From b855b04a0b2213dbc1b59cf936056726e7ed97ad Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Mar 2012 21:24:55 +0100
Subject: block: blk-throttle should be drained regardless of q->elevator

Currently, blk_cleanup_queue() doesn't call elv_drain_elevator() if
q->elevator doesn't exist; however, bio based drivers don't have
elevator initialized but can still use blk-throttle.  This patch moves
q->elevator test inside blk_drain_queue() such that only
elv_drain_elevator() is skipped if !q->elevator.

-v2: loop can have registered queue which has NULL request_fn.  Make
     sure we don't call into __blk_run_queue() in such cases.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Vivek Goyal <vgoyal@redhat.com>

Fold in bug fix from Vivek.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3a78b00..fccb250 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -365,17 +365,24 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
 
 		spin_lock_irq(q->queue_lock);
 
-		elv_drain_elevator(q);
+		/*
+		 * The caller might be trying to drain @q before its
+		 * elevator is initialized.
+		 */
+		if (q->elevator)
+			elv_drain_elevator(q);
+
 		if (drain_all)
 			blk_throtl_drain(q);
 
 		/*
 		 * This function might be called on a queue which failed
-		 * driver init after queue creation.  Some drivers
-		 * (e.g. fd) get unhappy in such cases.  Kick queue iff
-		 * dispatch queue has something on it.
+		 * driver init after queue creation or is not yet fully
+		 * active yet.  Some drivers (e.g. fd and loop) get unhappy
+		 * in such cases.  Kick queue iff dispatch queue has
+		 * something on it and @q has request_fn set.
 		 */
-		if (!list_empty(&q->queue_head))
+		if (!list_empty(&q->queue_head) && q->request_fn)
 			__blk_run_queue(q);
 
 		drain |= q->rq.elvpriv;
@@ -428,13 +435,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	spin_unlock_irq(lock);
 	mutex_unlock(&q->sysfs_lock);
 
-	/*
-	 * Drain all requests queued before DEAD marking.  The caller might
-	 * be trying to tear down @q before its elevator is initialized, in
-	 * which case we don't want to call into draining.
-	 */
-	if (q->elevator)
-		blk_drain_queue(q, true);
+	/* drain all requests queued before DEAD marking */
+	blk_drain_queue(q, true);
 
 	/* @q won't process any more request, flush async actions */
 	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
@@ -504,6 +506,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+	INIT_LIST_HEAD(&q->queue_head);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->icq_list);
 	INIT_LIST_HEAD(&q->flush_queue[0]);
-- 
cgit v1.1


From 32e380aedc3de454c06ce1c254fe3bea35a676e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:14:54 -0800
Subject: blkcg: make CONFIG_BLK_CGROUP bool

Block cgroup core can be built as module; however, it isn't too useful
as blk-throttle can only be built-in and cfq-iosched is usually the
default built-in scheduler.  Scheduled blkcg cleanup requires calling
into blkcg from block core.  To simplify that, disallow building blkcg
as module by making CONFIG_BLK_CGROUP bool.

If building blkcg core as module really matters, which I doubt, we can
revisit it after blkcg API cleanup.

-v2: Vivek pointed out that IOSCHED_CFQ was incorrectly updated to
     depend on BLK_CGROUP.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig.iosched |  4 ----
 block/blk-cgroup.c    | 17 -----------------
 block/blk-cgroup.h    | 10 ++--------
 init/Kconfig          |  2 +-
 4 files changed, 3 insertions(+), 30 deletions(-)

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 3199b76..421bef9 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,8 +23,6 @@ config IOSCHED_DEADLINE
 
 config IOSCHED_CFQ
 	tristate "CFQ I/O scheduler"
-	# If BLK_CGROUP is a module, CFQ has to be built as module.
-	depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
 	default y
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -34,8 +32,6 @@ config IOSCHED_CFQ
 
 	  This is the default I/O scheduler.
 
-	  Note: If BLK_CGROUP=m, then CFQ can be built only as module.
-
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
 	depends on IOSCHED_CFQ && BLK_CGROUP
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 75642a3..970a717 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -50,10 +50,7 @@ struct cgroup_subsys blkio_subsys = {
 	.attach = blkiocg_attach,
 	.destroy = blkiocg_destroy,
 	.populate = blkiocg_populate,
-#ifdef CONFIG_BLK_CGROUP
-	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
 	.subsys_id = blkio_subsys_id,
-#endif
 	.use_id = 1,
 	.module = THIS_MODULE,
 };
@@ -1679,17 +1676,3 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 	spin_unlock(&blkio_list_lock);
 }
 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
-
-static int __init init_cgroup_blkio(void)
-{
-	return cgroup_load_subsys(&blkio_subsys);
-}
-
-static void __exit exit_cgroup_blkio(void)
-{
-	cgroup_unload_subsys(&blkio_subsys);
-}
-
-module_init(init_cgroup_blkio);
-module_exit(exit_cgroup_blkio);
-MODULE_LICENSE("GPL");
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 6f3ace7..3551687 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -24,13 +24,7 @@ enum blkio_policy_id {
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
 
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-
-#ifndef CONFIG_BLK_CGROUP
-/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */
-extern struct cgroup_subsys blkio_subsys;
-#define blkio_subsys_id blkio_subsys.subsys_id
-#endif
+#ifdef CONFIG_BLK_CGROUP
 
 enum stat_type {
 	/* Total time spent (in ns) between request dispatch to the driver and
@@ -303,7 +297,7 @@ static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
 static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 #endif
 
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
+#ifdef CONFIG_BLK_CGROUP
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
diff --git a/init/Kconfig b/init/Kconfig
index 3f42cd6..da9222d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -766,7 +766,7 @@ config RT_GROUP_SCHED
 endif #CGROUP_SCHED
 
 config BLK_CGROUP
-	tristate "Block IO controller"
+	bool "Block IO controller"
 	depends on BLOCK
 	default n
 	---help---
-- 
cgit v1.1


From b95ada558c9e69c69ffd6950eb644ee8a3dba18f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:14:55 -0800
Subject: cfq: don't register propio policy if !CONFIG_CFQ_GROUP_IOSCHED

cfq has been registering zeroed blkio_poilcy_cfq if CFQ_GROUP_IOSCHED
is disabled.  This fortunately doesn't collide with blk-throtl as
BLKIO_POLICY_PROP is zero but is unnecessary and risky.  Just don't
register it if not enabled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4572952..388fe01 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3878,8 +3878,6 @@ static struct blkio_policy_type blkio_policy_cfq = {
 	},
 	.plid = BLKIO_POLICY_PROP,
 };
-#else
-static struct blkio_policy_type blkio_policy_cfq;
 #endif
 
 static int __init cfq_init(void)
@@ -3910,14 +3908,17 @@ static int __init cfq_init(void)
 		return ret;
 	}
 
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
 	blkio_policy_register(&blkio_policy_cfq);
-
+#endif
 	return 0;
 }
 
 static void __exit cfq_exit(void)
 {
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
 	blkio_policy_unregister(&blkio_policy_cfq);
+#endif
 	elv_unregister(&iosched_cfq);
 	kmem_cache_destroy(cfq_pool);
 }
-- 
cgit v1.1


From 5a5bafdc396b1da7570f84fb96a0f8a288970c5e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:14:56 -0800
Subject: elevator: clear auxiliary data earlier during elevator switch

Elevator switch tries hard to keep as much as context until new
elevator is ready so that it can revert to the original state if
initializing the new elevator fails for some reason.  Unfortunately,
with more auxiliary contexts to manage, this makes elevator init and
exit paths too complex and fragile.

This patch makes elevator_switch() unregister the current elevator and
flush icq's before start initializing the new one.  As we still keep
the old elevator itself, the only difference is that we lose icq's on
rare occassions of switching failure, which isn't critical at all.

Note that this makes explicit elevator parameter to
elevator_init_queue() and __elv_register_queue() unnecessary as they
always can use the current elevator.

This patch enables block cgroup cleanups.

-v2: blk_add_trace_msg() prints elevator name from @new_e instead of
     @e->type as the local variable no longer exists.  This caused
     build failure on CONFIG_BLK_DEV_IO_TRACE.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c | 90 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index f016855..f8c08e1 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -121,11 +121,10 @@ static struct elevator_type *elevator_get(const char *name)
 	return e;
 }
 
-static int elevator_init_queue(struct request_queue *q,
-			       struct elevator_queue *eq)
+static int elevator_init_queue(struct request_queue *q)
 {
-	eq->elevator_data = eq->type->ops.elevator_init_fn(q);
-	if (eq->elevator_data)
+	q->elevator->elevator_data = q->elevator->type->ops.elevator_init_fn(q);
+	if (q->elevator->elevator_data)
 		return 0;
 	return -ENOMEM;
 }
@@ -188,7 +187,6 @@ static void elevator_release(struct kobject *kobj)
 int elevator_init(struct request_queue *q, char *name)
 {
 	struct elevator_type *e = NULL;
-	struct elevator_queue *eq;
 	int err;
 
 	if (unlikely(q->elevator))
@@ -222,17 +220,16 @@ int elevator_init(struct request_queue *q, char *name)
 		}
 	}
 
-	eq = elevator_alloc(q, e);
-	if (!eq)
+	q->elevator = elevator_alloc(q, e);
+	if (!q->elevator)
 		return -ENOMEM;
 
-	err = elevator_init_queue(q, eq);
+	err = elevator_init_queue(q);
 	if (err) {
-		kobject_put(&eq->kobj);
+		kobject_put(&q->elevator->kobj);
 		return err;
 	}
 
-	q->elevator = eq;
 	return 0;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -801,8 +798,9 @@ static struct kobj_type elv_ktype = {
 	.release	= elevator_release,
 };
 
-int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
+int elv_register_queue(struct request_queue *q)
 {
+	struct elevator_queue *e = q->elevator;
 	int error;
 
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
@@ -820,11 +818,6 @@ int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
 	}
 	return error;
 }
-
-int elv_register_queue(struct request_queue *q)
-{
-	return __elv_register_queue(q, q->elevator);
-}
 EXPORT_SYMBOL(elv_register_queue);
 
 void elv_unregister_queue(struct request_queue *q)
@@ -907,51 +900,58 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  */
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
-	struct elevator_queue *old_elevator, *e;
+	struct elevator_queue *old = q->elevator;
+	bool registered = old->registered;
 	int err;
 
-	/* allocate new elevator */
-	e = elevator_alloc(q, new_e);
-	if (!e)
-		return -ENOMEM;
-
-	err = elevator_init_queue(q, e);
-	if (err) {
-		kobject_put(&e->kobj);
-		return err;
-	}
-
-	/* turn on BYPASS and drain all requests w/ elevator private data */
+	/*
+	 * Turn on BYPASS and drain all requests w/ elevator private data.
+	 * Block layer doesn't call into a quiesced elevator - all requests
+	 * are directly put on the dispatch list without elevator data
+	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
+	 * merge happens either.
+	 */
 	elv_quiesce_start(q);
 
-	/* unregister old queue, register new one and kill old elevator */
-	if (q->elevator->registered) {
+	/* unregister and clear all auxiliary data of the old elevator */
+	if (registered)
 		elv_unregister_queue(q);
-		err = __elv_register_queue(q, e);
-		if (err)
-			goto fail_register;
-	}
 
-	/* done, clear io_cq's, switch elevators and turn off BYPASS */
 	spin_lock_irq(q->queue_lock);
 	ioc_clear_queue(q);
-	old_elevator = q->elevator;
-	q->elevator = e;
 	spin_unlock_irq(q->queue_lock);
 
-	elevator_exit(old_elevator);
+	/* allocate, init and register new elevator */
+	err = -ENOMEM;
+	q->elevator = elevator_alloc(q, new_e);
+	if (!q->elevator)
+		goto fail_init;
+
+	err = elevator_init_queue(q);
+	if (err) {
+		kobject_put(&q->elevator->kobj);
+		goto fail_init;
+	}
+
+	if (registered) {
+		err = elv_register_queue(q);
+		if (err)
+			goto fail_register;
+	}
+
+	/* done, kill the old one and finish */
+	elevator_exit(old);
 	elv_quiesce_end(q);
 
-	blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name);
+	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
 
 	return 0;
 
 fail_register:
-	/*
-	 * switch failed, exit the new io scheduler and reattach the old
-	 * one again (along with re-adding the sysfs dir)
-	 */
-	elevator_exit(e);
+	elevator_exit(q->elevator);
+fail_init:
+	/* switch failed, restore and re-register old elevator */
+	q->elevator = old;
 	elv_register_queue(q);
 	elv_quiesce_end(q);
 
-- 
cgit v1.1


From b2fab5acd28ead6f0dd6c3996ba23f0ef1772f15 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:14:57 -0800
Subject: elevator: make elevator_init_fn() return 0/-errno

elevator_ops->elevator_init_fn() has a weird return value.  It returns
a void * which the caller should assign to q->elevator->elevator_data
and %NULL return denotes init failure.

Update such that it returns integer 0/-errno and sets elevator_data
directly as necessary.

This makes the interface more conventional and eases further cleanup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c      |  9 +++++----
 block/deadline-iosched.c |  8 +++++---
 block/elevator.c         | 12 ++----------
 block/noop-iosched.c     |  8 +++++---
 include/linux/elevator.h |  2 +-
 5 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 388fe01..72680a6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3656,7 +3656,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	kfree(cfqd);
 }
 
-static void *cfq_init_queue(struct request_queue *q)
+static int cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
 	int i, j;
@@ -3665,7 +3665,7 @@ static void *cfq_init_queue(struct request_queue *q)
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
-		return NULL;
+		return -ENOMEM;
 
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -3692,7 +3692,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
 		kfree(cfqg);
 		kfree(cfqd);
-		return NULL;
+		return -ENOMEM;
 	}
 
 	rcu_read_lock();
@@ -3723,6 +3723,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
 
 	cfqd->queue = q;
+	q->elevator->elevator_data = cfqd;
 
 	init_timer(&cfqd->idle_slice_timer);
 	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
@@ -3747,7 +3748,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * second, in order to have larger depth for async operations.
 	 */
 	cfqd->last_delayed_sync = jiffies - HZ;
-	return cfqd;
+	return 0;
 }
 
 /*
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 7bf12d7..599b12e 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e)
 /*
  * initialize elevator private data (deadline_data).
  */
-static void *deadline_init_queue(struct request_queue *q)
+static int deadline_init_queue(struct request_queue *q)
 {
 	struct deadline_data *dd;
 
 	dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!dd)
-		return NULL;
+		return -ENOMEM;
 
 	INIT_LIST_HEAD(&dd->fifo_list[READ]);
 	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -354,7 +354,9 @@ static void *deadline_init_queue(struct request_queue *q)
 	dd->writes_starved = writes_starved;
 	dd->front_merges = 1;
 	dd->fifo_batch = fifo_batch;
-	return dd;
+
+	q->elevator->elevator_data = dd;
+	return 0;
 }
 
 /*
diff --git a/block/elevator.c b/block/elevator.c
index f8c08e1..f81c061 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -121,14 +121,6 @@ static struct elevator_type *elevator_get(const char *name)
 	return e;
 }
 
-static int elevator_init_queue(struct request_queue *q)
-{
-	q->elevator->elevator_data = q->elevator->type->ops.elevator_init_fn(q);
-	if (q->elevator->elevator_data)
-		return 0;
-	return -ENOMEM;
-}
-
 static char chosen_elevator[ELV_NAME_MAX];
 
 static int __init elevator_setup(char *str)
@@ -224,7 +216,7 @@ int elevator_init(struct request_queue *q, char *name)
 	if (!q->elevator)
 		return -ENOMEM;
 
-	err = elevator_init_queue(q);
+	err = e->ops.elevator_init_fn(q);
 	if (err) {
 		kobject_put(&q->elevator->kobj);
 		return err;
@@ -927,7 +919,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	if (!q->elevator)
 		goto fail_init;
 
-	err = elevator_init_queue(q);
+	err = new_e->ops.elevator_init_fn(q);
 	if (err) {
 		kobject_put(&q->elevator->kobj);
 		goto fail_init;
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 413a0b1..5d1bf70 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,15 +59,17 @@ noop_latter_request(struct request_queue *q, struct request *rq)
 	return list_entry(rq->queuelist.next, struct request, queuelist);
 }
 
-static void *noop_init_queue(struct request_queue *q)
+static int noop_init_queue(struct request_queue *q)
 {
 	struct noop_data *nd;
 
 	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
 	if (!nd)
-		return NULL;
+		return -ENOMEM;
+
 	INIT_LIST_HEAD(&nd->queue);
-	return nd;
+	q->elevator->elevator_data = nd;
+	return 0;
 }
 
 static void noop_exit_queue(struct elevator_queue *e)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 7d4e035..97fb255 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -33,7 +33,7 @@ typedef void (elevator_put_req_fn) (struct request *);
 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
 
-typedef void *(elevator_init_fn) (struct request_queue *);
+typedef int (elevator_init_fn) (struct request_queue *);
 typedef void (elevator_exit_fn) (struct elevator_queue *);
 
 struct elevator_ops
-- 
cgit v1.1


From d732580b4eb31553c63744a47d590f770cafb8f0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:14:58 -0800
Subject: block: implement blk_queue_bypass_start/end()

Rename and extend elv_queisce_start/end() to
blk_queue_bypass_start/end() which are exported and supports nesting
via @q->bypass_depth.  Also add blk_queue_bypass() to test bypass
state.

This will be further extended and used for blkio_group management.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 39 +++++++++++++++++++++++++++++++++++++--
 block/blk.h            |  6 ++----
 block/elevator.c       | 25 +++----------------------
 include/linux/blkdev.h |  5 ++++-
 4 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index fccb250..98ddef4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -410,6 +410,42 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
 }
 
 /**
+ * blk_queue_bypass_start - enter queue bypass mode
+ * @q: queue of interest
+ *
+ * In bypass mode, only the dispatch FIFO queue of @q is used.  This
+ * function makes @q enter bypass mode and drains all requests which were
+ * issued before.  On return, it's guaranteed that no request has ELVPRIV
+ * set.
+ */
+void blk_queue_bypass_start(struct request_queue *q)
+{
+	spin_lock_irq(q->queue_lock);
+	q->bypass_depth++;
+	queue_flag_set(QUEUE_FLAG_BYPASS, q);
+	spin_unlock_irq(q->queue_lock);
+
+	blk_drain_queue(q, false);
+}
+EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
+
+/**
+ * blk_queue_bypass_end - leave queue bypass mode
+ * @q: queue of interest
+ *
+ * Leave bypass mode and restore the normal queueing behavior.
+ */
+void blk_queue_bypass_end(struct request_queue *q)
+{
+	spin_lock_irq(q->queue_lock);
+	if (!--q->bypass_depth)
+		queue_flag_clear(QUEUE_FLAG_BYPASS, q);
+	WARN_ON_ONCE(q->bypass_depth < 0);
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
+
+/**
  * blk_cleanup_queue - shutdown a request queue
  * @q: request queue to shutdown
  *
@@ -862,8 +898,7 @@ retry:
 	 * Also, lookup icq while holding queue_lock.  If it doesn't exist,
 	 * it will be created after releasing queue_lock.
 	 */
-	if (blk_rq_should_init_elevator(bio) &&
-	    !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
+	if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
 		rw_flags |= REQ_ELVPRIV;
 		rl->elvpriv++;
 		if (et->icq_cache && ioc)
diff --git a/block/blk.h b/block/blk.h
index 9c12f80..7422f31 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -23,7 +23,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio);
-void blk_drain_queue(struct request_queue *q, bool drain_all);
+void blk_queue_bypass_start(struct request_queue *q);
+void blk_queue_bypass_end(struct request_queue *q);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
@@ -144,9 +145,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
 
 /*
  * Return the threshold (number of used requests) at which the queue is
diff --git a/block/elevator.c b/block/elevator.c
index f81c061..0bdea0e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -553,25 +553,6 @@ void elv_drain_elevator(struct request_queue *q)
 	}
 }
 
-void elv_quiesce_start(struct request_queue *q)
-{
-	if (!q->elevator)
-		return;
-
-	spin_lock_irq(q->queue_lock);
-	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
-	spin_unlock_irq(q->queue_lock);
-
-	blk_drain_queue(q, false);
-}
-
-void elv_quiesce_end(struct request_queue *q)
-{
-	spin_lock_irq(q->queue_lock);
-	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
-	spin_unlock_irq(q->queue_lock);
-}
-
 void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
 	trace_block_rq_insert(q, rq);
@@ -903,7 +884,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
 	 * merge happens either.
 	 */
-	elv_quiesce_start(q);
+	blk_queue_bypass_start(q);
 
 	/* unregister and clear all auxiliary data of the old elevator */
 	if (registered)
@@ -933,7 +914,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 
 	/* done, kill the old one and finish */
 	elevator_exit(old);
-	elv_quiesce_end(q);
+	blk_queue_bypass_end(q);
 
 	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
 
@@ -945,7 +926,7 @@ fail_init:
 	/* switch failed, restore and re-register old elevator */
 	q->elevator = old;
 	elv_register_queue(q);
-	elv_quiesce_end(q);
+	blk_queue_bypass_end(q);
 
 	return err;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 606cf33..315db1d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -389,6 +389,8 @@ struct request_queue {
 
 	struct mutex		sysfs_lock;
 
+	int			bypass_depth;
+
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
 	int			bsg_job_size;
@@ -406,7 +408,7 @@ struct request_queue {
 #define	QUEUE_FLAG_SYNCFULL	3	/* read queue has been filled */
 #define QUEUE_FLAG_ASYNCFULL	4	/* write queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
-#define QUEUE_FLAG_ELVSWITCH	6	/* don't use elevator, just do FIFO */
+#define QUEUE_FLAG_BYPASS	6	/* act as dumb FIFO queue */
 #define QUEUE_FLAG_BIDI		7	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES     8	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	9	/* complete on same CPU-group */
@@ -494,6 +496,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
+#define blk_queue_bypass(q)	test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)	\
 	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
-- 
cgit v1.1


From 6ecf23afab13c39d3bb0e2d826d0984b0dd53733 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:14:59 -0800
Subject: block: extend queue bypassing to cover blkcg policies

Extend queue bypassing such that dying queue is always bypassing and
blk-throttle is drained on bypass.  With blkcg policies updated to
test blk_queue_bypass() instead of blk_queue_dead(), this ensures that
no bio or request is held by or going through blkcg policies on a
bypassing queue.

This will be used to implement blkg cleanup on elevator switches and
policy changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     | 12 ++++++++----
 block/blk-throttle.c |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 98ddef4..7713c73 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -372,8 +372,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
 		if (q->elevator)
 			elv_drain_elevator(q);
 
-		if (drain_all)
-			blk_throtl_drain(q);
+		blk_throtl_drain(q);
 
 		/*
 		 * This function might be called on a queue which failed
@@ -415,8 +414,8 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
  *
  * In bypass mode, only the dispatch FIFO queue of @q is used.  This
  * function makes @q enter bypass mode and drains all requests which were
- * issued before.  On return, it's guaranteed that no request has ELVPRIV
- * set.
+ * throttled or issued before.  On return, it's guaranteed that no request
+ * is being throttled or has ELVPRIV set.
  */
 void blk_queue_bypass_start(struct request_queue *q)
 {
@@ -461,6 +460,11 @@ void blk_cleanup_queue(struct request_queue *q)
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 
 	spin_lock_irq(lock);
+
+	/* dead queue is permanently in bypass mode till released */
+	q->bypass_depth++;
+	queue_flag_set(QUEUE_FLAG_BYPASS, q);
+
 	queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 	queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
 	queue_flag_set(QUEUE_FLAG_DEAD, q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5eed6a7..702c0e6 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -310,7 +310,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	struct request_queue *q = td->queue;
 
 	/* no throttling for dead queue */
-	if (unlikely(blk_queue_dead(q)))
+	if (unlikely(blk_queue_bypass(q)))
 		return NULL;
 
 	rcu_read_lock();
@@ -335,7 +335,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	spin_lock_irq(q->queue_lock);
 
 	/* Make sure @q is still alive */
-	if (unlikely(blk_queue_dead(q))) {
+	if (unlikely(blk_queue_bypass(q))) {
 		kfree(tg);
 		return NULL;
 	}
-- 
cgit v1.1


From 72e06c255181537d0b3e1f657a9ed81655d745b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:00 -0800
Subject: blkcg: shoot down blkio_groups on elevator switch

Elevator switch may involve changes to blkcg policies.  Implement
shoot down of blkio_groups.

Combined with the previous bypass updates, the end goal is updating
blkcg core such that it can ensure that blkcg's being affected become
quiescent and don't have any per-blkg data hanging around before
commencing any policy updates.  Until queues are made aware of the
policies that applies to them, as an interim step, all per-policy blkg
data will be shot down.

* blk-throtl doesn't need this change as it can't be disabled for a
  live queue; however, update it anyway as the scheduled blkg
  unification requires this behavior change.  This means that
  blk-throtl configuration will be unnecessarily lost over elevator
  switch.  This oddity will be removed after blkcg learns to associate
  individual policies with request_queues.

* blk-throtl dosen't shoot down root_tg.  This is to ease transition.
  Unified blkg will always have persistent root group and not shooting
  down root_tg for now eases transition to that point by avoiding
  having to update td->root_tg and is safe as blk-throtl can never be
  disabled

-v2: Vivek pointed out that group list is not guaranteed to be empty
     on return from clear function if it raced cgroup removal and
     lost.  Fix it by waiting a bit and retrying.  This kludge will
     soon be removed once locking is updated such that blkg is never
     in limbo state between blkcg and request_queue locks.

     blk-throtl no longer shoots down root_tg to avoid breaking
     td->root_tg.

     Also, Nest queue_lock inside blkio_list_lock not the other way
     around to avoid introduce possible deadlock via blkcg lock.

-v3: blkcg_clear_queue() repositioned and renamed to
     blkg_destroy_all() to increase consistency with later changes.
     cfq_clear_queue() updated to check q->elevator before
     dereferencing it to avoid NULL dereference on not fully
     initialized queues (used by later change).

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 34 +++++++++++++++++++++++++++++++++-
 block/blk-cgroup.h   |  5 ++++-
 block/blk-throttle.c | 27 +++++++++++++++++++++++++--
 block/cfq-iosched.c  | 20 +++++++++++++++++++-
 block/elevator.c     |  3 +++
 5 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 970a717..159aef5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -17,8 +17,9 @@
 #include <linux/err.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
-#include "blk-cgroup.h"
 #include <linux/genhd.h>
+#include <linux/delay.h>
+#include "blk-cgroup.h"
 
 #define MAX_KEY_LEN 100
 
@@ -546,6 +547,37 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 }
 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 
+void blkg_destroy_all(struct request_queue *q)
+{
+	struct blkio_policy_type *pol;
+
+	while (true) {
+		bool done = true;
+
+		spin_lock(&blkio_list_lock);
+		spin_lock_irq(q->queue_lock);
+
+		/*
+		 * clear_queue_fn() might return with non-empty group list
+		 * if it raced cgroup removal and lost.  cgroup removal is
+		 * guaranteed to make forward progress and retrying after a
+		 * while is enough.  This ugliness is scheduled to be
+		 * removed after locking update.
+		 */
+		list_for_each_entry(pol, &blkio_list, list)
+			if (!pol->ops.blkio_clear_queue_fn(q))
+				done = false;
+
+		spin_unlock_irq(q->queue_lock);
+		spin_unlock(&blkio_list_lock);
+
+		if (done)
+			break;
+
+		msleep(10);	/* just some random duration I like */
+	}
+}
+
 static void blkio_reset_stats_cpu(struct blkio_group *blkg)
 {
 	struct blkio_group_stats_cpu *stats_cpu;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 3551687..e5cfcbd 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -203,7 +203,7 @@ extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
-
+typedef bool (blkio_clear_queue_fn)(struct request_queue *q);
 typedef void (blkio_update_group_weight_fn) (void *key,
 			struct blkio_group *blkg, unsigned int weight);
 typedef void (blkio_update_group_read_bps_fn) (void * key,
@@ -217,6 +217,7 @@ typedef void (blkio_update_group_write_iops_fn) (void *key,
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
+	blkio_clear_queue_fn *blkio_clear_queue_fn;
 	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
 	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
 	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
@@ -233,6 +234,7 @@ struct blkio_policy_type {
 /* Blkio controller policy registration */
 extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
+extern void blkg_destroy_all(struct request_queue *q);
 
 static inline char *blkg_path(struct blkio_group *blkg)
 {
@@ -249,6 +251,7 @@ struct blkio_policy_type {
 
 static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
+static inline void blkg_destroy_all(struct request_queue *q) { }
 
 static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 702c0e6..3699ab4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -989,12 +989,17 @@ throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
 	td->nr_undestroyed_grps--;
 }
 
-static void throtl_release_tgs(struct throtl_data *td)
+static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
 {
 	struct hlist_node *pos, *n;
 	struct throtl_grp *tg;
+	bool empty = true;
 
 	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+		/* skip root? */
+		if (!release_root && tg == td->root_tg)
+			continue;
+
 		/*
 		 * If cgroup removal path got to blk_group first and removed
 		 * it from cgroup list, then it will take care of destroying
@@ -1002,7 +1007,10 @@ static void throtl_release_tgs(struct throtl_data *td)
 		 */
 		if (!blkiocg_del_blkio_group(&tg->blkg))
 			throtl_destroy_tg(td, tg);
+		else
+			empty = false;
 	}
+	return empty;
 }
 
 /*
@@ -1029,6 +1037,20 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
 	spin_unlock_irqrestore(td->queue->queue_lock, flags);
 }
 
+static bool throtl_clear_queue(struct request_queue *q)
+{
+	lockdep_assert_held(q->queue_lock);
+
+	/*
+	 * Clear tgs but leave the root one alone.  This is necessary
+	 * because root_tg is expected to be persistent and safe because
+	 * blk-throtl can never be disabled while @q is alive.  This is a
+	 * kludge to prepare for unified blkg.  This whole function will be
+	 * removed soon.
+	 */
+	return throtl_release_tgs(q->td, false);
+}
+
 static void throtl_update_blkio_group_common(struct throtl_data *td,
 				struct throtl_grp *tg)
 {
@@ -1097,6 +1119,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 static struct blkio_policy_type blkio_policy_throtl = {
 	.ops = {
 		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
+		.blkio_clear_queue_fn = throtl_clear_queue,
 		.blkio_update_group_read_bps_fn =
 					throtl_update_blkio_group_read_bps,
 		.blkio_update_group_write_bps_fn =
@@ -1282,7 +1305,7 @@ void blk_throtl_exit(struct request_queue *q)
 	throtl_shutdown_wq(q);
 
 	spin_lock_irq(q->queue_lock);
-	throtl_release_tgs(td);
+	throtl_release_tgs(td, true);
 
 	/* If there are other groups */
 	if (td->nr_undestroyed_grps > 0)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 72680a6..61693d3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1225,10 +1225,11 @@ static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	cfq_put_cfqg(cfqg);
 }
 
-static void cfq_release_cfq_groups(struct cfq_data *cfqd)
+static bool cfq_release_cfq_groups(struct cfq_data *cfqd)
 {
 	struct hlist_node *pos, *n;
 	struct cfq_group *cfqg;
+	bool empty = true;
 
 	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
 		/*
@@ -1238,7 +1239,10 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
 		 */
 		if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
 			cfq_destroy_cfqg(cfqd, cfqg);
+		else
+			empty = false;
 	}
+	return empty;
 }
 
 /*
@@ -1265,6 +1269,19 @@ static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
+static struct elevator_type iosched_cfq;
+
+static bool cfq_clear_queue(struct request_queue *q)
+{
+	lockdep_assert_held(q->queue_lock);
+
+	/* shoot down blkgs iff the current elevator is cfq */
+	if (!q->elevator || q->elevator->type != &iosched_cfq)
+		return true;
+
+	return cfq_release_cfq_groups(q->elevator->elevator_data);
+}
+
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
@@ -3875,6 +3892,7 @@ static struct elevator_type iosched_cfq = {
 static struct blkio_policy_type blkio_policy_cfq = {
 	.ops = {
 		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
+		.blkio_clear_queue_fn = cfq_clear_queue,
 		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
 	},
 	.plid = BLKIO_POLICY_PROP,
diff --git a/block/elevator.c b/block/elevator.c
index 0bdea0e..8c7561f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -38,6 +38,7 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -894,6 +895,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	ioc_clear_queue(q);
 	spin_unlock_irq(q->queue_lock);
 
+	blkg_destroy_all(q);
+
 	/* allocate, init and register new elevator */
 	err = -ENOMEM;
 	q->elevator = elevator_alloc(q, new_e);
-- 
cgit v1.1


From 2a7f124414b35645049e9c1b125a6f0b470aa5ae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:01 -0800
Subject: blkcg: move rcu_read_lock() outside of blkio_group get functions

rcu_read_lock() in throtl_get_tb() and cfq_get_cfqg() holds onto
@blkcg while looking up blkg.  For API cleanup, the next patch will
make the caller responsible for determining @blkcg to look blkg from
and let them specify it as a parameter.  Move rcu read locking out to
the callers to prepare for the change.

-v2: Originally this patch was described as a fix for RCU read locking
     bug around @blkg, which Vivek pointed out to be incorrect.  It
     was from misunderstanding the role of rcu locking as protecting
     @blkg not @blkcg.  Patch description updated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 18 ++++++------------
 block/cfq-iosched.c  | 11 +++++------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3699ab4..9beaac7 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -313,25 +313,23 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	if (unlikely(blk_queue_bypass(q)))
 		return NULL;
 
-	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
 	tg = throtl_find_tg(td, blkcg);
-	if (tg) {
-		rcu_read_unlock();
+	if (tg)
 		return tg;
-	}
 
 	/*
 	 * Need to allocate a group. Allocation of group also needs allocation
 	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
 	 * we need to drop rcu lock and queue_lock before we call alloc.
 	 */
-	rcu_read_unlock();
 	spin_unlock_irq(q->queue_lock);
+	rcu_read_unlock();
 
 	tg = throtl_alloc_tg(td);
 
 	/* Group allocated and queue is still alive. take the lock */
+	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
 
 	/* Make sure @q is still alive */
@@ -343,7 +341,6 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	/*
 	 * Initialize the new group. After sleeping, read the blkcg again.
 	 */
-	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
 
 	/*
@@ -354,7 +351,6 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 
 	if (__tg) {
 		kfree(tg);
-		rcu_read_unlock();
 		return __tg;
 	}
 
@@ -365,7 +361,6 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	}
 
 	throtl_init_add_tg_lists(td, tg, blkcg);
-	rcu_read_unlock();
 	return tg;
 }
 
@@ -1150,7 +1145,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	 * basic fields like stats and io rates. If a group has no rules,
 	 * just update the dispatch stats in lockless manner and return.
 	 */
-
 	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
 	tg = throtl_find_tg(td, blkcg);
@@ -1160,11 +1154,9 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 		if (tg_no_rule_group(tg, rw)) {
 			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
 					rw, rw_is_sync(bio->bi_rw));
-			rcu_read_unlock();
-			goto out;
+			goto out_unlock_rcu;
 		}
 	}
-	rcu_read_unlock();
 
 	/*
 	 * Either group has not been allocated yet or it is not an unlimited
@@ -1222,6 +1214,8 @@ queue_bio:
 
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
+out_unlock_rcu:
+	rcu_read_unlock();
 out:
 	return throttled;
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 61693d3..6063c44 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1128,13 +1128,10 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 	struct cfq_group *cfqg = NULL, *__cfqg = NULL;
 	struct request_queue *q = cfqd->queue;
 
-	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
 	cfqg = cfq_find_cfqg(cfqd, blkcg);
-	if (cfqg) {
-		rcu_read_unlock();
+	if (cfqg)
 		return cfqg;
-	}
 
 	/*
 	 * Need to allocate a group. Allocation of group also needs allocation
@@ -1164,7 +1161,6 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 
 	if (__cfqg) {
 		kfree(cfqg);
-		rcu_read_unlock();
 		return __cfqg;
 	}
 
@@ -1172,7 +1168,6 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 		cfqg = &cfqd->root_group;
 
 	cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
-	rcu_read_unlock();
 	return cfqg;
 }
 
@@ -2870,6 +2865,8 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 	struct cfq_group *cfqg;
 
 retry:
+	rcu_read_lock();
+
 	cfqg = cfq_get_cfqg(cfqd);
 	cic = cfq_cic_lookup(cfqd, ioc);
 	/* cic always exists here */
@@ -2885,6 +2882,7 @@ retry:
 			cfqq = new_cfqq;
 			new_cfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
+			rcu_read_unlock();
 			spin_unlock_irq(cfqd->queue->queue_lock);
 			new_cfqq = kmem_cache_alloc_node(cfq_pool,
 					gfp_mask | __GFP_ZERO,
@@ -2910,6 +2908,7 @@ retry:
 	if (new_cfqq)
 		kmem_cache_free(cfq_pool, new_cfqq);
 
+	rcu_read_unlock();
 	return cfqq;
 }
 
-- 
cgit v1.1


From 0a5a7d0e32be6643b881f0e7cd9d0d06fadde27a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:02 -0800
Subject: blkcg: update blkg get functions take blkio_cgroup as parameter

In both blkg get functions - throtl_get_tg() and cfq_get_cfqg(),
instead of obtaining blkcg of %current explicitly, let the caller
specify the blkcg to use as parameter and make both functions hold on
to the blkcg.

This is part of block cgroup interface cleanup and will help making
blkcg API more modular.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 16 +++++++---------
 block/cfq-iosched.c  | 20 ++++++++++++--------
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9beaac7..c252df9 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -303,21 +303,23 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 	return tg;
 }
 
-static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+static struct throtl_grp *throtl_get_tg(struct throtl_data *td,
+					struct blkio_cgroup *blkcg)
 {
 	struct throtl_grp *tg = NULL, *__tg = NULL;
-	struct blkio_cgroup *blkcg;
 	struct request_queue *q = td->queue;
 
 	/* no throttling for dead queue */
 	if (unlikely(blk_queue_bypass(q)))
 		return NULL;
 
-	blkcg = task_blkio_cgroup(current);
 	tg = throtl_find_tg(td, blkcg);
 	if (tg)
 		return tg;
 
+	if (!css_tryget(&blkcg->css))
+		return NULL;
+
 	/*
 	 * Need to allocate a group. Allocation of group also needs allocation
 	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
@@ -331,6 +333,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	/* Group allocated and queue is still alive. take the lock */
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
+	css_put(&blkcg->css);
 
 	/* Make sure @q is still alive */
 	if (unlikely(blk_queue_bypass(q))) {
@@ -339,11 +342,6 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	}
 
 	/*
-	 * Initialize the new group. After sleeping, read the blkcg again.
-	 */
-	blkcg = task_blkio_cgroup(current);
-
-	/*
 	 * If some other thread already allocated the group while we were
 	 * not holding queue lock, free up the group
 	 */
@@ -1163,7 +1161,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	 * IO group
 	 */
 	spin_lock_irq(q->queue_lock);
-	tg = throtl_get_tg(td);
+	tg = throtl_get_tg(td, blkcg);
 	if (unlikely(!tg))
 		goto out_unlock;
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6063c44..0f7a81f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1122,17 +1122,19 @@ cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
  * Search for the cfq group current task belongs to. request_queue lock must
  * be held.
  */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd,
+				      struct blkio_cgroup *blkcg)
 {
-	struct blkio_cgroup *blkcg;
 	struct cfq_group *cfqg = NULL, *__cfqg = NULL;
 	struct request_queue *q = cfqd->queue;
 
-	blkcg = task_blkio_cgroup(current);
 	cfqg = cfq_find_cfqg(cfqd, blkcg);
 	if (cfqg)
 		return cfqg;
 
+	if (!css_tryget(&blkcg->css))
+		return NULL;
+
 	/*
 	 * Need to allocate a group. Allocation of group also needs allocation
 	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
@@ -1142,16 +1144,14 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 	 * around by the time we return. CFQ queue allocation code does
 	 * the same. It might be racy though.
 	 */
-
 	rcu_read_unlock();
 	spin_unlock_irq(q->queue_lock);
 
 	cfqg = cfq_alloc_cfqg(cfqd);
 
 	spin_lock_irq(q->queue_lock);
-
 	rcu_read_lock();
-	blkcg = task_blkio_cgroup(current);
+	css_put(&blkcg->css);
 
 	/*
 	 * If some other thread already allocated the group while we were
@@ -1278,7 +1278,8 @@ static bool cfq_clear_queue(struct request_queue *q)
 }
 
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd,
+				      struct blkio_cgroup *blkcg)
 {
 	return &cfqd->root_group;
 }
@@ -2860,6 +2861,7 @@ static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 		     struct io_context *ioc, gfp_t gfp_mask)
 {
+	struct blkio_cgroup *blkcg;
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_io_cq *cic;
 	struct cfq_group *cfqg;
@@ -2867,7 +2869,9 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 retry:
 	rcu_read_lock();
 
-	cfqg = cfq_get_cfqg(cfqd);
+	blkcg = task_blkio_cgroup(current);
+
+	cfqg = cfq_get_cfqg(cfqd, blkcg);
 	cic = cfq_cic_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
-- 
cgit v1.1


From ca32aefc7f2539ed88d42763330d54ee3e61769a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:03 -0800
Subject: blkcg: use q and plid instead of opaque void * for blkio_group
 association

blkgio_group is association between a block cgroup and a queue for a
given policy.  Using opaque void * for association makes things
confusing and hinders factoring of common code.  Use request_queue *
and, if necessary, policy id instead.

This will help block cgroup API cleanup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 32 +++++++++++++++-----------------
 block/blk-cgroup.h   | 22 ++++++++++++----------
 block/blk-throttle.c | 50 +++++++++++++++++++++++---------------------------
 block/cfq-iosched.c  | 30 ++++++++++++++++--------------
 block/cfq.h          |  7 ++++---
 5 files changed, 70 insertions(+), 71 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 159aef5..5e50ca1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -129,7 +129,7 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 		if (blkiop->plid != blkg->plid)
 			continue;
 		if (blkiop->ops.blkio_update_group_weight_fn)
-			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
+			blkiop->ops.blkio_update_group_weight_fn(blkg->q,
 							blkg, weight);
 	}
 }
@@ -147,12 +147,12 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 
 		if (fileid == BLKIO_THROTL_read_bps_device
 		    && blkiop->ops.blkio_update_group_read_bps_fn)
-			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
+			blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
 								blkg, bps);
 
 		if (fileid == BLKIO_THROTL_write_bps_device
 		    && blkiop->ops.blkio_update_group_write_bps_fn)
-			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
+			blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
 								blkg, bps);
 	}
 }
@@ -170,12 +170,12 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg,
 
 		if (fileid == BLKIO_THROTL_read_iops_device
 		    && blkiop->ops.blkio_update_group_read_iops_fn)
-			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
+			blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
 								blkg, iops);
 
 		if (fileid == BLKIO_THROTL_write_iops_device
 		    && blkiop->ops.blkio_update_group_write_iops_fn)
-			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
+			blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
 								blkg,iops);
 	}
 }
@@ -478,14 +478,14 @@ int blkio_alloc_blkg_stats(struct blkio_group *blkg)
 EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-		struct blkio_group *blkg, void *key, dev_t dev,
+		struct blkio_group *blkg, struct request_queue *q, dev_t dev,
 		enum blkio_policy_id plid)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkcg->lock, flags);
 	spin_lock_init(&blkg->stats_lock);
-	rcu_assign_pointer(blkg->key, key);
+	rcu_assign_pointer(blkg->q, q);
 	blkg->blkcg_id = css_id(&blkcg->css);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	blkg->plid = plid;
@@ -531,18 +531,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg)
 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 
 /* called under rcu_read_lock(). */
-struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
+struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
+					 struct request_queue *q,
+					 enum blkio_policy_id plid)
 {
 	struct blkio_group *blkg;
 	struct hlist_node *n;
-	void *__key;
 
-	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		__key = blkg->key;
-		if (__key == key)
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
+		if (blkg->q == q && blkg->plid == plid)
 			return blkg;
-	}
-
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
@@ -1582,7 +1580,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	unsigned long flags;
 	struct blkio_group *blkg;
-	void *key;
+	struct request_queue *q;
 	struct blkio_policy_type *blkiop;
 	struct blkio_policy_node *pn, *pntmp;
 
@@ -1597,7 +1595,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 
 		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
 					blkcg_node);
-		key = rcu_dereference(blkg->key);
+		q = rcu_dereference(blkg->q);
 		__blkiocg_del_blkio_group(blkg);
 
 		spin_unlock_irqrestore(&blkcg->lock, flags);
@@ -1611,7 +1609,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		list_for_each_entry(blkiop, &blkio_list, list) {
 			if (blkiop->plid != blkg->plid)
 				continue;
-			blkiop->ops.blkio_unlink_group_fn(key, blkg);
+			blkiop->ops.blkio_unlink_group_fn(q, blkg);
 		}
 		spin_unlock(&blkio_list_lock);
 	} while (1);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index e5cfcbd..41c960b 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -153,8 +153,8 @@ struct blkio_group_stats_cpu {
 };
 
 struct blkio_group {
-	/* An rcu protected unique identifier for the group */
-	void *key;
+	/* Pointer to the associated request_queue, RCU protected */
+	struct request_queue __rcu *q;
 	struct hlist_node blkcg_node;
 	unsigned short blkcg_id;
 	/* Store cgroup path */
@@ -202,17 +202,18 @@ extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
 extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 
-typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
+typedef void (blkio_unlink_group_fn)(struct request_queue *q,
+			struct blkio_group *blkg);
 typedef bool (blkio_clear_queue_fn)(struct request_queue *q);
-typedef void (blkio_update_group_weight_fn) (void *key,
+typedef void (blkio_update_group_weight_fn)(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int weight);
-typedef void (blkio_update_group_read_bps_fn) (void * key,
+typedef void (blkio_update_group_read_bps_fn)(struct request_queue *q,
 			struct blkio_group *blkg, u64 read_bps);
-typedef void (blkio_update_group_write_bps_fn) (void *key,
+typedef void (blkio_update_group_write_bps_fn)(struct request_queue *q,
 			struct blkio_group *blkg, u64 write_bps);
-typedef void (blkio_update_group_read_iops_fn) (void *key,
+typedef void (blkio_update_group_read_iops_fn)(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int read_iops);
-typedef void (blkio_update_group_write_iops_fn) (void *key,
+typedef void (blkio_update_group_write_iops_fn)(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int write_iops);
 
 struct blkio_policy_ops {
@@ -305,12 +306,13 @@ extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-	struct blkio_group *blkg, void *key, dev_t dev,
+	struct blkio_group *blkg, struct request_queue *q, dev_t dev,
 	enum blkio_policy_id plid);
 extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
-						void *key);
+						struct request_queue *q,
+						enum blkio_policy_id plid);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 					unsigned long time,
 					unsigned long unaccounted_time);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c252df9..6613de7 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -252,7 +252,7 @@ static void throtl_init_add_tg_lists(struct throtl_data *td,
 	__throtl_tg_fill_dev_details(td, tg);
 
 	/* Add group onto cgroup list */
-	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
+	blkiocg_add_blkio_group(blkcg, &tg->blkg, td->queue,
 				tg->blkg.dev, BLKIO_POLICY_THROTL);
 
 	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
@@ -288,7 +288,6 @@ static struct
 throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
 	struct throtl_grp *tg = NULL;
-	void *key = td;
 
 	/*
 	 * This is the common case when there are no blkio cgroups.
@@ -297,7 +296,8 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 	if (blkcg == &blkio_root_cgroup)
 		tg = td->root_tg;
 	else
-		tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+		tg = tg_of_blkg(blkiocg_lookup_group(blkcg, td->queue,
+						     BLKIO_POLICY_THROTL));
 
 	__throtl_tg_fill_dev_details(td, tg);
 	return tg;
@@ -1012,22 +1012,22 @@ static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
  * no new IO will come in this group. So get rid of this group as soon as
  * any pending IO in the group is finished.
  *
- * This function is called under rcu_read_lock(). key is the rcu protected
- * pointer. That means "key" is a valid throtl_data pointer as long as we are
- * rcu read lock.
+ * This function is called under rcu_read_lock(). @q is the rcu protected
+ * pointer. That means @q is a valid request_queue pointer as long as we
+ * are rcu read lock.
  *
- * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * @q was fetched from blkio_group under blkio_cgroup->lock. That means
  * it should not be NULL as even if queue was going away, cgroup deltion
  * path got to it first.
  */
-void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
+void throtl_unlink_blkio_group(struct request_queue *q,
+			       struct blkio_group *blkg)
 {
 	unsigned long flags;
-	struct throtl_data *td = key;
 
-	spin_lock_irqsave(td->queue->queue_lock, flags);
-	throtl_destroy_tg(td, tg_of_blkg(blkg));
-	spin_unlock_irqrestore(td->queue->queue_lock, flags);
+	spin_lock_irqsave(q->queue_lock, flags);
+	throtl_destroy_tg(q->td, tg_of_blkg(blkg));
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 static bool throtl_clear_queue(struct request_queue *q)
@@ -1054,52 +1054,48 @@ static void throtl_update_blkio_group_common(struct throtl_data *td,
 }
 
 /*
- * For all update functions, key should be a valid pointer because these
+ * For all update functions, @q should be a valid pointer because these
  * update functions are called under blkcg_lock, that means, blkg is
- * valid and in turn key is valid. queue exit path can not race because
+ * valid and in turn @q is valid. queue exit path can not race because
  * of blkcg_lock
  *
  * Can not take queue lock in update functions as queue lock under blkcg_lock
  * is not allowed. Under other paths we take blkcg_lock under queue_lock.
  */
-static void throtl_update_blkio_group_read_bps(void *key,
+static void throtl_update_blkio_group_read_bps(struct request_queue *q,
 				struct blkio_group *blkg, u64 read_bps)
 {
-	struct throtl_data *td = key;
 	struct throtl_grp *tg = tg_of_blkg(blkg);
 
 	tg->bps[READ] = read_bps;
-	throtl_update_blkio_group_common(td, tg);
+	throtl_update_blkio_group_common(q->td, tg);
 }
 
-static void throtl_update_blkio_group_write_bps(void *key,
+static void throtl_update_blkio_group_write_bps(struct request_queue *q,
 				struct blkio_group *blkg, u64 write_bps)
 {
-	struct throtl_data *td = key;
 	struct throtl_grp *tg = tg_of_blkg(blkg);
 
 	tg->bps[WRITE] = write_bps;
-	throtl_update_blkio_group_common(td, tg);
+	throtl_update_blkio_group_common(q->td, tg);
 }
 
-static void throtl_update_blkio_group_read_iops(void *key,
+static void throtl_update_blkio_group_read_iops(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int read_iops)
 {
-	struct throtl_data *td = key;
 	struct throtl_grp *tg = tg_of_blkg(blkg);
 
 	tg->iops[READ] = read_iops;
-	throtl_update_blkio_group_common(td, tg);
+	throtl_update_blkio_group_common(q->td, tg);
 }
 
-static void throtl_update_blkio_group_write_iops(void *key,
+static void throtl_update_blkio_group_write_iops(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int write_iops)
 {
-	struct throtl_data *td = key;
 	struct throtl_grp *tg = tg_of_blkg(blkg);
 
 	tg->iops[WRITE] = write_iops;
-	throtl_update_blkio_group_common(td, tg);
+	throtl_update_blkio_group_common(q->td, tg);
 }
 
 static void throtl_shutdown_wq(struct request_queue *q)
@@ -1306,7 +1302,7 @@ void blk_throtl_exit(struct request_queue *q)
 	spin_unlock_irq(q->queue_lock);
 
 	/*
-	 * Wait for tg->blkg->key accessors to exit their grace periods.
+	 * Wait for tg->blkg->q accessors to exit their grace periods.
 	 * Do this wait only if there are other undestroyed groups out
 	 * there (other than root group). This can happen if cgroup deletion
 	 * path claimed the responsibility of cleaning up a group before
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0f7a81f..37e2da9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1020,7 +1020,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 	return NULL;
 }
 
-static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+static void cfq_update_blkio_group_weight(struct request_queue *q,
+					  struct blkio_group *blkg,
 					  unsigned int weight)
 {
 	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
@@ -1043,10 +1044,10 @@ static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
 	if (bdi->dev) {
 		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
 		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-					(void *)cfqd, MKDEV(major, minor));
+					cfqd->queue, MKDEV(major, minor));
 	} else
 		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-					(void *)cfqd, 0);
+					cfqd->queue, 0);
 
 	cfqd->nr_blkcg_linked_grps++;
 	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
@@ -1097,7 +1098,6 @@ static struct cfq_group *
 cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
 {
 	struct cfq_group *cfqg = NULL;
-	void *key = cfqd;
 	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
 
@@ -1108,7 +1108,8 @@ cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
 	if (blkcg == &blkio_root_cgroup)
 		cfqg = &cfqd->root_group;
 	else
-		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, cfqd->queue,
+							 BLKIO_POLICY_PROP));
 
 	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
 		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
@@ -1247,21 +1248,22 @@ static bool cfq_release_cfq_groups(struct cfq_data *cfqd)
  * any pending IO in the group is finished.
  *
  * This function is called under rcu_read_lock(). key is the rcu protected
- * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
- * read lock.
+ * pointer. That means @q is a valid request_queue pointer as long as we
+ * are rcu read lock.
  *
- * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * @q was fetched from blkio_group under blkio_cgroup->lock. That means
  * it should not be NULL as even if elevator was exiting, cgroup deltion
  * path got to it first.
  */
-static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
+static void cfq_unlink_blkio_group(struct request_queue *q,
+				   struct blkio_group *blkg)
 {
-	unsigned long  flags;
-	struct cfq_data *cfqd = key;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	unsigned long flags;
 
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+	spin_lock_irqsave(q->queue_lock, flags);
 	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 static struct elevator_type iosched_cfq;
@@ -3718,7 +3720,7 @@ static int cfq_init_queue(struct request_queue *q)
 	rcu_read_lock();
 
 	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
-					(void *)cfqd, 0);
+				    cfqd->queue, 0);
 	rcu_read_unlock();
 	cfqd->nr_blkcg_linked_grps++;
 
diff --git a/block/cfq.h b/block/cfq.h
index 2a15592..343b78a 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -68,8 +68,9 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 }
 
 static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev) {
-	blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
+		struct blkio_group *blkg, struct request_queue *q, dev_t dev)
+{
+	blkiocg_add_blkio_group(blkcg, blkg, q, dev, BLKIO_POLICY_PROP);
 }
 
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
@@ -105,7 +106,7 @@ static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {}
 
 static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev) {}
+		struct blkio_group *blkg, struct request_queue *q, dev_t dev) {}
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
 {
 	return 0;
-- 
cgit v1.1


From 035d10b2fa7e5f7e9bf9465dbc39c35affd5ac32 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:04 -0800
Subject: blkcg: add blkio_policy[] array and allow one policy per policy ID

Block cgroup policies are maintained in a linked list and,
theoretically, multiple policies sharing the same policy ID are
allowed.

This patch temporarily restricts one policy per plid and adds
blkio_policy[] array which indexes registered policy types by plid.
Both the restriction and blkio_policy[] array are transitional and
will be removed once API cleanup is complete.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 10 ++++++++++
 block/blk-cgroup.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5e50ca1..f1b08d3c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -29,6 +29,8 @@ static LIST_HEAD(blkio_list);
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
+static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
+
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
 						  struct cgroup *);
 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
@@ -1694,7 +1696,11 @@ static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 void blkio_policy_register(struct blkio_policy_type *blkiop)
 {
 	spin_lock(&blkio_list_lock);
+
+	BUG_ON(blkio_policy[blkiop->plid]);
+	blkio_policy[blkiop->plid] = blkiop;
 	list_add_tail(&blkiop->list, &blkio_list);
+
 	spin_unlock(&blkio_list_lock);
 }
 EXPORT_SYMBOL_GPL(blkio_policy_register);
@@ -1702,7 +1708,11 @@ EXPORT_SYMBOL_GPL(blkio_policy_register);
 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 {
 	spin_lock(&blkio_list_lock);
+
+	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
+	blkio_policy[blkiop->plid] = NULL;
 	list_del_init(&blkiop->list);
+
 	spin_unlock(&blkio_list_lock);
 }
 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 41c960b..562fa55 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -19,6 +19,8 @@
 enum blkio_policy_id {
 	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
 	BLKIO_POLICY_THROTL,		/* Throttling */
+
+	BLKIO_NR_POLICIES,
 };
 
 /* Max limits for throttle policy */
-- 
cgit v1.1


From f51b802c17e2a21926b29911493f5e7ddf6eee87 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:05 -0800
Subject: blkcg: use the usual get blkg path for root blkio_group

For root blkg, blk_throtl_init() was using throtl_alloc_tg()
explicitly and cfq_init_queue() was manually initializing embedded
cfqd->root_group, adding unnecessarily different code paths to blkg
handling.

Make both use the usual blkio_group get functions - throtl_get_tg()
and cfq_get_cfqg() - for the root blkio_group too.  Note that
blk_throtl_init() callsite is pushed downwards in
blk_alloc_queue_node() so that @q is sufficiently initialized for
throtl_get_tg().

This simplifies root blkg handling noticeably for cfq and will allow
further modularization of blkcg API.

-v2: Vivek pointed out that using cfq_get_cfqg() won't work if
     CONFIG_CFQ_GROUP_IOSCHED is disabled.  Fix it by factoring out
     initialization of base part of cfqg into cfq_init_cfqg_base() and
     alloc/init/free explicitly if !CONFIG_CFQ_GROUP_IOSCHED.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     |   6 +--
 block/blk-throttle.c |  18 ++++-----
 block/cfq-iosched.c  | 105 ++++++++++++++++++++++++++-------------------------
 3 files changed, 65 insertions(+), 64 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 7713c73..5a1b8cc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -540,9 +540,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (err)
 		goto fail_id;
 
-	if (blk_throtl_init(q))
-		goto fail_id;
-
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
@@ -565,6 +562,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	 */
 	q->queue_lock = &q->__queue_lock;
 
+	if (blk_throtl_init(q))
+		goto fail_id;
+
 	return q;
 
 fail_id:
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6613de7..aeeb798 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1252,7 +1252,6 @@ void blk_throtl_drain(struct request_queue *q)
 int blk_throtl_init(struct request_queue *q)
 {
 	struct throtl_data *td;
-	struct throtl_grp *tg;
 
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
@@ -1265,19 +1264,20 @@ int blk_throtl_init(struct request_queue *q)
 
 	/* alloc and Init root group. */
 	td->queue = q;
-	tg = throtl_alloc_tg(td);
 
-	if (!tg) {
-		kfree(td);
-		return -ENOMEM;
-	}
+	rcu_read_lock();
+	spin_lock_irq(q->queue_lock);
 
-	td->root_tg = tg;
+	td->root_tg = throtl_get_tg(td, &blkio_root_cgroup);
 
-	rcu_read_lock();
-	throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
+	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 
+	if (!td->root_tg) {
+		kfree(td);
+		return -ENOMEM;
+	}
+
 	/* Attach throtl data to request queue */
 	q->td = td;
 	return 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 37e2da9..1c3f41b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -229,7 +229,7 @@ struct cfq_data {
 	struct request_queue *queue;
 	/* Root service tree for cfq_groups */
 	struct cfq_rb_root grp_service_tree;
-	struct cfq_group root_group;
+	struct cfq_group *root_group;
 
 	/*
 	 * The priority currently being served
@@ -1012,6 +1012,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
 
+/**
+ * cfq_init_cfqg_base - initialize base part of a cfq_group
+ * @cfqg: cfq_group to initialize
+ *
+ * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
+ * is enabled or not.
+ */
+static void cfq_init_cfqg_base(struct cfq_group *cfqg)
+{
+	struct cfq_rb_root *st;
+	int i, j;
+
+	for_each_cfqg_st(cfqg, i, j, st)
+		*st = CFQ_RB_ROOT;
+	RB_CLEAR_NODE(&cfqg->rb_node);
+
+	cfqg->ttime.last_end_request = jiffies;
+}
+
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 {
@@ -1063,19 +1082,14 @@ static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
  */
 static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
 {
-	struct cfq_group *cfqg = NULL;
-	int i, j, ret;
-	struct cfq_rb_root *st;
+	struct cfq_group *cfqg;
+	int ret;
 
 	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
 	if (!cfqg)
 		return NULL;
 
-	for_each_cfqg_st(cfqg, i, j, st)
-		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
-
-	cfqg->ttime.last_end_request = jiffies;
+	cfq_init_cfqg_base(cfqg);
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1106,7 +1120,7 @@ cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
 	 * Avoid lookup in this case
 	 */
 	if (blkcg == &blkio_root_cgroup)
-		cfqg = &cfqd->root_group;
+		cfqg = cfqd->root_group;
 	else
 		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, cfqd->queue,
 							 BLKIO_POLICY_PROP));
@@ -1166,7 +1180,7 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd,
 	}
 
 	if (!cfqg)
-		cfqg = &cfqd->root_group;
+		cfqg = cfqd->root_group;
 
 	cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
 	return cfqg;
@@ -1182,7 +1196,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
 	/* Currently, all async queues are mapped to root group */
 	if (!cfq_cfqq_sync(cfqq))
-		cfqg = &cfqq->cfqd->root_group;
+		cfqg = cfqq->cfqd->root_group;
 
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
@@ -1283,7 +1297,7 @@ static bool cfq_clear_queue(struct request_queue *q)
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd,
 				      struct blkio_cgroup *blkcg)
 {
-	return &cfqd->root_group;
+	return cfqd->root_group;
 }
 
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
@@ -3671,9 +3685,8 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	if (wait)
 		synchronize_rcu();
 
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	/* Free up per cpu stats for root group */
-	free_percpu(cfqd->root_group.blkg.stats_cpu);
+#ifndef CONFIG_CFQ_GROUP_IOSCHED
+	kfree(cfqd->root_group);
 #endif
 	kfree(cfqd);
 }
@@ -3681,52 +3694,40 @@ static void cfq_exit_queue(struct elevator_queue *e)
 static int cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
-	int i, j;
-	struct cfq_group *cfqg;
-	struct cfq_rb_root *st;
+	int i;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
 		return -ENOMEM;
 
+	cfqd->queue = q;
+	q->elevator->elevator_data = cfqd;
+
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
 
-	/* Init root group */
-	cfqg = &cfqd->root_group;
-	for_each_cfqg_st(cfqg, i, j, st)
-		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
-
-	/* Give preference to root group over other groups */
-	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
-
+	/* Init root group and prefer root group over other groups by default */
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-	/*
-	 * Set root group reference to 2. One reference will be dropped when
-	 * all groups on cfqd->cfqg_list are being deleted during queue exit.
-	 * Other reference will remain there as we don't want to delete this
-	 * group as it is statically allocated and gets destroyed when
-	 * throtl_data goes away.
-	 */
-	cfqg->ref = 2;
+	rcu_read_lock();
+	spin_lock_irq(q->queue_lock);
 
-	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
-		kfree(cfqg);
+	cfqd->root_group = cfq_get_cfqg(cfqd, &blkio_root_cgroup);
+
+	spin_unlock_irq(q->queue_lock);
+	rcu_read_unlock();
+#else
+	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
+					GFP_KERNEL, cfqd->queue->node);
+	if (cfqd->root_group)
+		cfq_init_cfqg_base(cfqd->root_group);
+#endif
+	if (!cfqd->root_group) {
 		kfree(cfqd);
 		return -ENOMEM;
 	}
 
-	rcu_read_lock();
+	cfqd->root_group->weight = 2*BLKIO_WEIGHT_DEFAULT;
 
-	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
-				    cfqd->queue, 0);
-	rcu_read_unlock();
-	cfqd->nr_blkcg_linked_grps++;
-
-	/* Add group on cfqd->cfqg_list */
-	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
-#endif
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
 	 * zeroed cfqd on alloc), but better be safe in case someone decides
@@ -3738,14 +3739,14 @@ static int cfq_init_queue(struct request_queue *q)
 	/*
 	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
-	 * will not attempt to free it.
+	 * will not attempt to free it.  oom_cfqq is linked to root_group
+	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
+	 * the reference from linking right away.
 	 */
 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
 	cfqd->oom_cfqq.ref++;
-	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
-
-	cfqd->queue = q;
-	q->elevator->elevator_data = cfqd;
+	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
+	cfq_put_cfqg(cfqd->root_group);
 
 	init_timer(&cfqd->idle_slice_timer);
 	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
-- 
cgit v1.1


From cd1604fab4f95f7cfc227d3955fd7ae14da61f38 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:06 -0800
Subject: blkcg: factor out blkio_group creation

Currently both blk-throttle and cfq-iosched implement their own
blkio_group creation code in throtl_get_tg() and cfq_get_cfqg().  This
patch factors out the common code into blkg_lookup_create(), which
returns ERR_PTR value so that transitional failures due to queue
bypass can be distinguished from other failures.

* New plkio_policy_ops methods blkio_alloc_group_fn() and
  blkio_link_group_fn added.  Both are transitional and will be
  removed once the blkg management code is fully moved into
  blk-cgroup.c.

* blkio_alloc_group_fn() allocates policy-specific blkg which is
  usually a larger data structure with blkg as the first entry and
  intiailizes it.  Note that initialization of blkg proper, including
  percpu stats, is responsibility of blk-cgroup proper.

  Note that default config (weight, bps...) initialization is done
  from this method; otherwise, we end up violating locking order
  between blkcg and q locks via blkcg_get_CONF() functions.

* blkio_link_group_fn() is called under queue_lock and responsible for
  linking the blkg to the queue.  blkcg side is handled by blk-cgroup
  proper.

* The common blkg creation function is named blkg_lookup_create() and
  blkiocg_lookup_group() is renamed to blkg_lookup() for consistency.
  Also, throtl / cfq related functions are similarly [re]named for
  consistency.

This simplifies blkcg policy implementations and enables further
cleanup.

-v2: Vivek noticed that blkg_lookup_create() incorrectly tested
     blk_queue_dead() instead of blk_queue_bypass() leading a user of
     the function ending up creating a new blkg on bypassing queue.
     This is a bug introduced while relocating bypass patches before
     this one.  Fixed.

-v3: ERR_PTR patch folded into this one.  @for_root added to
     blkg_lookup_create() to allow creating root group on a bypassed
     queue during elevator switch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 117 +++++++++++++++++++++++++++-----------
 block/blk-cgroup.h   |  30 +++++-----
 block/blk-throttle.c | 155 +++++++++++++++++----------------------------------
 block/cfq-iosched.c  | 131 +++++++++++++------------------------------
 block/cfq.h          |   8 ---
 5 files changed, 193 insertions(+), 248 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index f1b08d3c..bc98914 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -465,38 +465,93 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
-/*
- * This function allocates the per cpu stats for blkio_group. Should be called
- * from sleepable context as alloc_per_cpu() requires that.
- */
-int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
+				       struct request_queue *q,
+				       enum blkio_policy_id plid,
+				       bool for_root)
+	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
-	/* Allocate memory for per cpu stats */
-	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-	if (!blkg->stats_cpu)
-		return -ENOMEM;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
+	struct blkio_policy_type *pol = blkio_policy[plid];
+	struct blkio_group *blkg, *new_blkg;
 
-void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-		struct blkio_group *blkg, struct request_queue *q, dev_t dev,
-		enum blkio_policy_id plid)
-{
-	unsigned long flags;
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	lockdep_assert_held(q->queue_lock);
 
-	spin_lock_irqsave(&blkcg->lock, flags);
-	spin_lock_init(&blkg->stats_lock);
-	rcu_assign_pointer(blkg->q, q);
-	blkg->blkcg_id = css_id(&blkcg->css);
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 * The following can be removed if blkg lookup is guaranteed to
+	 * fail on a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q)) && !for_root)
+		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
+
+	blkg = blkg_lookup(blkcg, q, plid);
+	if (blkg)
+		return blkg;
+
+	if (!css_tryget(&blkcg->css))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Allocate and initialize.
+	 *
+	 * FIXME: The following is broken.  Percpu memory allocation
+	 * requires %GFP_KERNEL context and can't be performed from IO
+	 * path.  Allocation here should inherently be atomic and the
+	 * following lock dancing can be removed once the broken percpu
+	 * allocation is fixed.
+	 */
+	spin_unlock_irq(q->queue_lock);
+	rcu_read_unlock();
+
+	new_blkg = pol->ops.blkio_alloc_group_fn(q, blkcg);
+	if (new_blkg) {
+		new_blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+
+		spin_lock_init(&new_blkg->stats_lock);
+		rcu_assign_pointer(new_blkg->q, q);
+		new_blkg->blkcg_id = css_id(&blkcg->css);
+		new_blkg->plid = plid;
+		cgroup_path(blkcg->css.cgroup, new_blkg->path,
+			    sizeof(new_blkg->path));
+	}
+
+	rcu_read_lock();
+	spin_lock_irq(q->queue_lock);
+	css_put(&blkcg->css);
+
+	/* did bypass get turned on inbetween? */
+	if (unlikely(blk_queue_bypass(q)) && !for_root) {
+		blkg = ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
+		goto out;
+	}
+
+	/* did someone beat us to it? */
+	blkg = blkg_lookup(blkcg, q, plid);
+	if (unlikely(blkg))
+		goto out;
+
+	/* did alloc fail? */
+	if (unlikely(!new_blkg || !new_blkg->stats_cpu)) {
+		blkg = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	/* insert */
+	spin_lock(&blkcg->lock);
+	swap(blkg, new_blkg);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
-	blkg->plid = plid;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-	/* Need to take css reference ? */
-	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
-	blkg->dev = dev;
+	pol->ops.blkio_link_group_fn(q, blkg);
+	spin_unlock(&blkcg->lock);
+out:
+	if (new_blkg) {
+		free_percpu(new_blkg->stats_cpu);
+		kfree(new_blkg);
+	}
+	return blkg;
 }
-EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
+EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 {
@@ -533,9 +588,9 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg)
 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 
 /* called under rcu_read_lock(). */
-struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
-					 struct request_queue *q,
-					 enum blkio_policy_id plid)
+struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
+				struct request_queue *q,
+				enum blkio_policy_id plid)
 {
 	struct blkio_group *blkg;
 	struct hlist_node *n;
@@ -545,7 +600,7 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 			return blkg;
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
+EXPORT_SYMBOL_GPL(blkg_lookup);
 
 void blkg_destroy_all(struct request_queue *q)
 {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 562fa55..2600ae7 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -204,6 +204,10 @@ extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
 extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 
+typedef struct blkio_group *(blkio_alloc_group_fn)(struct request_queue *q,
+						   struct blkio_cgroup *blkcg);
+typedef void (blkio_link_group_fn)(struct request_queue *q,
+			struct blkio_group *blkg);
 typedef void (blkio_unlink_group_fn)(struct request_queue *q,
 			struct blkio_group *blkg);
 typedef bool (blkio_clear_queue_fn)(struct request_queue *q);
@@ -219,6 +223,8 @@ typedef void (blkio_update_group_write_iops_fn)(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int write_iops);
 
 struct blkio_policy_ops {
+	blkio_alloc_group_fn *blkio_alloc_group_fn;
+	blkio_link_group_fn *blkio_link_group_fn;
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
 	blkio_clear_queue_fn *blkio_clear_queue_fn;
 	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
@@ -307,14 +313,14 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
-extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-	struct blkio_group *blkg, struct request_queue *q, dev_t dev,
-	enum blkio_policy_id plid);
-extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
-extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
-						struct request_queue *q,
-						enum blkio_policy_id plid);
+extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
+				       struct request_queue *q,
+				       enum blkio_policy_id plid);
+struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
+				       struct request_queue *q,
+				       enum blkio_policy_id plid,
+				       bool for_root);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 					unsigned long time,
 					unsigned long unaccounted_time);
@@ -335,17 +341,11 @@ cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 static inline struct blkio_cgroup *
 task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
 
-static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-		struct blkio_group *blkg, void *key, dev_t dev,
-		enum blkio_policy_id plid) {}
-
-static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
-
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 
-static inline struct blkio_group *
-blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
+static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
+					      void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 						unsigned long time,
 						unsigned long unaccounted_time)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index aeeb798..2ae637b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -181,17 +181,25 @@ static void throtl_put_tg(struct throtl_grp *tg)
 	call_rcu(&tg->rcu_head, throtl_free_tg);
 }
 
-static void throtl_init_group(struct throtl_grp *tg)
+static struct blkio_group *throtl_alloc_blkio_group(struct request_queue *q,
+						    struct blkio_cgroup *blkcg)
 {
+	struct throtl_grp *tg;
+
+	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, q->node);
+	if (!tg)
+		return NULL;
+
 	INIT_HLIST_NODE(&tg->tg_node);
 	RB_CLEAR_NODE(&tg->rb_node);
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
 	tg->limits_changed = false;
 
-	/* Practically unlimited BW */
-	tg->bps[0] = tg->bps[1] = -1;
-	tg->iops[0] = tg->iops[1] = -1;
+	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
+	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
+	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -200,14 +208,8 @@ static void throtl_init_group(struct throtl_grp *tg)
 	 * exit or cgroup deletion path depending on who is exiting first.
 	 */
 	atomic_set(&tg->ref, 1);
-}
 
-/* Should be called with rcu read lock held (needed for blkcg) */
-static void
-throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
-{
-	hlist_add_head(&tg->tg_node, &td->tg_list);
-	td->nr_undestroyed_grps++;
+	return &tg->blkg;
 }
 
 static void
@@ -246,119 +248,62 @@ throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
 	spin_unlock_irq(td->queue->queue_lock);
 }
 
-static void throtl_init_add_tg_lists(struct throtl_data *td,
-			struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+static void throtl_link_blkio_group(struct request_queue *q,
+				    struct blkio_group *blkg)
 {
-	__throtl_tg_fill_dev_details(td, tg);
-
-	/* Add group onto cgroup list */
-	blkiocg_add_blkio_group(blkcg, &tg->blkg, td->queue,
-				tg->blkg.dev, BLKIO_POLICY_THROTL);
-
-	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
-	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
-	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
-	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
-
-	throtl_add_group_to_td_list(td, tg);
-}
-
-/* Should be called without queue lock and outside of rcu period */
-static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
-{
-	struct throtl_grp *tg = NULL;
-	int ret;
-
-	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
-	if (!tg)
-		return NULL;
-
-	ret = blkio_alloc_blkg_stats(&tg->blkg);
+	struct throtl_data *td = q->td;
+	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	if (ret) {
-		kfree(tg);
-		return NULL;
-	}
+	__throtl_tg_fill_dev_details(td, tg);
 
-	throtl_init_group(tg);
-	return tg;
+	hlist_add_head(&tg->tg_node, &td->tg_list);
+	td->nr_undestroyed_grps++;
 }
 
 static struct
-throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
+throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
 	struct throtl_grp *tg = NULL;
 
 	/*
 	 * This is the common case when there are no blkio cgroups.
- 	 * Avoid lookup in this case
- 	 */
+	 * Avoid lookup in this case
+	 */
 	if (blkcg == &blkio_root_cgroup)
 		tg = td->root_tg;
 	else
-		tg = tg_of_blkg(blkiocg_lookup_group(blkcg, td->queue,
-						     BLKIO_POLICY_THROTL));
+		tg = tg_of_blkg(blkg_lookup(blkcg, td->queue,
+					    BLKIO_POLICY_THROTL));
 
 	__throtl_tg_fill_dev_details(td, tg);
 	return tg;
 }
 
-static struct throtl_grp *throtl_get_tg(struct throtl_data *td,
-					struct blkio_cgroup *blkcg)
+static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
+						  struct blkio_cgroup *blkcg)
 {
-	struct throtl_grp *tg = NULL, *__tg = NULL;
 	struct request_queue *q = td->queue;
-
-	/* no throttling for dead queue */
-	if (unlikely(blk_queue_bypass(q)))
-		return NULL;
-
-	tg = throtl_find_tg(td, blkcg);
-	if (tg)
-		return tg;
-
-	if (!css_tryget(&blkcg->css))
-		return NULL;
-
-	/*
-	 * Need to allocate a group. Allocation of group also needs allocation
-	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
-	 * we need to drop rcu lock and queue_lock before we call alloc.
-	 */
-	spin_unlock_irq(q->queue_lock);
-	rcu_read_unlock();
-
-	tg = throtl_alloc_tg(td);
-
-	/* Group allocated and queue is still alive. take the lock */
-	rcu_read_lock();
-	spin_lock_irq(q->queue_lock);
-	css_put(&blkcg->css);
-
-	/* Make sure @q is still alive */
-	if (unlikely(blk_queue_bypass(q))) {
-		kfree(tg);
-		return NULL;
-	}
+	struct throtl_grp *tg = NULL;
 
 	/*
-	 * If some other thread already allocated the group while we were
-	 * not holding queue lock, free up the group
+	 * This is the common case when there are no blkio cgroups.
+	 * Avoid lookup in this case
 	 */
-	__tg = throtl_find_tg(td, blkcg);
+	if (blkcg == &blkio_root_cgroup) {
+		tg = td->root_tg;
+	} else {
+		struct blkio_group *blkg;
 
-	if (__tg) {
-		kfree(tg);
-		return __tg;
-	}
+		blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_THROTL, false);
 
-	/* Group allocation failed. Account the IO to root group */
-	if (!tg) {
-		tg = td->root_tg;
-		return tg;
+		/* if %NULL and @q is alive, fall back to root_tg */
+		if (!IS_ERR(blkg))
+			tg = tg_of_blkg(blkg);
+		else if (!blk_queue_dead(q))
+			tg = td->root_tg;
 	}
 
-	throtl_init_add_tg_lists(td, tg, blkcg);
+	__throtl_tg_fill_dev_details(td, tg);
 	return tg;
 }
 
@@ -1107,6 +1052,8 @@ static void throtl_shutdown_wq(struct request_queue *q)
 
 static struct blkio_policy_type blkio_policy_throtl = {
 	.ops = {
+		.blkio_alloc_group_fn = throtl_alloc_blkio_group,
+		.blkio_link_group_fn = throtl_link_blkio_group,
 		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
 		.blkio_clear_queue_fn = throtl_clear_queue,
 		.blkio_update_group_read_bps_fn =
@@ -1141,7 +1088,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	 */
 	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
-	tg = throtl_find_tg(td, blkcg);
+	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
 		throtl_tg_fill_dev_details(td, tg);
 
@@ -1157,7 +1104,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	 * IO group
 	 */
 	spin_lock_irq(q->queue_lock);
-	tg = throtl_get_tg(td, blkcg);
+	tg = throtl_lookup_create_tg(td, blkcg);
 	if (unlikely(!tg))
 		goto out_unlock;
 
@@ -1252,6 +1199,7 @@ void blk_throtl_drain(struct request_queue *q)
 int blk_throtl_init(struct request_queue *q)
 {
 	struct throtl_data *td;
+	struct blkio_group *blkg;
 
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
@@ -1262,13 +1210,17 @@ int blk_throtl_init(struct request_queue *q)
 	td->limits_changed = false;
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 
-	/* alloc and Init root group. */
+	q->td = td;
 	td->queue = q;
 
+	/* alloc and init root group. */
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
 
-	td->root_tg = throtl_get_tg(td, &blkio_root_cgroup);
+	blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_THROTL,
+				  true);
+	if (!IS_ERR(blkg))
+		td->root_tg = tg_of_blkg(blkg);
 
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
@@ -1277,9 +1229,6 @@ int blk_throtl_init(struct request_queue *q)
 		kfree(td);
 		return -ENOMEM;
 	}
-
-	/* Attach throtl data to request queue */
-	q->td = td;
 	return 0;
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1c3f41b..acef564 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1048,10 +1048,12 @@ static void cfq_update_blkio_group_weight(struct request_queue *q,
 	cfqg->needs_update = true;
 }
 
-static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
-			struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
+static void cfq_link_blkio_group(struct request_queue *q,
+				 struct blkio_group *blkg)
 {
-	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct backing_dev_info *bdi = &q->backing_dev_info;
+	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
 	unsigned int major, minor;
 
 	/*
@@ -1062,34 +1064,26 @@ static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
 	 */
 	if (bdi->dev) {
 		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-					cfqd->queue, MKDEV(major, minor));
-	} else
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-					cfqd->queue, 0);
+		blkg->dev = MKDEV(major, minor);
+	}
 
 	cfqd->nr_blkcg_linked_grps++;
-	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 
 	/* Add group on cfqd list */
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 }
 
-/*
- * Should be called from sleepable context. No request queue lock as per
- * cpu stats are allocated dynamically and alloc_percpu needs to be called
- * from sleepable context.
- */
-static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+static struct blkio_group *cfq_alloc_blkio_group(struct request_queue *q,
+						 struct blkio_cgroup *blkcg)
 {
 	struct cfq_group *cfqg;
-	int ret;
 
-	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
+	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, q->node);
 	if (!cfqg)
 		return NULL;
 
 	cfq_init_cfqg_base(cfqg);
+	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1099,90 +1093,38 @@ static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
 	 */
 	cfqg->ref = 1;
 
-	ret = blkio_alloc_blkg_stats(&cfqg->blkg);
-	if (ret) {
-		kfree(cfqg);
-		return NULL;
-	}
-
-	return cfqg;
-}
-
-static struct cfq_group *
-cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
-{
-	struct cfq_group *cfqg = NULL;
-	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
-	unsigned int major, minor;
-
-	/*
-	 * This is the common case when there are no blkio cgroups.
-	 * Avoid lookup in this case
-	 */
-	if (blkcg == &blkio_root_cgroup)
-		cfqg = cfqd->root_group;
-	else
-		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, cfqd->queue,
-							 BLKIO_POLICY_PROP));
-
-	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfqg->blkg.dev = MKDEV(major, minor);
-	}
-
-	return cfqg;
+	return &cfqg->blkg;
 }
 
 /*
  * Search for the cfq group current task belongs to. request_queue lock must
  * be held.
  */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd,
-				      struct blkio_cgroup *blkcg)
+static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
+						struct blkio_cgroup *blkcg)
 {
-	struct cfq_group *cfqg = NULL, *__cfqg = NULL;
 	struct request_queue *q = cfqd->queue;
+	struct backing_dev_info *bdi = &q->backing_dev_info;
+	struct cfq_group *cfqg = NULL;
 
-	cfqg = cfq_find_cfqg(cfqd, blkcg);
-	if (cfqg)
-		return cfqg;
-
-	if (!css_tryget(&blkcg->css))
-		return NULL;
-
-	/*
-	 * Need to allocate a group. Allocation of group also needs allocation
-	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
-	 * we need to drop rcu lock and queue_lock before we call alloc.
-	 *
-	 * Not taking any queue reference here and assuming that queue is
-	 * around by the time we return. CFQ queue allocation code does
-	 * the same. It might be racy though.
-	 */
-	rcu_read_unlock();
-	spin_unlock_irq(q->queue_lock);
-
-	cfqg = cfq_alloc_cfqg(cfqd);
+	/* avoid lookup for the common case where there's no blkio cgroup */
+	if (blkcg == &blkio_root_cgroup) {
+		cfqg = cfqd->root_group;
+	} else {
+		struct blkio_group *blkg;
 
-	spin_lock_irq(q->queue_lock);
-	rcu_read_lock();
-	css_put(&blkcg->css);
+		blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_PROP, false);
+		if (!IS_ERR(blkg))
+			cfqg = cfqg_of_blkg(blkg);
+	}
 
-	/*
-	 * If some other thread already allocated the group while we were
-	 * not holding queue lock, free up the group
-	 */
-	__cfqg = cfq_find_cfqg(cfqd, blkcg);
+	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		unsigned int major, minor;
 
-	if (__cfqg) {
-		kfree(cfqg);
-		return __cfqg;
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		cfqg->blkg.dev = MKDEV(major, minor);
 	}
 
-	if (!cfqg)
-		cfqg = cfqd->root_group;
-
-	cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
 	return cfqg;
 }
 
@@ -1294,8 +1236,8 @@ static bool cfq_clear_queue(struct request_queue *q)
 }
 
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd,
-				      struct blkio_cgroup *blkcg)
+static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
+						struct blkio_cgroup *blkcg)
 {
 	return cfqd->root_group;
 }
@@ -2887,7 +2829,8 @@ retry:
 
 	blkcg = task_blkio_cgroup(current);
 
-	cfqg = cfq_get_cfqg(cfqd, blkcg);
+	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
+
 	cic = cfq_cic_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
@@ -3694,6 +3637,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 static int cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
+	struct blkio_group *blkg __maybe_unused;
 	int i;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
@@ -3711,7 +3655,10 @@ static int cfq_init_queue(struct request_queue *q)
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
 
-	cfqd->root_group = cfq_get_cfqg(cfqd, &blkio_root_cgroup);
+	blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_PROP,
+				  true);
+	if (!IS_ERR(blkg))
+		cfqd->root_group = cfqg_of_blkg(blkg);
 
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
@@ -3897,6 +3844,8 @@ static struct elevator_type iosched_cfq = {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkio_policy_type blkio_policy_cfq = {
 	.ops = {
+		.blkio_alloc_group_fn =		cfq_alloc_blkio_group,
+		.blkio_link_group_fn =		cfq_link_blkio_group,
 		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
 		.blkio_clear_queue_fn = cfq_clear_queue,
 		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
diff --git a/block/cfq.h b/block/cfq.h
index 343b78a..3987601 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -67,12 +67,6 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 				direction, sync);
 }
 
-static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-		struct blkio_group *blkg, struct request_queue *q, dev_t dev)
-{
-	blkiocg_add_blkio_group(blkcg, blkg, q, dev, BLKIO_POLICY_PROP);
-}
-
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
 {
 	return blkiocg_del_blkio_group(blkg);
@@ -105,8 +99,6 @@ static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 				uint64_t bytes, bool direction, bool sync) {}
 static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {}
 
-static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-		struct blkio_group *blkg, struct request_queue *q, dev_t dev) {}
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
 {
 	return 0;
-- 
cgit v1.1


From e56da7e287967667474a58c4f60c286279e3f487 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:07 -0800
Subject: blkcg: don't allow or retain configuration of missing devices

blkcg is very peculiar in that it allows setting and remembering
configurations for non-existent devices by maintaining separate data
structures for configuration.

This behavior is completely out of the usual norms and outright
confusing; furthermore, it uses dev_t number to match the
configuration to devices, which is unpredictable to begin with and
becomes completely unuseable if EXT_DEVT is fully used.

It is wholely unnecessary - we already have fully functional userland
mechanism to program devices being hotplugged which has full access to
device identification, connection topology and filesystem information.

Add a new struct blkio_group_conf which contains all blkcg
configurations to blkio_group and let blkio_group, which can be
created iff the associated device exists and is removed when the
associated device goes away, carry all configurations.

Note that, after this patch, all newly created blkg's will always have
the default configuration (unlimited for throttling and blkcg's weight
for propio).

This patch makes blkio_policy_node meaningless but doesn't remove it.
The next patch will.

-v2: Updated to retry after short sleep if blkg lookup/creation failed
     due to the queue being temporarily bypassed as indicated by
     -EBUSY return.  Pointed out by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 94 ++++++++++++++++++++++++++++++++++++++++------------
 block/blk-cgroup.h   |  9 +++++
 block/blk-throttle.c |  8 ++---
 block/cfq-iosched.c  |  2 +-
 4 files changed, 87 insertions(+), 26 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bc98914..fe8ce14 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -855,9 +855,12 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 }
 
 static int blkio_policy_parse_and_set(char *buf,
-	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
+				      struct blkio_policy_node *newpn,
+				      enum blkio_policy_id plid, int fileid,
+				      struct blkio_cgroup *blkcg)
 {
 	struct gendisk *disk = NULL;
+	struct blkio_group *blkg = NULL;
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	unsigned long major, minor;
 	int i = 0, ret = -EINVAL;
@@ -903,11 +906,25 @@ static int blkio_policy_parse_and_set(char *buf,
 		goto out;
 
 	/* For rule removal, do not check for device presence. */
-	if (temp) {
-		disk = get_gendisk(dev, &part);
-		if (!disk || part) {
-			ret = -ENODEV;
-			goto out;
+	disk = get_gendisk(dev, &part);
+
+	if ((!disk || part) && temp) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	rcu_read_lock();
+
+	if (disk && !part) {
+		spin_lock_irq(disk->queue->queue_lock);
+		blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
+		spin_unlock_irq(disk->queue->queue_lock);
+
+		if (IS_ERR(blkg)) {
+			ret = PTR_ERR(blkg);
+			if (ret == -EBUSY)
+				goto out_unlock;
+			blkg = NULL;
 		}
 	}
 
@@ -917,25 +934,46 @@ static int blkio_policy_parse_and_set(char *buf,
 	case BLKIO_POLICY_PROP:
 		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
 		     temp > BLKIO_WEIGHT_MAX)
-			goto out;
+			goto out_unlock;
 
 		newpn->plid = plid;
 		newpn->fileid = fileid;
 		newpn->val.weight = temp;
+		if (blkg)
+			blkg->conf.weight = temp;
 		break;
 	case BLKIO_POLICY_THROTL:
 		switch(fileid) {
 		case BLKIO_THROTL_read_bps_device:
+			if (blkg)
+				blkg->conf.bps[READ] = temp;
+			newpn->plid = plid;
+			newpn->fileid = fileid;
+			newpn->val.bps = temp;
+			break;
 		case BLKIO_THROTL_write_bps_device:
+			if (blkg)
+				blkg->conf.bps[WRITE] = temp;
 			newpn->plid = plid;
 			newpn->fileid = fileid;
 			newpn->val.bps = temp;
 			break;
 		case BLKIO_THROTL_read_iops_device:
+			if (temp > THROTL_IOPS_MAX)
+				goto out_unlock;
+
+			if (blkg)
+				blkg->conf.iops[READ] = temp;
+			newpn->plid = plid;
+			newpn->fileid = fileid;
+			newpn->val.iops = (unsigned int)temp;
+			break;
 		case BLKIO_THROTL_write_iops_device:
 			if (temp > THROTL_IOPS_MAX)
-				goto out;
+				goto out_unlock;
 
+			if (blkg)
+				blkg->conf.iops[WRITE] = temp;
 			newpn->plid = plid;
 			newpn->fileid = fileid;
 			newpn->val.iops = (unsigned int)temp;
@@ -946,8 +984,21 @@ static int blkio_policy_parse_and_set(char *buf,
 		BUG();
 	}
 	ret = 0;
+out_unlock:
+	rcu_read_unlock();
 out:
 	put_disk(disk);
+
+	/*
+	 * If queue was bypassing, we should retry.  Do so after a short
+	 * msleep().  It isn't strictly necessary but queue can be
+	 * bypassing for some time and it's always nice to avoid busy
+	 * looping.
+	 */
+	if (ret == -EBUSY) {
+		msleep(10);
+		return restart_syscall();
+	}
 	return ret;
 }
 
@@ -1095,26 +1146,29 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, struct blkio_policy_node *pn)
 {
-	unsigned int weight, iops;
-	u64 bps;
+	struct blkio_group_conf *conf = &blkg->conf;
 
 	switch(pn->plid) {
 	case BLKIO_POLICY_PROP:
-		weight = pn->val.weight ? pn->val.weight :
-				blkcg->weight;
-		blkio_update_group_weight(blkg, weight);
+		blkio_update_group_weight(blkg, conf->weight ?: blkcg->weight);
 		break;
 	case BLKIO_POLICY_THROTL:
 		switch(pn->fileid) {
 		case BLKIO_THROTL_read_bps_device:
+			blkio_update_group_bps(blkg, conf->bps[READ] ?: -1,
+					       pn->fileid);
+			break;
 		case BLKIO_THROTL_write_bps_device:
-			bps = pn->val.bps ? pn->val.bps : (-1);
-			blkio_update_group_bps(blkg, bps, pn->fileid);
+			blkio_update_group_bps(blkg, conf->bps[WRITE] ?: -1,
+					       pn->fileid);
 			break;
 		case BLKIO_THROTL_read_iops_device:
+			blkio_update_group_iops(blkg, conf->iops[READ] ?: -1,
+						pn->fileid);
+			break;
 		case BLKIO_THROTL_write_iops_device:
-			iops = pn->val.iops ? pn->val.iops : (-1);
-			blkio_update_group_iops(blkg, iops, pn->fileid);
+			blkio_update_group_iops(blkg, conf->iops[WRITE] ?: -1,
+						pn->fileid);
 			break;
 		}
 		break;
@@ -1152,7 +1206,7 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 	int ret = 0;
 	char *buf;
 	struct blkio_policy_node *newpn, *pn;
-	struct blkio_cgroup *blkcg;
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	int keep_newpn = 0;
 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
 	int fileid = BLKIOFILE_ATTR(cft->private);
@@ -1167,12 +1221,10 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 		goto free_buf;
 	}
 
-	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
+	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid, blkcg);
 	if (ret)
 		goto free_newpn;
 
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
-
 	spin_lock_irq(&blkcg->lock);
 
 	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2600ae7..81efe71 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -154,6 +154,12 @@ struct blkio_group_stats_cpu {
 	struct u64_stats_sync syncp;
 };
 
+struct blkio_group_conf {
+	unsigned int weight;
+	unsigned int iops[2];
+	u64 bps[2];
+};
+
 struct blkio_group {
 	/* Pointer to the associated request_queue, RCU protected */
 	struct request_queue __rcu *q;
@@ -166,6 +172,9 @@ struct blkio_group {
 	/* policy which owns this blk group */
 	enum blkio_policy_id plid;
 
+	/* Configuration */
+	struct blkio_group_conf conf;
+
 	/* Need to serialize the stats in the case of reset/update */
 	spinlock_t stats_lock;
 	struct blkio_group_stats stats;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 2ae637b..791b107 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -196,10 +196,10 @@ static struct blkio_group *throtl_alloc_blkio_group(struct request_queue *q,
 	bio_list_init(&tg->bio_lists[1]);
 	tg->limits_changed = false;
 
-	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
-	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
-	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
-	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
+	tg->bps[READ] = -1;
+	tg->bps[WRITE] = -1;
+	tg->iops[READ] = -1;
+	tg->iops[WRITE] = -1;
 
 	/*
 	 * Take the initial reference that will be released on destroy
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index acef564..08d4fdd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1083,7 +1083,7 @@ static struct blkio_group *cfq_alloc_blkio_group(struct request_queue *q,
 		return NULL;
 
 	cfq_init_cfqg_base(cfqg);
-	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	cfqg->weight = blkcg->weight;
 
 	/*
 	 * Take the initial reference that will be released on destroy
-- 
cgit v1.1


From 4bfd482e73b30284cb21e10834ce729fa81aa256 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:08 -0800
Subject: blkcg: kill blkio_policy_node

Now that blkcg configuration lives in blkg's, blkio_policy_node is no
longer necessary.  Kill it.

blkio_policy_parse_and_set() now fails if invoked for missing device
and functions to print out configurations are updated to print from
blkg's.

cftype_blkg_same_policy() is dropped along with other policy functions
for consistency.  Its one line is open coded in the only user -
blkio_read_blkg_stats().

-v2: Update to reflect the retry-on-bypass logic change of the
     previous patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 437 ++++++++---------------------------------------------
 block/blk-cgroup.h |  32 ----
 2 files changed, 59 insertions(+), 410 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fe8ce14..adf61c9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -59,54 +59,6 @@ struct cgroup_subsys blkio_subsys = {
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
-static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
-					    struct blkio_policy_node *pn)
-{
-	list_add(&pn->node, &blkcg->policy_list);
-}
-
-static inline bool cftype_blkg_same_policy(struct cftype *cft,
-			struct blkio_group *blkg)
-{
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-
-	if (blkg->plid == plid)
-		return 1;
-
-	return 0;
-}
-
-/* Determines if policy node matches cgroup file being accessed */
-static inline bool pn_matches_cftype(struct cftype *cft,
-			struct blkio_policy_node *pn)
-{
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int fileid = BLKIOFILE_ATTR(cft->private);
-
-	return (plid == pn->plid && fileid == pn->fileid);
-}
-
-/* Must be called with blkcg->lock held */
-static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
-{
-	list_del(&pn->node);
-}
-
-/* Must be called with blkcg->lock held */
-static struct blkio_policy_node *
-blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
-		enum blkio_policy_id plid, int fileid)
-{
-	struct blkio_policy_node *pn;
-
-	list_for_each_entry(pn, &blkcg->policy_list, node) {
-		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
-			return pn;
-	}
-
-	return NULL;
-}
-
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -854,10 +806,8 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	return disk_total;
 }
 
-static int blkio_policy_parse_and_set(char *buf,
-				      struct blkio_policy_node *newpn,
-				      enum blkio_policy_id plid, int fileid,
-				      struct blkio_cgroup *blkcg)
+static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
+				      int fileid, struct blkio_cgroup *blkcg)
 {
 	struct gendisk *disk = NULL;
 	struct blkio_group *blkg = NULL;
@@ -905,78 +855,51 @@ static int blkio_policy_parse_and_set(char *buf,
 	if (strict_strtoull(s[1], 10, &temp))
 		goto out;
 
-	/* For rule removal, do not check for device presence. */
 	disk = get_gendisk(dev, &part);
-
-	if ((!disk || part) && temp) {
-		ret = -ENODEV;
+	if (!disk || part)
 		goto out;
-	}
 
 	rcu_read_lock();
 
-	if (disk && !part) {
-		spin_lock_irq(disk->queue->queue_lock);
-		blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
-		spin_unlock_irq(disk->queue->queue_lock);
+	spin_lock_irq(disk->queue->queue_lock);
+	blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
+	spin_unlock_irq(disk->queue->queue_lock);
 
-		if (IS_ERR(blkg)) {
-			ret = PTR_ERR(blkg);
-			if (ret == -EBUSY)
-				goto out_unlock;
-			blkg = NULL;
-		}
+	if (IS_ERR(blkg)) {
+		ret = PTR_ERR(blkg);
+		goto out_unlock;
 	}
 
-	newpn->dev = dev;
-
 	switch (plid) {
 	case BLKIO_POLICY_PROP:
 		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
 		     temp > BLKIO_WEIGHT_MAX)
 			goto out_unlock;
 
-		newpn->plid = plid;
-		newpn->fileid = fileid;
-		newpn->val.weight = temp;
-		if (blkg)
-			blkg->conf.weight = temp;
+		blkg->conf.weight = temp;
+		blkio_update_group_weight(blkg, temp ?: blkcg->weight);
 		break;
 	case BLKIO_POLICY_THROTL:
 		switch(fileid) {
 		case BLKIO_THROTL_read_bps_device:
-			if (blkg)
-				blkg->conf.bps[READ] = temp;
-			newpn->plid = plid;
-			newpn->fileid = fileid;
-			newpn->val.bps = temp;
+			blkg->conf.bps[READ] = temp;
+			blkio_update_group_bps(blkg, temp ?: -1, fileid);
 			break;
 		case BLKIO_THROTL_write_bps_device:
-			if (blkg)
-				blkg->conf.bps[WRITE] = temp;
-			newpn->plid = plid;
-			newpn->fileid = fileid;
-			newpn->val.bps = temp;
+			blkg->conf.bps[WRITE] = temp;
+			blkio_update_group_bps(blkg, temp ?: -1, fileid);
 			break;
 		case BLKIO_THROTL_read_iops_device:
 			if (temp > THROTL_IOPS_MAX)
 				goto out_unlock;
-
-			if (blkg)
-				blkg->conf.iops[READ] = temp;
-			newpn->plid = plid;
-			newpn->fileid = fileid;
-			newpn->val.iops = (unsigned int)temp;
+			blkg->conf.iops[READ] = temp;
+			blkio_update_group_iops(blkg, temp ?: -1, fileid);
 			break;
 		case BLKIO_THROTL_write_iops_device:
 			if (temp > THROTL_IOPS_MAX)
 				goto out_unlock;
-
-			if (blkg)
-				blkg->conf.iops[WRITE] = temp;
-			newpn->plid = plid;
-			newpn->fileid = fileid;
-			newpn->val.iops = (unsigned int)temp;
+			blkg->conf.iops[WRITE] = temp;
+			blkio_update_group_iops(blkg, temp ?: -1, fileid);
 			break;
 		}
 		break;
@@ -1002,212 +925,12 @@ out:
 	return ret;
 }
 
-unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
-			      dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	unsigned int weight;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
-				BLKIO_PROP_weight_device);
-	if (pn)
-		weight = pn->val.weight;
-	else
-		weight = blkcg->weight;
-
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return weight;
-}
-EXPORT_SYMBOL_GPL(blkcg_get_weight);
-
-uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	uint64_t bps = -1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_bps_device);
-	if (pn)
-		bps = pn->val.bps;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return bps;
-}
-
-uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	uint64_t bps = -1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_bps_device);
-	if (pn)
-		bps = pn->val.bps;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return bps;
-}
-
-unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	unsigned int iops = -1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_iops_device);
-	if (pn)
-		iops = pn->val.iops;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return iops;
-}
-
-unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	unsigned int iops = -1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_iops_device);
-	if (pn)
-		iops = pn->val.iops;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return iops;
-}
-
-/* Checks whether user asked for deleting a policy rule */
-static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
-{
-	switch(pn->plid) {
-	case BLKIO_POLICY_PROP:
-		if (pn->val.weight == 0)
-			return 1;
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(pn->fileid) {
-		case BLKIO_THROTL_read_bps_device:
-		case BLKIO_THROTL_write_bps_device:
-			if (pn->val.bps == 0)
-				return 1;
-			break;
-		case BLKIO_THROTL_read_iops_device:
-		case BLKIO_THROTL_write_iops_device:
-			if (pn->val.iops == 0)
-				return 1;
-		}
-		break;
-	default:
-		BUG();
-	}
-
-	return 0;
-}
-
-static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
-					struct blkio_policy_node *newpn)
-{
-	switch(oldpn->plid) {
-	case BLKIO_POLICY_PROP:
-		oldpn->val.weight = newpn->val.weight;
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(newpn->fileid) {
-		case BLKIO_THROTL_read_bps_device:
-		case BLKIO_THROTL_write_bps_device:
-			oldpn->val.bps = newpn->val.bps;
-			break;
-		case BLKIO_THROTL_read_iops_device:
-		case BLKIO_THROTL_write_iops_device:
-			oldpn->val.iops = newpn->val.iops;
-		}
-		break;
-	default:
-		BUG();
-	}
-}
-
-/*
- * Some rules/values in blkg have changed. Propagate those to respective
- * policies.
- */
-static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
-		struct blkio_group *blkg, struct blkio_policy_node *pn)
-{
-	struct blkio_group_conf *conf = &blkg->conf;
-
-	switch(pn->plid) {
-	case BLKIO_POLICY_PROP:
-		blkio_update_group_weight(blkg, conf->weight ?: blkcg->weight);
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(pn->fileid) {
-		case BLKIO_THROTL_read_bps_device:
-			blkio_update_group_bps(blkg, conf->bps[READ] ?: -1,
-					       pn->fileid);
-			break;
-		case BLKIO_THROTL_write_bps_device:
-			blkio_update_group_bps(blkg, conf->bps[WRITE] ?: -1,
-					       pn->fileid);
-			break;
-		case BLKIO_THROTL_read_iops_device:
-			blkio_update_group_iops(blkg, conf->iops[READ] ?: -1,
-						pn->fileid);
-			break;
-		case BLKIO_THROTL_write_iops_device:
-			blkio_update_group_iops(blkg, conf->iops[WRITE] ?: -1,
-						pn->fileid);
-			break;
-		}
-		break;
-	default:
-		BUG();
-	}
-}
-
-/*
- * A policy node rule has been updated. Propagate this update to all the
- * block groups which might be affected by this update.
- */
-static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
-				struct blkio_policy_node *pn)
-{
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-
-	spin_lock(&blkio_list_lock);
-	spin_lock_irq(&blkcg->lock);
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
-			continue;
-		blkio_update_blkg_policy(blkcg, blkg, pn);
-	}
-
-	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
-}
-
 static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
  				       const char *buffer)
 {
 	int ret = 0;
 	char *buf;
-	struct blkio_policy_node *newpn, *pn;
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-	int keep_newpn = 0;
 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
 	int fileid = BLKIOFILE_ATTR(cft->private);
 
@@ -1215,69 +938,42 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 	if (!buf)
 		return -ENOMEM;
 
-	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
-	if (!newpn) {
-		ret = -ENOMEM;
-		goto free_buf;
-	}
-
-	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid, blkcg);
-	if (ret)
-		goto free_newpn;
-
-	spin_lock_irq(&blkcg->lock);
-
-	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
-	if (!pn) {
-		if (!blkio_delete_rule_command(newpn)) {
-			blkio_policy_insert_node(blkcg, newpn);
-			keep_newpn = 1;
-		}
-		spin_unlock_irq(&blkcg->lock);
-		goto update_io_group;
-	}
-
-	if (blkio_delete_rule_command(newpn)) {
-		blkio_policy_delete_node(pn);
-		kfree(pn);
-		spin_unlock_irq(&blkcg->lock);
-		goto update_io_group;
-	}
-	spin_unlock_irq(&blkcg->lock);
-
-	blkio_update_policy_rule(pn, newpn);
-
-update_io_group:
-	blkio_update_policy_node_blkg(blkcg, newpn);
-
-free_newpn:
-	if (!keep_newpn)
-		kfree(newpn);
-free_buf:
+	ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
 	kfree(buf);
 	return ret;
 }
 
-static void
-blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
+static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
+				   struct seq_file *m)
 {
-	switch(pn->plid) {
+	int fileid = BLKIOFILE_ATTR(cft->private);
+	int rw = WRITE;
+
+	switch (blkg->plid) {
 		case BLKIO_POLICY_PROP:
-			if (pn->fileid == BLKIO_PROP_weight_device)
-				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->val.weight);
+			if (blkg->conf.weight)
+				seq_printf(m, "%u:%u\t%u\n", MAJOR(blkg->dev),
+					MINOR(blkg->dev), blkg->conf.weight);
 			break;
 		case BLKIO_POLICY_THROTL:
-			switch(pn->fileid) {
+			switch (fileid) {
 			case BLKIO_THROTL_read_bps_device:
+				rw = READ;
 			case BLKIO_THROTL_write_bps_device:
-				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->val.bps);
+				if (blkg->conf.bps[rw])
+					seq_printf(m, "%u:%u\t%llu\n",
+						   MAJOR(blkg->dev),
+						   MINOR(blkg->dev),
+						   blkg->conf.bps[rw]);
 				break;
 			case BLKIO_THROTL_read_iops_device:
+				rw = READ;
 			case BLKIO_THROTL_write_iops_device:
-				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->val.iops);
+				if (blkg->conf.iops[rw])
+					seq_printf(m, "%u:%u\t%u\n",
+						   MAJOR(blkg->dev),
+						   MINOR(blkg->dev),
+						   blkg->conf.iops[rw]);
 				break;
 			}
 			break;
@@ -1287,20 +983,17 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 }
 
 /* cgroup files which read their data from policy nodes end up here */
-static void blkio_read_policy_node_files(struct cftype *cft,
-			struct blkio_cgroup *blkcg, struct seq_file *m)
+static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
+			    struct seq_file *m)
 {
-	struct blkio_policy_node *pn;
+	struct blkio_group *blkg;
+	struct hlist_node *n;
 
-	if (!list_empty(&blkcg->policy_list)) {
-		spin_lock_irq(&blkcg->lock);
-		list_for_each_entry(pn, &blkcg->policy_list, node) {
-			if (!pn_matches_cftype(cft, pn))
-				continue;
-			blkio_print_policy_node(m, pn);
-		}
-		spin_unlock_irq(&blkcg->lock);
-	}
+	spin_lock_irq(&blkcg->lock);
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
+		if (BLKIOFILE_POLICY(cft->private) == blkg->plid)
+			blkio_print_group_conf(cft, blkg, m);
+	spin_unlock_irq(&blkcg->lock);
 }
 
 static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
@@ -1316,7 +1009,7 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 	case BLKIO_POLICY_PROP:
 		switch(name) {
 		case BLKIO_PROP_weight_device:
-			blkio_read_policy_node_files(cft, blkcg, m);
+			blkio_read_conf(cft, blkcg, m);
 			return 0;
 		default:
 			BUG();
@@ -1328,7 +1021,7 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 		case BLKIO_THROTL_write_bps_device:
 		case BLKIO_THROTL_read_iops_device:
 		case BLKIO_THROTL_write_iops_device:
-			blkio_read_policy_node_files(cft, blkcg, m);
+			blkio_read_conf(cft, blkcg, m);
 			return 0;
 		default:
 			BUG();
@@ -1352,7 +1045,7 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		if (blkg->dev) {
-			if (!cftype_blkg_same_policy(cft, blkg))
+			if (BLKIOFILE_POLICY(cft->private) != blkg->plid)
 				continue;
 			if (pcpu)
 				cgroup_total += blkio_get_stat_cpu(blkg, cb,
@@ -1451,11 +1144,10 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
-static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
+static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
 {
 	struct blkio_group *blkg;
 	struct hlist_node *n;
-	struct blkio_policy_node *pn;
 
 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
 		return -EINVAL;
@@ -1464,14 +1156,10 @@ static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
 	spin_lock_irq(&blkcg->lock);
 	blkcg->weight = (unsigned int)val;
 
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		pn = blkio_policy_search_node(blkcg, blkg->dev,
-				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
-		if (pn)
-			continue;
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
+		if (blkg->plid == plid && !blkg->conf.weight)
+			blkio_update_group_weight(blkg, blkcg->weight);
 
-		blkio_update_group_weight(blkg, blkcg->weight);
-	}
 	spin_unlock_irq(&blkcg->lock);
 	spin_unlock(&blkio_list_lock);
 	return 0;
@@ -1510,7 +1198,7 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 	case BLKIO_POLICY_PROP:
 		switch(name) {
 		case BLKIO_PROP_weight:
-			return blkio_weight_write(blkcg, val);
+			return blkio_weight_write(blkcg, plid, val);
 		}
 		break;
 	default:
@@ -1691,7 +1379,6 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 	struct blkio_group *blkg;
 	struct request_queue *q;
 	struct blkio_policy_type *blkiop;
-	struct blkio_policy_node *pn, *pntmp;
 
 	rcu_read_lock();
 	do {
@@ -1723,11 +1410,6 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		spin_unlock(&blkio_list_lock);
 	} while (1);
 
-	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
-		blkio_policy_delete_node(pn);
-		kfree(pn);
-	}
-
 	free_css_id(&blkio_subsys, &blkcg->css);
 	rcu_read_unlock();
 	if (blkcg != &blkio_root_cgroup)
@@ -1754,7 +1436,6 @@ done:
 	spin_lock_init(&blkcg->lock);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
-	INIT_LIST_HEAD(&blkcg->policy_list);
 	return &blkcg->css;
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 81efe71..9a5c68d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -112,7 +112,6 @@ struct blkio_cgroup {
 	unsigned int weight;
 	spinlock_t lock;
 	struct hlist_head blkg_list;
-	struct list_head policy_list; /* list of blkio_policy_node */
 };
 
 struct blkio_group_stats {
@@ -182,37 +181,6 @@ struct blkio_group {
 	struct blkio_group_stats_cpu __percpu *stats_cpu;
 };
 
-struct blkio_policy_node {
-	struct list_head node;
-	dev_t dev;
-	/* This node belongs to max bw policy or porportional weight policy */
-	enum blkio_policy_id plid;
-	/* cgroup file to which this rule belongs to */
-	int fileid;
-
-	union {
-		unsigned int weight;
-		/*
-		 * Rate read/write in terms of bytes per second
-		 * Whether this rate represents read or write is determined
-		 * by file type "fileid".
-		 */
-		u64 bps;
-		unsigned int iops;
-	} val;
-};
-
-extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-
 typedef struct blkio_group *(blkio_alloc_group_fn)(struct request_queue *q,
 						   struct blkio_cgroup *blkcg);
 typedef void (blkio_link_group_fn)(struct request_queue *q,
-- 
cgit v1.1


From 7a4dd281ec66224f802093962d1d903d86b09560 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:09 -0800
Subject: blkcg: kill the mind-bending blkg->dev

blkg->dev is dev_t recording the device number of the block device for
the associated request_queue.  It is used to identify the associated
block device when printing out configuration or stats.

This is redundant to begin with.  A blkg is an association between a
cgroup and a request_queue and it of course is possible to reach
request_queue from blkg and synchronization conventions are in place
for safe q dereferencing, so this shouldn't be necessary from the
beginning.  Furthermore, it's initialized by sscanf()ing the device
name of backing_dev_info.  The mind boggles.

Anyways, if blkg is visible under rcu lock, we *know* that the
associated request_queue hasn't gone away yet and its bdi is
registered and alive - blkg can't be created for request_queue which
hasn't been fully initialized and it can't go away before blkg is
removed.

Let stat and conf read functions get device name from
blkg->q->backing_dev_info.dev and pass it down to printing functions
and remove blkg->dev.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 86 +++++++++++++++++++++++++++-------------------------
 block/blk-cgroup.h   |  2 --
 block/blk-throttle.c | 51 ++-----------------------------
 block/cfq-iosched.c  | 21 -------------
 4 files changed, 47 insertions(+), 113 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index adf61c9..8742af3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -662,10 +662,10 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	return 0;
 }
 
-static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
-				int chars_left, bool diskname_only)
+static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
+			       char *str, int chars_left, bool diskname_only)
 {
-	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
+	snprintf(str, chars_left, "%s", dname);
 	chars_left -= strlen(str);
 	if (chars_left <= 0) {
 		printk(KERN_WARNING
@@ -696,9 +696,9 @@ static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
 }
 
 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
-				struct cgroup_map_cb *cb, dev_t dev)
+				struct cgroup_map_cb *cb, const char *dname)
 {
-	blkio_get_key_name(0, dev, str, chars_left, true);
+	blkio_get_key_name(0, dname, str, chars_left, true);
 	cb->fill(cb, str, val);
 	return val;
 }
@@ -730,7 +730,8 @@ static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
 }
 
 static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
-		struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
+				   struct cgroup_map_cb *cb, const char *dname,
+				   enum stat_type_cpu type)
 {
 	uint64_t disk_total, val;
 	char key_str[MAX_KEY_LEN];
@@ -738,12 +739,14 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
 
 	if (type == BLKIO_STAT_CPU_SECTORS) {
 		val = blkio_read_stat_cpu(blkg, type, 0);
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb,
+				       dname);
 	}
 
 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 			sub_type++) {
-		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+		blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
+				   false);
 		val = blkio_read_stat_cpu(blkg, type, sub_type);
 		cb->fill(cb, key_str, val);
 	}
@@ -751,14 +754,16 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
 	disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
 			blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
 
-	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
+			   false);
 	cb->fill(cb, key_str, disk_total);
 	return disk_total;
 }
 
 /* This should be called with blkg->stats_lock held */
 static uint64_t blkio_get_stat(struct blkio_group *blkg,
-		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
+			       struct cgroup_map_cb *cb, const char *dname,
+			       enum stat_type type)
 {
 	uint64_t disk_total;
 	char key_str[MAX_KEY_LEN];
@@ -766,11 +771,11 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 
 	if (type == BLKIO_STAT_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.time, cb, dev);
+					blkg->stats.time, cb, dname);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.unaccounted_time, cb, dev);
+				       blkg->stats.unaccounted_time, cb, dname);
 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
 		uint64_t sum = blkg->stats.avg_queue_size_sum;
 		uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -778,30 +783,33 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 			do_div(sum, samples);
 		else
 			sum = 0;
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+				       sum, cb, dname);
 	}
 	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.group_wait_time, cb, dev);
+				       blkg->stats.group_wait_time, cb, dname);
 	if (type == BLKIO_STAT_IDLE_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.idle_time, cb, dev);
+				       blkg->stats.idle_time, cb, dname);
 	if (type == BLKIO_STAT_EMPTY_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.empty_time, cb, dev);
+				       blkg->stats.empty_time, cb, dname);
 	if (type == BLKIO_STAT_DEQUEUE)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.dequeue, cb, dev);
+				       blkg->stats.dequeue, cb, dname);
 #endif
 
 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 			sub_type++) {
-		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+		blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
+				   false);
 		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
 	}
 	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
 			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
-	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
+			   false);
 	cb->fill(cb, key_str, disk_total);
 	return disk_total;
 }
@@ -946,14 +954,15 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
 				   struct seq_file *m)
 {
+	const char *dname = dev_name(blkg->q->backing_dev_info.dev);
 	int fileid = BLKIOFILE_ATTR(cft->private);
 	int rw = WRITE;
 
 	switch (blkg->plid) {
 		case BLKIO_POLICY_PROP:
 			if (blkg->conf.weight)
-				seq_printf(m, "%u:%u\t%u\n", MAJOR(blkg->dev),
-					MINOR(blkg->dev), blkg->conf.weight);
+				seq_printf(m, "%s\t%u\n",
+					   dname, blkg->conf.weight);
 			break;
 		case BLKIO_POLICY_THROTL:
 			switch (fileid) {
@@ -961,19 +970,15 @@ static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
 				rw = READ;
 			case BLKIO_THROTL_write_bps_device:
 				if (blkg->conf.bps[rw])
-					seq_printf(m, "%u:%u\t%llu\n",
-						   MAJOR(blkg->dev),
-						   MINOR(blkg->dev),
-						   blkg->conf.bps[rw]);
+					seq_printf(m, "%s\t%llu\n",
+						   dname, blkg->conf.bps[rw]);
 				break;
 			case BLKIO_THROTL_read_iops_device:
 				rw = READ;
 			case BLKIO_THROTL_write_iops_device:
 				if (blkg->conf.iops[rw])
-					seq_printf(m, "%u:%u\t%u\n",
-						   MAJOR(blkg->dev),
-						   MINOR(blkg->dev),
-						   blkg->conf.iops[rw]);
+					seq_printf(m, "%s\t%u\n",
+						   dname, blkg->conf.iops[rw]);
 				break;
 			}
 			break;
@@ -1044,18 +1049,17 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		if (blkg->dev) {
-			if (BLKIOFILE_POLICY(cft->private) != blkg->plid)
-				continue;
-			if (pcpu)
-				cgroup_total += blkio_get_stat_cpu(blkg, cb,
-						blkg->dev, type);
-			else {
-				spin_lock_irq(&blkg->stats_lock);
-				cgroup_total += blkio_get_stat(blkg, cb,
-						blkg->dev, type);
-				spin_unlock_irq(&blkg->stats_lock);
-			}
+		const char *dname = dev_name(blkg->q->backing_dev_info.dev);
+
+		if (BLKIOFILE_POLICY(cft->private) != blkg->plid)
+			continue;
+		if (pcpu)
+			cgroup_total += blkio_get_stat_cpu(blkg, cb, dname,
+							   type);
+		else {
+			spin_lock_irq(&blkg->stats_lock);
+			cgroup_total += blkio_get_stat(blkg, cb, dname, type);
+			spin_unlock_irq(&blkg->stats_lock);
 		}
 	}
 	if (show_total)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 9a5c68d..7ebecf6 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -166,8 +166,6 @@ struct blkio_group {
 	unsigned short blkcg_id;
 	/* Store cgroup path */
 	char path[128];
-	/* The device MKDEV(major, minor), this group has been created for */
-	dev_t dev;
 	/* policy which owns this blk group */
 	enum blkio_policy_id plid;
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 791b107..52a4293 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -212,50 +212,12 @@ static struct blkio_group *throtl_alloc_blkio_group(struct request_queue *q,
 	return &tg->blkg;
 }
 
-static void
-__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
-{
-	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
-	unsigned int major, minor;
-
-	if (!tg || tg->blkg.dev)
-		return;
-
-	/*
-	 * Fill in device details for a group which might not have been
-	 * filled at group creation time as queue was being instantiated
-	 * and driver had not attached a device yet
-	 */
-	if (bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		tg->blkg.dev = MKDEV(major, minor);
-	}
-}
-
-/*
- * Should be called with without queue lock held. Here queue lock will be
- * taken rarely. It will be taken only once during life time of a group
- * if need be
- */
-static void
-throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
-{
-	if (!tg || tg->blkg.dev)
-		return;
-
-	spin_lock_irq(td->queue->queue_lock);
-	__throtl_tg_fill_dev_details(td, tg);
-	spin_unlock_irq(td->queue->queue_lock);
-}
-
 static void throtl_link_blkio_group(struct request_queue *q,
 				    struct blkio_group *blkg)
 {
 	struct throtl_data *td = q->td;
 	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	__throtl_tg_fill_dev_details(td, tg);
-
 	hlist_add_head(&tg->tg_node, &td->tg_list);
 	td->nr_undestroyed_grps++;
 }
@@ -263,20 +225,14 @@ static void throtl_link_blkio_group(struct request_queue *q,
 static struct
 throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
-	struct throtl_grp *tg = NULL;
-
 	/*
 	 * This is the common case when there are no blkio cgroups.
 	 * Avoid lookup in this case
 	 */
 	if (blkcg == &blkio_root_cgroup)
-		tg = td->root_tg;
-	else
-		tg = tg_of_blkg(blkg_lookup(blkcg, td->queue,
-					    BLKIO_POLICY_THROTL));
+		return td->root_tg;
 
-	__throtl_tg_fill_dev_details(td, tg);
-	return tg;
+	return tg_of_blkg(blkg_lookup(blkcg, td->queue, BLKIO_POLICY_THROTL));
 }
 
 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
@@ -303,7 +259,6 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 			tg = td->root_tg;
 	}
 
-	__throtl_tg_fill_dev_details(td, tg);
 	return tg;
 }
 
@@ -1090,8 +1045,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	blkcg = task_blkio_cgroup(current);
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
-		throtl_tg_fill_dev_details(td, tg);
-
 		if (tg_no_rule_group(tg, rw)) {
 			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
 					rw, rw_is_sync(bio->bi_rw));
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 08d4fdd..f67d109 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1052,20 +1052,7 @@ static void cfq_link_blkio_group(struct request_queue *q,
 				 struct blkio_group *blkg)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct backing_dev_info *bdi = &q->backing_dev_info;
 	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
-	unsigned int major, minor;
-
-	/*
-	 * Add group onto cgroup list. It might happen that bdi->dev is
-	 * not initialized yet. Initialize this new group without major
-	 * and minor info and this info will be filled in once a new thread
-	 * comes for IO.
-	 */
-	if (bdi->dev) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		blkg->dev = MKDEV(major, minor);
-	}
 
 	cfqd->nr_blkcg_linked_grps++;
 
@@ -1104,7 +1091,6 @@ static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 						struct blkio_cgroup *blkcg)
 {
 	struct request_queue *q = cfqd->queue;
-	struct backing_dev_info *bdi = &q->backing_dev_info;
 	struct cfq_group *cfqg = NULL;
 
 	/* avoid lookup for the common case where there's no blkio cgroup */
@@ -1118,13 +1104,6 @@ static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 			cfqg = cfqg_of_blkg(blkg);
 	}
 
-	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-		unsigned int major, minor;
-
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfqg->blkg.dev = MKDEV(major, minor);
-	}
-
 	return cfqg;
 }
 
-- 
cgit v1.1


From 92616b5b3a7c7fa8148df82e7ff6183056f2bfc8 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 5 Mar 2012 13:15:10 -0800
Subject: blkcg: skip blkg printing if q isn't associated with disk

blk-cgroup printing code currently assumes that there is a device/disk
associated with every queue in the system, but modules like floppy,
can instantiate request queues without registering disk which can lead
to oops.

Skip the queue/blkg which don't have dev/disk associated with them.

-tj: Factored out backing_dev_info check into blkg_dev_name().

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8742af3..7694236 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -951,13 +951,24 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 	return ret;
 }
 
+static const char *blkg_dev_name(struct blkio_group *blkg)
+{
+	/* some drivers (floppy) instantiate a queue w/o disk registered */
+	if (blkg->q->backing_dev_info.dev)
+		return dev_name(blkg->q->backing_dev_info.dev);
+	return NULL;
+}
+
 static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
 				   struct seq_file *m)
 {
-	const char *dname = dev_name(blkg->q->backing_dev_info.dev);
+	const char *dname = blkg_dev_name(blkg);
 	int fileid = BLKIOFILE_ATTR(cft->private);
 	int rw = WRITE;
 
+	if (!dname)
+		return;
+
 	switch (blkg->plid) {
 		case BLKIO_POLICY_PROP:
 			if (blkg->conf.weight)
@@ -1049,9 +1060,9 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		const char *dname = dev_name(blkg->q->backing_dev_info.dev);
+		const char *dname = blkg_dev_name(blkg);
 
-		if (BLKIOFILE_POLICY(cft->private) != blkg->plid)
+		if (!dname || BLKIOFILE_POLICY(cft->private) != blkg->plid)
 			continue;
 		if (pcpu)
 			cgroup_total += blkio_get_stat_cpu(blkg, cb, dname,
-- 
cgit v1.1


From 7ee9c5620504906e98451dc9a1945b2b9e892cb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:11 -0800
Subject: blkcg: let blkio_group point to blkio_cgroup directly

Currently, blkg points to the associated blkcg via its css_id.  This
unnecessarily complicates dereferencing blkcg.  Let blkg hold a
reference to the associated blkcg and point directly to it and disable
css_id on blkio_subsys.

This change requires splitting blkiocg_destroy() into
blkiocg_pre_destroy() and blkiocg_destroy() so that all blkg's can be
destroyed and all the blkcg references held by them dropped during
cgroup removal.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 43 ++++++++++++++++++++++++-------------------
 block/blk-cgroup.h   |  2 +-
 block/blk-throttle.c |  3 +++
 block/cfq-iosched.c  |  4 ++++
 4 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7694236..d42d826 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -37,6 +37,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
 			      struct cgroup_taskset *);
 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
 			   struct cgroup_taskset *);
+static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *);
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 
@@ -51,10 +52,10 @@ struct cgroup_subsys blkio_subsys = {
 	.create = blkiocg_create,
 	.can_attach = blkiocg_can_attach,
 	.attach = blkiocg_attach,
+	.pre_destroy = blkiocg_pre_destroy,
 	.destroy = blkiocg_destroy,
 	.populate = blkiocg_populate,
 	.subsys_id = blkio_subsys_id,
-	.use_id = 1,
 	.module = THIS_MODULE,
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
@@ -442,6 +443,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	if (blkg)
 		return blkg;
 
+	/* blkg holds a reference to blkcg */
 	if (!css_tryget(&blkcg->css))
 		return ERR_PTR(-EINVAL);
 
@@ -463,15 +465,16 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 
 		spin_lock_init(&new_blkg->stats_lock);
 		rcu_assign_pointer(new_blkg->q, q);
-		new_blkg->blkcg_id = css_id(&blkcg->css);
+		new_blkg->blkcg = blkcg;
 		new_blkg->plid = plid;
 		cgroup_path(blkcg->css.cgroup, new_blkg->path,
 			    sizeof(new_blkg->path));
+	} else {
+		css_put(&blkcg->css);
 	}
 
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
-	css_put(&blkcg->css);
 
 	/* did bypass get turned on inbetween? */
 	if (unlikely(blk_queue_bypass(q)) && !for_root) {
@@ -500,6 +503,7 @@ out:
 	if (new_blkg) {
 		free_percpu(new_blkg->stats_cpu);
 		kfree(new_blkg);
+		css_put(&blkcg->css);
 	}
 	return blkg;
 }
@@ -508,7 +512,6 @@ EXPORT_SYMBOL_GPL(blkg_lookup_create);
 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 {
 	hlist_del_init_rcu(&blkg->blkcg_node);
-	blkg->blkcg_id = 0;
 }
 
 /*
@@ -517,24 +520,17 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
  */
 int blkiocg_del_blkio_group(struct blkio_group *blkg)
 {
-	struct blkio_cgroup *blkcg;
+	struct blkio_cgroup *blkcg = blkg->blkcg;
 	unsigned long flags;
-	struct cgroup_subsys_state *css;
 	int ret = 1;
 
-	rcu_read_lock();
-	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
-	if (css) {
-		blkcg = container_of(css, struct blkio_cgroup, css);
-		spin_lock_irqsave(&blkcg->lock, flags);
-		if (!hlist_unhashed(&blkg->blkcg_node)) {
-			__blkiocg_del_blkio_group(blkg);
-			ret = 0;
-		}
-		spin_unlock_irqrestore(&blkcg->lock, flags);
+	spin_lock_irqsave(&blkcg->lock, flags);
+	if (!hlist_unhashed(&blkg->blkcg_node)) {
+		__blkiocg_del_blkio_group(blkg);
+		ret = 0;
 	}
+	spin_unlock_irqrestore(&blkcg->lock, flags);
 
-	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
@@ -1387,7 +1383,8 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 				ARRAY_SIZE(blkio_files));
 }
 
-static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
+			       struct cgroup *cgroup)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	unsigned long flags;
@@ -1396,6 +1393,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 	struct blkio_policy_type *blkiop;
 
 	rcu_read_lock();
+
 	do {
 		spin_lock_irqsave(&blkcg->lock, flags);
 
@@ -1425,8 +1423,15 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		spin_unlock(&blkio_list_lock);
 	} while (1);
 
-	free_css_id(&blkio_subsys, &blkcg->css);
 	rcu_read_unlock();
+
+	return 0;
+}
+
+static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+
 	if (blkcg != &blkio_root_cgroup)
 		kfree(blkcg);
 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 7ebecf6..ca1fc63 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -163,7 +163,7 @@ struct blkio_group {
 	/* Pointer to the associated request_queue, RCU protected */
 	struct request_queue __rcu *q;
 	struct hlist_node blkcg_node;
-	unsigned short blkcg_id;
+	struct blkio_cgroup *blkcg;
 	/* Store cgroup path */
 	char path[128];
 	/* policy which owns this blk group */
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 52a4293..fe6a442 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -169,6 +169,9 @@ static void throtl_put_tg(struct throtl_grp *tg)
 	if (!atomic_dec_and_test(&tg->ref))
 		return;
 
+	/* release the extra blkcg reference this blkg has been holding */
+	css_put(&tg->blkg.blkcg->css);
+
 	/*
 	 * A group is freed in rcu manner. But having an rcu lock does not
 	 * mean that one can access all the fields of blkg and assume these
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f67d109..9ef86fb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1133,6 +1133,10 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 	cfqg->ref--;
 	if (cfqg->ref)
 		return;
+
+	/* release the extra blkcg reference this blkg has been holding */
+	css_put(&cfqg->blkg.blkcg->css);
+
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
 	free_percpu(cfqg->blkg.stats_cpu);
-- 
cgit v1.1


From 5efd611351d1a847c72d74fb12ff4bd187c0cb2c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:12 -0800
Subject: blkcg: add blkcg_{init|drain|exit}_queue()

Currently block core calls directly into blk-throttle for init, drain
and exit.  This patch adds blkcg_{init|drain|exit}_queue() which wraps
the blk-throttle functions.  This is to give more control and
visiblity to blkcg core layer for proper layering.  Further patches
will add logic common to blkcg policies to the functions.

While at it, collapse blk_throtl_release() into blk_throtl_exit().
There's no reason to keep them separate.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 42 ++++++++++++++++++++++++++++++++++++++++++
 block/blk-cgroup.h   |  7 +++++++
 block/blk-core.c     |  7 ++++---
 block/blk-sysfs.c    |  4 ++--
 block/blk-throttle.c |  3 ---
 block/blk.h          |  2 --
 6 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d42d826..b302ce1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -20,6 +20,7 @@
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include "blk-cgroup.h"
+#include "blk.h"
 
 #define MAX_KEY_LEN 100
 
@@ -1459,6 +1460,47 @@ done:
 	return &blkcg->css;
 }
 
+/**
+ * blkcg_init_queue - initialize blkcg part of request queue
+ * @q: request_queue to initialize
+ *
+ * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
+ * part of new request_queue @q.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int blkcg_init_queue(struct request_queue *q)
+{
+	might_sleep();
+
+	return blk_throtl_init(q);
+}
+
+/**
+ * blkcg_drain_queue - drain blkcg part of request_queue
+ * @q: request_queue to drain
+ *
+ * Called from blk_drain_queue().  Responsible for draining blkcg part.
+ */
+void blkcg_drain_queue(struct request_queue *q)
+{
+	lockdep_assert_held(q->queue_lock);
+
+	blk_throtl_drain(q);
+}
+
+/**
+ * blkcg_exit_queue - exit and release blkcg part of request_queue
+ * @q: request_queue being released
+ *
+ * Called from blk_release_queue().  Responsible for exiting blkcg part.
+ */
+void blkcg_exit_queue(struct request_queue *q)
+{
+	blk_throtl_exit(q);
+}
+
 /*
  * We cannot support shared io contexts, as we have no mean to support
  * two tasks with the same ioc in two different groups without major rework
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ca1fc63..3bc1710 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -215,6 +215,10 @@ struct blkio_policy_type {
 	enum blkio_policy_id plid;
 };
 
+extern int blkcg_init_queue(struct request_queue *q);
+extern void blkcg_drain_queue(struct request_queue *q);
+extern void blkcg_exit_queue(struct request_queue *q);
+
 /* Blkio controller policy registration */
 extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
@@ -233,6 +237,9 @@ struct blkio_group {
 struct blkio_policy_type {
 };
 
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_drain_queue(struct request_queue *q) { }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
 static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 static inline void blkg_destroy_all(struct request_queue *q) { }
diff --git a/block/blk-core.c b/block/blk-core.c
index 5a1b8cc..c3434c6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,6 +34,7 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-cgroup.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -280,7 +281,7 @@ EXPORT_SYMBOL(blk_stop_queue);
  *
  *     This function does not cancel any asynchronous activity arising
  *     out of elevator or throttling code. That would require elevaotor_exit()
- *     and blk_throtl_exit() to be called with queue lock initialized.
+ *     and blkcg_exit_queue() to be called with queue lock initialized.
  *
  */
 void blk_sync_queue(struct request_queue *q)
@@ -372,7 +373,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
 		if (q->elevator)
 			elv_drain_elevator(q);
 
-		blk_throtl_drain(q);
+		blkcg_drain_queue(q);
 
 		/*
 		 * This function might be called on a queue which failed
@@ -562,7 +563,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	 */
 	q->queue_lock = &q->__queue_lock;
 
-	if (blk_throtl_init(q))
+	if (blkcg_init_queue(q))
 		goto fail_id;
 
 	return q;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cf15001..00cdc98 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -9,6 +9,7 @@
 #include <linux/blktrace_api.h>
 
 #include "blk.h"
+#include "blk-cgroup.h"
 
 struct queue_sysfs_entry {
 	struct attribute attr;
@@ -486,7 +487,7 @@ static void blk_release_queue(struct kobject *kobj)
 		elevator_exit(q->elevator);
 	}
 
-	blk_throtl_exit(q);
+	blkcg_exit_queue(q);
 
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
@@ -494,7 +495,6 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	blk_throtl_release(q);
 	blk_trace_shutdown(q);
 
 	bdi_destroy(&q->backing_dev_info);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index fe6a442..ac6d0fe 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1226,10 +1226,7 @@ void blk_throtl_exit(struct request_queue *q)
 	 * it.
 	 */
 	throtl_shutdown_wq(q);
-}
 
-void blk_throtl_release(struct request_queue *q)
-{
 	kfree(q->td);
 }
 
diff --git a/block/blk.h b/block/blk.h
index 7422f31..de15f92 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -236,7 +236,6 @@ extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
-extern void blk_throtl_release(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
@@ -245,7 +244,6 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
-static inline void blk_throtl_release(struct request_queue *q) { }
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 
 #endif /* BLK_INTERNAL_H */
-- 
cgit v1.1


From 923adde1be1df57cebd80c563058e503376645e8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:13 -0800
Subject: blkcg: clear all request_queues on blkcg policy [un]registrations

Keep track of all request_queues which have blkcg initialized and turn
on bypass and invoke blkcg_clear_queue() on all before making changes
to blkcg policies.

This is to prepare for moving blkg management into blkcg core.  Note
that this uses more brute force than necessary.  Finer grained shoot
down will be implemented later and given that policy [un]registration
almost never happens on running systems (blk-throtl can't be built as
a module and cfq usually is the builtin default iosched), this
shouldn't be a problem for the time being.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/blkdev.h |  3 +++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b302ce1..266c070 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,9 @@
 static DEFINE_SPINLOCK(blkio_list_lock);
 static LIST_HEAD(blkio_list);
 
+static DEFINE_MUTEX(all_q_mutex);
+static LIST_HEAD(all_q_list);
+
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
@@ -1472,9 +1475,20 @@ done:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
+	int ret;
+
 	might_sleep();
 
-	return blk_throtl_init(q);
+	ret = blk_throtl_init(q);
+	if (ret)
+		return ret;
+
+	mutex_lock(&all_q_mutex);
+	INIT_LIST_HEAD(&q->all_q_node);
+	list_add_tail(&q->all_q_node, &all_q_list);
+	mutex_unlock(&all_q_mutex);
+
+	return 0;
 }
 
 /**
@@ -1498,6 +1512,10 @@ void blkcg_drain_queue(struct request_queue *q)
  */
 void blkcg_exit_queue(struct request_queue *q)
 {
+	mutex_lock(&all_q_mutex);
+	list_del_init(&q->all_q_node);
+	mutex_unlock(&all_q_mutex);
+
 	blk_throtl_exit(q);
 }
 
@@ -1543,8 +1561,33 @@ static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	}
 }
 
+static void blkcg_bypass_start(void)
+	__acquires(&all_q_mutex)
+{
+	struct request_queue *q;
+
+	mutex_lock(&all_q_mutex);
+
+	list_for_each_entry(q, &all_q_list, all_q_node) {
+		blk_queue_bypass_start(q);
+		blkg_destroy_all(q);
+	}
+}
+
+static void blkcg_bypass_end(void)
+	__releases(&all_q_mutex)
+{
+	struct request_queue *q;
+
+	list_for_each_entry(q, &all_q_list, all_q_node)
+		blk_queue_bypass_end(q);
+
+	mutex_unlock(&all_q_mutex);
+}
+
 void blkio_policy_register(struct blkio_policy_type *blkiop)
 {
+	blkcg_bypass_start();
 	spin_lock(&blkio_list_lock);
 
 	BUG_ON(blkio_policy[blkiop->plid]);
@@ -1552,11 +1595,13 @@ void blkio_policy_register(struct blkio_policy_type *blkiop)
 	list_add_tail(&blkiop->list, &blkio_list);
 
 	spin_unlock(&blkio_list_lock);
+	blkcg_bypass_end();
 }
 EXPORT_SYMBOL_GPL(blkio_policy_register);
 
 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 {
+	blkcg_bypass_start();
 	spin_lock(&blkio_list_lock);
 
 	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
@@ -1564,5 +1609,6 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 	list_del_init(&blkiop->list);
 
 	spin_unlock(&blkio_list_lock);
+	blkcg_bypass_end();
 }
 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 315db1d..e8c0bbd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -397,6 +397,9 @@ struct request_queue {
 	struct bsg_class_device bsg_dev;
 #endif
 
+#ifdef CONFIG_BLK_CGROUP
+	struct list_head	all_q_node;
+#endif
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	/* Throttle data */
 	struct throtl_data *td;
-- 
cgit v1.1


From 0381411e4b1a52cee134eb73750e5e3cc1155d09 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:14 -0800
Subject: blkcg: let blkcg core handle policy private data allocation

Currently, blkg's are embedded in private data blkcg policy private
data structure and thus allocated and freed by policies.  This leads
to duplicate codes in policies, hinders implementing common part in
blkcg core with strong semantics, and forces duplicate blkg's for the
same cgroup-q association.

This patch introduces struct blkg_policy_data which is a separate data
structure chained from blkg.  Policies specifies the amount of private
data it needs in its blkio_policy_type->pdata_size and blkcg core
takes care of allocating them along with blkg which can be accessed
using blkg_to_pdata().  blkg can be determined from pdata using
pdata_to_blkg().  blkio_alloc_group_fn() method is accordingly updated
to blkio_init_group_fn().

For consistency, tg_of_blkg() and cfqg_of_blkg() are replaced with
blkg_to_tg() and blkg_to_cfqg() respectively, and functions to map in
the reverse direction are added.

Except that policy specific data now lives in a separate data
structure from blkg, this patch doesn't introduce any functional
difference.

This will be used to unify blkg's for different policies.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   |  86 +++++++++++++++++++++++++++++++++----------
 block/blk-cgroup.h   |  53 ++++++++++++++++++++++++--
 block/blk-throttle.c |  79 +++++++++++++++++++--------------------
 block/cfq-iosched.c  | 102 ++++++++++++++++++++++++++-------------------------
 4 files changed, 209 insertions(+), 111 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 266c070..1436749 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -422,6 +422,70 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
+/**
+ * blkg_free - free a blkg
+ * @blkg: blkg to free
+ *
+ * Free @blkg which may be partially allocated.
+ */
+static void blkg_free(struct blkio_group *blkg)
+{
+	if (blkg) {
+		free_percpu(blkg->stats_cpu);
+		kfree(blkg->pd);
+		kfree(blkg);
+	}
+}
+
+/**
+ * blkg_alloc - allocate a blkg
+ * @blkcg: block cgroup the new blkg is associated with
+ * @q: request_queue the new blkg is associated with
+ * @pol: policy the new blkg is associated with
+ *
+ * Allocate a new blkg assocating @blkcg and @q for @pol.
+ *
+ * FIXME: Should be called with queue locked but currently isn't due to
+ *        percpu stat breakage.
+ */
+static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
+				      struct request_queue *q,
+				      struct blkio_policy_type *pol)
+{
+	struct blkio_group *blkg;
+
+	/* alloc and init base part */
+	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
+	if (!blkg)
+		return NULL;
+
+	spin_lock_init(&blkg->stats_lock);
+	rcu_assign_pointer(blkg->q, q);
+	blkg->blkcg = blkcg;
+	blkg->plid = pol->plid;
+	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
+
+	/* alloc per-policy data */
+	blkg->pd = kzalloc_node(sizeof(*blkg->pd) + pol->pdata_size, GFP_ATOMIC,
+				q->node);
+	if (!blkg->pd) {
+		blkg_free(blkg);
+		return NULL;
+	}
+
+	/* broken, read comment in the callsite */
+	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+	if (!blkg->stats_cpu) {
+		blkg_free(blkg);
+		return NULL;
+	}
+
+	/* attach pd to blkg and invoke per-policy init */
+	blkg->pd->blkg = blkg;
+	pol->ops.blkio_init_group_fn(blkg);
+	return blkg;
+}
+
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       struct request_queue *q,
 				       enum blkio_policy_id plid,
@@ -463,19 +527,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 
-	new_blkg = pol->ops.blkio_alloc_group_fn(q, blkcg);
-	if (new_blkg) {
-		new_blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-
-		spin_lock_init(&new_blkg->stats_lock);
-		rcu_assign_pointer(new_blkg->q, q);
-		new_blkg->blkcg = blkcg;
-		new_blkg->plid = plid;
-		cgroup_path(blkcg->css.cgroup, new_blkg->path,
-			    sizeof(new_blkg->path));
-	} else {
-		css_put(&blkcg->css);
-	}
+	new_blkg = blkg_alloc(blkcg, q, pol);
 
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
@@ -492,7 +544,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 		goto out;
 
 	/* did alloc fail? */
-	if (unlikely(!new_blkg || !new_blkg->stats_cpu)) {
+	if (unlikely(!new_blkg)) {
 		blkg = ERR_PTR(-ENOMEM);
 		goto out;
 	}
@@ -504,11 +556,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	pol->ops.blkio_link_group_fn(q, blkg);
 	spin_unlock(&blkcg->lock);
 out:
-	if (new_blkg) {
-		free_percpu(new_blkg->stats_cpu);
-		kfree(new_blkg);
-		css_put(&blkcg->css);
-	}
+	blkg_free(new_blkg);
 	return blkg;
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 3bc1710..9537819 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -159,6 +159,15 @@ struct blkio_group_conf {
 	u64 bps[2];
 };
 
+/* per-blkg per-policy data */
+struct blkg_policy_data {
+	/* the blkg this per-policy data belongs to */
+	struct blkio_group *blkg;
+
+	/* pol->pdata_size bytes of private data used by policy impl */
+	char pdata[] __aligned(__alignof__(unsigned long long));
+};
+
 struct blkio_group {
 	/* Pointer to the associated request_queue, RCU protected */
 	struct request_queue __rcu *q;
@@ -177,10 +186,11 @@ struct blkio_group {
 	struct blkio_group_stats stats;
 	/* Per cpu stats pointer */
 	struct blkio_group_stats_cpu __percpu *stats_cpu;
+
+	struct blkg_policy_data *pd;
 };
 
-typedef struct blkio_group *(blkio_alloc_group_fn)(struct request_queue *q,
-						   struct blkio_cgroup *blkcg);
+typedef void (blkio_init_group_fn)(struct blkio_group *blkg);
 typedef void (blkio_link_group_fn)(struct request_queue *q,
 			struct blkio_group *blkg);
 typedef void (blkio_unlink_group_fn)(struct request_queue *q,
@@ -198,7 +208,7 @@ typedef void (blkio_update_group_write_iops_fn)(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int write_iops);
 
 struct blkio_policy_ops {
-	blkio_alloc_group_fn *blkio_alloc_group_fn;
+	blkio_init_group_fn *blkio_init_group_fn;
 	blkio_link_group_fn *blkio_link_group_fn;
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
 	blkio_clear_queue_fn *blkio_clear_queue_fn;
@@ -213,6 +223,7 @@ struct blkio_policy_type {
 	struct list_head list;
 	struct blkio_policy_ops ops;
 	enum blkio_policy_id plid;
+	size_t pdata_size;		/* policy specific private data size */
 };
 
 extern int blkcg_init_queue(struct request_queue *q);
@@ -224,6 +235,38 @@ extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
 extern void blkg_destroy_all(struct request_queue *q);
 
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline void *blkg_to_pdata(struct blkio_group *blkg,
+			      struct blkio_policy_type *pol)
+{
+	return blkg ? blkg->pd->pdata : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pdata: policy private data of interest
+ * @pol: policy @pdata is for
+ *
+ * @pdata is policy private data for @pol.  Determine the blkg it's
+ * associated with.
+ */
+static inline struct blkio_group *pdata_to_blkg(void *pdata,
+						struct blkio_policy_type *pol)
+{
+	if (pdata) {
+		struct blkg_policy_data *pd =
+			container_of(pdata, struct blkg_policy_data, pdata);
+		return pd->blkg;
+	}
+	return NULL;
+}
+
 static inline char *blkg_path(struct blkio_group *blkg)
 {
 	return blkg->path;
@@ -244,6 +287,10 @@ static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 static inline void blkg_destroy_all(struct request_queue *q) { }
 
+static inline void *blkg_to_pdata(struct blkio_group *blkg,
+				struct blkio_policy_type *pol) { return NULL; }
+static inline struct blkio_group *pdata_to_blkg(void *pdata,
+				struct blkio_policy_type *pol) { return NULL; }
 static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 
 #endif
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ac6d0fe..9c8a124 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,6 +21,8 @@ static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 static unsigned long throtl_slice = HZ/10;	/* 100 ms */
 
+static struct blkio_policy_type blkio_policy_throtl;
+
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 static void throtl_schedule_delayed_work(struct throtl_data *td,
@@ -52,7 +54,6 @@ struct throtl_grp {
 	 */
 	unsigned long disptime;
 
-	struct blkio_group blkg;
 	atomic_t ref;
 	unsigned int flags;
 
@@ -108,6 +109,16 @@ struct throtl_data
 	int limits_changed;
 };
 
+static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
+{
+	return blkg_to_pdata(blkg, &blkio_policy_throtl);
+}
+
+static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg)
+{
+	return pdata_to_blkg(tg, &blkio_policy_throtl);
+}
+
 enum tg_state_flags {
 	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
 };
@@ -130,19 +141,11 @@ THROTL_TG_FNS(on_rr);
 
 #define throtl_log_tg(td, tg, fmt, args...)				\
 	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
-				blkg_path(&(tg)->blkg), ##args);      	\
+			  blkg_path(tg_to_blkg(tg)), ##args);		\
 
 #define throtl_log(td, fmt, args...)	\
 	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
 
-static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
-{
-	if (blkg)
-		return container_of(blkg, struct throtl_grp, blkg);
-
-	return NULL;
-}
-
 static inline unsigned int total_nr_queued(struct throtl_data *td)
 {
 	return td->nr_queued[0] + td->nr_queued[1];
@@ -156,21 +159,24 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
 
 static void throtl_free_tg(struct rcu_head *head)
 {
-	struct throtl_grp *tg;
+	struct throtl_grp *tg = container_of(head, struct throtl_grp, rcu_head);
+	struct blkio_group *blkg = tg_to_blkg(tg);
 
-	tg = container_of(head, struct throtl_grp, rcu_head);
-	free_percpu(tg->blkg.stats_cpu);
-	kfree(tg);
+	free_percpu(blkg->stats_cpu);
+	kfree(blkg->pd);
+	kfree(blkg);
 }
 
 static void throtl_put_tg(struct throtl_grp *tg)
 {
+	struct blkio_group *blkg = tg_to_blkg(tg);
+
 	BUG_ON(atomic_read(&tg->ref) <= 0);
 	if (!atomic_dec_and_test(&tg->ref))
 		return;
 
 	/* release the extra blkcg reference this blkg has been holding */
-	css_put(&tg->blkg.blkcg->css);
+	css_put(&blkg->blkcg->css);
 
 	/*
 	 * A group is freed in rcu manner. But having an rcu lock does not
@@ -184,14 +190,9 @@ static void throtl_put_tg(struct throtl_grp *tg)
 	call_rcu(&tg->rcu_head, throtl_free_tg);
 }
 
-static struct blkio_group *throtl_alloc_blkio_group(struct request_queue *q,
-						    struct blkio_cgroup *blkcg)
+static void throtl_init_blkio_group(struct blkio_group *blkg)
 {
-	struct throtl_grp *tg;
-
-	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, q->node);
-	if (!tg)
-		return NULL;
+	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	INIT_HLIST_NODE(&tg->tg_node);
 	RB_CLEAR_NODE(&tg->rb_node);
@@ -211,15 +212,13 @@ static struct blkio_group *throtl_alloc_blkio_group(struct request_queue *q,
 	 * exit or cgroup deletion path depending on who is exiting first.
 	 */
 	atomic_set(&tg->ref, 1);
-
-	return &tg->blkg;
 }
 
 static void throtl_link_blkio_group(struct request_queue *q,
 				    struct blkio_group *blkg)
 {
 	struct throtl_data *td = q->td;
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	hlist_add_head(&tg->tg_node, &td->tg_list);
 	td->nr_undestroyed_grps++;
@@ -235,7 +234,7 @@ throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 	if (blkcg == &blkio_root_cgroup)
 		return td->root_tg;
 
-	return tg_of_blkg(blkg_lookup(blkcg, td->queue, BLKIO_POLICY_THROTL));
+	return blkg_to_tg(blkg_lookup(blkcg, td->queue, BLKIO_POLICY_THROTL));
 }
 
 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
@@ -257,7 +256,7 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 
 		/* if %NULL and @q is alive, fall back to root_tg */
 		if (!IS_ERR(blkg))
-			tg = tg_of_blkg(blkg);
+			tg = blkg_to_tg(blkg);
 		else if (!blk_queue_dead(q))
 			tg = td->root_tg;
 	}
@@ -639,7 +638,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;
 
-	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
+	blkiocg_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, rw, sync);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -901,7 +900,7 @@ static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
 		 * it from cgroup list, then it will take care of destroying
 		 * cfqg also.
 		 */
-		if (!blkiocg_del_blkio_group(&tg->blkg))
+		if (!blkiocg_del_blkio_group(tg_to_blkg(tg)))
 			throtl_destroy_tg(td, tg);
 		else
 			empty = false;
@@ -929,7 +928,7 @@ void throtl_unlink_blkio_group(struct request_queue *q,
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	throtl_destroy_tg(q->td, tg_of_blkg(blkg));
+	throtl_destroy_tg(q->td, blkg_to_tg(blkg));
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -968,7 +967,7 @@ static void throtl_update_blkio_group_common(struct throtl_data *td,
 static void throtl_update_blkio_group_read_bps(struct request_queue *q,
 				struct blkio_group *blkg, u64 read_bps)
 {
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	tg->bps[READ] = read_bps;
 	throtl_update_blkio_group_common(q->td, tg);
@@ -977,7 +976,7 @@ static void throtl_update_blkio_group_read_bps(struct request_queue *q,
 static void throtl_update_blkio_group_write_bps(struct request_queue *q,
 				struct blkio_group *blkg, u64 write_bps)
 {
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	tg->bps[WRITE] = write_bps;
 	throtl_update_blkio_group_common(q->td, tg);
@@ -986,7 +985,7 @@ static void throtl_update_blkio_group_write_bps(struct request_queue *q,
 static void throtl_update_blkio_group_read_iops(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int read_iops)
 {
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	tg->iops[READ] = read_iops;
 	throtl_update_blkio_group_common(q->td, tg);
@@ -995,7 +994,7 @@ static void throtl_update_blkio_group_read_iops(struct request_queue *q,
 static void throtl_update_blkio_group_write_iops(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int write_iops)
 {
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	tg->iops[WRITE] = write_iops;
 	throtl_update_blkio_group_common(q->td, tg);
@@ -1010,7 +1009,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 
 static struct blkio_policy_type blkio_policy_throtl = {
 	.ops = {
-		.blkio_alloc_group_fn = throtl_alloc_blkio_group,
+		.blkio_init_group_fn = throtl_init_blkio_group,
 		.blkio_link_group_fn = throtl_link_blkio_group,
 		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
 		.blkio_clear_queue_fn = throtl_clear_queue,
@@ -1024,6 +1023,7 @@ static struct blkio_policy_type blkio_policy_throtl = {
 					throtl_update_blkio_group_write_iops,
 	},
 	.plid = BLKIO_POLICY_THROTL,
+	.pdata_size = sizeof(struct throtl_grp),
 };
 
 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
@@ -1049,8 +1049,9 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
 		if (tg_no_rule_group(tg, rw)) {
-			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
-					rw, rw_is_sync(bio->bi_rw));
+			blkiocg_update_dispatch_stats(tg_to_blkg(tg),
+						      bio->bi_size, rw,
+						      rw_is_sync(bio->bi_rw));
 			goto out_unlock_rcu;
 		}
 	}
@@ -1176,7 +1177,7 @@ int blk_throtl_init(struct request_queue *q)
 	blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_THROTL,
 				  true);
 	if (!IS_ERR(blkg))
-		td->root_tg = tg_of_blkg(blkg);
+		td->root_tg = blkg_to_tg(blkg);
 
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
@@ -1207,7 +1208,7 @@ void blk_throtl_exit(struct request_queue *q)
 	spin_unlock_irq(q->queue_lock);
 
 	/*
-	 * Wait for tg->blkg->q accessors to exit their grace periods.
+	 * Wait for tg_to_blkg(tg)->q accessors to exit their grace periods.
 	 * Do this wait only if there are other undestroyed groups out
 	 * there (other than root group). This can happen if cgroup deletion
 	 * path claimed the responsibility of cleaning up a group before
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9ef86fb..c7449db 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -17,6 +17,8 @@
 #include "blk.h"
 #include "cfq.h"
 
+static struct blkio_policy_type blkio_policy_cfq;
+
 /*
  * tunables
  */
@@ -206,7 +208,6 @@ struct cfq_group {
 	unsigned long saved_workload_slice;
 	enum wl_type_t saved_workload;
 	enum wl_prio_t saved_serving_prio;
-	struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	struct hlist_node cfqd_node;
 	int ref;
@@ -310,6 +311,16 @@ struct cfq_data {
 	unsigned int nr_blkcg_linked_grps;
 };
 
+static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
+{
+	return blkg_to_pdata(blkg, &blkio_policy_cfq);
+}
+
+static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+	return pdata_to_blkg(cfqg, &blkio_policy_cfq);
+}
+
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
@@ -374,11 +385,11 @@ CFQ_CFQQ_FNS(wait_busy);
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
-			blkg_path(&(cfqq)->cfqg->blkg), ##args)
+			blkg_path(cfqg_to_blkg((cfqq)->cfqg)), ##args)
 
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
 	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
-				blkg_path(&(cfqg)->blkg), ##args)       \
+			blkg_path(cfqg_to_blkg((cfqg))), ##args)	\
 
 #else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
@@ -935,7 +946,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfq_group_service_tree_del(st, cfqg);
 	cfqg->saved_workload_slice = 0;
-	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+	cfq_blkiocg_update_dequeue_stats(cfqg_to_blkg(cfqg), 1);
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
@@ -1007,9 +1018,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
 		     used_sl, cfqq->slice_dispatch, charge,
 		     iops_mode(cfqd), cfqq->nr_sectors);
-	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
+	cfq_blkiocg_update_timeslice_used(cfqg_to_blkg(cfqg), used_sl,
 					  unaccounted_sl);
-	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
+	cfq_blkiocg_set_start_empty_time(cfqg_to_blkg(cfqg));
 }
 
 /**
@@ -1032,18 +1043,12 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
-{
-	if (blkg)
-		return container_of(blkg, struct cfq_group, blkg);
-	return NULL;
-}
-
 static void cfq_update_blkio_group_weight(struct request_queue *q,
 					  struct blkio_group *blkg,
 					  unsigned int weight)
 {
-	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
+	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+
 	cfqg->new_weight = weight;
 	cfqg->needs_update = true;
 }
@@ -1052,7 +1057,7 @@ static void cfq_link_blkio_group(struct request_queue *q,
 				 struct blkio_group *blkg)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
+	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 
 	cfqd->nr_blkcg_linked_grps++;
 
@@ -1060,17 +1065,12 @@ static void cfq_link_blkio_group(struct request_queue *q,
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 }
 
-static struct blkio_group *cfq_alloc_blkio_group(struct request_queue *q,
-						 struct blkio_cgroup *blkcg)
+static void cfq_init_blkio_group(struct blkio_group *blkg)
 {
-	struct cfq_group *cfqg;
-
-	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, q->node);
-	if (!cfqg)
-		return NULL;
+	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 
 	cfq_init_cfqg_base(cfqg);
-	cfqg->weight = blkcg->weight;
+	cfqg->weight = blkg->blkcg->weight;
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1079,8 +1079,6 @@ static struct blkio_group *cfq_alloc_blkio_group(struct request_queue *q,
 	 * or cgroup deletion path depending on who is exiting first.
 	 */
 	cfqg->ref = 1;
-
-	return &cfqg->blkg;
 }
 
 /*
@@ -1101,7 +1099,7 @@ static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 
 		blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_PROP, false);
 		if (!IS_ERR(blkg))
-			cfqg = cfqg_of_blkg(blkg);
+			cfqg = blkg_to_cfqg(blkg);
 	}
 
 	return cfqg;
@@ -1126,6 +1124,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 
 static void cfq_put_cfqg(struct cfq_group *cfqg)
 {
+	struct blkio_group *blkg = cfqg_to_blkg(cfqg);
 	struct cfq_rb_root *st;
 	int i, j;
 
@@ -1135,12 +1134,13 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 		return;
 
 	/* release the extra blkcg reference this blkg has been holding */
-	css_put(&cfqg->blkg.blkcg->css);
+	css_put(&blkg->blkcg->css);
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
-	free_percpu(cfqg->blkg.stats_cpu);
-	kfree(cfqg);
+	free_percpu(blkg->stats_cpu);
+	kfree(blkg->pd);
+	kfree(blkg);
 }
 
 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
@@ -1172,7 +1172,7 @@ static bool cfq_release_cfq_groups(struct cfq_data *cfqd)
 		 * it from cgroup list, then it will take care of destroying
 		 * cfqg also.
 		 */
-		if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+		if (!cfq_blkiocg_del_blkio_group(cfqg_to_blkg(cfqg)))
 			cfq_destroy_cfqg(cfqd, cfqg);
 		else
 			empty = false;
@@ -1201,7 +1201,7 @@ static void cfq_unlink_blkio_group(struct request_queue *q,
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
+	cfq_destroy_cfqg(cfqd, blkg_to_cfqg(blkg));
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -1504,12 +1504,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+	cfq_blkiocg_update_io_remove_stats(cfqg_to_blkg(RQ_CFQG(rq)),
 					rq_data_dir(rq), rq_is_sync(rq));
 	cfq_add_rq_rb(rq);
-	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
-			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
-			rq_is_sync(rq));
+	cfq_blkiocg_update_io_add_stats(cfqg_to_blkg(RQ_CFQG(rq)),
+					cfqg_to_blkg(cfqq->cfqd->serving_group),
+					rq_data_dir(rq), rq_is_sync(rq));
 }
 
 static struct request *
@@ -1565,7 +1565,7 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+	cfq_blkiocg_update_io_remove_stats(cfqg_to_blkg(RQ_CFQG(rq)),
 					rq_data_dir(rq), rq_is_sync(rq));
 	if (rq->cmd_flags & REQ_PRIO) {
 		WARN_ON(!cfqq->prio_pending);
@@ -1601,7 +1601,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
+	cfq_blkiocg_update_io_merged_stats(cfqg_to_blkg(RQ_CFQG(req)),
 					bio_data_dir(bio), cfq_bio_sync(bio));
 }
 
@@ -1624,7 +1624,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
-	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
+	cfq_blkiocg_update_io_merged_stats(cfqg_to_blkg(RQ_CFQG(rq)),
 					rq_data_dir(next), rq_is_sync(next));
 
 	cfqq = RQ_CFQQ(next);
@@ -1666,7 +1666,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	del_timer(&cfqd->idle_slice_timer);
-	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+	cfq_blkiocg_update_idle_time_stats(cfqg_to_blkg(cfqq->cfqg));
 }
 
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
@@ -1675,7 +1675,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
+		cfq_blkiocg_update_avg_queue_size_stats(cfqg_to_blkg(cfqq->cfqg));
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
@@ -2023,7 +2023,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 		sl = cfqd->cfq_slice_idle;
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-	cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
+	cfq_blkiocg_update_set_idle_time_stats(cfqg_to_blkg(cfqq->cfqg));
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
 			group_idle ? 1 : 0);
 }
@@ -2046,8 +2046,9 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	cfqq->nr_sectors += blk_rq_sectors(rq);
-	cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
-					rq_data_dir(rq), rq_is_sync(rq));
+	cfq_blkiocg_update_dispatch_stats(cfqg_to_blkg(cfqq->cfqg),
+					  blk_rq_bytes(rq), rq_data_dir(rq),
+					  rq_is_sync(rq));
 }
 
 /*
@@ -3135,7 +3136,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				__blk_run_queue(cfqd->queue);
 			} else {
 				cfq_blkiocg_update_idle_time_stats(
-						&cfqq->cfqg->blkg);
+						cfqg_to_blkg(cfqq->cfqg));
 				cfq_mark_cfqq_must_dispatch(cfqq);
 			}
 		}
@@ -3162,9 +3163,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
-	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
-			&cfqd->serving_group->blkg, rq_data_dir(rq),
-			rq_is_sync(rq));
+	cfq_blkiocg_update_io_add_stats(cfqg_to_blkg(RQ_CFQG(rq)),
+					cfqg_to_blkg(cfqd->serving_group),
+					rq_data_dir(rq), rq_is_sync(rq));
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
@@ -3260,7 +3261,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
-	cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
+	cfq_blkiocg_update_completion_stats(cfqg_to_blkg(cfqq->cfqg),
 			rq_start_time_ns(rq), rq_io_start_time_ns(rq),
 			rq_data_dir(rq), rq_is_sync(rq));
 
@@ -3641,7 +3642,7 @@ static int cfq_init_queue(struct request_queue *q)
 	blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_PROP,
 				  true);
 	if (!IS_ERR(blkg))
-		cfqd->root_group = cfqg_of_blkg(blkg);
+		cfqd->root_group = blkg_to_cfqg(blkg);
 
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
@@ -3827,13 +3828,14 @@ static struct elevator_type iosched_cfq = {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkio_policy_type blkio_policy_cfq = {
 	.ops = {
-		.blkio_alloc_group_fn =		cfq_alloc_blkio_group,
+		.blkio_init_group_fn =		cfq_init_blkio_group,
 		.blkio_link_group_fn =		cfq_link_blkio_group,
 		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
 		.blkio_clear_queue_fn = cfq_clear_queue,
 		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
 	},
 	.plid = BLKIO_POLICY_PROP,
+	.pdata_size = sizeof(struct cfq_group),
 };
 #endif
 
-- 
cgit v1.1


From 1adaf3dde37a8b9b59ea59c5f58fed7761178383 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:15 -0800
Subject: blkcg: move refcnt to blkcg core

Currently, blkcg policy implementations manage blkg refcnt duplicating
mostly identical code in both policies.  This patch moves refcnt to
blkg and let blkcg core handle refcnt and freeing of blkgs.

* cfq blkgs now also get freed via RCU.

* cfq blkgs lose RB_EMPTY_ROOT() sanity check on blkg free.  If
  necessary, we can add blkio_exit_group_fn() to resurrect this.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 24 ++++++++++++++++++++++
 block/blk-cgroup.h   | 35 +++++++++++++++++++++++++++++++
 block/blk-throttle.c | 58 ++++------------------------------------------------
 block/cfq-iosched.c  | 58 +++++++++-------------------------------------------
 4 files changed, 73 insertions(+), 102 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1436749..3b6a0e1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -463,6 +463,7 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	rcu_assign_pointer(blkg->q, q);
 	blkg->blkcg = blkcg;
 	blkg->plid = pol->plid;
+	blkg->refcnt = 1;
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 
 	/* alloc per-policy data */
@@ -633,6 +634,29 @@ void blkg_destroy_all(struct request_queue *q)
 	}
 }
 
+static void blkg_rcu_free(struct rcu_head *rcu_head)
+{
+	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
+}
+
+void __blkg_release(struct blkio_group *blkg)
+{
+	/* release the extra blkcg reference this blkg has been holding */
+	css_put(&blkg->blkcg->css);
+
+	/*
+	 * A group is freed in rcu manner. But having an rcu lock does not
+	 * mean that one can access all the fields of blkg and assume these
+	 * are valid. For example, don't try to follow throtl_data and
+	 * request queue links.
+	 *
+	 * Having a reference to blkg under an rcu allows acess to only
+	 * values local to groups like group stats and group rate limits
+	 */
+	call_rcu(&blkg->rcu_head, blkg_rcu_free);
+}
+EXPORT_SYMBOL_GPL(__blkg_release);
+
 static void blkio_reset_stats_cpu(struct blkio_group *blkg)
 {
 	struct blkio_group_stats_cpu *stats_cpu;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 9537819..7da1068 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -177,6 +177,8 @@ struct blkio_group {
 	char path[128];
 	/* policy which owns this blk group */
 	enum blkio_policy_id plid;
+	/* reference count */
+	int refcnt;
 
 	/* Configuration */
 	struct blkio_group_conf conf;
@@ -188,6 +190,8 @@ struct blkio_group {
 	struct blkio_group_stats_cpu __percpu *stats_cpu;
 
 	struct blkg_policy_data *pd;
+
+	struct rcu_head rcu_head;
 };
 
 typedef void (blkio_init_group_fn)(struct blkio_group *blkg);
@@ -272,6 +276,35 @@ static inline char *blkg_path(struct blkio_group *blkg)
 	return blkg->path;
 }
 
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding queue_lock and an existing reference.
+ */
+static inline void blkg_get(struct blkio_group *blkg)
+{
+	lockdep_assert_held(blkg->q->queue_lock);
+	WARN_ON_ONCE(!blkg->refcnt);
+	blkg->refcnt++;
+}
+
+void __blkg_release(struct blkio_group *blkg);
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ *
+ * The caller should be holding queue_lock.
+ */
+static inline void blkg_put(struct blkio_group *blkg)
+{
+	lockdep_assert_held(blkg->q->queue_lock);
+	WARN_ON_ONCE(blkg->refcnt <= 0);
+	if (!--blkg->refcnt)
+		__blkg_release(blkg);
+}
+
 #else
 
 struct blkio_group {
@@ -292,6 +325,8 @@ static inline void *blkg_to_pdata(struct blkio_group *blkg,
 static inline struct blkio_group *pdata_to_blkg(void *pdata,
 				struct blkio_policy_type *pol) { return NULL; }
 static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+static inline void blkg_get(struct blkio_group *blkg) { }
+static inline void blkg_put(struct blkio_group *blkg) { }
 
 #endif
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9c8a124..153ba50 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -54,7 +54,6 @@ struct throtl_grp {
 	 */
 	unsigned long disptime;
 
-	atomic_t ref;
 	unsigned int flags;
 
 	/* Two lists for READ and WRITE */
@@ -80,8 +79,6 @@ struct throtl_grp {
 
 	/* Some throttle limits got updated for the group */
 	int limits_changed;
-
-	struct rcu_head rcu_head;
 };
 
 struct throtl_data
@@ -151,45 +148,6 @@ static inline unsigned int total_nr_queued(struct throtl_data *td)
 	return td->nr_queued[0] + td->nr_queued[1];
 }
 
-static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
-{
-	atomic_inc(&tg->ref);
-	return tg;
-}
-
-static void throtl_free_tg(struct rcu_head *head)
-{
-	struct throtl_grp *tg = container_of(head, struct throtl_grp, rcu_head);
-	struct blkio_group *blkg = tg_to_blkg(tg);
-
-	free_percpu(blkg->stats_cpu);
-	kfree(blkg->pd);
-	kfree(blkg);
-}
-
-static void throtl_put_tg(struct throtl_grp *tg)
-{
-	struct blkio_group *blkg = tg_to_blkg(tg);
-
-	BUG_ON(atomic_read(&tg->ref) <= 0);
-	if (!atomic_dec_and_test(&tg->ref))
-		return;
-
-	/* release the extra blkcg reference this blkg has been holding */
-	css_put(&blkg->blkcg->css);
-
-	/*
-	 * A group is freed in rcu manner. But having an rcu lock does not
-	 * mean that one can access all the fields of blkg and assume these
-	 * are valid. For example, don't try to follow throtl_data and
-	 * request queue links.
-	 *
-	 * Having a reference to blkg under an rcu allows acess to only
-	 * values local to groups like group stats and group rate limits
-	 */
-	call_rcu(&tg->rcu_head, throtl_free_tg);
-}
-
 static void throtl_init_blkio_group(struct blkio_group *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -204,14 +162,6 @@ static void throtl_init_blkio_group(struct blkio_group *blkg)
 	tg->bps[WRITE] = -1;
 	tg->iops[READ] = -1;
 	tg->iops[WRITE] = -1;
-
-	/*
-	 * Take the initial reference that will be released on destroy
-	 * This can be thought of a joint reference by cgroup and
-	 * request queue which will be dropped by either request queue
-	 * exit or cgroup deletion path depending on who is exiting first.
-	 */
-	atomic_set(&tg->ref, 1);
 }
 
 static void throtl_link_blkio_group(struct request_queue *q,
@@ -648,7 +598,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
 
 	bio_list_add(&tg->bio_lists[rw], bio);
 	/* Take a bio reference on tg */
-	throtl_ref_get_tg(tg);
+	blkg_get(tg_to_blkg(tg));
 	tg->nr_queued[rw]++;
 	td->nr_queued[rw]++;
 	throtl_enqueue_tg(td, tg);
@@ -681,8 +631,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
 
 	bio = bio_list_pop(&tg->bio_lists[rw]);
 	tg->nr_queued[rw]--;
-	/* Drop bio reference on tg */
-	throtl_put_tg(tg);
+	/* Drop bio reference on blkg */
+	blkg_put(tg_to_blkg(tg));
 
 	BUG_ON(td->nr_queued[rw] <= 0);
 	td->nr_queued[rw]--;
@@ -880,7 +830,7 @@ throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
-	throtl_put_tg(tg);
+	blkg_put(tg_to_blkg(tg));
 	td->nr_undestroyed_grps--;
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c7449db..86980023 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -210,7 +210,6 @@ struct cfq_group {
 	enum wl_prio_t saved_serving_prio;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	struct hlist_node cfqd_node;
-	int ref;
 #endif
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
@@ -1071,14 +1070,6 @@ static void cfq_init_blkio_group(struct blkio_group *blkg)
 
 	cfq_init_cfqg_base(cfqg);
 	cfqg->weight = blkg->blkcg->weight;
-
-	/*
-	 * Take the initial reference that will be released on destroy
-	 * This can be thought of a joint reference by cgroup and
-	 * elevator which will be dropped by either elevator exit
-	 * or cgroup deletion path depending on who is exiting first.
-	 */
-	cfqg->ref = 1;
 }
 
 /*
@@ -1105,12 +1096,6 @@ static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 	return cfqg;
 }
 
-static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
-{
-	cfqg->ref++;
-	return cfqg;
-}
-
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
 	/* Currently, all async queues are mapped to root group */
@@ -1119,28 +1104,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
-	cfqq->cfqg->ref++;
-}
-
-static void cfq_put_cfqg(struct cfq_group *cfqg)
-{
-	struct blkio_group *blkg = cfqg_to_blkg(cfqg);
-	struct cfq_rb_root *st;
-	int i, j;
-
-	BUG_ON(cfqg->ref <= 0);
-	cfqg->ref--;
-	if (cfqg->ref)
-		return;
-
-	/* release the extra blkcg reference this blkg has been holding */
-	css_put(&blkg->blkcg->css);
-
-	for_each_cfqg_st(cfqg, i, j, st)
-		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
-	free_percpu(blkg->stats_cpu);
-	kfree(blkg->pd);
-	kfree(blkg);
+	blkg_get(cfqg_to_blkg(cfqg));
 }
 
 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
@@ -1157,7 +1121,7 @@ static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
-	cfq_put_cfqg(cfqg);
+	blkg_put(cfqg_to_blkg(cfqg));
 }
 
 static bool cfq_release_cfq_groups(struct cfq_data *cfqd)
@@ -1225,18 +1189,12 @@ static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 	return cfqd->root_group;
 }
 
-static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
-{
-	return cfqg;
-}
-
 static inline void
 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
 	cfqq->cfqg = cfqg;
 }
 
 static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
-static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
 
 #endif /* GROUP_IOSCHED */
 
@@ -2630,7 +2588,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
-	cfq_put_cfqg(cfqg);
+	blkg_put(cfqg_to_blkg(cfqg));
 }
 
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -3382,7 +3340,7 @@ static void cfq_put_request(struct request *rq)
 		cfqq->allocated[rw]--;
 
 		/* Put down rq reference on cfqg */
-		cfq_put_cfqg(RQ_CFQG(rq));
+		blkg_put(cfqg_to_blkg(RQ_CFQG(rq)));
 		rq->elv.priv[0] = NULL;
 		rq->elv.priv[1] = NULL;
 
@@ -3477,8 +3435,9 @@ new_queue:
 	cfqq->allocated[rw]++;
 
 	cfqq->ref++;
+	blkg_get(cfqg_to_blkg(cfqq->cfqg));
 	rq->elv.priv[0] = cfqq;
-	rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
+	rq->elv.priv[1] = cfqq->cfqg;
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
@@ -3676,8 +3635,11 @@ static int cfq_init_queue(struct request_queue *q)
 	 */
 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
 	cfqd->oom_cfqq.ref++;
+
+	spin_lock_irq(q->queue_lock);
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
-	cfq_put_cfqg(cfqd->root_group);
+	blkg_put(cfqg_to_blkg(cfqd->root_group));
+	spin_unlock_irq(q->queue_lock);
 
 	init_timer(&cfqd->idle_slice_timer);
 	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
-- 
cgit v1.1


From 549d3aa872cd1aec1ee540fd93afd9611faa0def Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:16 -0800
Subject: blkcg: make blkg->pd an array and move configuration and stats into
 it

To prepare for unifying blkgs for different policies, make blkg->pd an
array with BLKIO_NR_POLICIES elements and move blkg->conf, ->stats,
and ->stats_cpu into blkg_policy_data.

This patch doesn't introduce any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 150 +++++++++++++++++++++++++++++++++--------------------
 block/blk-cgroup.h |  18 +++----
 2 files changed, 102 insertions(+), 66 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 3b6a0e1..0eb3998 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -184,12 +184,14 @@ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 						struct blkio_group *curr_blkg)
 {
-	if (blkio_blkg_waiting(&blkg->stats))
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+
+	if (blkio_blkg_waiting(&pd->stats))
 		return;
 	if (blkg == curr_blkg)
 		return;
-	blkg->stats.start_group_wait_time = sched_clock();
-	blkio_mark_blkg_waiting(&blkg->stats);
+	pd->stats.start_group_wait_time = sched_clock();
+	blkio_mark_blkg_waiting(&pd->stats);
 }
 
 /* This should be called with the blkg->stats_lock held. */
@@ -222,24 +224,26 @@ static void blkio_end_empty_time(struct blkio_group_stats *stats)
 
 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
-	BUG_ON(blkio_blkg_idling(&blkg->stats));
-	blkg->stats.start_idle_time = sched_clock();
-	blkio_mark_blkg_idling(&blkg->stats);
+	BUG_ON(blkio_blkg_idling(&pd->stats));
+	pd->stats.start_idle_time = sched_clock();
+	blkio_mark_blkg_idling(&pd->stats);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
 
 void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	unsigned long flags;
 	unsigned long long now;
 	struct blkio_group_stats *stats;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
+	stats = &pd->stats;
 	if (blkio_blkg_idling(stats)) {
 		now = sched_clock();
 		if (time_after64(now, stats->start_idle_time))
@@ -252,11 +256,12 @@ EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 
 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	unsigned long flags;
 	struct blkio_group_stats *stats;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
+	stats = &pd->stats;
 	stats->avg_queue_size_sum +=
 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
@@ -268,11 +273,12 @@ EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 
 void blkiocg_set_start_empty_time(struct blkio_group *blkg)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	unsigned long flags;
 	struct blkio_group_stats *stats;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
+	stats = &pd->stats;
 
 	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
@@ -299,7 +305,9 @@ EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 			unsigned long dequeue)
 {
-	blkg->stats.dequeue += dequeue;
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+
+	pd->stats.dequeue += dequeue;
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 #else
@@ -312,12 +320,13 @@ void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 			struct blkio_group *curr_blkg, bool direction,
 			bool sync)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
-	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
+	blkio_add_stat(pd->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
 			sync);
-	blkio_end_empty_time(&blkg->stats);
+	blkio_end_empty_time(&pd->stats);
 	blkio_set_start_group_wait_time(blkg, curr_blkg);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
@@ -326,10 +335,11 @@ EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 						bool direction, bool sync)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
-	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
+	blkio_check_and_dec_stat(pd->stats.stat_arr[BLKIO_STAT_QUEUED],
 					direction, sync);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
@@ -338,12 +348,13 @@ EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
 				unsigned long unaccounted_time)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
-	blkg->stats.time += time;
+	pd->stats.time += time;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg->stats.unaccounted_time += unaccounted_time;
+	pd->stats.unaccounted_time += unaccounted_time;
 #endif
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
@@ -356,6 +367,7 @@ EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 				uint64_t bytes, bool direction, bool sync)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
@@ -366,7 +378,7 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 	 */
 	local_irq_save(flags);
 
-	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+	stats_cpu = this_cpu_ptr(pd->stats_cpu);
 
 	u64_stats_update_begin(&stats_cpu->syncp);
 	stats_cpu->sectors += bytes >> 9;
@@ -382,12 +394,13 @@ EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 void blkiocg_update_completion_stats(struct blkio_group *blkg,
 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	struct blkio_group_stats *stats;
 	unsigned long flags;
 	unsigned long long now = sched_clock();
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
+	stats = &pd->stats;
 	if (time_after64(now, io_start_time))
 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
 				now - io_start_time, direction, sync);
@@ -402,6 +415,7 @@ EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 					bool sync)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
@@ -412,7 +426,7 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 	 */
 	local_irq_save(flags);
 
-	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+	stats_cpu = this_cpu_ptr(pd->stats_cpu);
 
 	u64_stats_update_begin(&stats_cpu->syncp);
 	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
@@ -430,11 +444,17 @@ EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
  */
 static void blkg_free(struct blkio_group *blkg)
 {
-	if (blkg) {
-		free_percpu(blkg->stats_cpu);
-		kfree(blkg->pd);
-		kfree(blkg);
+	struct blkg_policy_data *pd;
+
+	if (!blkg)
+		return;
+
+	pd = blkg->pd[blkg->plid];
+	if (pd) {
+		free_percpu(pd->stats_cpu);
+		kfree(pd);
 	}
+	kfree(blkg);
 }
 
 /**
@@ -453,6 +473,7 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 				      struct blkio_policy_type *pol)
 {
 	struct blkio_group *blkg;
+	struct blkg_policy_data *pd;
 
 	/* alloc and init base part */
 	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
@@ -466,23 +487,26 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	blkg->refcnt = 1;
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 
-	/* alloc per-policy data */
-	blkg->pd = kzalloc_node(sizeof(*blkg->pd) + pol->pdata_size, GFP_ATOMIC,
-				q->node);
-	if (!blkg->pd) {
+	/* alloc per-policy data and attach it to blkg */
+	pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
+			  q->node);
+	if (!pd) {
 		blkg_free(blkg);
 		return NULL;
 	}
 
+	blkg->pd[pol->plid] = pd;
+	pd->blkg = blkg;
+
 	/* broken, read comment in the callsite */
-	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-	if (!blkg->stats_cpu) {
+
+	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+	if (!pd->stats_cpu) {
 		blkg_free(blkg);
 		return NULL;
 	}
 
-	/* attach pd to blkg and invoke per-policy init */
-	blkg->pd->blkg = blkg;
+	/* invoke per-policy init */
 	pol->ops.blkio_init_group_fn(blkg);
 	return blkg;
 }
@@ -659,6 +683,7 @@ EXPORT_SYMBOL_GPL(__blkg_release);
 
 static void blkio_reset_stats_cpu(struct blkio_group *blkg)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	struct blkio_group_stats_cpu *stats_cpu;
 	int i, j, k;
 	/*
@@ -673,7 +698,7 @@ static void blkio_reset_stats_cpu(struct blkio_group *blkg)
 	 * unless this becomes a real issue.
 	 */
 	for_each_possible_cpu(i) {
-		stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
+		stats_cpu = per_cpu_ptr(pd->stats_cpu, i);
 		stats_cpu->sectors = 0;
 		for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
 			for (k = 0; k < BLKIO_STAT_TOTAL; k++)
@@ -698,8 +723,10 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	blkcg = cgroup_to_blkio_cgroup(cgroup);
 	spin_lock_irq(&blkcg->lock);
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+
 		spin_lock(&blkg->stats_lock);
-		stats = &blkg->stats;
+		stats = &pd->stats;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 		idling = blkio_blkg_idling(stats);
 		waiting = blkio_blkg_waiting(stats);
@@ -779,13 +806,14 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
 static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
 			enum stat_type_cpu type, enum stat_sub_type sub_type)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	int cpu;
 	struct blkio_group_stats_cpu *stats_cpu;
 	u64 val = 0, tval;
 
 	for_each_possible_cpu(cpu) {
 		unsigned int start;
-		stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
+		stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);
 
 		do {
 			start = u64_stats_fetch_begin(&stats_cpu->syncp);
@@ -837,20 +865,21 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 			       struct cgroup_map_cb *cb, const char *dname,
 			       enum stat_type type)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	uint64_t disk_total;
 	char key_str[MAX_KEY_LEN];
 	enum stat_sub_type sub_type;
 
 	if (type == BLKIO_STAT_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.time, cb, dname);
+					pd->stats.time, cb, dname);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       blkg->stats.unaccounted_time, cb, dname);
+				       pd->stats.unaccounted_time, cb, dname);
 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
-		uint64_t sum = blkg->stats.avg_queue_size_sum;
-		uint64_t samples = blkg->stats.avg_queue_size_samples;
+		uint64_t sum = pd->stats.avg_queue_size_sum;
+		uint64_t samples = pd->stats.avg_queue_size_samples;
 		if (samples)
 			do_div(sum, samples);
 		else
@@ -860,26 +889,26 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	}
 	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       blkg->stats.group_wait_time, cb, dname);
+				       pd->stats.group_wait_time, cb, dname);
 	if (type == BLKIO_STAT_IDLE_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       blkg->stats.idle_time, cb, dname);
+				       pd->stats.idle_time, cb, dname);
 	if (type == BLKIO_STAT_EMPTY_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       blkg->stats.empty_time, cb, dname);
+				       pd->stats.empty_time, cb, dname);
 	if (type == BLKIO_STAT_DEQUEUE)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       blkg->stats.dequeue, cb, dname);
+				       pd->stats.dequeue, cb, dname);
 #endif
 
 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 			sub_type++) {
 		blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
 				   false);
-		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
+		cb->fill(cb, key_str, pd->stats.stat_arr[type][sub_type]);
 	}
-	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
-			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
+	disk_total = pd->stats.stat_arr[type][BLKIO_STAT_READ] +
+			pd->stats.stat_arr[type][BLKIO_STAT_WRITE];
 	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
 			   false);
 	cb->fill(cb, key_str, disk_total);
@@ -891,6 +920,7 @@ static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
 {
 	struct gendisk *disk = NULL;
 	struct blkio_group *blkg = NULL;
+	struct blkg_policy_data *pd;
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	unsigned long major, minor;
 	int i = 0, ret = -EINVAL;
@@ -950,35 +980,37 @@ static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
 		goto out_unlock;
 	}
 
+	pd = blkg->pd[plid];
+
 	switch (plid) {
 	case BLKIO_POLICY_PROP:
 		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
 		     temp > BLKIO_WEIGHT_MAX)
 			goto out_unlock;
 
-		blkg->conf.weight = temp;
+		pd->conf.weight = temp;
 		blkio_update_group_weight(blkg, temp ?: blkcg->weight);
 		break;
 	case BLKIO_POLICY_THROTL:
 		switch(fileid) {
 		case BLKIO_THROTL_read_bps_device:
-			blkg->conf.bps[READ] = temp;
+			pd->conf.bps[READ] = temp;
 			blkio_update_group_bps(blkg, temp ?: -1, fileid);
 			break;
 		case BLKIO_THROTL_write_bps_device:
-			blkg->conf.bps[WRITE] = temp;
+			pd->conf.bps[WRITE] = temp;
 			blkio_update_group_bps(blkg, temp ?: -1, fileid);
 			break;
 		case BLKIO_THROTL_read_iops_device:
 			if (temp > THROTL_IOPS_MAX)
 				goto out_unlock;
-			blkg->conf.iops[READ] = temp;
+			pd->conf.iops[READ] = temp;
 			blkio_update_group_iops(blkg, temp ?: -1, fileid);
 			break;
 		case BLKIO_THROTL_write_iops_device:
 			if (temp > THROTL_IOPS_MAX)
 				goto out_unlock;
-			blkg->conf.iops[WRITE] = temp;
+			pd->conf.iops[WRITE] = temp;
 			blkio_update_group_iops(blkg, temp ?: -1, fileid);
 			break;
 		}
@@ -1034,6 +1066,7 @@ static const char *blkg_dev_name(struct blkio_group *blkg)
 static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
 				   struct seq_file *m)
 {
+	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 	const char *dname = blkg_dev_name(blkg);
 	int fileid = BLKIOFILE_ATTR(cft->private);
 	int rw = WRITE;
@@ -1043,25 +1076,25 @@ static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
 
 	switch (blkg->plid) {
 		case BLKIO_POLICY_PROP:
-			if (blkg->conf.weight)
+			if (pd->conf.weight)
 				seq_printf(m, "%s\t%u\n",
-					   dname, blkg->conf.weight);
+					   dname, pd->conf.weight);
 			break;
 		case BLKIO_POLICY_THROTL:
 			switch (fileid) {
 			case BLKIO_THROTL_read_bps_device:
 				rw = READ;
 			case BLKIO_THROTL_write_bps_device:
-				if (blkg->conf.bps[rw])
+				if (pd->conf.bps[rw])
 					seq_printf(m, "%s\t%llu\n",
-						   dname, blkg->conf.bps[rw]);
+						   dname, pd->conf.bps[rw]);
 				break;
 			case BLKIO_THROTL_read_iops_device:
 				rw = READ;
 			case BLKIO_THROTL_write_iops_device:
-				if (blkg->conf.iops[rw])
+				if (pd->conf.iops[rw])
 					seq_printf(m, "%s\t%u\n",
-						   dname, blkg->conf.iops[rw]);
+						   dname, pd->conf.iops[rw]);
 				break;
 			}
 			break;
@@ -1243,9 +1276,12 @@ static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
 	spin_lock_irq(&blkcg->lock);
 	blkcg->weight = (unsigned int)val;
 
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
-		if (blkg->plid == plid && !blkg->conf.weight)
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+
+		if (blkg->plid == plid && !pd->conf.weight)
 			blkio_update_group_weight(blkg, blkcg->weight);
+	}
 
 	spin_unlock_irq(&blkcg->lock);
 	spin_unlock(&blkio_list_lock);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 7da1068..5dffd43 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -164,6 +164,13 @@ struct blkg_policy_data {
 	/* the blkg this per-policy data belongs to */
 	struct blkio_group *blkg;
 
+	/* Configuration */
+	struct blkio_group_conf conf;
+
+	struct blkio_group_stats stats;
+	/* Per cpu stats pointer */
+	struct blkio_group_stats_cpu __percpu *stats_cpu;
+
 	/* pol->pdata_size bytes of private data used by policy impl */
 	char pdata[] __aligned(__alignof__(unsigned long long));
 };
@@ -180,16 +187,9 @@ struct blkio_group {
 	/* reference count */
 	int refcnt;
 
-	/* Configuration */
-	struct blkio_group_conf conf;
-
 	/* Need to serialize the stats in the case of reset/update */
 	spinlock_t stats_lock;
-	struct blkio_group_stats stats;
-	/* Per cpu stats pointer */
-	struct blkio_group_stats_cpu __percpu *stats_cpu;
-
-	struct blkg_policy_data *pd;
+	struct blkg_policy_data *pd[BLKIO_NR_POLICIES];
 
 	struct rcu_head rcu_head;
 };
@@ -249,7 +249,7 @@ extern void blkg_destroy_all(struct request_queue *q);
 static inline void *blkg_to_pdata(struct blkio_group *blkg,
 			      struct blkio_policy_type *pol)
 {
-	return blkg ? blkg->pd->pdata : NULL;
+	return blkg ? blkg->pd[pol->plid]->pdata : NULL;
 }
 
 /**
-- 
cgit v1.1


From c1768268f9424410761da57ea71107acae7b03cc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:17 -0800
Subject: blkcg: don't use blkg->plid in stat related functions

blkg is scheduled to be unified for all policies and thus there won't
be one-to-one mapping from blkg to policy.  Update stat related
functions to take explicit @pol or @plid arguments and not use
blkg->plid.

This is painful for now but most of specific stat interface functions
will be replaced with a handful of generic helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 150 +++++++++++++++++++++++++++++----------------------
 block/blk-cgroup.h   |  80 ++++++++++++++++-----------
 block/blk-throttle.c |   4 +-
 block/cfq-iosched.c  |  44 +++++++++------
 block/cfq.h          |  96 ++++++++++++++++++++-------------
 5 files changed, 224 insertions(+), 150 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0eb3998..91f9824 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -78,14 +78,14 @@ struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(task_blkio_cgroup);
 
-static inline void
-blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
+static inline void blkio_update_group_weight(struct blkio_group *blkg,
+					     int plid, unsigned int weight)
 {
 	struct blkio_policy_type *blkiop;
 
 	list_for_each_entry(blkiop, &blkio_list, list) {
 		/* If this policy does not own the blkg, do not send updates */
-		if (blkiop->plid != blkg->plid)
+		if (blkiop->plid != plid)
 			continue;
 		if (blkiop->ops.blkio_update_group_weight_fn)
 			blkiop->ops.blkio_update_group_weight_fn(blkg->q,
@@ -93,15 +93,15 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 	}
 }
 
-static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
-				int fileid)
+static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
+					  u64 bps, int fileid)
 {
 	struct blkio_policy_type *blkiop;
 
 	list_for_each_entry(blkiop, &blkio_list, list) {
 
 		/* If this policy does not own the blkg, do not send updates */
-		if (blkiop->plid != blkg->plid)
+		if (blkiop->plid != plid)
 			continue;
 
 		if (fileid == BLKIO_THROTL_read_bps_device
@@ -117,14 +117,15 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 }
 
 static inline void blkio_update_group_iops(struct blkio_group *blkg,
-			unsigned int iops, int fileid)
+					   int plid, unsigned int iops,
+					   int fileid)
 {
 	struct blkio_policy_type *blkiop;
 
 	list_for_each_entry(blkiop, &blkio_list, list) {
 
 		/* If this policy does not own the blkg, do not send updates */
-		if (blkiop->plid != blkg->plid)
+		if (blkiop->plid != plid)
 			continue;
 
 		if (fileid == BLKIO_THROTL_read_iops_device
@@ -182,9 +183,10 @@ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 /* This should be called with the blkg->stats_lock held. */
 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
-						struct blkio_group *curr_blkg)
+					    struct blkio_policy_type *pol,
+					    struct blkio_group *curr_blkg)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 
 	if (blkio_blkg_waiting(&pd->stats))
 		return;
@@ -222,9 +224,10 @@ static void blkio_end_empty_time(struct blkio_group_stats *stats)
 	blkio_clear_blkg_empty(stats);
 }
 
-void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
+					struct blkio_policy_type *pol)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
@@ -235,9 +238,10 @@ void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
 
-void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
+				    struct blkio_policy_type *pol)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	unsigned long flags;
 	unsigned long long now;
 	struct blkio_group_stats *stats;
@@ -254,9 +258,10 @@ void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 
-void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
+					 struct blkio_policy_type *pol)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	unsigned long flags;
 	struct blkio_group_stats *stats;
 
@@ -271,9 +276,10 @@ void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 
-void blkiocg_set_start_empty_time(struct blkio_group *blkg)
+void blkiocg_set_start_empty_time(struct blkio_group *blkg,
+				  struct blkio_policy_type *pol)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	unsigned long flags;
 	struct blkio_group_stats *stats;
 
@@ -303,39 +309,43 @@ void blkiocg_set_start_empty_time(struct blkio_group *blkg)
 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
 
 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			unsigned long dequeue)
+				  struct blkio_policy_type *pol,
+				  unsigned long dequeue)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 
 	pd->stats.dequeue += dequeue;
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 #else
 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
-					struct blkio_group *curr_blkg) {}
-static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
+					struct blkio_policy_type *pol,
+					struct blkio_group *curr_blkg) { }
+static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
 #endif
 
 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
-			struct blkio_group *curr_blkg, bool direction,
-			bool sync)
+				 struct blkio_policy_type *pol,
+				 struct blkio_group *curr_blkg, bool direction,
+				 bool sync)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
 	blkio_add_stat(pd->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
 			sync);
 	blkio_end_empty_time(&pd->stats);
-	blkio_set_start_group_wait_time(blkg, curr_blkg);
+	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
 
 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-						bool direction, bool sync)
+				    struct blkio_policy_type *pol,
+				    bool direction, bool sync)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
@@ -345,10 +355,12 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 
-void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
-				unsigned long unaccounted_time)
+void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+				   struct blkio_policy_type *pol,
+				   unsigned long time,
+				   unsigned long unaccounted_time)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
@@ -365,9 +377,10 @@ EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
  * is valid.
  */
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				uint64_t bytes, bool direction, bool sync)
+				   struct blkio_policy_type *pol,
+				   uint64_t bytes, bool direction, bool sync)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
@@ -392,9 +405,12 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 
 void blkiocg_update_completion_stats(struct blkio_group *blkg,
-	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
+				     struct blkio_policy_type *pol,
+				     uint64_t start_time,
+				     uint64_t io_start_time, bool direction,
+				     bool sync)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	struct blkio_group_stats *stats;
 	unsigned long flags;
 	unsigned long long now = sched_clock();
@@ -412,10 +428,11 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 
 /*  Merged stats are per cpu.  */
-void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
-					bool sync)
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+				    struct blkio_policy_type *pol,
+				    bool direction, bool sync)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
@@ -681,9 +698,9 @@ void __blkg_release(struct blkio_group *blkg)
 }
 EXPORT_SYMBOL_GPL(__blkg_release);
 
-static void blkio_reset_stats_cpu(struct blkio_group *blkg)
+static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[plid];
 	struct blkio_group_stats_cpu *stats_cpu;
 	int i, j, k;
 	/*
@@ -754,7 +771,7 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 		spin_unlock(&blkg->stats_lock);
 
 		/* Reset Per cpu stats which don't take blkg->stats_lock */
-		blkio_reset_stats_cpu(blkg);
+		blkio_reset_stats_cpu(blkg, blkg->plid);
 	}
 
 	spin_unlock_irq(&blkcg->lock);
@@ -803,10 +820,10 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
 }
 
 
-static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
+static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
 			enum stat_type_cpu type, enum stat_sub_type sub_type)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[plid];
 	int cpu;
 	struct blkio_group_stats_cpu *stats_cpu;
 	u64 val = 0, tval;
@@ -829,7 +846,7 @@ static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
 	return val;
 }
 
-static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
+static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
 				   struct cgroup_map_cb *cb, const char *dname,
 				   enum stat_type_cpu type)
 {
@@ -838,7 +855,7 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
 	enum stat_sub_type sub_type;
 
 	if (type == BLKIO_STAT_CPU_SECTORS) {
-		val = blkio_read_stat_cpu(blkg, type, 0);
+		val = blkio_read_stat_cpu(blkg, plid, type, 0);
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb,
 				       dname);
 	}
@@ -847,12 +864,12 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
 			sub_type++) {
 		blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
 				   false);
-		val = blkio_read_stat_cpu(blkg, type, sub_type);
+		val = blkio_read_stat_cpu(blkg, plid, type, sub_type);
 		cb->fill(cb, key_str, val);
 	}
 
-	disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
-			blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
+	disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_READ) +
+		blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_WRITE);
 
 	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
 			   false);
@@ -861,11 +878,11 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
 }
 
 /* This should be called with blkg->stats_lock held */
-static uint64_t blkio_get_stat(struct blkio_group *blkg,
+static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
 			       struct cgroup_map_cb *cb, const char *dname,
 			       enum stat_type type)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+	struct blkg_policy_data *pd = blkg->pd[plid];
 	uint64_t disk_total;
 	char key_str[MAX_KEY_LEN];
 	enum stat_sub_type sub_type;
@@ -989,29 +1006,29 @@ static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
 			goto out_unlock;
 
 		pd->conf.weight = temp;
-		blkio_update_group_weight(blkg, temp ?: blkcg->weight);
+		blkio_update_group_weight(blkg, plid, temp ?: blkcg->weight);
 		break;
 	case BLKIO_POLICY_THROTL:
 		switch(fileid) {
 		case BLKIO_THROTL_read_bps_device:
 			pd->conf.bps[READ] = temp;
-			blkio_update_group_bps(blkg, temp ?: -1, fileid);
+			blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
 			break;
 		case BLKIO_THROTL_write_bps_device:
 			pd->conf.bps[WRITE] = temp;
-			blkio_update_group_bps(blkg, temp ?: -1, fileid);
+			blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
 			break;
 		case BLKIO_THROTL_read_iops_device:
 			if (temp > THROTL_IOPS_MAX)
 				goto out_unlock;
 			pd->conf.iops[READ] = temp;
-			blkio_update_group_iops(blkg, temp ?: -1, fileid);
+			blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
 			break;
 		case BLKIO_THROTL_write_iops_device:
 			if (temp > THROTL_IOPS_MAX)
 				goto out_unlock;
 			pd->conf.iops[WRITE] = temp;
-			blkio_update_group_iops(blkg, temp ?: -1, fileid);
+			blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
 			break;
 		}
 		break;
@@ -1066,15 +1083,16 @@ static const char *blkg_dev_name(struct blkio_group *blkg)
 static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
 				   struct seq_file *m)
 {
-	struct blkg_policy_data *pd = blkg->pd[blkg->plid];
-	const char *dname = blkg_dev_name(blkg);
+	int plid = BLKIOFILE_POLICY(cft->private);
 	int fileid = BLKIOFILE_ATTR(cft->private);
+	struct blkg_policy_data *pd = blkg->pd[plid];
+	const char *dname = blkg_dev_name(blkg);
 	int rw = WRITE;
 
 	if (!dname)
 		return;
 
-	switch (blkg->plid) {
+	switch (plid) {
 		case BLKIO_POLICY_PROP:
 			if (pd->conf.weight)
 				seq_printf(m, "%s\t%u\n",
@@ -1166,15 +1184,17 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		const char *dname = blkg_dev_name(blkg);
+		int plid = BLKIOFILE_POLICY(cft->private);
 
-		if (!dname || BLKIOFILE_POLICY(cft->private) != blkg->plid)
+		if (!dname || plid != blkg->plid)
 			continue;
-		if (pcpu)
-			cgroup_total += blkio_get_stat_cpu(blkg, cb, dname,
-							   type);
-		else {
+		if (pcpu) {
+			cgroup_total += blkio_get_stat_cpu(blkg, plid,
+							   cb, dname, type);
+		} else {
 			spin_lock_irq(&blkg->stats_lock);
-			cgroup_total += blkio_get_stat(blkg, cb, dname, type);
+			cgroup_total += blkio_get_stat(blkg, plid,
+						       cb, dname, type);
 			spin_unlock_irq(&blkg->stats_lock);
 		}
 	}
@@ -1280,7 +1300,7 @@ static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
 		struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 
 		if (blkg->plid == plid && !pd->conf.weight)
-			blkio_update_group_weight(blkg, blkcg->weight);
+			blkio_update_group_weight(blkg, plid, blkcg->weight);
 	}
 
 	spin_unlock_irq(&blkcg->lock);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 5dffd43..60e96b4 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -335,12 +335,17 @@ static inline void blkg_put(struct blkio_group *blkg) { }
 #define BLKIO_WEIGHT_DEFAULT	500
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
+					 struct blkio_policy_type *pol);
 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-				unsigned long dequeue);
-void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
-void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
-void blkiocg_set_start_empty_time(struct blkio_group *blkg);
+				  struct blkio_policy_type *pol,
+				  unsigned long dequeue);
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
+					struct blkio_policy_type *pol);
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
+				    struct blkio_policy_type *pol);
+void blkiocg_set_start_empty_time(struct blkio_group *blkg,
+				  struct blkio_policy_type *pol);
 
 #define BLKG_FLAG_FNS(name)						\
 static inline void blkio_mark_blkg_##name(				\
@@ -363,14 +368,16 @@ BLKG_FLAG_FNS(idling)
 BLKG_FLAG_FNS(empty)
 #undef BLKG_FLAG_FNS
 #else
-static inline void blkiocg_update_avg_queue_size_stats(
-						struct blkio_group *blkg) {}
+static inline void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol) { }
 static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-						unsigned long dequeue) {}
-static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
-{}
-static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
-static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
+			struct blkio_policy_type *pol, unsigned long dequeue) { }
+static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol) { }
+static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol) { }
+static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
+			struct blkio_policy_type *pol) { }
 #endif
 
 #ifdef CONFIG_BLK_CGROUP
@@ -386,18 +393,27 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       enum blkio_policy_id plid,
 				       bool for_root);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-					unsigned long time,
-					unsigned long unaccounted_time);
-void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
-						bool direction, bool sync);
+				   struct blkio_policy_type *pol,
+				   unsigned long time,
+				   unsigned long unaccounted_time);
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+				   struct blkio_policy_type *pol,
+				   uint64_t bytes, bool direction, bool sync);
 void blkiocg_update_completion_stats(struct blkio_group *blkg,
-	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
-void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
-					bool sync);
+				     struct blkio_policy_type *pol,
+				     uint64_t start_time,
+				     uint64_t io_start_time, bool direction,
+				     bool sync);
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+				    struct blkio_policy_type *pol,
+				    bool direction, bool sync);
 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
-		struct blkio_group *curr_blkg, bool direction, bool sync);
+				 struct blkio_policy_type *pol,
+				 struct blkio_group *curr_blkg, bool direction,
+				 bool sync);
 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-					bool direction, bool sync);
+				    struct blkio_policy_type *pol,
+				    bool direction, bool sync);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
@@ -411,19 +427,23 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 					      void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-						unsigned long time,
-						unsigned long unaccounted_time)
-{}
+			struct blkio_policy_type *pol, unsigned long time,
+			unsigned long unaccounted_time) { }
 static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				uint64_t bytes, bool direction, bool sync) {}
+			struct blkio_policy_type *pol, uint64_t bytes,
+			bool direction, bool sync) { }
 static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
-		uint64_t start_time, uint64_t io_start_time, bool direction,
-		bool sync) {}
+			struct blkio_policy_type *pol, uint64_t start_time,
+			uint64_t io_start_time, bool direction, bool sync) { }
 static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-						bool direction, bool sync) {}
+			struct blkio_policy_type *pol, bool direction,
+			bool sync) { }
 static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
-		struct blkio_group *curr_blkg, bool direction, bool sync) {}
+			struct blkio_policy_type *pol,
+			struct blkio_group *curr_blkg, bool direction,
+			bool sync) { }
 static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-						bool direction, bool sync) {}
+			struct blkio_policy_type *pol, bool direction,
+			bool sync) { }
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 153ba50..b2fddaf 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -588,7 +588,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;
 
-	blkiocg_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, rw, sync);
+	blkiocg_update_dispatch_stats(tg_to_blkg(tg), &blkio_policy_throtl,
+				      bio->bi_size, rw, sync);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -1000,6 +1001,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	if (tg) {
 		if (tg_no_rule_group(tg, rw)) {
 			blkiocg_update_dispatch_stats(tg_to_blkg(tg),
+						      &blkio_policy_throtl,
 						      bio->bi_size, rw,
 						      rw_is_sync(bio->bi_rw));
 			goto out_unlock_rcu;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 86980023..11dd9d7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -945,7 +945,8 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfq_group_service_tree_del(st, cfqg);
 	cfqg->saved_workload_slice = 0;
-	cfq_blkiocg_update_dequeue_stats(cfqg_to_blkg(cfqg), 1);
+	cfq_blkiocg_update_dequeue_stats(cfqg_to_blkg(cfqg),
+					 &blkio_policy_cfq, 1);
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
@@ -1017,9 +1018,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
 		     used_sl, cfqq->slice_dispatch, charge,
 		     iops_mode(cfqd), cfqq->nr_sectors);
-	cfq_blkiocg_update_timeslice_used(cfqg_to_blkg(cfqg), used_sl,
-					  unaccounted_sl);
-	cfq_blkiocg_set_start_empty_time(cfqg_to_blkg(cfqg));
+	cfq_blkiocg_update_timeslice_used(cfqg_to_blkg(cfqg), &blkio_policy_cfq,
+					  used_sl, unaccounted_sl);
+	cfq_blkiocg_set_start_empty_time(cfqg_to_blkg(cfqg), &blkio_policy_cfq);
 }
 
 /**
@@ -1463,9 +1464,11 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
 	cfq_blkiocg_update_io_remove_stats(cfqg_to_blkg(RQ_CFQG(rq)),
-					rq_data_dir(rq), rq_is_sync(rq));
+					   &blkio_policy_cfq, rq_data_dir(rq),
+					   rq_is_sync(rq));
 	cfq_add_rq_rb(rq);
 	cfq_blkiocg_update_io_add_stats(cfqg_to_blkg(RQ_CFQG(rq)),
+					&blkio_policy_cfq,
 					cfqg_to_blkg(cfqq->cfqd->serving_group),
 					rq_data_dir(rq), rq_is_sync(rq));
 }
@@ -1524,7 +1527,8 @@ static void cfq_remove_request(struct request *rq)
 
 	cfqq->cfqd->rq_queued--;
 	cfq_blkiocg_update_io_remove_stats(cfqg_to_blkg(RQ_CFQG(rq)),
-					rq_data_dir(rq), rq_is_sync(rq));
+					   &blkio_policy_cfq, rq_data_dir(rq),
+					   rq_is_sync(rq));
 	if (rq->cmd_flags & REQ_PRIO) {
 		WARN_ON(!cfqq->prio_pending);
 		cfqq->prio_pending--;
@@ -1560,7 +1564,8 @@ static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
 	cfq_blkiocg_update_io_merged_stats(cfqg_to_blkg(RQ_CFQG(req)),
-					bio_data_dir(bio), cfq_bio_sync(bio));
+					   &blkio_policy_cfq, bio_data_dir(bio),
+					   cfq_bio_sync(bio));
 }
 
 static void
@@ -1583,7 +1588,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
 	cfq_blkiocg_update_io_merged_stats(cfqg_to_blkg(RQ_CFQG(rq)),
-					rq_data_dir(next), rq_is_sync(next));
+					   &blkio_policy_cfq, rq_data_dir(next),
+					   rq_is_sync(next));
 
 	cfqq = RQ_CFQQ(next);
 	/*
@@ -1624,7 +1630,8 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	del_timer(&cfqd->idle_slice_timer);
-	cfq_blkiocg_update_idle_time_stats(cfqg_to_blkg(cfqq->cfqg));
+	cfq_blkiocg_update_idle_time_stats(cfqg_to_blkg(cfqq->cfqg),
+					   &blkio_policy_cfq);
 }
 
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
@@ -1633,7 +1640,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		cfq_blkiocg_update_avg_queue_size_stats(cfqg_to_blkg(cfqq->cfqg));
+		cfq_blkiocg_update_avg_queue_size_stats(cfqg_to_blkg(cfqq->cfqg),
+							&blkio_policy_cfq);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
@@ -1981,7 +1989,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 		sl = cfqd->cfq_slice_idle;
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-	cfq_blkiocg_update_set_idle_time_stats(cfqg_to_blkg(cfqq->cfqg));
+	cfq_blkiocg_update_set_idle_time_stats(cfqg_to_blkg(cfqq->cfqg),
+					       &blkio_policy_cfq);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
 			group_idle ? 1 : 0);
 }
@@ -2005,8 +2014,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	cfqq->nr_sectors += blk_rq_sectors(rq);
 	cfq_blkiocg_update_dispatch_stats(cfqg_to_blkg(cfqq->cfqg),
-					  blk_rq_bytes(rq), rq_data_dir(rq),
-					  rq_is_sync(rq));
+					  &blkio_policy_cfq, blk_rq_bytes(rq),
+					  rq_data_dir(rq), rq_is_sync(rq));
 }
 
 /*
@@ -3094,7 +3103,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				__blk_run_queue(cfqd->queue);
 			} else {
 				cfq_blkiocg_update_idle_time_stats(
-						cfqg_to_blkg(cfqq->cfqg));
+						cfqg_to_blkg(cfqq->cfqg),
+						&blkio_policy_cfq);
 				cfq_mark_cfqq_must_dispatch(cfqq);
 			}
 		}
@@ -3122,6 +3132,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
 	cfq_blkiocg_update_io_add_stats(cfqg_to_blkg(RQ_CFQG(rq)),
+					&blkio_policy_cfq,
 					cfqg_to_blkg(cfqd->serving_group),
 					rq_data_dir(rq), rq_is_sync(rq));
 	cfq_rq_enqueued(cfqd, cfqq, rq);
@@ -3220,8 +3231,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
 	cfq_blkiocg_update_completion_stats(cfqg_to_blkg(cfqq->cfqg),
-			rq_start_time_ns(rq), rq_io_start_time_ns(rq),
-			rq_data_dir(rq), rq_is_sync(rq));
+			&blkio_policy_cfq, rq_start_time_ns(rq),
+			rq_io_start_time_ns(rq), rq_data_dir(rq),
+			rq_is_sync(rq));
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
diff --git a/block/cfq.h b/block/cfq.h
index 3987601..5584e1b 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -4,67 +4,79 @@
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
-	struct blkio_group *curr_blkg, bool direction, bool sync)
+			struct blkio_policy_type *pol,
+			struct blkio_group *curr_blkg,
+			bool direction, bool sync)
 {
-	blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync);
+	blkiocg_update_io_add_stats(blkg, pol, curr_blkg, direction, sync);
 }
 
 static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			unsigned long dequeue)
+			struct blkio_policy_type *pol, unsigned long dequeue)
 {
-	blkiocg_update_dequeue_stats(blkg, dequeue);
+	blkiocg_update_dequeue_stats(blkg, pol, dequeue);
 }
 
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			unsigned long time, unsigned long unaccounted_time)
+			struct blkio_policy_type *pol, unsigned long time,
+			unsigned long unaccounted_time)
 {
-	blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
+	blkiocg_update_timeslice_used(blkg, pol, time, unaccounted_time);
 }
 
-static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
+static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
+			struct blkio_policy_type *pol)
 {
-	blkiocg_set_start_empty_time(blkg);
+	blkiocg_set_start_empty_time(blkg, pol);
 }
 
 static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-				bool direction, bool sync)
+			struct blkio_policy_type *pol, bool direction,
+			bool sync)
 {
-	blkiocg_update_io_remove_stats(blkg, direction, sync);
+	blkiocg_update_io_remove_stats(blkg, pol, direction, sync);
 }
 
 static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-		bool direction, bool sync)
+			struct blkio_policy_type *pol, bool direction,
+			bool sync)
 {
-	blkiocg_update_io_merged_stats(blkg, direction, sync);
+	blkiocg_update_io_merged_stats(blkg, pol, direction, sync);
 }
 
-static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol)
 {
-	blkiocg_update_idle_time_stats(blkg);
+	blkiocg_update_idle_time_stats(blkg, pol);
 }
 
 static inline void
-cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
+cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol)
 {
-	blkiocg_update_avg_queue_size_stats(blkg);
+	blkiocg_update_avg_queue_size_stats(blkg, pol);
 }
 
 static inline void
-cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol)
 {
-	blkiocg_update_set_idle_time_stats(blkg);
+	blkiocg_update_set_idle_time_stats(blkg, pol);
 }
 
 static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				uint64_t bytes, bool direction, bool sync)
+			struct blkio_policy_type *pol, uint64_t bytes,
+			bool direction, bool sync)
 {
-	blkiocg_update_dispatch_stats(blkg, bytes, direction, sync);
+	blkiocg_update_dispatch_stats(blkg, pol, bytes, direction, sync);
 }
 
-static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
+static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, uint64_t start_time,
+			uint64_t io_start_time, bool direction, bool sync)
 {
-	blkiocg_update_completion_stats(blkg, start_time, io_start_time,
-				direction, sync);
+	blkiocg_update_completion_stats(blkg, pol, start_time, io_start_time,
+					direction, sync);
 }
 
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
@@ -74,30 +86,38 @@ static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
 
 #else /* CFQ_GROUP_IOSCHED */
 static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
-	struct blkio_group *curr_blkg, bool direction, bool sync) {}
-
+			struct blkio_policy_type *pol,
+			struct blkio_group *curr_blkg, bool direction,
+			bool sync) { }
 static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			unsigned long dequeue) {}
-
+			struct blkio_policy_type *pol, unsigned long dequeue) { }
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			unsigned long time, unsigned long unaccounted_time) {}
-static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
+			struct blkio_policy_type *pol, unsigned long time,
+			unsigned long unaccounted_time) { }
+static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
+			struct blkio_policy_type *pol) { }
 static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-				bool direction, bool sync) {}
+			struct blkio_policy_type *pol, bool direction,
+			bool sync) { }
 static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-		bool direction, bool sync) {}
-static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
-{
-}
+			struct blkio_policy_type *pol, bool direction,
+			bool sync) { }
+static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol) { }
 static inline void
-cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {}
+cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
+					struct blkio_policy_type *pol) { }
 
 static inline void
-cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {}
+cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
+				       struct blkio_policy_type *pol) { }
 
 static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				uint64_t bytes, bool direction, bool sync) {}
-static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {}
+			struct blkio_policy_type *pol, uint64_t bytes,
+			bool direction, bool sync) { }
+static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, uint64_t start_time,
+			uint64_t io_start_time, bool direction, bool sync) { }
 
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
 {
-- 
cgit v1.1


From 4eef3049986e8397d5003916aed8cad6567a5e02 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:18 -0800
Subject: blkcg: move per-queue blkg list heads and counters to queue and blkg

Currently, specific policy implementations are responsible for
maintaining list and number of blkgs.  This duplicates code
unnecessarily, and hinders factoring common code and providing blkcg
API with better defined semantics.

After this patch, request_queue hosts list heads and counters and blkg
has list nodes for both policies.  This patch only relocates the
necessary fields and the next patch will actually move management code
into blkcg core.

Note that request_queue->blkg_list[] and ->nr_blkgs[] are hardcoded to
have 2 elements.  This is to avoid include dependency and will be
removed by the next patch.

This patch doesn't introduce any behavior change.

-v2: Now unnecessary conditional on CONFIG_BLK_CGROUP_MODULE removed
     as pointed out by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     |  2 ++
 block/blk-cgroup.h     |  1 +
 block/blk-core.c       |  4 ++++
 block/blk-throttle.c   | 49 +++++++++++++++++++++++--------------------------
 block/cfq-iosched.c    | 47 +++++++++++++++++++----------------------------
 include/linux/blkdev.h |  5 +++++
 6 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 91f9824..e940972 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -499,6 +499,8 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 
 	spin_lock_init(&blkg->stats_lock);
 	rcu_assign_pointer(blkg->q, q);
+	INIT_LIST_HEAD(&blkg->q_node[0]);
+	INIT_LIST_HEAD(&blkg->q_node[1]);
 	blkg->blkcg = blkcg;
 	blkg->plid = pol->plid;
 	blkg->refcnt = 1;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 60e96b4..ae96f19 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -178,6 +178,7 @@ struct blkg_policy_data {
 struct blkio_group {
 	/* Pointer to the associated request_queue, RCU protected */
 	struct request_queue __rcu *q;
+	struct list_head q_node[BLKIO_NR_POLICIES];
 	struct hlist_node blkcg_node;
 	struct blkio_cgroup *blkcg;
 	/* Store cgroup path */
diff --git a/block/blk-core.c b/block/blk-core.c
index c3434c6..83a47fc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -547,6 +547,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	INIT_LIST_HEAD(&q->queue_head);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->icq_list);
+#ifdef CONFIG_BLK_CGROUP
+	INIT_LIST_HEAD(&q->blkg_list[0]);
+	INIT_LIST_HEAD(&q->blkg_list[1]);
+#endif
 	INIT_LIST_HEAD(&q->flush_queue[0]);
 	INIT_LIST_HEAD(&q->flush_queue[1]);
 	INIT_LIST_HEAD(&q->flush_data_in_flight);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b2fddaf..c15d383 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -41,9 +41,6 @@ struct throtl_rb_root {
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
 struct throtl_grp {
-	/* List of throtl groups on the request queue*/
-	struct hlist_node tg_node;
-
 	/* active throtl group service_tree member */
 	struct rb_node rb_node;
 
@@ -83,9 +80,6 @@ struct throtl_grp {
 
 struct throtl_data
 {
-	/* List of throtl groups */
-	struct hlist_head tg_list;
-
 	/* service tree for active throtl groups */
 	struct throtl_rb_root tg_service_tree;
 
@@ -152,7 +146,6 @@ static void throtl_init_blkio_group(struct blkio_group *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 
-	INIT_HLIST_NODE(&tg->tg_node);
 	RB_CLEAR_NODE(&tg->rb_node);
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
@@ -167,11 +160,9 @@ static void throtl_init_blkio_group(struct blkio_group *blkg)
 static void throtl_link_blkio_group(struct request_queue *q,
 				    struct blkio_group *blkg)
 {
-	struct throtl_data *td = q->td;
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-
-	hlist_add_head(&tg->tg_node, &td->tg_list);
-	td->nr_undestroyed_grps++;
+	list_add(&blkg->q_node[BLKIO_POLICY_THROTL],
+		 &q->blkg_list[BLKIO_POLICY_THROTL]);
+	q->nr_blkgs[BLKIO_POLICY_THROTL]++;
 }
 
 static struct
@@ -711,8 +702,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 
 static void throtl_process_limit_change(struct throtl_data *td)
 {
-	struct throtl_grp *tg;
-	struct hlist_node *pos, *n;
+	struct request_queue *q = td->queue;
+	struct blkio_group *blkg, *n;
 
 	if (!td->limits_changed)
 		return;
@@ -721,7 +712,10 @@ static void throtl_process_limit_change(struct throtl_data *td)
 
 	throtl_log(td, "limits changed");
 
-	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+	list_for_each_entry_safe(blkg, n, &q->blkg_list[BLKIO_POLICY_THROTL],
+				 q_node[BLKIO_POLICY_THROTL]) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+
 		if (!tg->limits_changed)
 			continue;
 
@@ -822,26 +816,31 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 static void
 throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
+	struct blkio_group *blkg = tg_to_blkg(tg);
+
 	/* Something wrong if we are trying to remove same group twice */
-	BUG_ON(hlist_unhashed(&tg->tg_node));
+	WARN_ON_ONCE(list_empty(&blkg->q_node[BLKIO_POLICY_THROTL]));
 
-	hlist_del_init(&tg->tg_node);
+	list_del_init(&blkg->q_node[BLKIO_POLICY_THROTL]);
 
 	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
 	blkg_put(tg_to_blkg(tg));
-	td->nr_undestroyed_grps--;
+	td->queue->nr_blkgs[BLKIO_POLICY_THROTL]--;
 }
 
 static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
 {
-	struct hlist_node *pos, *n;
-	struct throtl_grp *tg;
+	struct request_queue *q = td->queue;
+	struct blkio_group *blkg, *n;
 	bool empty = true;
 
-	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+	list_for_each_entry_safe(blkg, n, &q->blkg_list[BLKIO_POLICY_THROTL],
+				 q_node[BLKIO_POLICY_THROTL]) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+
 		/* skip root? */
 		if (!release_root && tg == td->root_tg)
 			continue;
@@ -851,7 +850,7 @@ static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
 		 * it from cgroup list, then it will take care of destroying
 		 * cfqg also.
 		 */
-		if (!blkiocg_del_blkio_group(tg_to_blkg(tg)))
+		if (!blkiocg_del_blkio_group(blkg))
 			throtl_destroy_tg(td, tg);
 		else
 			empty = false;
@@ -1114,7 +1113,6 @@ int blk_throtl_init(struct request_queue *q)
 	if (!td)
 		return -ENOMEM;
 
-	INIT_HLIST_HEAD(&td->tg_list);
 	td->tg_service_tree = THROTL_RB_ROOT;
 	td->limits_changed = false;
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
@@ -1144,7 +1142,7 @@ int blk_throtl_init(struct request_queue *q)
 void blk_throtl_exit(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
-	bool wait = false;
+	bool wait;
 
 	BUG_ON(!td);
 
@@ -1154,8 +1152,7 @@ void blk_throtl_exit(struct request_queue *q)
 	throtl_release_tgs(td, true);
 
 	/* If there are other groups */
-	if (td->nr_undestroyed_grps > 0)
-		wait = true;
+	wait = q->nr_blkgs[BLKIO_POLICY_THROTL];
 
 	spin_unlock_irq(q->queue_lock);
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 11dd9d7..e846803 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -208,9 +208,7 @@ struct cfq_group {
 	unsigned long saved_workload_slice;
 	enum wl_type_t saved_workload;
 	enum wl_prio_t saved_serving_prio;
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	struct hlist_node cfqd_node;
-#endif
+
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 	struct cfq_ttime ttime;
@@ -302,12 +300,6 @@ struct cfq_data {
 	struct cfq_queue oom_cfqq;
 
 	unsigned long last_delayed_sync;
-
-	/* List of cfq groups being managed on this device*/
-	struct hlist_head cfqg_list;
-
-	/* Number of groups which are on blkcg->blkg_list */
-	unsigned int nr_blkcg_linked_grps;
 };
 
 static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
@@ -1056,13 +1048,9 @@ static void cfq_update_blkio_group_weight(struct request_queue *q,
 static void cfq_link_blkio_group(struct request_queue *q,
 				 struct blkio_group *blkg)
 {
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-
-	cfqd->nr_blkcg_linked_grps++;
-
-	/* Add group on cfqd list */
-	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+	list_add(&blkg->q_node[BLKIO_POLICY_PROP],
+		 &q->blkg_list[BLKIO_POLICY_PROP]);
+	q->nr_blkgs[BLKIO_POLICY_PROP]++;
 }
 
 static void cfq_init_blkio_group(struct blkio_group *blkg)
@@ -1110,13 +1098,15 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 
 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
+	struct blkio_group *blkg = cfqg_to_blkg(cfqg);
+
 	/* Something wrong if we are trying to remove same group twice */
-	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
+	BUG_ON(list_empty(&blkg->q_node[BLKIO_POLICY_PROP]));
 
-	hlist_del_init(&cfqg->cfqd_node);
+	list_del_init(&blkg->q_node[BLKIO_POLICY_PROP]);
 
-	BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
-	cfqd->nr_blkcg_linked_grps--;
+	BUG_ON(cfqd->queue->nr_blkgs[BLKIO_POLICY_PROP] <= 0);
+	cfqd->queue->nr_blkgs[BLKIO_POLICY_PROP]--;
 
 	/*
 	 * Put the reference taken at the time of creation so that when all
@@ -1127,18 +1117,19 @@ static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 static bool cfq_release_cfq_groups(struct cfq_data *cfqd)
 {
-	struct hlist_node *pos, *n;
-	struct cfq_group *cfqg;
+	struct request_queue *q = cfqd->queue;
+	struct blkio_group *blkg, *n;
 	bool empty = true;
 
-	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
+	list_for_each_entry_safe(blkg, n, &q->blkg_list[BLKIO_POLICY_PROP],
+				 q_node[BLKIO_POLICY_PROP]) {
 		/*
 		 * If cgroup removal path got to blk_group first and removed
 		 * it from cgroup list, then it will take care of destroying
 		 * cfqg also.
 		 */
-		if (!cfq_blkiocg_del_blkio_group(cfqg_to_blkg(cfqg)))
-			cfq_destroy_cfqg(cfqd, cfqg);
+		if (!cfq_blkiocg_del_blkio_group(blkg))
+			cfq_destroy_cfqg(cfqd, blkg_to_cfqg(blkg));
 		else
 			empty = false;
 	}
@@ -3558,13 +3549,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	cfq_put_async_queues(cfqd);
 	cfq_release_cfq_groups(cfqd);
 
+#ifdef CONFIG_BLK_CGROUP
 	/*
 	 * If there are groups which we could not unlink from blkcg list,
 	 * wait for a rcu period for them to be freed.
 	 */
-	if (cfqd->nr_blkcg_linked_grps)
-		wait = true;
-
+	wait = q->nr_blkgs[BLKIO_POLICY_PROP];
+#endif
 	spin_unlock_irq(q->queue_lock);
 
 	cfq_shutdown_timer_wq(cfqd);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e8c0bbd..f4e35ed 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -362,6 +362,11 @@ struct request_queue {
 	struct list_head	timeout_list;
 
 	struct list_head	icq_list;
+#ifdef CONFIG_BLK_CGROUP
+	/* XXX: array size hardcoded to avoid include dependency (temporary) */
+	struct list_head	blkg_list[2];
+	int			nr_blkgs[2];
+#endif
 
 	struct queue_limits	limits;
 
-- 
cgit v1.1


From 03aa264ac15637b6f98374270bcdf31400965505 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:19 -0800
Subject: blkcg: let blkcg core manage per-queue blkg list and counter

With the previous patch to move blkg list heads and counters to
request_queue and blkg, logic to manage them in both policies are
almost identical and can be moved to blkcg core.

This patch moves blkg link logic into blkg_lookup_create(), implements
common blkg unlink code in blkg_destroy(), and updates
blkg_destory_all() so that it's policy specific and can skip root
group.  The updated blkg_destroy_all() is now used to both clear queue
for bypassing and elv switching, and release all blkgs on q exit.

This patch introduces a race window where policy [de]registration may
race against queue blkg clearing.  This can only be a problem on cfq
unload and shouldn't be a real problem in practice (and we have many
other places where this race already exists).  Future patches will
remove these unlikely races.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     |  72 +++++++++++++++++++++++++++--------
 block/blk-cgroup.h     |  15 +++-----
 block/blk-throttle.c   |  99 +-----------------------------------------------
 block/cfq-iosched.c    | 100 +++----------------------------------------------
 block/elevator.c       |   5 ++-
 include/linux/blkdev.h |   4 +-
 6 files changed, 74 insertions(+), 221 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e940972..2ca9a15 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -596,8 +596,11 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	/* insert */
 	spin_lock(&blkcg->lock);
 	swap(blkg, new_blkg);
+
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
-	pol->ops.blkio_link_group_fn(q, blkg);
+	list_add(&blkg->q_node[plid], &q->blkg_list[plid]);
+	q->nr_blkgs[plid]++;
+
 	spin_unlock(&blkcg->lock);
 out:
 	blkg_free(new_blkg);
@@ -646,36 +649,69 @@ struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkg_lookup);
 
-void blkg_destroy_all(struct request_queue *q)
+static void blkg_destroy(struct blkio_group *blkg, enum blkio_policy_id plid)
+{
+	struct request_queue *q = blkg->q;
+
+	lockdep_assert_held(q->queue_lock);
+
+	/* Something wrong if we are trying to remove same group twice */
+	WARN_ON_ONCE(list_empty(&blkg->q_node[plid]));
+	list_del_init(&blkg->q_node[plid]);
+
+	WARN_ON_ONCE(q->nr_blkgs[plid] <= 0);
+	q->nr_blkgs[plid]--;
+
+	/*
+	 * Put the reference taken at the time of creation so that when all
+	 * queues are gone, group can be destroyed.
+	 */
+	blkg_put(blkg);
+}
+
+void blkg_destroy_all(struct request_queue *q, enum blkio_policy_id plid,
+		      bool destroy_root)
 {
-	struct blkio_policy_type *pol;
+	struct blkio_group *blkg, *n;
 
 	while (true) {
 		bool done = true;
 
-		spin_lock(&blkio_list_lock);
 		spin_lock_irq(q->queue_lock);
 
-		/*
-		 * clear_queue_fn() might return with non-empty group list
-		 * if it raced cgroup removal and lost.  cgroup removal is
-		 * guaranteed to make forward progress and retrying after a
-		 * while is enough.  This ugliness is scheduled to be
-		 * removed after locking update.
-		 */
-		list_for_each_entry(pol, &blkio_list, list)
-			if (!pol->ops.blkio_clear_queue_fn(q))
+		list_for_each_entry_safe(blkg, n, &q->blkg_list[plid],
+					 q_node[plid]) {
+			/* skip root? */
+			if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
+				continue;
+
+			/*
+			 * If cgroup removal path got to blk_group first
+			 * and removed it from cgroup list, then it will
+			 * take care of destroying cfqg also.
+			 */
+			if (!blkiocg_del_blkio_group(blkg))
+				blkg_destroy(blkg, plid);
+			else
 				done = false;
+		}
 
 		spin_unlock_irq(q->queue_lock);
-		spin_unlock(&blkio_list_lock);
 
+		/*
+		 * Group list may not be empty if we raced cgroup removal
+		 * and lost.  cgroup removal is guaranteed to make forward
+		 * progress and retrying after a while is enough.  This
+		 * ugliness is scheduled to be removed after locking
+		 * update.
+		 */
 		if (done)
 			break;
 
 		msleep(10);	/* just some random duration I like */
 	}
 }
+EXPORT_SYMBOL_GPL(blkg_destroy_all);
 
 static void blkg_rcu_free(struct rcu_head *rcu_head)
 {
@@ -1549,11 +1585,13 @@ static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
 		 * this event.
 		 */
 		spin_lock(&blkio_list_lock);
+		spin_lock_irqsave(q->queue_lock, flags);
 		list_for_each_entry(blkiop, &blkio_list, list) {
 			if (blkiop->plid != blkg->plid)
 				continue;
-			blkiop->ops.blkio_unlink_group_fn(q, blkg);
+			blkg_destroy(blkg, blkiop->plid);
 		}
+		spin_unlock_irqrestore(q->queue_lock, flags);
 		spin_unlock(&blkio_list_lock);
 	} while (1);
 
@@ -1695,12 +1733,14 @@ static void blkcg_bypass_start(void)
 	__acquires(&all_q_mutex)
 {
 	struct request_queue *q;
+	int i;
 
 	mutex_lock(&all_q_mutex);
 
 	list_for_each_entry(q, &all_q_list, all_q_node) {
 		blk_queue_bypass_start(q);
-		blkg_destroy_all(q);
+		for (i = 0; i < BLKIO_NR_POLICIES; i++)
+			blkg_destroy_all(q, i, false);
 	}
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ae96f19..83ce5fa 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -196,11 +196,6 @@ struct blkio_group {
 };
 
 typedef void (blkio_init_group_fn)(struct blkio_group *blkg);
-typedef void (blkio_link_group_fn)(struct request_queue *q,
-			struct blkio_group *blkg);
-typedef void (blkio_unlink_group_fn)(struct request_queue *q,
-			struct blkio_group *blkg);
-typedef bool (blkio_clear_queue_fn)(struct request_queue *q);
 typedef void (blkio_update_group_weight_fn)(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int weight);
 typedef void (blkio_update_group_read_bps_fn)(struct request_queue *q,
@@ -214,9 +209,6 @@ typedef void (blkio_update_group_write_iops_fn)(struct request_queue *q,
 
 struct blkio_policy_ops {
 	blkio_init_group_fn *blkio_init_group_fn;
-	blkio_link_group_fn *blkio_link_group_fn;
-	blkio_unlink_group_fn *blkio_unlink_group_fn;
-	blkio_clear_queue_fn *blkio_clear_queue_fn;
 	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
 	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
 	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
@@ -238,7 +230,8 @@ extern void blkcg_exit_queue(struct request_queue *q);
 /* Blkio controller policy registration */
 extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
-extern void blkg_destroy_all(struct request_queue *q);
+extern void blkg_destroy_all(struct request_queue *q,
+			     enum blkio_policy_id plid, bool destroy_root);
 
 /**
  * blkg_to_pdata - get policy private data
@@ -319,7 +312,9 @@ static inline void blkcg_drain_queue(struct request_queue *q) { }
 static inline void blkcg_exit_queue(struct request_queue *q) { }
 static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
-static inline void blkg_destroy_all(struct request_queue *q) { }
+static inline void blkg_destroy_all(struct request_queue *q,
+				    enum blkio_policy_id plid,
+				    bool destory_root) { }
 
 static inline void *blkg_to_pdata(struct blkio_group *blkg,
 				struct blkio_policy_type *pol) { return NULL; }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c15d383..1329412 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -157,14 +157,6 @@ static void throtl_init_blkio_group(struct blkio_group *blkg)
 	tg->iops[WRITE] = -1;
 }
 
-static void throtl_link_blkio_group(struct request_queue *q,
-				    struct blkio_group *blkg)
-{
-	list_add(&blkg->q_node[BLKIO_POLICY_THROTL],
-		 &q->blkg_list[BLKIO_POLICY_THROTL]);
-	q->nr_blkgs[BLKIO_POLICY_THROTL]++;
-}
-
 static struct
 throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
@@ -813,89 +805,6 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 	}
 }
 
-static void
-throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
-{
-	struct blkio_group *blkg = tg_to_blkg(tg);
-
-	/* Something wrong if we are trying to remove same group twice */
-	WARN_ON_ONCE(list_empty(&blkg->q_node[BLKIO_POLICY_THROTL]));
-
-	list_del_init(&blkg->q_node[BLKIO_POLICY_THROTL]);
-
-	/*
-	 * Put the reference taken at the time of creation so that when all
-	 * queues are gone, group can be destroyed.
-	 */
-	blkg_put(tg_to_blkg(tg));
-	td->queue->nr_blkgs[BLKIO_POLICY_THROTL]--;
-}
-
-static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
-{
-	struct request_queue *q = td->queue;
-	struct blkio_group *blkg, *n;
-	bool empty = true;
-
-	list_for_each_entry_safe(blkg, n, &q->blkg_list[BLKIO_POLICY_THROTL],
-				 q_node[BLKIO_POLICY_THROTL]) {
-		struct throtl_grp *tg = blkg_to_tg(blkg);
-
-		/* skip root? */
-		if (!release_root && tg == td->root_tg)
-			continue;
-
-		/*
-		 * If cgroup removal path got to blk_group first and removed
-		 * it from cgroup list, then it will take care of destroying
-		 * cfqg also.
-		 */
-		if (!blkiocg_del_blkio_group(blkg))
-			throtl_destroy_tg(td, tg);
-		else
-			empty = false;
-	}
-	return empty;
-}
-
-/*
- * Blk cgroup controller notification saying that blkio_group object is being
- * delinked as associated cgroup object is going away. That also means that
- * no new IO will come in this group. So get rid of this group as soon as
- * any pending IO in the group is finished.
- *
- * This function is called under rcu_read_lock(). @q is the rcu protected
- * pointer. That means @q is a valid request_queue pointer as long as we
- * are rcu read lock.
- *
- * @q was fetched from blkio_group under blkio_cgroup->lock. That means
- * it should not be NULL as even if queue was going away, cgroup deltion
- * path got to it first.
- */
-void throtl_unlink_blkio_group(struct request_queue *q,
-			       struct blkio_group *blkg)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	throtl_destroy_tg(q->td, blkg_to_tg(blkg));
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static bool throtl_clear_queue(struct request_queue *q)
-{
-	lockdep_assert_held(q->queue_lock);
-
-	/*
-	 * Clear tgs but leave the root one alone.  This is necessary
-	 * because root_tg is expected to be persistent and safe because
-	 * blk-throtl can never be disabled while @q is alive.  This is a
-	 * kludge to prepare for unified blkg.  This whole function will be
-	 * removed soon.
-	 */
-	return throtl_release_tgs(q->td, false);
-}
-
 static void throtl_update_blkio_group_common(struct throtl_data *td,
 				struct throtl_grp *tg)
 {
@@ -960,9 +869,6 @@ static void throtl_shutdown_wq(struct request_queue *q)
 static struct blkio_policy_type blkio_policy_throtl = {
 	.ops = {
 		.blkio_init_group_fn = throtl_init_blkio_group,
-		.blkio_link_group_fn = throtl_link_blkio_group,
-		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
-		.blkio_clear_queue_fn = throtl_clear_queue,
 		.blkio_update_group_read_bps_fn =
 					throtl_update_blkio_group_read_bps,
 		.blkio_update_group_write_bps_fn =
@@ -1148,12 +1054,11 @@ void blk_throtl_exit(struct request_queue *q)
 
 	throtl_shutdown_wq(q);
 
-	spin_lock_irq(q->queue_lock);
-	throtl_release_tgs(td, true);
+	blkg_destroy_all(q, BLKIO_POLICY_THROTL, true);
 
 	/* If there are other groups */
+	spin_lock_irq(q->queue_lock);
 	wait = q->nr_blkgs[BLKIO_POLICY_THROTL];
-
 	spin_unlock_irq(q->queue_lock);
 
 	/*
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e846803..dc73690 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1045,14 +1045,6 @@ static void cfq_update_blkio_group_weight(struct request_queue *q,
 	cfqg->needs_update = true;
 }
 
-static void cfq_link_blkio_group(struct request_queue *q,
-				 struct blkio_group *blkg)
-{
-	list_add(&blkg->q_node[BLKIO_POLICY_PROP],
-		 &q->blkg_list[BLKIO_POLICY_PROP]);
-	q->nr_blkgs[BLKIO_POLICY_PROP]++;
-}
-
 static void cfq_init_blkio_group(struct blkio_group *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1096,84 +1088,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 	blkg_get(cfqg_to_blkg(cfqg));
 }
 
-static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
-{
-	struct blkio_group *blkg = cfqg_to_blkg(cfqg);
-
-	/* Something wrong if we are trying to remove same group twice */
-	BUG_ON(list_empty(&blkg->q_node[BLKIO_POLICY_PROP]));
-
-	list_del_init(&blkg->q_node[BLKIO_POLICY_PROP]);
-
-	BUG_ON(cfqd->queue->nr_blkgs[BLKIO_POLICY_PROP] <= 0);
-	cfqd->queue->nr_blkgs[BLKIO_POLICY_PROP]--;
-
-	/*
-	 * Put the reference taken at the time of creation so that when all
-	 * queues are gone, group can be destroyed.
-	 */
-	blkg_put(cfqg_to_blkg(cfqg));
-}
-
-static bool cfq_release_cfq_groups(struct cfq_data *cfqd)
-{
-	struct request_queue *q = cfqd->queue;
-	struct blkio_group *blkg, *n;
-	bool empty = true;
-
-	list_for_each_entry_safe(blkg, n, &q->blkg_list[BLKIO_POLICY_PROP],
-				 q_node[BLKIO_POLICY_PROP]) {
-		/*
-		 * If cgroup removal path got to blk_group first and removed
-		 * it from cgroup list, then it will take care of destroying
-		 * cfqg also.
-		 */
-		if (!cfq_blkiocg_del_blkio_group(blkg))
-			cfq_destroy_cfqg(cfqd, blkg_to_cfqg(blkg));
-		else
-			empty = false;
-	}
-	return empty;
-}
-
-/*
- * Blk cgroup controller notification saying that blkio_group object is being
- * delinked as associated cgroup object is going away. That also means that
- * no new IO will come in this group. So get rid of this group as soon as
- * any pending IO in the group is finished.
- *
- * This function is called under rcu_read_lock(). key is the rcu protected
- * pointer. That means @q is a valid request_queue pointer as long as we
- * are rcu read lock.
- *
- * @q was fetched from blkio_group under blkio_cgroup->lock. That means
- * it should not be NULL as even if elevator was exiting, cgroup deltion
- * path got to it first.
- */
-static void cfq_unlink_blkio_group(struct request_queue *q,
-				   struct blkio_group *blkg)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	cfq_destroy_cfqg(cfqd, blkg_to_cfqg(blkg));
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static struct elevator_type iosched_cfq;
-
-static bool cfq_clear_queue(struct request_queue *q)
-{
-	lockdep_assert_held(q->queue_lock);
-
-	/* shoot down blkgs iff the current elevator is cfq */
-	if (!q->elevator || q->elevator->type != &iosched_cfq)
-		return true;
-
-	return cfq_release_cfq_groups(q->elevator->elevator_data);
-}
-
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 						struct blkio_cgroup *blkcg)
@@ -1186,8 +1100,6 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
 	cfqq->cfqg = cfqg;
 }
 
-static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
-
 #endif /* GROUP_IOSCHED */
 
 /*
@@ -3547,17 +3459,20 @@ static void cfq_exit_queue(struct elevator_queue *e)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
 	cfq_put_async_queues(cfqd);
-	cfq_release_cfq_groups(cfqd);
+
+	spin_unlock_irq(q->queue_lock);
+
+	blkg_destroy_all(q, BLKIO_POLICY_PROP, true);
 
 #ifdef CONFIG_BLK_CGROUP
 	/*
 	 * If there are groups which we could not unlink from blkcg list,
 	 * wait for a rcu period for them to be freed.
 	 */
+	spin_lock_irq(q->queue_lock);
 	wait = q->nr_blkgs[BLKIO_POLICY_PROP];
-#endif
 	spin_unlock_irq(q->queue_lock);
-
+#endif
 	cfq_shutdown_timer_wq(cfqd);
 
 	/*
@@ -3794,9 +3709,6 @@ static struct elevator_type iosched_cfq = {
 static struct blkio_policy_type blkio_policy_cfq = {
 	.ops = {
 		.blkio_init_group_fn =		cfq_init_blkio_group,
-		.blkio_link_group_fn =		cfq_link_blkio_group,
-		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
-		.blkio_clear_queue_fn = cfq_clear_queue,
 		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
 	},
 	.plid = BLKIO_POLICY_PROP,
diff --git a/block/elevator.c b/block/elevator.c
index 8c7561f..d4d39da 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -876,7 +876,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	struct elevator_queue *old = q->elevator;
 	bool registered = old->registered;
-	int err;
+	int i, err;
 
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data.
@@ -895,7 +895,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	ioc_clear_queue(q);
 	spin_unlock_irq(q->queue_lock);
 
-	blkg_destroy_all(q);
+	for (i = 0; i < BLKIO_NR_POLICIES; i++)
+		blkg_destroy_all(q, i, false);
 
 	/* allocate, init and register new elevator */
 	err = -ENOMEM;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f4e35ed..b4d1d4b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -364,8 +364,8 @@ struct request_queue {
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
 	/* XXX: array size hardcoded to avoid include dependency (temporary) */
-	struct list_head	blkg_list[2];
-	int			nr_blkgs[2];
+	struct list_head	blkg_list;
+	int			nr_blkgs;
 #endif
 
 	struct queue_limits	limits;
-- 
cgit v1.1


From e8989fae38d9831c72b20375a206a919ca468c52 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:20 -0800
Subject: blkcg: unify blkg's for blkcg policies

Currently, blkg is per cgroup-queue-policy combination.  This is
unnatural and leads to various convolutions in partially used
duplicate fields in blkg, config / stat access, and general management
of blkgs.

This patch make blkg's per cgroup-queue and let them serve all
policies.  blkgs are now created and destroyed by blkcg core proper.
This will allow further consolidation of common management logic into
blkcg core and API with better defined semantics and layering.

As a transitional step to untangle blkg management, elvswitch and
policy [de]registration, all blkgs except the root blkg are being shot
down during elvswitch and bypass.  This patch adds blkg_root_update()
to update root blkg in place on policy change.  This is hacky and racy
but should be good enough as interim step until we get locking
simplified and switch over to proper in-place update for all blkgs.

-v2: Root blkgs need to be updated on elvswitch too and blkg_alloc()
     comment wasn't updated according to the function change.  Fixed.
     Both pointed out by Vivek.

-v3: v2 updated blkg_destroy_all() to invoke update_root_blkg_pd() for
     all policies.  This freed root pd during elvswitch before the
     last queue finished exiting and led to oops.  Directly invoke
     update_root_blkg_pd() only on BLKIO_POLICY_PROP from
     cfq_exit_queue().  This also is closer to what will be done with
     proper in-place blkg update.  Reported by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 224 +++++++++++++++++++++++++++++++--------------------
 block/blk-cgroup.h   |  15 ++--
 block/blk-core.c     |   3 +-
 block/blk-sysfs.c    |   4 +-
 block/blk-throttle.c |   9 +--
 block/cfq-iosched.c  |   5 +-
 block/elevator.c     |   5 +-
 7 files changed, 154 insertions(+), 111 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2ca9a15..cad5f15 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -461,16 +461,20 @@ EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
  */
 static void blkg_free(struct blkio_group *blkg)
 {
-	struct blkg_policy_data *pd;
+	int i;
 
 	if (!blkg)
 		return;
 
-	pd = blkg->pd[blkg->plid];
-	if (pd) {
-		free_percpu(pd->stats_cpu);
-		kfree(pd);
+	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+		struct blkg_policy_data *pd = blkg->pd[i];
+
+		if (pd) {
+			free_percpu(pd->stats_cpu);
+			kfree(pd);
+		}
 	}
+
 	kfree(blkg);
 }
 
@@ -478,19 +482,17 @@ static void blkg_free(struct blkio_group *blkg)
  * blkg_alloc - allocate a blkg
  * @blkcg: block cgroup the new blkg is associated with
  * @q: request_queue the new blkg is associated with
- * @pol: policy the new blkg is associated with
  *
- * Allocate a new blkg assocating @blkcg and @q for @pol.
+ * Allocate a new blkg assocating @blkcg and @q.
  *
  * FIXME: Should be called with queue locked but currently isn't due to
  *        percpu stat breakage.
  */
 static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
-				      struct request_queue *q,
-				      struct blkio_policy_type *pol)
+				      struct request_queue *q)
 {
 	struct blkio_group *blkg;
-	struct blkg_policy_data *pd;
+	int i;
 
 	/* alloc and init base part */
 	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
@@ -499,34 +501,45 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 
 	spin_lock_init(&blkg->stats_lock);
 	rcu_assign_pointer(blkg->q, q);
-	INIT_LIST_HEAD(&blkg->q_node[0]);
-	INIT_LIST_HEAD(&blkg->q_node[1]);
+	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
-	blkg->plid = pol->plid;
 	blkg->refcnt = 1;
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 
-	/* alloc per-policy data and attach it to blkg */
-	pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
-			  q->node);
-	if (!pd) {
-		blkg_free(blkg);
-		return NULL;
-	}
+	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+		struct blkio_policy_type *pol = blkio_policy[i];
+		struct blkg_policy_data *pd;
 
-	blkg->pd[pol->plid] = pd;
-	pd->blkg = blkg;
+		if (!pol)
+			continue;
+
+		/* alloc per-policy data and attach it to blkg */
+		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
+				  q->node);
+		if (!pd) {
+			blkg_free(blkg);
+			return NULL;
+		}
 
-	/* broken, read comment in the callsite */
+		blkg->pd[i] = pd;
+		pd->blkg = blkg;
 
-	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-	if (!pd->stats_cpu) {
-		blkg_free(blkg);
-		return NULL;
+		/* broken, read comment in the callsite */
+		pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+		if (!pd->stats_cpu) {
+			blkg_free(blkg);
+			return NULL;
+		}
 	}
 
 	/* invoke per-policy init */
-	pol->ops.blkio_init_group_fn(blkg);
+	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+		struct blkio_policy_type *pol = blkio_policy[i];
+
+		if (pol)
+			pol->ops.blkio_init_group_fn(blkg);
+	}
+
 	return blkg;
 }
 
@@ -536,7 +549,6 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       bool for_root)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
-	struct blkio_policy_type *pol = blkio_policy[plid];
 	struct blkio_group *blkg, *new_blkg;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
@@ -551,7 +563,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	if (unlikely(blk_queue_bypass(q)) && !for_root)
 		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
 
-	blkg = blkg_lookup(blkcg, q, plid);
+	blkg = blkg_lookup(blkcg, q);
 	if (blkg)
 		return blkg;
 
@@ -571,7 +583,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 
-	new_blkg = blkg_alloc(blkcg, q, pol);
+	new_blkg = blkg_alloc(blkcg, q);
 
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
@@ -583,7 +595,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	}
 
 	/* did someone beat us to it? */
-	blkg = blkg_lookup(blkcg, q, plid);
+	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(blkg))
 		goto out;
 
@@ -598,8 +610,8 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	swap(blkg, new_blkg);
 
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
-	list_add(&blkg->q_node[plid], &q->blkg_list[plid]);
-	q->nr_blkgs[plid]++;
+	list_add(&blkg->q_node, &q->blkg_list);
+	q->nr_blkgs++;
 
 	spin_unlock(&blkcg->lock);
 out:
@@ -636,31 +648,30 @@ EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 
 /* called under rcu_read_lock(). */
 struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
-				struct request_queue *q,
-				enum blkio_policy_id plid)
+				struct request_queue *q)
 {
 	struct blkio_group *blkg;
 	struct hlist_node *n;
 
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
-		if (blkg->q == q && blkg->plid == plid)
+		if (blkg->q == q)
 			return blkg;
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(blkg_lookup);
 
-static void blkg_destroy(struct blkio_group *blkg, enum blkio_policy_id plid)
+static void blkg_destroy(struct blkio_group *blkg)
 {
 	struct request_queue *q = blkg->q;
 
 	lockdep_assert_held(q->queue_lock);
 
 	/* Something wrong if we are trying to remove same group twice */
-	WARN_ON_ONCE(list_empty(&blkg->q_node[plid]));
-	list_del_init(&blkg->q_node[plid]);
+	WARN_ON_ONCE(list_empty(&blkg->q_node));
+	list_del_init(&blkg->q_node);
 
-	WARN_ON_ONCE(q->nr_blkgs[plid] <= 0);
-	q->nr_blkgs[plid]--;
+	WARN_ON_ONCE(q->nr_blkgs <= 0);
+	q->nr_blkgs--;
 
 	/*
 	 * Put the reference taken at the time of creation so that when all
@@ -669,8 +680,40 @@ static void blkg_destroy(struct blkio_group *blkg, enum blkio_policy_id plid)
 	blkg_put(blkg);
 }
 
-void blkg_destroy_all(struct request_queue *q, enum blkio_policy_id plid,
-		      bool destroy_root)
+/*
+ * XXX: This updates blkg policy data in-place for root blkg, which is
+ * necessary across elevator switch and policy registration as root blkgs
+ * aren't shot down.  This broken and racy implementation is temporary.
+ * Eventually, blkg shoot down will be replaced by proper in-place update.
+ */
+void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
+{
+	struct blkio_policy_type *pol = blkio_policy[plid];
+	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
+	struct blkg_policy_data *pd;
+
+	if (!blkg)
+		return;
+
+	kfree(blkg->pd[plid]);
+	blkg->pd[plid] = NULL;
+
+	if (!pol)
+		return;
+
+	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
+	WARN_ON_ONCE(!pd);
+
+	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+	WARN_ON_ONCE(!pd->stats_cpu);
+
+	blkg->pd[plid] = pd;
+	pd->blkg = blkg;
+	pol->ops.blkio_init_group_fn(blkg);
+}
+EXPORT_SYMBOL_GPL(update_root_blkg_pd);
+
+void blkg_destroy_all(struct request_queue *q, bool destroy_root)
 {
 	struct blkio_group *blkg, *n;
 
@@ -679,8 +722,7 @@ void blkg_destroy_all(struct request_queue *q, enum blkio_policy_id plid,
 
 		spin_lock_irq(q->queue_lock);
 
-		list_for_each_entry_safe(blkg, n, &q->blkg_list[plid],
-					 q_node[plid]) {
+		list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 			/* skip root? */
 			if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
 				continue;
@@ -691,7 +733,7 @@ void blkg_destroy_all(struct request_queue *q, enum blkio_policy_id plid,
 			 * take care of destroying cfqg also.
 			 */
 			if (!blkiocg_del_blkio_group(blkg))
-				blkg_destroy(blkg, plid);
+				blkg_destroy(blkg);
 			else
 				done = false;
 		}
@@ -776,43 +818,49 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 #endif
 
 	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	spin_lock(&blkio_list_lock);
 	spin_lock_irq(&blkcg->lock);
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+		struct blkio_policy_type *pol;
 
-		spin_lock(&blkg->stats_lock);
-		stats = &pd->stats;
+		list_for_each_entry(pol, &blkio_list, list) {
+			struct blkg_policy_data *pd = blkg->pd[pol->plid];
+
+			spin_lock(&blkg->stats_lock);
+			stats = &pd->stats;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-		idling = blkio_blkg_idling(stats);
-		waiting = blkio_blkg_waiting(stats);
-		empty = blkio_blkg_empty(stats);
+			idling = blkio_blkg_idling(stats);
+			waiting = blkio_blkg_waiting(stats);
+			empty = blkio_blkg_empty(stats);
 #endif
-		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
-			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
-		memset(stats, 0, sizeof(struct blkio_group_stats));
-		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
-			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
+			for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+				queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
+			memset(stats, 0, sizeof(struct blkio_group_stats));
+			for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+				stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-		if (idling) {
-			blkio_mark_blkg_idling(stats);
-			stats->start_idle_time = now;
-		}
-		if (waiting) {
-			blkio_mark_blkg_waiting(stats);
-			stats->start_group_wait_time = now;
-		}
-		if (empty) {
-			blkio_mark_blkg_empty(stats);
-			stats->start_empty_time = now;
-		}
+			if (idling) {
+				blkio_mark_blkg_idling(stats);
+				stats->start_idle_time = now;
+			}
+			if (waiting) {
+				blkio_mark_blkg_waiting(stats);
+				stats->start_group_wait_time = now;
+			}
+			if (empty) {
+				blkio_mark_blkg_empty(stats);
+				stats->start_empty_time = now;
+			}
 #endif
-		spin_unlock(&blkg->stats_lock);
+			spin_unlock(&blkg->stats_lock);
 
-		/* Reset Per cpu stats which don't take blkg->stats_lock */
-		blkio_reset_stats_cpu(blkg, blkg->plid);
+			/* Reset Per cpu stats which don't take blkg->stats_lock */
+			blkio_reset_stats_cpu(blkg, pol->plid);
+		}
 	}
 
 	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
 	return 0;
 }
 
@@ -1168,8 +1216,7 @@ static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
 
 	spin_lock_irq(&blkcg->lock);
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
-		if (BLKIOFILE_POLICY(cft->private) == blkg->plid)
-			blkio_print_group_conf(cft, blkg, m);
+		blkio_print_group_conf(cft, blkg, m);
 	spin_unlock_irq(&blkcg->lock);
 }
 
@@ -1224,7 +1271,7 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
 		const char *dname = blkg_dev_name(blkg);
 		int plid = BLKIOFILE_POLICY(cft->private);
 
-		if (!dname || plid != blkg->plid)
+		if (!dname)
 			continue;
 		if (pcpu) {
 			cgroup_total += blkio_get_stat_cpu(blkg, plid,
@@ -1335,9 +1382,9 @@ static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
 	blkcg->weight = (unsigned int)val;
 
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		struct blkg_policy_data *pd = blkg->pd[blkg->plid];
+		struct blkg_policy_data *pd = blkg->pd[plid];
 
-		if (blkg->plid == plid && !pd->conf.weight)
+		if (!pd->conf.weight)
 			blkio_update_group_weight(blkg, plid, blkcg->weight);
 	}
 
@@ -1560,7 +1607,6 @@ static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
 	unsigned long flags;
 	struct blkio_group *blkg;
 	struct request_queue *q;
-	struct blkio_policy_type *blkiop;
 
 	rcu_read_lock();
 
@@ -1586,11 +1632,7 @@ static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
 		 */
 		spin_lock(&blkio_list_lock);
 		spin_lock_irqsave(q->queue_lock, flags);
-		list_for_each_entry(blkiop, &blkio_list, list) {
-			if (blkiop->plid != blkg->plid)
-				continue;
-			blkg_destroy(blkg, blkiop->plid);
-		}
+		blkg_destroy(blkg);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 		spin_unlock(&blkio_list_lock);
 	} while (1);
@@ -1684,6 +1726,8 @@ void blkcg_exit_queue(struct request_queue *q)
 	list_del_init(&q->all_q_node);
 	mutex_unlock(&all_q_mutex);
 
+	blkg_destroy_all(q, true);
+
 	blk_throtl_exit(q);
 }
 
@@ -1733,14 +1777,12 @@ static void blkcg_bypass_start(void)
 	__acquires(&all_q_mutex)
 {
 	struct request_queue *q;
-	int i;
 
 	mutex_lock(&all_q_mutex);
 
 	list_for_each_entry(q, &all_q_list, all_q_node) {
 		blk_queue_bypass_start(q);
-		for (i = 0; i < BLKIO_NR_POLICIES; i++)
-			blkg_destroy_all(q, i, false);
+		blkg_destroy_all(q, false);
 	}
 }
 
@@ -1757,6 +1799,8 @@ static void blkcg_bypass_end(void)
 
 void blkio_policy_register(struct blkio_policy_type *blkiop)
 {
+	struct request_queue *q;
+
 	blkcg_bypass_start();
 	spin_lock(&blkio_list_lock);
 
@@ -1765,12 +1809,16 @@ void blkio_policy_register(struct blkio_policy_type *blkiop)
 	list_add_tail(&blkiop->list, &blkio_list);
 
 	spin_unlock(&blkio_list_lock);
+	list_for_each_entry(q, &all_q_list, all_q_node)
+		update_root_blkg_pd(q, blkiop->plid);
 	blkcg_bypass_end();
 }
 EXPORT_SYMBOL_GPL(blkio_policy_register);
 
 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 {
+	struct request_queue *q;
+
 	blkcg_bypass_start();
 	spin_lock(&blkio_list_lock);
 
@@ -1779,6 +1827,8 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 	list_del_init(&blkiop->list);
 
 	spin_unlock(&blkio_list_lock);
+	list_for_each_entry(q, &all_q_list, all_q_node)
+		update_root_blkg_pd(q, blkiop->plid);
 	blkcg_bypass_end();
 }
 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 83ce5fa..6e8ee86 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -178,13 +178,11 @@ struct blkg_policy_data {
 struct blkio_group {
 	/* Pointer to the associated request_queue, RCU protected */
 	struct request_queue __rcu *q;
-	struct list_head q_node[BLKIO_NR_POLICIES];
+	struct list_head q_node;
 	struct hlist_node blkcg_node;
 	struct blkio_cgroup *blkcg;
 	/* Store cgroup path */
 	char path[128];
-	/* policy which owns this blk group */
-	enum blkio_policy_id plid;
 	/* reference count */
 	int refcnt;
 
@@ -230,8 +228,9 @@ extern void blkcg_exit_queue(struct request_queue *q);
 /* Blkio controller policy registration */
 extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
-extern void blkg_destroy_all(struct request_queue *q,
-			     enum blkio_policy_id plid, bool destroy_root);
+extern void blkg_destroy_all(struct request_queue *q, bool destroy_root);
+extern void update_root_blkg_pd(struct request_queue *q,
+				enum blkio_policy_id plid);
 
 /**
  * blkg_to_pdata - get policy private data
@@ -313,8 +312,9 @@ static inline void blkcg_exit_queue(struct request_queue *q) { }
 static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 static inline void blkg_destroy_all(struct request_queue *q,
-				    enum blkio_policy_id plid,
 				    bool destory_root) { }
+static inline void update_root_blkg_pd(struct request_queue *q,
+				       enum blkio_policy_id plid) { }
 
 static inline void *blkg_to_pdata(struct blkio_group *blkg,
 				struct blkio_policy_type *pol) { return NULL; }
@@ -382,8 +382,7 @@ extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
-				       struct request_queue *q,
-				       enum blkio_policy_id plid);
+				       struct request_queue *q);
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       struct request_queue *q,
 				       enum blkio_policy_id plid,
diff --git a/block/blk-core.c b/block/blk-core.c
index 83a47fc..05693f4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -548,8 +548,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->icq_list);
 #ifdef CONFIG_BLK_CGROUP
-	INIT_LIST_HEAD(&q->blkg_list[0]);
-	INIT_LIST_HEAD(&q->blkg_list[1]);
+	INIT_LIST_HEAD(&q->blkg_list);
 #endif
 	INIT_LIST_HEAD(&q->flush_queue[0]);
 	INIT_LIST_HEAD(&q->flush_queue[1]);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 00cdc98..aa41b47 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -480,6 +480,8 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_sync_queue(q);
 
+	blkcg_exit_queue(q);
+
 	if (q->elevator) {
 		spin_lock_irq(q->queue_lock);
 		ioc_clear_queue(q);
@@ -487,8 +489,6 @@ static void blk_release_queue(struct kobject *kobj)
 		elevator_exit(q->elevator);
 	}
 
-	blkcg_exit_queue(q);
-
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1329412..e35ee7a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -167,7 +167,7 @@ throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 	if (blkcg == &blkio_root_cgroup)
 		return td->root_tg;
 
-	return blkg_to_tg(blkg_lookup(blkcg, td->queue, BLKIO_POLICY_THROTL));
+	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
 }
 
 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
@@ -704,8 +704,7 @@ static void throtl_process_limit_change(struct throtl_data *td)
 
 	throtl_log(td, "limits changed");
 
-	list_for_each_entry_safe(blkg, n, &q->blkg_list[BLKIO_POLICY_THROTL],
-				 q_node[BLKIO_POLICY_THROTL]) {
+	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
 
 		if (!tg->limits_changed)
@@ -1054,11 +1053,9 @@ void blk_throtl_exit(struct request_queue *q)
 
 	throtl_shutdown_wq(q);
 
-	blkg_destroy_all(q, BLKIO_POLICY_THROTL, true);
-
 	/* If there are other groups */
 	spin_lock_irq(q->queue_lock);
-	wait = q->nr_blkgs[BLKIO_POLICY_THROTL];
+	wait = q->nr_blkgs;
 	spin_unlock_irq(q->queue_lock);
 
 	/*
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dc73690..393eaa5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3462,15 +3462,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
 
 	spin_unlock_irq(q->queue_lock);
 
-	blkg_destroy_all(q, BLKIO_POLICY_PROP, true);
-
 #ifdef CONFIG_BLK_CGROUP
 	/*
 	 * If there are groups which we could not unlink from blkcg list,
 	 * wait for a rcu period for them to be freed.
 	 */
 	spin_lock_irq(q->queue_lock);
-	wait = q->nr_blkgs[BLKIO_POLICY_PROP];
+	wait = q->nr_blkgs;
 	spin_unlock_irq(q->queue_lock);
 #endif
 	cfq_shutdown_timer_wq(cfqd);
@@ -3492,6 +3490,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 #ifndef CONFIG_CFQ_GROUP_IOSCHED
 	kfree(cfqd->root_group);
 #endif
+	update_root_blkg_pd(q, BLKIO_POLICY_PROP);
 	kfree(cfqd);
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index d4d39da..451654f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -876,7 +876,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	struct elevator_queue *old = q->elevator;
 	bool registered = old->registered;
-	int i, err;
+	int err;
 
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data.
@@ -895,8 +895,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	ioc_clear_queue(q);
 	spin_unlock_irq(q->queue_lock);
 
-	for (i = 0; i < BLKIO_NR_POLICIES; i++)
-		blkg_destroy_all(q, i, false);
+	blkg_destroy_all(q, false);
 
 	/* allocate, init and register new elevator */
 	err = -ENOMEM;
-- 
cgit v1.1


From 9f13ef678efd977487fc0c2e489f17c9a8c67a3e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:21 -0800
Subject: blkcg: use double locking instead of RCU for blkg synchronization

blkgs are chained from both blkcgs and request_queues and thus
subjected to two locks - blkcg->lock and q->queue_lock.  As both blkcg
and q can go away anytime, locking during removal is tricky.  It's
currently solved by wrapping removal inside RCU, which makes the
synchronization complex.  There are three locks to worry about - the
outer RCU, q lock and blkcg lock, and it leads to nasty subtle
complications like conditional synchronize_rcu() on queue exit paths.

For all other paths, blkcg lock is naturally nested inside q lock and
the only exception is blkcg removal path, which is a very cold path
and can be implemented as clumsy but conceptually-simple reverse
double lock dancing.

This patch updates blkg removal path such that blkgs are removed while
holding both q and blkcg locks, which is trivial for request queue
exit path - blkg_destroy_all().  The blkcg removal path,
blkiocg_pre_destroy(), implements reverse double lock dancing
essentially identical to ioc_release_fn().

This simplifies blkg locking - no half-dead blkgs to worry about.  Now
unnecessary RCU annotations will be removed by the next patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 136 ++++++++++++++++++++---------------------------------
 block/blk-cgroup.h |   4 --
 block/cfq.h        |  10 ----
 3 files changed, 51 insertions(+), 99 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index cad5f15..e9e3b03 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -620,32 +620,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
-static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	hlist_del_init_rcu(&blkg->blkcg_node);
-}
-
-/*
- * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
- * indicating that blk_group was unhashed by the time we got to it.
- */
-int blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	struct blkio_cgroup *blkcg = blkg->blkcg;
-	unsigned long flags;
-	int ret = 1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	if (!hlist_unhashed(&blkg->blkcg_node)) {
-		__blkiocg_del_blkio_group(blkg);
-		ret = 0;
-	}
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
-
 /* called under rcu_read_lock(). */
 struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 				struct request_queue *q)
@@ -663,12 +637,16 @@ EXPORT_SYMBOL_GPL(blkg_lookup);
 static void blkg_destroy(struct blkio_group *blkg)
 {
 	struct request_queue *q = blkg->q;
+	struct blkio_cgroup *blkcg = blkg->blkcg;
 
 	lockdep_assert_held(q->queue_lock);
+	lockdep_assert_held(&blkcg->lock);
 
 	/* Something wrong if we are trying to remove same group twice */
 	WARN_ON_ONCE(list_empty(&blkg->q_node));
+	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 	list_del_init(&blkg->q_node);
+	hlist_del_init_rcu(&blkg->blkcg_node);
 
 	WARN_ON_ONCE(q->nr_blkgs <= 0);
 	q->nr_blkgs--;
@@ -713,45 +691,33 @@ void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
 }
 EXPORT_SYMBOL_GPL(update_root_blkg_pd);
 
+/**
+ * blkg_destroy_all - destroy all blkgs associated with a request_queue
+ * @q: request_queue of interest
+ * @destroy_root: whether to destroy root blkg or not
+ *
+ * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
+ * destroyed; otherwise, root blkg is left alone.
+ */
 void blkg_destroy_all(struct request_queue *q, bool destroy_root)
 {
 	struct blkio_group *blkg, *n;
 
-	while (true) {
-		bool done = true;
-
-		spin_lock_irq(q->queue_lock);
-
-		list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
-			/* skip root? */
-			if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
-				continue;
-
-			/*
-			 * If cgroup removal path got to blk_group first
-			 * and removed it from cgroup list, then it will
-			 * take care of destroying cfqg also.
-			 */
-			if (!blkiocg_del_blkio_group(blkg))
-				blkg_destroy(blkg);
-			else
-				done = false;
-		}
+	spin_lock_irq(q->queue_lock);
 
-		spin_unlock_irq(q->queue_lock);
+	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+		struct blkio_cgroup *blkcg = blkg->blkcg;
 
-		/*
-		 * Group list may not be empty if we raced cgroup removal
-		 * and lost.  cgroup removal is guaranteed to make forward
-		 * progress and retrying after a while is enough.  This
-		 * ugliness is scheduled to be removed after locking
-		 * update.
-		 */
-		if (done)
-			break;
+		/* skip root? */
+		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
+			continue;
 
-		msleep(10);	/* just some random duration I like */
+		spin_lock(&blkcg->lock);
+		blkg_destroy(blkg);
+		spin_unlock(&blkcg->lock);
 	}
+
+	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blkg_destroy_all);
 
@@ -1600,45 +1566,45 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 				ARRAY_SIZE(blkio_files));
 }
 
+/**
+ * blkiocg_pre_destroy - cgroup pre_destroy callback
+ * @subsys: cgroup subsys
+ * @cgroup: cgroup of interest
+ *
+ * This function is called when @cgroup is about to go away and responsible
+ * for shooting down all blkgs associated with @cgroup.  blkgs should be
+ * removed while holding both q and blkcg locks.  As blkcg lock is nested
+ * inside q lock, this function performs reverse double lock dancing.
+ *
+ * This is the blkcg counterpart of ioc_release_fn().
+ */
 static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
 			       struct cgroup *cgroup)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-	unsigned long flags;
-	struct blkio_group *blkg;
-	struct request_queue *q;
 
 	rcu_read_lock();
+	spin_lock_irq(&blkcg->lock);
 
-	do {
-		spin_lock_irqsave(&blkcg->lock, flags);
+	while (!hlist_empty(&blkcg->blkg_list)) {
+		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
+						struct blkio_group, blkcg_node);
+		struct request_queue *q = rcu_dereference(blkg->q);
 
-		if (hlist_empty(&blkcg->blkg_list)) {
-			spin_unlock_irqrestore(&blkcg->lock, flags);
-			break;
+		if (spin_trylock(q->queue_lock)) {
+			blkg_destroy(blkg);
+			spin_unlock(q->queue_lock);
+		} else {
+			spin_unlock_irq(&blkcg->lock);
+			rcu_read_unlock();
+			cpu_relax();
+			rcu_read_lock();
+			spin_lock(&blkcg->lock);
 		}
+	}
 
-		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
-					blkcg_node);
-		q = rcu_dereference(blkg->q);
-		__blkiocg_del_blkio_group(blkg);
-
-		spin_unlock_irqrestore(&blkcg->lock, flags);
-
-		/*
-		 * This blkio_group is being unlinked as associated cgroup is
-		 * going away. Let all the IO controlling policies know about
-		 * this event.
-		 */
-		spin_lock(&blkio_list_lock);
-		spin_lock_irqsave(q->queue_lock, flags);
-		blkg_destroy(blkg);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-		spin_unlock(&blkio_list_lock);
-	} while (1);
-
+	spin_unlock_irq(&blkcg->lock);
 	rcu_read_unlock();
-
 	return 0;
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 6e8ee86..df73040 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -380,7 +380,6 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
-extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 				       struct request_queue *q);
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
@@ -416,9 +415,6 @@ cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 static inline struct blkio_cgroup *
 task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
 
-static inline int
-blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
-
 static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 					      void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
diff --git a/block/cfq.h b/block/cfq.h
index 5584e1b..c8b15ef 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -79,11 +79,6 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 					direction, sync);
 }
 
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	return blkiocg_del_blkio_group(blkg);
-}
-
 #else /* CFQ_GROUP_IOSCHED */
 static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol,
@@ -119,10 +114,5 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, uint64_t start_time,
 			uint64_t io_start_time, bool direction, bool sync) { }
 
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	return 0;
-}
-
 #endif /* CFQ_GROUP_IOSCHED */
 #endif
-- 
cgit v1.1


From c875f4d0250a1f070fa26087a73bdd8f54c48100 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:22 -0800
Subject: blkcg: drop unnecessary RCU locking

Now that blkg additions / removals are always done under both q and
blkcg locks, the only places RCU locking is necessary are
blkg_lookup[_create]() for lookup w/o blkcg lock.  This patch drops
unncessary RCU locking replacing it with plain blkcg locking as
necessary.

* blkiocg_pre_destroy() already perform proper locking and don't need
  RCU.  Dropped.

* blkio_read_blkg_stats() now uses blkcg->lock instead of RCU read
  lock.  This isn't a hot path.

* Now unnecessary synchronize_rcu() from queue exit paths removed.
  This makes q->nr_blkgs unnecessary.  Dropped.

* RCU annotation on blkg->q removed.

-v2: Vivek pointed out that blkg_lookup_create() still needs to be
     called under rcu_read_lock().  Updated.

-v3: After the update, stats_lock locking in blkio_read_blkg_stats()
     shouldn't be using _irq variant as it otherwise ends up enabling
     irq while blkcg->lock is locked.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     | 24 +++++++++---------------
 block/blk-cgroup.h     |  4 ++--
 block/blk-throttle.c   | 33 +--------------------------------
 block/cfq-iosched.c    | 24 ------------------------
 include/linux/blkdev.h |  1 -
 5 files changed, 12 insertions(+), 74 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e9e3b03..27d39a8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -500,7 +500,7 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 		return NULL;
 
 	spin_lock_init(&blkg->stats_lock);
-	rcu_assign_pointer(blkg->q, q);
+	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
 	blkg->refcnt = 1;
@@ -611,7 +611,6 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	list_add(&blkg->q_node, &q->blkg_list);
-	q->nr_blkgs++;
 
 	spin_unlock(&blkcg->lock);
 out:
@@ -648,9 +647,6 @@ static void blkg_destroy(struct blkio_group *blkg)
 	list_del_init(&blkg->q_node);
 	hlist_del_init_rcu(&blkg->blkcg_node);
 
-	WARN_ON_ONCE(q->nr_blkgs <= 0);
-	q->nr_blkgs--;
-
 	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
@@ -1232,8 +1228,9 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
 	struct hlist_node *n;
 	uint64_t cgroup_total = 0;
 
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+	spin_lock_irq(&blkcg->lock);
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		const char *dname = blkg_dev_name(blkg);
 		int plid = BLKIOFILE_POLICY(cft->private);
 
@@ -1243,15 +1240,16 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
 			cgroup_total += blkio_get_stat_cpu(blkg, plid,
 							   cb, dname, type);
 		} else {
-			spin_lock_irq(&blkg->stats_lock);
+			spin_lock(&blkg->stats_lock);
 			cgroup_total += blkio_get_stat(blkg, plid,
 						       cb, dname, type);
-			spin_unlock_irq(&blkg->stats_lock);
+			spin_unlock(&blkg->stats_lock);
 		}
 	}
 	if (show_total)
 		cb->fill(cb, "Total", cgroup_total);
-	rcu_read_unlock();
+
+	spin_unlock_irq(&blkcg->lock);
 	return 0;
 }
 
@@ -1583,28 +1581,24 @@ static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 
-	rcu_read_lock();
 	spin_lock_irq(&blkcg->lock);
 
 	while (!hlist_empty(&blkcg->blkg_list)) {
 		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
 						struct blkio_group, blkcg_node);
-		struct request_queue *q = rcu_dereference(blkg->q);
+		struct request_queue *q = blkg->q;
 
 		if (spin_trylock(q->queue_lock)) {
 			blkg_destroy(blkg);
 			spin_unlock(q->queue_lock);
 		} else {
 			spin_unlock_irq(&blkcg->lock);
-			rcu_read_unlock();
 			cpu_relax();
-			rcu_read_lock();
 			spin_lock(&blkcg->lock);
 		}
 	}
 
 	spin_unlock_irq(&blkcg->lock);
-	rcu_read_unlock();
 	return 0;
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index df73040..66eaefe 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -176,8 +176,8 @@ struct blkg_policy_data {
 };
 
 struct blkio_group {
-	/* Pointer to the associated request_queue, RCU protected */
-	struct request_queue __rcu *q;
+	/* Pointer to the associated request_queue */
+	struct request_queue *q;
 	struct list_head q_node;
 	struct hlist_node blkcg_node;
 	struct blkio_cgroup *blkcg;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e35ee7a..bfa5168 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1046,39 +1046,8 @@ int blk_throtl_init(struct request_queue *q)
 
 void blk_throtl_exit(struct request_queue *q)
 {
-	struct throtl_data *td = q->td;
-	bool wait;
-
-	BUG_ON(!td);
-
+	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
-
-	/* If there are other groups */
-	spin_lock_irq(q->queue_lock);
-	wait = q->nr_blkgs;
-	spin_unlock_irq(q->queue_lock);
-
-	/*
-	 * Wait for tg_to_blkg(tg)->q accessors to exit their grace periods.
-	 * Do this wait only if there are other undestroyed groups out
-	 * there (other than root group). This can happen if cgroup deletion
-	 * path claimed the responsibility of cleaning up a group before
-	 * queue cleanup code get to the group.
-	 *
-	 * Do not call synchronize_rcu() unconditionally as there are drivers
-	 * which create/delete request queue hundreds of times during scan/boot
-	 * and synchronize_rcu() can take significant time and slow down boot.
-	 */
-	if (wait)
-		synchronize_rcu();
-
-	/*
-	 * Just being safe to make sure after previous flush if some body did
-	 * update limits through cgroup and another work got queued, cancel
-	 * it.
-	 */
-	throtl_shutdown_wq(q);
-
 	kfree(q->td);
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 393eaa5..9e386d9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3449,7 +3449,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	struct request_queue *q = cfqd->queue;
-	bool wait = false;
 
 	cfq_shutdown_timer_wq(cfqd);
 
@@ -3462,31 +3461,8 @@ static void cfq_exit_queue(struct elevator_queue *e)
 
 	spin_unlock_irq(q->queue_lock);
 
-#ifdef CONFIG_BLK_CGROUP
-	/*
-	 * If there are groups which we could not unlink from blkcg list,
-	 * wait for a rcu period for them to be freed.
-	 */
-	spin_lock_irq(q->queue_lock);
-	wait = q->nr_blkgs;
-	spin_unlock_irq(q->queue_lock);
-#endif
 	cfq_shutdown_timer_wq(cfqd);
 
-	/*
-	 * Wait for cfqg->blkg->key accessors to exit their grace periods.
-	 * Do this wait only if there are other unlinked groups out
-	 * there. This can happen if cgroup deletion path claimed the
-	 * responsibility of cleaning up a group before queue cleanup code
-	 * get to the group.
-	 *
-	 * Do not call synchronize_rcu() unconditionally as there are drivers
-	 * which create/delete request queue hundreds of times during scan/boot
-	 * and synchronize_rcu() can take significant time and slow down boot.
-	 */
-	if (wait)
-		synchronize_rcu();
-
 #ifndef CONFIG_CFQ_GROUP_IOSCHED
 	kfree(cfqd->root_group);
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b4d1d4b..33f1b29 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -365,7 +365,6 @@ struct request_queue {
 #ifdef CONFIG_BLK_CGROUP
 	/* XXX: array size hardcoded to avoid include dependency (temporary) */
 	struct list_head	blkg_list;
-	int			nr_blkgs;
 #endif
 
 	struct queue_limits	limits;
-- 
cgit v1.1


From b679281a6410676a41b175c5a185150a1ae42f9d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:23 -0800
Subject: block: restructure get_request()

get_request() is structured a bit unusually in that failure path is
inlined in the usual flow with goto labels atop and inside it.
Relocate the error path to the end of the function.

This is to prepare for icq handling changes in get_request() and
doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 60 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 05693f4..792a384 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -826,7 +826,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
 {
-	struct request *rq = NULL;
+	struct request *rq;
 	struct request_list *rl = &q->rq;
 	struct elevator_type *et;
 	struct io_context *ioc;
@@ -878,7 +878,7 @@ retry:
 					 * process is not a "batcher", and not
 					 * exempted by the IO scheduler
 					 */
-					goto out;
+					return NULL;
 				}
 			}
 		}
@@ -891,7 +891,7 @@ retry:
 	 * allocated with any setting of ->nr_requests
 	 */
 	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
-		goto out;
+		return NULL;
 
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
@@ -921,36 +921,12 @@ retry:
 	if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) {
 		icq = ioc_create_icq(q, gfp_mask);
 		if (!icq)
-			goto fail_icq;
+			goto fail_alloc;
 	}
 
 	rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
-
-fail_icq:
-	if (unlikely(!rq)) {
-		/*
-		 * Allocation failed presumably due to memory. Undo anything
-		 * we might have messed up.
-		 *
-		 * Allocating task should really be put onto the front of the
-		 * wait queue, but this is pretty rare.
-		 */
-		spin_lock_irq(q->queue_lock);
-		freed_request(q, rw_flags);
-
-		/*
-		 * in the very unlikely event that allocation failed and no
-		 * requests for this direction was pending, mark us starved
-		 * so that freeing of a request in the other direction will
-		 * notice us. another possible fix would be to split the
-		 * rq mempool into READ and WRITE
-		 */
-rq_starved:
-		if (unlikely(rl->count[is_sync] == 0))
-			rl->starved[is_sync] = 1;
-
-		goto out;
-	}
+	if (unlikely(!rq))
+		goto fail_alloc;
 
 	/*
 	 * ioc may be NULL here, and ioc_batching will be false. That's
@@ -962,8 +938,30 @@ rq_starved:
 		ioc->nr_batch_requests--;
 
 	trace_block_getrq(q, bio, rw_flags & 1);
-out:
 	return rq;
+
+fail_alloc:
+	/*
+	 * Allocation failed presumably due to memory. Undo anything we
+	 * might have messed up.
+	 *
+	 * Allocating task should really be put onto the front of the wait
+	 * queue, but this is pretty rare.
+	 */
+	spin_lock_irq(q->queue_lock);
+	freed_request(q, rw_flags);
+
+	/*
+	 * in the very unlikely event that allocation failed and no
+	 * requests for this direction was pending, mark us starved so that
+	 * freeing of a request in the other direction will notice
+	 * us. another possible fix would be to split the rq mempool into
+	 * READ and WRITE
+	 */
+rq_starved:
+	if (unlikely(rl->count[is_sync] == 0))
+		rl->starved[is_sync] = 1;
+	return NULL;
 }
 
 /**
-- 
cgit v1.1


From 24acfc34fba0b4f62ef9d5c2616eb0faa802b606 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:24 -0800
Subject: block: interface update for ioc/icq creation functions

Make the following interface updates to prepare for future ioc related
changes.

* create_io_context() returning ioc only works for %current because it
  doesn't increment ref on the ioc.  Drop @task parameter from it and
  always assume %current.

* Make create_io_context_slowpath() return 0 or -errno and rename it
  to create_task_io_context().

* Make ioc_create_icq() take @ioc as parameter instead of assuming
  that of %current.  The caller, get_request(), is updated to create
  ioc explicitly and then pass it into ioc_create_icq().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c |  8 +++++---
 block/blk-ioc.c  | 22 ++++++++++------------
 block/blk.h      | 24 +++++++++++-------------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 792a384..b2d0fcd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -855,7 +855,7 @@ retry:
 			 */
 			if (!ioc && !retried) {
 				spin_unlock_irq(q->queue_lock);
-				create_io_context(current, gfp_mask, q->node);
+				create_io_context(gfp_mask, q->node);
 				spin_lock_irq(q->queue_lock);
 				retried = true;
 				goto retry;
@@ -919,7 +919,9 @@ retry:
 
 	/* create icq if missing */
 	if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) {
-		icq = ioc_create_icq(q, gfp_mask);
+		ioc = create_io_context(gfp_mask, q->node);
+		if (ioc)
+			icq = ioc_create_icq(ioc, q, gfp_mask);
 		if (!icq)
 			goto fail_alloc;
 	}
@@ -1005,7 +1007,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		 * up to a big batch of them for a small period time.
 		 * See ioc_batching, ioc_set_batching
 		 */
-		create_io_context(current, GFP_NOIO, q->node);
+		create_io_context(GFP_NOIO, q->node);
 		ioc_set_batching(q, current->io_context);
 
 		spin_lock_irq(q->queue_lock);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 92bf555..1092874 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -212,15 +212,14 @@ void ioc_clear_queue(struct request_queue *q)
 	}
 }
 
-void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
-				int node)
+int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
 {
 	struct io_context *ioc;
 
 	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
 				    node);
 	if (unlikely(!ioc))
-		return;
+		return -ENOMEM;
 
 	/* initialize */
 	atomic_long_set(&ioc->refcount, 1);
@@ -244,6 +243,8 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
 	else
 		kmem_cache_free(iocontext_cachep, ioc);
 	task_unlock(task);
+
+	return 0;
 }
 
 /**
@@ -275,7 +276,7 @@ struct io_context *get_task_io_context(struct task_struct *task,
 			return ioc;
 		}
 		task_unlock(task);
-	} while (create_io_context(task, gfp_flags, node));
+	} while (!create_task_io_context(task, gfp_flags, node));
 
 	return NULL;
 }
@@ -319,26 +320,23 @@ EXPORT_SYMBOL(ioc_lookup_icq);
 
 /**
  * ioc_create_icq - create and link io_cq
+ * @ioc: io_context of interest
  * @q: request_queue of interest
  * @gfp_mask: allocation mask
  *
- * Make sure io_cq linking %current->io_context and @q exists.  If either
- * io_context and/or icq don't exist, they will be created using @gfp_mask.
+ * Make sure io_cq linking @ioc and @q exists.  If icq doesn't exist, they
+ * will be created using @gfp_mask.
  *
  * The caller is responsible for ensuring @ioc won't go away and @q is
  * alive and will stay alive until this function returns.
  */
-struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
+struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
+			     gfp_t gfp_mask)
 {
 	struct elevator_type *et = q->elevator->type;
-	struct io_context *ioc;
 	struct io_cq *icq;
 
 	/* allocate stuff */
-	ioc = create_io_context(current, gfp_mask, q->node);
-	if (!ioc)
-		return NULL;
-
 	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
 				    q->node);
 	if (!icq)
diff --git a/block/blk.h b/block/blk.h
index de15f92..aa81afd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -200,32 +200,30 @@ static inline int blk_do_io_stat(struct request *rq)
  */
 void get_io_context(struct io_context *ioc);
 struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
-struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask);
+struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
+			     gfp_t gfp_mask);
 void ioc_clear_queue(struct request_queue *q);
 
-void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
-				int node);
+int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
 
 /**
  * create_io_context - try to create task->io_context
- * @task: target task
  * @gfp_mask: allocation mask
  * @node: allocation node
  *
- * If @task->io_context is %NULL, allocate a new io_context and install it.
- * Returns the current @task->io_context which may be %NULL if allocation
- * failed.
+ * If %current->io_context is %NULL, allocate a new io_context and install
+ * it.  Returns the current %current->io_context which may be %NULL if
+ * allocation failed.
  *
  * Note that this function can't be called with IRQ disabled because
- * task_lock which protects @task->io_context is IRQ-unsafe.
+ * task_lock which protects %current->io_context is IRQ-unsafe.
  */
-static inline struct io_context *create_io_context(struct task_struct *task,
-						   gfp_t gfp_mask, int node)
+static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
 {
 	WARN_ON_ONCE(irqs_disabled());
-	if (unlikely(!task->io_context))
-		create_io_context_slowpath(task, gfp_mask, node);
-	return task->io_context;
+	if (unlikely(!current->io_context))
+		create_task_io_context(current, gfp_mask, node);
+	return current->io_context;
 }
 
 /*
-- 
cgit v1.1


From 3d48749d93a3dce732dd30a14002ab90ec4355f3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:25 -0800
Subject: block: ioc_task_link() can't fail

ioc_task_link() is used to share %current's ioc on clone.  If
%current->io_context is set, %current is guaranteed to have refcount
on the ioc and, thus, ioc_task_link() can't fail.

Replace error checking in ioc_task_link() with WARN_ON_ONCE() and make
it just increment refcount and nr_tasks.

-v2: Description typo fix (Vivek).

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/iocontext.h | 16 +++++-----------
 kernel/fork.c             |  5 ++---
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 1a30180..81a8870 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -120,18 +120,12 @@ struct io_context {
 	struct work_struct release_work;
 };
 
-static inline struct io_context *ioc_task_link(struct io_context *ioc)
+static inline void ioc_task_link(struct io_context *ioc)
 {
-	/*
-	 * if ref count is zero, don't allow sharing (ioc is going away, it's
-	 * a race).
-	 */
-	if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) {
-		atomic_inc(&ioc->nr_tasks);
-		return ioc;
-	}
-
-	return NULL;
+	WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0);
+	WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0);
+	atomic_long_inc(&ioc->refcount);
+	atomic_inc(&ioc->nr_tasks);
 }
 
 struct task_struct;
diff --git a/kernel/fork.c b/kernel/fork.c
index b77fd559..a1b6327 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -901,9 +901,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 	 * Share io context with parent, if CLONE_IO is set
 	 */
 	if (clone_flags & CLONE_IO) {
-		tsk->io_context = ioc_task_link(ioc);
-		if (unlikely(!tsk->io_context))
-			return -ENOMEM;
+		ioc_task_link(ioc);
+		tsk->io_context = ioc;
 	} else if (ioprio_valid(ioc->ioprio)) {
 		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
 		if (unlikely(!new_ioc))
-- 
cgit v1.1


From f6e8d01bee036460e03bd4f6a79d014f98ba712e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:26 -0800
Subject: block: add io_context->active_ref

Currently ioc->nr_tasks is used to decide two things - whether an ioc
is done issuing IOs and whether it's shared by multiple tasks.  This
patch separate out the first into ioc->active_ref, which is acquired
and released using {get|put}_io_context_active() respectively.

This will be used to associate bio's with a given task.  This patch
doesn't introduce any visible behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c           | 36 +++++++++++++++++++++++++-----------
 block/cfq-iosched.c       |  4 ++--
 include/linux/iocontext.h | 22 ++++++++++++++++++++--
 3 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 1092874..439ec21 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -149,20 +149,20 @@ void put_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(put_io_context);
 
-/* Called by the exiting task */
-void exit_io_context(struct task_struct *task)
+/**
+ * put_io_context_active - put active reference on ioc
+ * @ioc: ioc of interest
+ *
+ * Undo get_io_context_active().  If active reference reaches zero after
+ * put, @ioc can never issue further IOs and ioscheds are notified.
+ */
+void put_io_context_active(struct io_context *ioc)
 {
-	struct io_context *ioc;
-	struct io_cq *icq;
 	struct hlist_node *n;
 	unsigned long flags;
+	struct io_cq *icq;
 
-	task_lock(task);
-	ioc = task->io_context;
-	task->io_context = NULL;
-	task_unlock(task);
-
-	if (!atomic_dec_and_test(&ioc->nr_tasks)) {
+	if (!atomic_dec_and_test(&ioc->active_ref)) {
 		put_io_context(ioc);
 		return;
 	}
@@ -191,6 +191,20 @@ retry:
 	put_io_context(ioc);
 }
 
+/* Called by the exiting task */
+void exit_io_context(struct task_struct *task)
+{
+	struct io_context *ioc;
+
+	task_lock(task);
+	ioc = task->io_context;
+	task->io_context = NULL;
+	task_unlock(task);
+
+	atomic_dec(&ioc->nr_tasks);
+	put_io_context_active(ioc);
+}
+
 /**
  * ioc_clear_queue - break any ioc association with the specified queue
  * @q: request_queue being cleared
@@ -223,7 +237,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
 
 	/* initialize */
 	atomic_long_set(&ioc->refcount, 1);
-	atomic_set(&ioc->nr_tasks, 1);
+	atomic_set(&ioc->active_ref, 1);
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->icq_list);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9e386d9..9a4eac4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1865,7 +1865,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
-	if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
+	if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
 		return;
 
 	/*
@@ -2841,7 +2841,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
 		enable_idle = 0;
-	else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
+	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
 		 !cfqd->cfq_slice_idle ||
 		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 81a8870..6f1a260 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -100,6 +100,7 @@ struct io_cq {
  */
 struct io_context {
 	atomic_long_t refcount;
+	atomic_t active_ref;
 	atomic_t nr_tasks;
 
 	/* all the fields below are protected by this lock */
@@ -120,17 +121,34 @@ struct io_context {
 	struct work_struct release_work;
 };
 
-static inline void ioc_task_link(struct io_context *ioc)
+/**
+ * get_io_context_active - get active reference on ioc
+ * @ioc: ioc of interest
+ *
+ * Only iocs with active reference can issue new IOs.  This function
+ * acquires an active reference on @ioc.  The caller must already have an
+ * active reference on @ioc.
+ */
+static inline void get_io_context_active(struct io_context *ioc)
 {
 	WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0);
-	WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0);
+	WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0);
 	atomic_long_inc(&ioc->refcount);
+	atomic_inc(&ioc->active_ref);
+}
+
+static inline void ioc_task_link(struct io_context *ioc)
+{
+	get_io_context_active(ioc);
+
+	WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0);
 	atomic_inc(&ioc->nr_tasks);
 }
 
 struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
+void put_io_context_active(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
-- 
cgit v1.1


From 852c788f8365062c8a383c5a93f7f7289977cb50 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:27 -0800
Subject: block: implement bio_associate_current()

IO scheduling and cgroup are tied to the issuing task via io_context
and cgroup of %current.  Unfortunately, there are cases where IOs need
to be routed via a different task which makes scheduling and cgroup
limit enforcement applied completely incorrectly.

For example, all bios delayed by blk-throttle end up being issued by a
delayed work item and get assigned the io_context of the worker task
which happens to serve the work item and dumped to the default block
cgroup.  This is double confusing as bios which aren't delayed end up
in the correct cgroup and makes using blk-throttle and cfq propio
together impossible.

Any code which punts IO issuing to another task is affected which is
getting more and more common (e.g. btrfs).  As both io_context and
cgroup are firmly tied to task including userland visible APIs to
manipulate them, it makes a lot of sense to match up tasks to bios.

This patch implements bio_associate_current() which associates the
specified bio with %current.  The bio will record the associated ioc
and blkcg at that point and block layer will use the recorded ones
regardless of which task actually ends up issuing the bio.  bio
release puts the associated ioc and blkcg.

It grabs and remembers ioc and blkcg instead of the task itself
because task may already be dead by the time the bio is issued making
ioc and blkcg inaccessible and those are all block layer cares about.

elevator_set_req_fn() is updated such that the bio elvdata is being
allocated for is available to the elevator.

This doesn't update block cgroup policies yet.  Further patches will
implement the support.

-v2: #ifdef CONFIG_BLK_CGROUP added around bio->bi_ioc dereference in
     rq_ioc() to fix build breakage.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Kent Overstreet <koverstreet@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 32 +++++++++++++++++++------
 block/cfq-iosched.c       |  3 ++-
 block/elevator.c          |  5 ++--
 fs/bio.c                  | 61 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h       |  8 +++++++
 include/linux/blk_types.h | 10 ++++++++
 include/linux/elevator.h  |  6 +++--
 7 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b2d0fcd..991c1d6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -696,7 +696,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
 }
 
 static struct request *
-blk_alloc_request(struct request_queue *q, struct io_cq *icq,
+blk_alloc_request(struct request_queue *q, struct bio *bio, struct io_cq *icq,
 		  unsigned int flags, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
@@ -710,7 +710,7 @@ blk_alloc_request(struct request_queue *q, struct io_cq *icq,
 
 	if (flags & REQ_ELVPRIV) {
 		rq->elv.icq = icq;
-		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
+		if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
 			mempool_free(rq, q->rq.rq_pool);
 			return NULL;
 		}
@@ -810,6 +810,22 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 }
 
 /**
+ * rq_ioc - determine io_context for request allocation
+ * @bio: request being allocated is for this bio (can be %NULL)
+ *
+ * Determine io_context to use for request allocation for @bio.  May return
+ * %NULL if %current->io_context doesn't exist.
+ */
+static struct io_context *rq_ioc(struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+	if (bio && bio->bi_ioc)
+		return bio->bi_ioc;
+#endif
+	return current->io_context;
+}
+
+/**
  * get_request - get a free request
  * @q: request_queue to allocate request from
  * @rw_flags: RW and SYNC flags
@@ -836,7 +852,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	int may_queue;
 retry:
 	et = q->elevator->type;
-	ioc = current->io_context;
+	ioc = rq_ioc(bio);
 
 	if (unlikely(blk_queue_dead(q)))
 		return NULL;
@@ -919,14 +935,16 @@ retry:
 
 	/* create icq if missing */
 	if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) {
-		ioc = create_io_context(gfp_mask, q->node);
-		if (ioc)
-			icq = ioc_create_icq(ioc, q, gfp_mask);
+		create_io_context(gfp_mask, q->node);
+		ioc = rq_ioc(bio);
+		if (!ioc)
+			goto fail_alloc;
+		icq = ioc_create_icq(ioc, q, gfp_mask);
 		if (!icq)
 			goto fail_alloc;
 	}
 
-	rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
+	rq = blk_alloc_request(q, bio, icq, rw_flags, gfp_mask);
 	if (unlikely(!rq))
 		goto fail_alloc;
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9a4eac4..abac873 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3299,7 +3299,8 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
  * Allocate cfq data structures associated with this request.
  */
 static int
-cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
+cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
+		gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
diff --git a/block/elevator.c b/block/elevator.c
index 451654f..be3ab6d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -663,12 +663,13 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 	return NULL;
 }
 
-int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
+int elv_set_request(struct request_queue *q, struct request *rq,
+		    struct bio *bio, gfp_t gfp_mask)
 {
 	struct elevator_queue *e = q->elevator;
 
 	if (e->type->ops.elevator_set_req_fn)
-		return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask);
+		return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
 	return 0;
 }
 
diff --git a/fs/bio.c b/fs/bio.c
index b980ecd..142214b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,12 +19,14 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/iocontext.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <linux/cgroup.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #include <trace/events/block.h>
@@ -418,6 +420,7 @@ void bio_put(struct bio *bio)
 	 * last put frees it
 	 */
 	if (atomic_dec_and_test(&bio->bi_cnt)) {
+		bio_disassociate_task(bio);
 		bio->bi_next = NULL;
 		bio->bi_destructor(bio);
 	}
@@ -1641,6 +1644,64 @@ bad:
 }
 EXPORT_SYMBOL(bioset_create);
 
+#ifdef CONFIG_BLK_CGROUP
+/**
+ * bio_associate_current - associate a bio with %current
+ * @bio: target bio
+ *
+ * Associate @bio with %current if it hasn't been associated yet.  Block
+ * layer will treat @bio as if it were issued by %current no matter which
+ * task actually issues it.
+ *
+ * This function takes an extra reference of @task's io_context and blkcg
+ * which will be put when @bio is released.  The caller must own @bio,
+ * ensure %current->io_context exists, and is responsible for synchronizing
+ * calls to this function.
+ */
+int bio_associate_current(struct bio *bio)
+{
+	struct io_context *ioc;
+	struct cgroup_subsys_state *css;
+
+	if (bio->bi_ioc)
+		return -EBUSY;
+
+	ioc = current->io_context;
+	if (!ioc)
+		return -ENOENT;
+
+	/* acquire active ref on @ioc and associate */
+	get_io_context_active(ioc);
+	bio->bi_ioc = ioc;
+
+	/* associate blkcg if exists */
+	rcu_read_lock();
+	css = task_subsys_state(current, blkio_subsys_id);
+	if (css && css_tryget(css))
+		bio->bi_css = css;
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/**
+ * bio_disassociate_task - undo bio_associate_current()
+ * @bio: target bio
+ */
+void bio_disassociate_task(struct bio *bio)
+{
+	if (bio->bi_ioc) {
+		put_io_context(bio->bi_ioc);
+		bio->bi_ioc = NULL;
+	}
+	if (bio->bi_css) {
+		css_put(bio->bi_css);
+		bio->bi_css = NULL;
+	}
+}
+
+#endif /* CONFIG_BLK_CGROUP */
+
 static void __init biovec_init_slabs(void)
 {
 	int i;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 129a9c0..692d3d5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -268,6 +268,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set
 extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
+#ifdef CONFIG_BLK_CGROUP
+int bio_associate_current(struct bio *bio);
+void bio_disassociate_task(struct bio *bio);
+#else	/* CONFIG_BLK_CGROUP */
+static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
+static inline void bio_disassociate_task(struct bio *bio) { }
+#endif	/* CONFIG_BLK_CGROUP */
+
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4053cbd..0edb65d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -14,6 +14,8 @@ struct bio;
 struct bio_integrity_payload;
 struct page;
 struct block_device;
+struct io_context;
+struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *, int);
 typedef void (bio_destructor_t) (struct bio *);
 
@@ -66,6 +68,14 @@ struct bio {
 	bio_end_io_t		*bi_end_io;
 
 	void			*bi_private;
+#ifdef CONFIG_BLK_CGROUP
+	/*
+	 * Optional ioc and css associated with this bio.  Put on bio
+	 * release.  Read comment on top of bio_associate_current().
+	 */
+	struct io_context	*bi_ioc;
+	struct cgroup_subsys_state *bi_css;
+#endif
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	struct bio_integrity_payload *bi_integrity;  /* data integrity */
 #endif
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 97fb255..c03af76 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -28,7 +28,8 @@ typedef int (elevator_may_queue_fn) (struct request_queue *, int);
 
 typedef void (elevator_init_icq_fn) (struct io_cq *);
 typedef void (elevator_exit_icq_fn) (struct io_cq *);
-typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
+typedef int (elevator_set_req_fn) (struct request_queue *, struct request *,
+				   struct bio *, gfp_t);
 typedef void (elevator_put_req_fn) (struct request *);
 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
@@ -129,7 +130,8 @@ extern void elv_unregister_queue(struct request_queue *q);
 extern int elv_may_queue(struct request_queue *, int);
 extern void elv_abort_queue(struct request_queue *);
 extern void elv_completed_request(struct request_queue *, struct request *);
-extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
+extern int elv_set_request(struct request_queue *q, struct request *rq,
+			   struct bio *bio, gfp_t gfp_mask);
 extern void elv_put_request(struct request_queue *, struct request *);
 extern void elv_drain_elevator(struct request_queue *);
 
-- 
cgit v1.1


From 4f85cb96d9d2fbbb7160db855a6beee1baced5e5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:28 -0800
Subject: block: make block cgroup policies follow bio task association

Implement bio_blkio_cgroup() which returns the blkcg associated with
the bio if exists or %current's blkcg, and use it in blk-throttle and
cfq-iosched propio.  This makes both cgroup policies honor task
association for the bio instead of always assuming %current.

As nobody is using bio_set_task() yet, this doesn't introduce any
behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 11 +++++++++--
 block/blk-cgroup.h   |  4 ++--
 block/blk-throttle.c |  2 +-
 block/cfq-iosched.c  | 21 +++++++++++----------
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 27d39a8..ee962f3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -71,12 +71,19 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
-struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
+static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
 {
 	return container_of(task_subsys_state(tsk, blkio_subsys_id),
 			    struct blkio_cgroup, css);
 }
-EXPORT_SYMBOL_GPL(task_blkio_cgroup);
+
+struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
+{
+	if (bio && bio->bi_css)
+		return container_of(bio->bi_css, struct blkio_cgroup, css);
+	return task_blkio_cgroup(current);
+}
+EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
 
 static inline void blkio_update_group_weight(struct blkio_group *blkg,
 					     int plid, unsigned int weight)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 66eaefe..98cd8533 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -379,7 +379,7 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
 #ifdef CONFIG_BLK_CGROUP
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
-extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
+extern struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio);
 extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 				       struct request_queue *q);
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
@@ -413,7 +413,7 @@ struct cgroup;
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 static inline struct blkio_cgroup *
-task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
+bio_blkio_cgroup(struct bio *bio) { return NULL; }
 
 static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 					      void *key) { return NULL; }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index bfa5168..08b7ab2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -900,7 +900,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	 * just update the dispatch stats in lockless manner and return.
 	 */
 	rcu_read_lock();
-	blkcg = task_blkio_cgroup(current);
+	blkcg = bio_blkio_cgroup(bio);
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
 		if (tg_no_rule_group(tg, rw)) {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index abac873..f2387b5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -467,8 +467,9 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 }
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
-static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
-				       struct io_context *, gfp_t);
+static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
+				       struct io_context *ioc, struct bio *bio,
+				       gfp_t gfp_mask);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
@@ -2601,7 +2602,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static void changed_ioprio(struct cfq_io_cq *cic)
+static void changed_ioprio(struct cfq_io_cq *cic, struct bio *bio)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
@@ -2613,7 +2614,7 @@ static void changed_ioprio(struct cfq_io_cq *cic)
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
 		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
-						GFP_ATOMIC);
+					 bio, GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
@@ -2671,7 +2672,7 @@ static void changed_cgroup(struct cfq_io_cq *cic)
 
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
-		     struct io_context *ioc, gfp_t gfp_mask)
+		     struct io_context *ioc, struct bio *bio, gfp_t gfp_mask)
 {
 	struct blkio_cgroup *blkcg;
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
@@ -2681,7 +2682,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 retry:
 	rcu_read_lock();
 
-	blkcg = task_blkio_cgroup(current);
+	blkcg = bio_blkio_cgroup(bio);
 
 	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
 
@@ -2746,7 +2747,7 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
-	      gfp_t gfp_mask)
+	      struct bio *bio, gfp_t gfp_mask)
 {
 	const int ioprio = task_ioprio(ioc);
 	const int ioprio_class = task_ioprio_class(ioc);
@@ -2759,7 +2760,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	}
 
 	if (!cfqq)
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
+		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, bio, gfp_mask);
 
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
@@ -3316,7 +3317,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 	/* handle changed notifications */
 	changed = icq_get_changed(&cic->icq);
 	if (unlikely(changed & ICQ_IOPRIO_CHANGED))
-		changed_ioprio(cic);
+		changed_ioprio(cic, bio);
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (unlikely(changed & ICQ_CGROUP_CHANGED))
 		changed_cgroup(cic);
@@ -3325,7 +3326,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, bio, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
-- 
cgit v1.1


From 671058fb2a2aac4e70f01b316b06bc59b98bd138 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Mar 2012 13:15:29 -0800
Subject: block: make blk-throttle preserve the issuing task on delayed bios

Make blk-throttle call bio_associate_current() on bios being delayed
such that they get issued to block layer with the original io_context.
This allows stacking blk-throttle and cfq-iosched propio policies.
bios will always be issued with the correct ioc and blkcg whether it
gets delayed by blk-throttle or not.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 08b7ab2..4ba1418 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -894,6 +894,9 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 		goto out;
 	}
 
+	/* bio_associate_current() needs ioc, try creating */
+	create_io_context(GFP_ATOMIC, q->node);
+
 	/*
 	 * A throtl_grp pointer retrieved under rcu can be used to access
 	 * basic fields like stats and io rates. If a group has no rules,
@@ -958,6 +961,7 @@ queue_bio:
 			tg->io_disp[rw], tg->iops[rw],
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
+	bio_associate_current(bio);
 	throtl_add_bio_tg(q->td, tg, bio);
 	throttled = true;
 
-- 
cgit v1.1


From 1cd9e039fc258f91fe38b97b3c622b13a3b8a795 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 8 Mar 2012 10:53:56 -0800
Subject: blkcg: alloc per cpu stats from worker thread in a delayed manner

Current per cpu stat allocation assumes GFP_KERNEL allocation flag. But in
IO path there are times when we want GFP_NOIO semantics. As there is no
way to pass the allocation flags to alloc_percpu(), this patch delays the
allocation of stats using a worker thread.

v2-> tejun suggested following changes. Changed the patch accordingly.
	- move alloc_node location in structure
	- reduce the size of names of some of the fields
	- Reduce the scope of locking of alloc_list_lock
	- Simplified stat_alloc_fn() by allocating stats for all
	  policies in one go and then assigning these to a group.

v3 -> Andrew suggested to put some comments in the code. Also raised
      concerns about trying to allocate infinitely in case of allocation
      failure. I have changed the logic to sleep for 10ms before retrying.
      That should take care of non-preemptible UP kernels.

v4 -> Tejun had more suggestions.
	- drop list_for_each_entry_all()
	- instead of msleep() use queue_delayed_work()
	- Some cleanups realted to more compact coding.

v5-> tejun suggested more cleanups leading to more compact code.

tj: - Relocated pcpu_stats into blkio_stat_alloc_fn().
    - Minor comment update.
    - This also fixes suspicious RCU usage warning caused by invoking
      cgroup_path() from blkg_alloc() without holding RCU read lock.
      Now that blkg_alloc() doesn't require sleepable context, RCU
      read lock from blkg_lookup_create() is maintained throughout
      blkg_alloc().

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 129 ++++++++++++++++++++++++++++++++++++-----------------
 block/blk-cgroup.h |   2 +
 2 files changed, 91 insertions(+), 40 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ee962f3..622fb41 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,6 +30,13 @@ static LIST_HEAD(blkio_list);
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
+/* List of groups pending per cpu stats allocation */
+static DEFINE_SPINLOCK(alloc_list_lock);
+static LIST_HEAD(alloc_list);
+
+static void blkio_stat_alloc_fn(struct work_struct *);
+static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);
+
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
@@ -391,6 +398,10 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
+	/* If per cpu stats are not allocated yet, don't do any accounting. */
+	if (pd->stats_cpu == NULL)
+		return;
+
 	/*
 	 * Disabling interrupts to provide mutual exclusion between two
 	 * writes on same cpu. It probably is not needed for 64bit. Not
@@ -443,6 +454,10 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
+	/* If per cpu stats are not allocated yet, don't do any accounting. */
+	if (pd->stats_cpu == NULL)
+		return;
+
 	/*
 	 * Disabling interrupts to provide mutual exclusion between two
 	 * writes on same cpu. It probably is not needed for 64bit. Not
@@ -460,6 +475,60 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
+/*
+ * Worker for allocating per cpu stat for blk groups. This is scheduled on
+ * the system_nrt_wq once there are some groups on the alloc_list waiting
+ * for allocation.
+ */
+static void blkio_stat_alloc_fn(struct work_struct *work)
+{
+	static void *pcpu_stats[BLKIO_NR_POLICIES];
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct blkio_group *blkg;
+	int i;
+	bool empty = false;
+
+alloc_stats:
+	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+		if (pcpu_stats[i] != NULL)
+			continue;
+
+		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);
+
+		/* Allocation failed. Try again after some time. */
+		if (pcpu_stats[i] == NULL) {
+			queue_delayed_work(system_nrt_wq, dwork,
+						msecs_to_jiffies(10));
+			return;
+		}
+	}
+
+	spin_lock_irq(&blkio_list_lock);
+	spin_lock(&alloc_list_lock);
+
+	/* cgroup got deleted or queue exited. */
+	if (!list_empty(&alloc_list)) {
+		blkg = list_first_entry(&alloc_list, struct blkio_group,
+						alloc_node);
+		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+			struct blkg_policy_data *pd = blkg->pd[i];
+
+			if (blkio_policy[i] && pd && !pd->stats_cpu)
+				swap(pd->stats_cpu, pcpu_stats[i]);
+		}
+
+		list_del_init(&blkg->alloc_node);
+	}
+
+	empty = list_empty(&alloc_list);
+
+	spin_unlock(&alloc_list_lock);
+	spin_unlock_irq(&blkio_list_lock);
+
+	if (!empty)
+		goto alloc_stats;
+}
+
 /**
  * blkg_free - free a blkg
  * @blkg: blkg to free
@@ -491,9 +560,6 @@ static void blkg_free(struct blkio_group *blkg)
  * @q: request_queue the new blkg is associated with
  *
  * Allocate a new blkg assocating @blkcg and @q.
- *
- * FIXME: Should be called with queue locked but currently isn't due to
- *        percpu stat breakage.
  */
 static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 				      struct request_queue *q)
@@ -509,6 +575,7 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	spin_lock_init(&blkg->stats_lock);
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
+	INIT_LIST_HEAD(&blkg->alloc_node);
 	blkg->blkcg = blkcg;
 	blkg->refcnt = 1;
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -530,13 +597,6 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 
 		blkg->pd[i] = pd;
 		pd->blkg = blkg;
-
-		/* broken, read comment in the callsite */
-		pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-		if (!pd->stats_cpu) {
-			blkg_free(blkg);
-			return NULL;
-		}
 	}
 
 	/* invoke per-policy init */
@@ -556,7 +616,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       bool for_root)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
-	struct blkio_group *blkg, *new_blkg;
+	struct blkio_group *blkg;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
@@ -580,48 +640,27 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 
 	/*
 	 * Allocate and initialize.
-	 *
-	 * FIXME: The following is broken.  Percpu memory allocation
-	 * requires %GFP_KERNEL context and can't be performed from IO
-	 * path.  Allocation here should inherently be atomic and the
-	 * following lock dancing can be removed once the broken percpu
-	 * allocation is fixed.
 	 */
-	spin_unlock_irq(q->queue_lock);
-	rcu_read_unlock();
-
-	new_blkg = blkg_alloc(blkcg, q);
-
-	rcu_read_lock();
-	spin_lock_irq(q->queue_lock);
-
-	/* did bypass get turned on inbetween? */
-	if (unlikely(blk_queue_bypass(q)) && !for_root) {
-		blkg = ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
-		goto out;
-	}
-
-	/* did someone beat us to it? */
-	blkg = blkg_lookup(blkcg, q);
-	if (unlikely(blkg))
-		goto out;
+	blkg = blkg_alloc(blkcg, q);
 
 	/* did alloc fail? */
-	if (unlikely(!new_blkg)) {
+	if (unlikely(!blkg)) {
 		blkg = ERR_PTR(-ENOMEM);
 		goto out;
 	}
 
 	/* insert */
 	spin_lock(&blkcg->lock);
-	swap(blkg, new_blkg);
-
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	list_add(&blkg->q_node, &q->blkg_list);
-
 	spin_unlock(&blkcg->lock);
+
+	spin_lock(&alloc_list_lock);
+	list_add(&blkg->alloc_node, &alloc_list);
+	/* Queue per cpu stat allocation from worker thread. */
+	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
+	spin_unlock(&alloc_list_lock);
 out:
-	blkg_free(new_blkg);
 	return blkg;
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
@@ -654,6 +693,10 @@ static void blkg_destroy(struct blkio_group *blkg)
 	list_del_init(&blkg->q_node);
 	hlist_del_init_rcu(&blkg->blkcg_node);
 
+	spin_lock(&alloc_list_lock);
+	list_del_init(&blkg->alloc_node);
+	spin_unlock(&alloc_list_lock);
+
 	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
@@ -752,6 +795,9 @@ static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
 	struct blkg_policy_data *pd = blkg->pd[plid];
 	struct blkio_group_stats_cpu *stats_cpu;
 	int i, j, k;
+
+	if (pd->stats_cpu == NULL)
+		return;
 	/*
 	 * Note: On 64 bit arch this should not be an issue. This has the
 	 * possibility of returning some inconsistent value on 32bit arch
@@ -883,6 +929,9 @@ static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
 	struct blkio_group_stats_cpu *stats_cpu;
 	u64 val = 0, tval;
 
+	if (pd->stats_cpu == NULL)
+		return val;
+
 	for_each_possible_cpu(cpu) {
 		unsigned int start;
 		stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 98cd8533..1de32fe 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -190,6 +190,8 @@ struct blkio_group {
 	spinlock_t stats_lock;
 	struct blkg_policy_data *pd[BLKIO_NR_POLICIES];
 
+	/* List of blkg waiting for per cpu stats memory to be allocated */
+	struct list_head alloc_node;
 	struct rcu_head rcu_head;
 };
 
-- 
cgit v1.1


From 5fe224d2d5fbf8f020b30d0ba69fed7856923752 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 8 Mar 2012 10:53:57 -0800
Subject: blkcg: don't use percpu for merged stats

With recent plug merge updates, merged stats are no longer called for
plug merges and now only updated while holding queue_lock.  As
stats_lock is scheduled to be removed, there's no reason to use percpu
for merged stats.  Don't use percpu for merged stats.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 26 ++++++--------------------
 block/blk-cgroup.h |  6 +++---
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 622fb41..6eedf3a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -451,27 +451,13 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 				    bool direction, bool sync)
 {
 	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	struct blkio_group_stats_cpu *stats_cpu;
+	struct blkio_group_stats *stats;
 	unsigned long flags;
 
-	/* If per cpu stats are not allocated yet, don't do any accounting. */
-	if (pd->stats_cpu == NULL)
-		return;
-
-	/*
-	 * Disabling interrupts to provide mutual exclusion between two
-	 * writes on same cpu. It probably is not needed for 64bit. Not
-	 * optimizing that case yet.
-	 */
-	local_irq_save(flags);
-
-	stats_cpu = this_cpu_ptr(pd->stats_cpu);
-
-	u64_stats_update_begin(&stats_cpu->syncp);
-	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
-				direction, sync);
-	u64_stats_update_end(&stats_cpu->syncp);
-	local_irq_restore(flags);
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	stats = &pd->stats;
+	blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
@@ -1342,7 +1328,7 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 						BLKIO_STAT_WAIT_TIME, 1, 0);
 		case BLKIO_PROP_io_merged:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_MERGED, 1, 1);
+						BLKIO_STAT_MERGED, 1, 0);
 		case BLKIO_PROP_io_queued:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_QUEUED, 1, 0);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1de32fe..6c8e3e3 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -29,10 +29,12 @@ enum blkio_policy_id {
 #ifdef CONFIG_BLK_CGROUP
 
 enum stat_type {
+	/* Number of IOs merged */
+	BLKIO_STAT_MERGED,
 	/* Total time spent (in ns) between request dispatch to the driver and
 	 * request completion for IOs doen by this cgroup. This may not be
 	 * accurate when NCQ is turned on. */
-	BLKIO_STAT_SERVICE_TIME = 0,
+	BLKIO_STAT_SERVICE_TIME,
 	/* Total time spent waiting in scheduler queue in ns */
 	BLKIO_STAT_WAIT_TIME,
 	/* Number of IOs queued up */
@@ -57,8 +59,6 @@ enum stat_type_cpu {
 	BLKIO_STAT_CPU_SERVICE_BYTES,
 	/* Total IOs serviced, post merge */
 	BLKIO_STAT_CPU_SERVICED,
-	/* Number of IOs merged */
-	BLKIO_STAT_CPU_MERGED,
 	BLKIO_STAT_CPU_NR
 };
 
-- 
cgit v1.1


From 997a026c80c3cc05f82e589aced1f0011c17d376 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 8 Mar 2012 10:53:58 -0800
Subject: blkcg: simplify stat reset

blkiocg_reset_stats() implements stat reset for blkio.reset_stats
cgroupfs file.  This feature is very unconventional and something
which shouldn't have been merged.  It's only useful when there's only
one user or tool looking at the stats.  As soon as multiple users
and/or tools are involved, it becomes useless as resetting disrupts
other usages.  There are very good reasons why all other stats expect
readers to read values at the start and end of a period and subtract
to determine delta over the period.

The implementation is rather complex - some fields shouldn't be
cleared and it saves some fields, resets whole and restores for some
reason.  Reset of percpu stats is also racy.  The comment points to
64bit store atomicity for the reason but even without that stores for
zero can simply race with other CPUs doing RMW and get clobbered.

Simplify reset by

* Clear selectively instead of resetting and restoring.

* Grouping debug stat fields to be reset and using memset() over them.

* Not caring about stats_lock.

* Using memset() to reset percpu stats.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 80 +++++++++++++++++-------------------------------------
 block/blk-cgroup.h | 14 ++++++++--
 2 files changed, 37 insertions(+), 57 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6eedf3a..759bc58 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -779,83 +779,53 @@ EXPORT_SYMBOL_GPL(__blkg_release);
 static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
 {
 	struct blkg_policy_data *pd = blkg->pd[plid];
-	struct blkio_group_stats_cpu *stats_cpu;
-	int i, j, k;
+	int cpu;
 
 	if (pd->stats_cpu == NULL)
 		return;
-	/*
-	 * Note: On 64 bit arch this should not be an issue. This has the
-	 * possibility of returning some inconsistent value on 32bit arch
-	 * as 64bit update on 32bit is non atomic. Taking care of this
-	 * corner case makes code very complicated, like sending IPIs to
-	 * cpus, taking care of stats of offline cpus etc.
-	 *
-	 * reset stats is anyway more of a debug feature and this sounds a
-	 * corner case. So I am not complicating the code yet until and
-	 * unless this becomes a real issue.
-	 */
-	for_each_possible_cpu(i) {
-		stats_cpu = per_cpu_ptr(pd->stats_cpu, i);
-		stats_cpu->sectors = 0;
-		for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
-			for (k = 0; k < BLKIO_STAT_TOTAL; k++)
-				stats_cpu->stat_arr_cpu[j][k] = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct blkio_group_stats_cpu *sc =
+			per_cpu_ptr(pd->stats_cpu, cpu);
+
+		sc->sectors = 0;
+		memset(sc->stat_arr_cpu, 0, sizeof(sc->stat_arr_cpu));
 	}
 }
 
 static int
 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
-	struct blkio_cgroup *blkcg;
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	struct blkio_group *blkg;
-	struct blkio_group_stats *stats;
 	struct hlist_node *n;
-	uint64_t queued[BLKIO_STAT_TOTAL];
 	int i;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	bool idling, waiting, empty;
-	unsigned long long now = sched_clock();
-#endif
 
-	blkcg = cgroup_to_blkio_cgroup(cgroup);
 	spin_lock(&blkio_list_lock);
 	spin_lock_irq(&blkcg->lock);
+
+	/*
+	 * Note that stat reset is racy - it doesn't synchronize against
+	 * stat updates.  This is a debug feature which shouldn't exist
+	 * anyway.  If you get hit by a race, retry.
+	 */
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		struct blkio_policy_type *pol;
 
 		list_for_each_entry(pol, &blkio_list, list) {
 			struct blkg_policy_data *pd = blkg->pd[pol->plid];
-
-			spin_lock(&blkg->stats_lock);
-			stats = &pd->stats;
+			struct blkio_group_stats *stats = &pd->stats;
+
+			/* queued stats shouldn't be cleared */
+			for (i = 0; i < ARRAY_SIZE(stats->stat_arr); i++)
+				if (i != BLKIO_STAT_QUEUED)
+					memset(stats->stat_arr[i], 0,
+					       sizeof(stats->stat_arr[i]));
+			stats->time = 0;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-			idling = blkio_blkg_idling(stats);
-			waiting = blkio_blkg_waiting(stats);
-			empty = blkio_blkg_empty(stats);
+			memset((void *)stats + BLKG_STATS_DEBUG_CLEAR_START, 0,
+			       BLKG_STATS_DEBUG_CLEAR_SIZE);
 #endif
-			for (i = 0; i < BLKIO_STAT_TOTAL; i++)
-				queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
-			memset(stats, 0, sizeof(struct blkio_group_stats));
-			for (i = 0; i < BLKIO_STAT_TOTAL; i++)
-				stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-			if (idling) {
-				blkio_mark_blkg_idling(stats);
-				stats->start_idle_time = now;
-			}
-			if (waiting) {
-				blkio_mark_blkg_waiting(stats);
-				stats->start_group_wait_time = now;
-			}
-			if (empty) {
-				blkio_mark_blkg_empty(stats);
-				stats->start_empty_time = now;
-			}
-#endif
-			spin_unlock(&blkg->stats_lock);
-
-			/* Reset Per cpu stats which don't take blkg->stats_lock */
 			blkio_reset_stats_cpu(blkg, pol->plid);
 		}
 	}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 6c8e3e3..1fa3c5e 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -131,21 +131,31 @@ struct blkio_group_stats {
 
 	/* Total time spent waiting for it to be assigned a timeslice. */
 	uint64_t group_wait_time;
-	uint64_t start_group_wait_time;
 
 	/* Time spent idling for this blkio_group */
 	uint64_t idle_time;
-	uint64_t start_idle_time;
 	/*
 	 * Total time when we have requests queued and do not contain the
 	 * current active queue.
 	 */
 	uint64_t empty_time;
+
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t start_group_wait_time;
+	uint64_t start_idle_time;
 	uint64_t start_empty_time;
 	uint16_t flags;
 #endif
 };
 
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+#define BLKG_STATS_DEBUG_CLEAR_START	\
+	offsetof(struct blkio_group_stats, unaccounted_time)
+#define BLKG_STATS_DEBUG_CLEAR_SIZE	\
+	(offsetof(struct blkio_group_stats, start_group_wait_time) - \
+	 BLKG_STATS_DEBUG_CLEAR_START)
+#endif
+
 /* Per cpu blkio group stats */
 struct blkio_group_stats_cpu {
 	uint64_t sectors;
-- 
cgit v1.1


From c4c76a05382c7d05e0b911daa58a827399e9ba1a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 8 Mar 2012 10:53:59 -0800
Subject: blkcg: restructure blkio_get_stat()

Restructure blkio_get_stat() to prepare for removal of stats_lock.

* Define BLKIO_STAT_ARR_NR explicitly to denote which stats have
  subtypes instead of using BLKIO_STAT_QUEUED.

* Separate out stat acquisition and printing.  After this, there are
  only two users of blkio_fill_stat().  Just open code it.

* The code was mixing MAX_KEY_LEN and MAX_KEY_LEN - 1.  There's no
  need to subtract one.  Use MAX_KEY_LEN consistently.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 100 +++++++++++++++++++++++++++--------------------------
 block/blk-cgroup.h |   6 +++-
 2 files changed, 56 insertions(+), 50 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 759bc58..80887bc 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -868,15 +868,6 @@ static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
 	}
 }
 
-static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
-				struct cgroup_map_cb *cb, const char *dname)
-{
-	blkio_get_key_name(0, dname, str, chars_left, true);
-	cb->fill(cb, str, val);
-	return val;
-}
-
-
 static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
 			enum stat_type_cpu type, enum stat_sub_type sub_type)
 {
@@ -916,8 +907,9 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
 
 	if (type == BLKIO_STAT_CPU_SECTORS) {
 		val = blkio_read_stat_cpu(blkg, plid, type, 0);
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb,
-				       dname);
+		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
+		cb->fill(cb, key_str, val);
+		return val;
 	}
 
 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
@@ -942,50 +934,60 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
 			       struct cgroup_map_cb *cb, const char *dname,
 			       enum stat_type type)
 {
-	struct blkg_policy_data *pd = blkg->pd[plid];
-	uint64_t disk_total;
+	struct blkio_group_stats *stats = &blkg->pd[plid]->stats;
+	uint64_t v = 0, disk_total = 0;
 	char key_str[MAX_KEY_LEN];
-	enum stat_sub_type sub_type;
+	int st;
 
-	if (type == BLKIO_STAT_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					pd->stats.time, cb, dname);
+	if (type >= BLKIO_STAT_ARR_NR) {
+		switch (type) {
+		case BLKIO_STAT_TIME:
+			v = stats->time;
+			break;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       pd->stats.unaccounted_time, cb, dname);
-	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
-		uint64_t sum = pd->stats.avg_queue_size_sum;
-		uint64_t samples = pd->stats.avg_queue_size_samples;
-		if (samples)
-			do_div(sum, samples);
-		else
-			sum = 0;
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       sum, cb, dname);
-	}
-	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       pd->stats.group_wait_time, cb, dname);
-	if (type == BLKIO_STAT_IDLE_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       pd->stats.idle_time, cb, dname);
-	if (type == BLKIO_STAT_EMPTY_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       pd->stats.empty_time, cb, dname);
-	if (type == BLKIO_STAT_DEQUEUE)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-				       pd->stats.dequeue, cb, dname);
+		case BLKIO_STAT_UNACCOUNTED_TIME:
+			v = stats->unaccounted_time;
+			break;
+		case BLKIO_STAT_AVG_QUEUE_SIZE: {
+			uint64_t samples = stats->avg_queue_size_samples;
+
+			if (samples) {
+				v = stats->avg_queue_size_sum;
+				do_div(v, samples);
+			}
+			break;
+		}
+		case BLKIO_STAT_IDLE_TIME:
+			v = stats->idle_time;
+			break;
+		case BLKIO_STAT_EMPTY_TIME:
+			v = stats->empty_time;
+			break;
+		case BLKIO_STAT_DEQUEUE:
+			v = stats->dequeue;
+			break;
+		case BLKIO_STAT_GROUP_WAIT_TIME:
+			v = stats->group_wait_time;
+			break;
 #endif
+		default:
+			WARN_ON_ONCE(1);
+		}
 
-	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
-			sub_type++) {
-		blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
-				   false);
-		cb->fill(cb, key_str, pd->stats.stat_arr[type][sub_type]);
+		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
+		cb->fill(cb, key_str, v);
+		return v;
 	}
-	disk_total = pd->stats.stat_arr[type][BLKIO_STAT_READ] +
-			pd->stats.stat_arr[type][BLKIO_STAT_WRITE];
+
+	for (st = BLKIO_STAT_READ; st < BLKIO_STAT_TOTAL; st++) {
+		v = stats->stat_arr[type][st];
+
+		blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false);
+		cb->fill(cb, key_str, v);
+		if (st == BLKIO_STAT_READ || st == BLKIO_STAT_WRITE)
+			disk_total += v;
+	}
+
 	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
 			   false);
 	cb->fill(cb, key_str, disk_total);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1fa3c5e..8bdcf50 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -39,6 +39,7 @@ enum stat_type {
 	BLKIO_STAT_WAIT_TIME,
 	/* Number of IOs queued up */
 	BLKIO_STAT_QUEUED,
+
 	/* All the single valued stats go below this */
 	BLKIO_STAT_TIME,
 #ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -52,6 +53,9 @@ enum stat_type {
 #endif
 };
 
+/* Types lower than this live in stat_arr and have subtypes */
+#define BLKIO_STAT_ARR_NR	(BLKIO_STAT_QUEUED + 1)
+
 /* Per cpu stats */
 enum stat_type_cpu {
 	BLKIO_STAT_CPU_SECTORS,
@@ -117,7 +121,7 @@ struct blkio_cgroup {
 struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
-	uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
+	uint64_t stat_arr[BLKIO_STAT_ARR_NR][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Time not charged to this cgroup */
 	uint64_t unaccounted_time;
-- 
cgit v1.1


From edf1b879e308d37b5b7c414476ab39f79650a253 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 8 Mar 2012 10:54:00 -0800
Subject: blkcg: remove blkio_group->stats_lock

With recent plug merge updates, all non-percpu stat updates happen
under queue_lock making stats_lock unnecessary to synchronize stat
updates.  The only synchronization necessary is stat reading, which
can be done using u64_stats_sync instead.

This patch removes blkio_group->stats_lock and adds
blkio_group_stats->syncp for reader synchronization.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 209 ++++++++++++++++++++++++++---------------------------
 block/blk-cgroup.h |   3 +-
 2 files changed, 103 insertions(+), 109 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 80887bc..b15a517 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -156,7 +156,7 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg,
 
 /*
  * Add to the appropriate stat variable depending on the request type.
- * This should be called with the blkg->stats_lock held.
+ * This should be called with queue_lock held.
  */
 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
 				bool sync)
@@ -174,7 +174,7 @@ static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
 /*
  * Decrements the appropriate stat variable if non-zero depending on the
  * request type. Panics on value being zero.
- * This should be called with the blkg->stats_lock held.
+ * This should be called with the queue_lock held.
  */
 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 {
@@ -195,7 +195,7 @@ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 }
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-/* This should be called with the blkg->stats_lock held. */
+/* This should be called with the queue_lock held. */
 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 					    struct blkio_policy_type *pol,
 					    struct blkio_group *curr_blkg)
@@ -210,7 +210,7 @@ static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 	blkio_mark_blkg_waiting(&pd->stats);
 }
 
-/* This should be called with the blkg->stats_lock held. */
+/* This should be called with the queue_lock held. */
 static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
 {
 	unsigned long long now;
@@ -224,7 +224,7 @@ static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
 	blkio_clear_blkg_waiting(stats);
 }
 
-/* This should be called with the blkg->stats_lock held. */
+/* This should be called with the queue_lock held. */
 static void blkio_end_empty_time(struct blkio_group_stats *stats)
 {
 	unsigned long long now;
@@ -241,84 +241,74 @@ static void blkio_end_empty_time(struct blkio_group_stats *stats)
 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
 					struct blkio_policy_type *pol)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	unsigned long flags;
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	BUG_ON(blkio_blkg_idling(&pd->stats));
-	pd->stats.start_idle_time = sched_clock();
-	blkio_mark_blkg_idling(&pd->stats);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	lockdep_assert_held(blkg->q->queue_lock);
+	BUG_ON(blkio_blkg_idling(stats));
+
+	stats->start_idle_time = sched_clock();
+	blkio_mark_blkg_idling(stats);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
 
 void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
 				    struct blkio_policy_type *pol)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	unsigned long flags;
-	unsigned long long now;
-	struct blkio_group_stats *stats;
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+
+	lockdep_assert_held(blkg->q->queue_lock);
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &pd->stats;
 	if (blkio_blkg_idling(stats)) {
-		now = sched_clock();
-		if (time_after64(now, stats->start_idle_time))
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time)) {
+			u64_stats_update_begin(&stats->syncp);
 			stats->idle_time += now - stats->start_idle_time;
+			u64_stats_update_end(&stats->syncp);
+		}
 		blkio_clear_blkg_idling(stats);
 	}
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 
 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
 					 struct blkio_policy_type *pol)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	unsigned long flags;
-	struct blkio_group_stats *stats;
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &pd->stats;
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	u64_stats_update_begin(&stats->syncp);
 	stats->avg_queue_size_sum +=
 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
 	stats->avg_queue_size_samples++;
 	blkio_update_group_wait_time(stats);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	u64_stats_update_end(&stats->syncp);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 
 void blkiocg_set_start_empty_time(struct blkio_group *blkg,
 				  struct blkio_policy_type *pol)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	unsigned long flags;
-	struct blkio_group_stats *stats;
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &pd->stats;
+	lockdep_assert_held(blkg->q->queue_lock);
 
 	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
-			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
-		spin_unlock_irqrestore(&blkg->stats_lock, flags);
+			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE])
 		return;
-	}
 
 	/*
 	 * group is already marked empty. This can happen if cfqq got new
 	 * request in parent group and moved to this group while being added
 	 * to service tree. Just ignore the event and move on.
 	 */
-	if(blkio_blkg_empty(stats)) {
-		spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	if (blkio_blkg_empty(stats))
 		return;
-	}
 
 	stats->start_empty_time = sched_clock();
 	blkio_mark_blkg_empty(stats);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
 
@@ -328,6 +318,8 @@ void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 {
 	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 
+	lockdep_assert_held(blkg->q->queue_lock);
+
 	pd->stats.dequeue += dequeue;
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
@@ -343,15 +335,16 @@ void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 				 struct blkio_group *curr_blkg, bool direction,
 				 bool sync)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	unsigned long flags;
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	u64_stats_update_begin(&stats->syncp);
+	blkio_add_stat(stats->stat_arr[BLKIO_STAT_QUEUED], 1, direction, sync);
+	blkio_end_empty_time(stats);
+	u64_stats_update_end(&stats->syncp);
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	blkio_add_stat(pd->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
-			sync);
-	blkio_end_empty_time(&pd->stats);
 	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
 
@@ -359,13 +352,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 				    struct blkio_policy_type *pol,
 				    bool direction, bool sync)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	unsigned long flags;
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	blkio_check_and_dec_stat(pd->stats.stat_arr[BLKIO_STAT_QUEUED],
-					direction, sync);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	u64_stats_update_begin(&stats->syncp);
+	blkio_check_and_dec_stat(stats->stat_arr[BLKIO_STAT_QUEUED], direction,
+				 sync);
+	u64_stats_update_end(&stats->syncp);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 
@@ -374,15 +368,16 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 				   unsigned long time,
 				   unsigned long unaccounted_time)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	unsigned long flags;
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+
+	lockdep_assert_held(blkg->q->queue_lock);
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	pd->stats.time += time;
+	u64_stats_update_begin(&stats->syncp);
+	stats->time += time;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	pd->stats.unaccounted_time += unaccounted_time;
+	stats->unaccounted_time += unaccounted_time;
 #endif
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	u64_stats_update_end(&stats->syncp);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
@@ -428,20 +423,19 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 				     uint64_t io_start_time, bool direction,
 				     bool sync)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	struct blkio_group_stats *stats;
-	unsigned long flags;
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 	unsigned long long now = sched_clock();
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &pd->stats;
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	u64_stats_update_begin(&stats->syncp);
 	if (time_after64(now, io_start_time))
 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
 				now - io_start_time, direction, sync);
 	if (time_after64(io_start_time, start_time))
 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
 				io_start_time - start_time, direction, sync);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	u64_stats_update_end(&stats->syncp);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 
@@ -450,14 +444,13 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 				    struct blkio_policy_type *pol,
 				    bool direction, bool sync)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	struct blkio_group_stats *stats;
-	unsigned long flags;
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &pd->stats;
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	u64_stats_update_begin(&stats->syncp);
 	blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	u64_stats_update_end(&stats->syncp);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
@@ -558,7 +551,6 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	if (!blkg)
 		return NULL;
 
-	spin_lock_init(&blkg->stats_lock);
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
 	INIT_LIST_HEAD(&blkg->alloc_node);
@@ -929,7 +921,6 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
 	return disk_total;
 }
 
-/* This should be called with blkg->stats_lock held */
 static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
 			       struct cgroup_map_cb *cb, const char *dname,
 			       enum stat_type type)
@@ -937,42 +928,46 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
 	struct blkio_group_stats *stats = &blkg->pd[plid]->stats;
 	uint64_t v = 0, disk_total = 0;
 	char key_str[MAX_KEY_LEN];
+	unsigned int sync_start;
 	int st;
 
 	if (type >= BLKIO_STAT_ARR_NR) {
-		switch (type) {
-		case BLKIO_STAT_TIME:
-			v = stats->time;
-			break;
+		do {
+			sync_start = u64_stats_fetch_begin(&stats->syncp);
+			switch (type) {
+			case BLKIO_STAT_TIME:
+				v = stats->time;
+				break;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-		case BLKIO_STAT_UNACCOUNTED_TIME:
-			v = stats->unaccounted_time;
-			break;
-		case BLKIO_STAT_AVG_QUEUE_SIZE: {
-			uint64_t samples = stats->avg_queue_size_samples;
+			case BLKIO_STAT_UNACCOUNTED_TIME:
+				v = stats->unaccounted_time;
+				break;
+			case BLKIO_STAT_AVG_QUEUE_SIZE: {
+				uint64_t samples = stats->avg_queue_size_samples;
 
-			if (samples) {
-				v = stats->avg_queue_size_sum;
-				do_div(v, samples);
+				if (samples) {
+					v = stats->avg_queue_size_sum;
+					do_div(v, samples);
+				}
+				break;
 			}
-			break;
-		}
-		case BLKIO_STAT_IDLE_TIME:
-			v = stats->idle_time;
-			break;
-		case BLKIO_STAT_EMPTY_TIME:
-			v = stats->empty_time;
-			break;
-		case BLKIO_STAT_DEQUEUE:
-			v = stats->dequeue;
-			break;
-		case BLKIO_STAT_GROUP_WAIT_TIME:
-			v = stats->group_wait_time;
-			break;
+			case BLKIO_STAT_IDLE_TIME:
+				v = stats->idle_time;
+				break;
+			case BLKIO_STAT_EMPTY_TIME:
+				v = stats->empty_time;
+				break;
+			case BLKIO_STAT_DEQUEUE:
+				v = stats->dequeue;
+				break;
+			case BLKIO_STAT_GROUP_WAIT_TIME:
+				v = stats->group_wait_time;
+				break;
 #endif
-		default:
-			WARN_ON_ONCE(1);
-		}
+			default:
+				WARN_ON_ONCE(1);
+			}
+		} while (u64_stats_fetch_retry(&stats->syncp, sync_start));
 
 		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
 		cb->fill(cb, key_str, v);
@@ -980,7 +975,10 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
 	}
 
 	for (st = BLKIO_STAT_READ; st < BLKIO_STAT_TOTAL; st++) {
-		v = stats->stat_arr[type][st];
+		do {
+			sync_start = u64_stats_fetch_begin(&stats->syncp);
+			v = stats->stat_arr[type][st];
+		} while (u64_stats_fetch_retry(&stats->syncp, sync_start));
 
 		blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false);
 		cb->fill(cb, key_str, v);
@@ -1250,15 +1248,12 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
 
 		if (!dname)
 			continue;
-		if (pcpu) {
+		if (pcpu)
 			cgroup_total += blkio_get_stat_cpu(blkg, plid,
 							   cb, dname, type);
-		} else {
-			spin_lock(&blkg->stats_lock);
+		else
 			cgroup_total += blkio_get_stat(blkg, plid,
 						       cb, dname, type);
-			spin_unlock(&blkg->stats_lock);
-		}
 	}
 	if (show_total)
 		cb->fill(cb, "Total", cgroup_total);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8bdcf50..9df5ab0 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -119,6 +119,7 @@ struct blkio_cgroup {
 };
 
 struct blkio_group_stats {
+	struct u64_stats_sync syncp;
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
 	uint64_t stat_arr[BLKIO_STAT_ARR_NR][BLKIO_STAT_TOTAL];
@@ -200,8 +201,6 @@ struct blkio_group {
 	/* reference count */
 	int refcnt;
 
-	/* Need to serialize the stats in the case of reset/update */
-	spinlock_t stats_lock;
 	struct blkg_policy_data *pd[BLKIO_NR_POLICIES];
 
 	/* List of blkg waiting for per cpu stats memory to be allocated */
-- 
cgit v1.1


From 9a9e8a26da4c2c5ddc60999bdea957935fb22b6b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Mar 2012 15:10:56 -0700
Subject: blkcg: add blkcg->id

Add 64bit unique id to blkcg.  This will be used by policies which
want blkcg identity test to tell whether the associated blkcg has
changed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 3 +++
 block/blk-cgroup.h | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b15a517..30e0730 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/genhd.h>
 #include <linux/delay.h>
+#include <linux/atomic.h>
 #include "blk-cgroup.h"
 #include "blk.h"
 
@@ -1622,6 +1623,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 static struct cgroup_subsys_state *
 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 {
+	static atomic64_t id_seq = ATOMIC64_INIT(0);
 	struct blkio_cgroup *blkcg;
 	struct cgroup *parent = cgroup->parent;
 
@@ -1635,6 +1637,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		return ERR_PTR(-ENOMEM);
 
 	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
+	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
 done:
 	spin_lock_init(&blkcg->lock);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 9df5ab0..1cb8f76 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -116,6 +116,9 @@ struct blkio_cgroup {
 	unsigned int weight;
 	spinlock_t lock;
 	struct hlist_head blkg_list;
+
+	/* for policies to test whether associated blkcg has changed */
+	uint64_t id;
 };
 
 struct blkio_group_stats {
-- 
cgit v1.1


From abede6da27d9bd62ea9512830c83e32b3ee1104c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Mar 2012 15:10:57 -0700
Subject: cfq: pass around cfq_io_cq instead of io_context

Now that io_cq is managed by block core and guaranteed to exist for
any in-flight request, it is easier and carries more information to
pass around cfq_io_cq than io_context.

This patch updates cfq_init_prio_data(), cfq_find_alloc_queue() and
cfq_get_queue() to take @cic instead of @ioc.  This change removes a
duplicate cfq_cic_lookup() from cfq_find_alloc_queue().

This change enables the use of cic-cached ioprio in the next patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f2387b5..9e8624e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -468,7 +468,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
-				       struct io_context *ioc, struct bio *bio,
+				       struct cfq_io_cq *cic, struct bio *bio,
 				       gfp_t gfp_mask);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
@@ -2560,7 +2560,7 @@ static void cfq_exit_icq(struct io_cq *icq)
 	}
 }
 
-static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
+static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
@@ -2568,7 +2568,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
-	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
+	ioprio_class = IOPRIO_PRIO_CLASS(cic->icq.ioc->ioprio);
 	switch (ioprio_class) {
 	default:
 		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -2580,11 +2580,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 		cfqq->ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
-		cfqq->ioprio = task_ioprio(ioc);
+		cfqq->ioprio = task_ioprio(cic->icq.ioc);
 		cfqq->ioprio_class = IOPRIO_CLASS_RT;
 		break;
 	case IOPRIO_CLASS_BE:
-		cfqq->ioprio = task_ioprio(ioc);
+		cfqq->ioprio = task_ioprio(cic->icq.ioc);
 		cfqq->ioprio_class = IOPRIO_CLASS_BE;
 		break;
 	case IOPRIO_CLASS_IDLE:
@@ -2613,8 +2613,8 @@ static void changed_ioprio(struct cfq_io_cq *cic, struct bio *bio)
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
-					 bio, GFP_ATOMIC);
+		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
+					 GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
@@ -2671,23 +2671,18 @@ static void changed_cgroup(struct cfq_io_cq *cic)
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
 static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
-		     struct io_context *ioc, struct bio *bio, gfp_t gfp_mask)
+cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
+		     struct bio *bio, gfp_t gfp_mask)
 {
 	struct blkio_cgroup *blkcg;
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	struct cfq_io_cq *cic;
 	struct cfq_group *cfqg;
 
 retry:
 	rcu_read_lock();
 
 	blkcg = bio_blkio_cgroup(bio);
-
 	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
-
-	cic = cfq_cic_lookup(cfqd, ioc);
-	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
 
 	/*
@@ -2716,7 +2711,7 @@ retry:
 
 		if (cfqq) {
 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
-			cfq_init_prio_data(cfqq, ioc);
+			cfq_init_prio_data(cfqq, cic);
 			cfq_link_cfqq_cfqg(cfqq, cfqg);
 			cfq_log_cfqq(cfqd, cfqq, "alloced");
 		} else
@@ -2746,11 +2741,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 }
 
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
+cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 	      struct bio *bio, gfp_t gfp_mask)
 {
-	const int ioprio = task_ioprio(ioc);
-	const int ioprio_class = task_ioprio_class(ioc);
+	const int ioprio = task_ioprio(cic->icq.ioc);
+	const int ioprio_class = task_ioprio_class(cic->icq.ioc);
 	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 
@@ -2760,7 +2755,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	}
 
 	if (!cfqq)
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, bio, gfp_mask);
+		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
 
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
@@ -3030,7 +3025,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
 	cfq_log_cfqq(cfqd, cfqq, "insert_request");
-	cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
+	cfq_init_prio_data(cfqq, RQ_CIC(rq));
 
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
@@ -3234,7 +3229,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 
 	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
 	if (cfqq) {
-		cfq_init_prio_data(cfqq, cic->icq.ioc);
+		cfq_init_prio_data(cfqq, cic);
 
 		return __cfq_may_queue(cfqq);
 	}
@@ -3326,7 +3321,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, bio, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
-- 
cgit v1.1


From 598971bfbdfdc8701337dc1636c7919c44699914 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Mar 2012 15:10:58 -0700
Subject: cfq: don't use icq_get_changed()

cfq caches the associated cfqq's for a given cic.  The cache needs to
be flushed if the cic's ioprio or blkcg has changed.  It is currently
done by requiring the changing action to set the respective
ICQ_*_CHANGED bit in the icq and testing it from cfq_set_request(),
which involves iterating through all the affected icqs.

All cfq wants to know is whether ioprio and/or blkcg have changed
since the last flush and can be easily achieved by just remembering
the current ioprio and blkcg ID in cic.

This patch adds cic->{ioprio|blkcg_id}, updates all ioprio users to
use the remembered value instead, and updates cfq_set_request() path
such that, instead of using icq_get_changed(), the current values are
compared against the remembered ones and trigger appropriate flush
action if not.  Condition tests are moved inside both _changed
functions which are now named check_ioprio_changed() and
check_blkcg_changed().

ioprio.h::task_ioprio*() can't be used anymore and replaced with
open-coded IOPRIO_CLASS_NONE case in cfq_async_queue_prio().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c    | 63 ++++++++++++++++++++++++++++++++------------------
 include/linux/ioprio.h | 22 ++++--------------
 2 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9e8624e..7c3893d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -218,6 +218,10 @@ struct cfq_io_cq {
 	struct io_cq		icq;		/* must be the first member */
 	struct cfq_queue	*cfqq[2];
 	struct cfq_ttime	ttime;
+	int			ioprio;		/* the current ioprio */
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	uint64_t		blkcg_id;	/* the current blkcg ID */
+#endif
 };
 
 /*
@@ -2568,7 +2572,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
-	ioprio_class = IOPRIO_PRIO_CLASS(cic->icq.ioc->ioprio);
+	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	switch (ioprio_class) {
 	default:
 		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -2580,11 +2584,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
 		cfqq->ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
-		cfqq->ioprio = task_ioprio(cic->icq.ioc);
+		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 		cfqq->ioprio_class = IOPRIO_CLASS_RT;
 		break;
 	case IOPRIO_CLASS_BE:
-		cfqq->ioprio = task_ioprio(cic->icq.ioc);
+		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 		cfqq->ioprio_class = IOPRIO_CLASS_BE;
 		break;
 	case IOPRIO_CLASS_IDLE:
@@ -2602,12 +2606,17 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static void changed_ioprio(struct cfq_io_cq *cic, struct bio *bio)
+static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
+	int ioprio = cic->icq.ioc->ioprio;
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
 
-	if (unlikely(!cfqd))
+	/*
+	 * Check whether ioprio has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
 		return;
 
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
@@ -2624,6 +2633,8 @@ static void changed_ioprio(struct cfq_io_cq *cic, struct bio *bio)
 	cfqq = cic->cfqq[BLK_RW_SYNC];
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
+
+	cic->ioprio = ioprio;
 }
 
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -2647,17 +2658,24 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct cfq_io_cq *cic)
+static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
-	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	struct request_queue *q;
+	struct cfq_queue *sync_cfqq;
+	uint64_t id;
 
-	if (unlikely(!cfqd))
-		return;
+	rcu_read_lock();
+	id = bio_blkio_cgroup(bio)->id;
+	rcu_read_unlock();
 
-	q = cfqd->queue;
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
+		return;
 
+	sync_cfqq = cic_to_cfqq(cic, 1);
 	if (sync_cfqq) {
 		/*
 		 * Drop reference to sync queue. A new sync queue will be
@@ -2667,7 +2685,11 @@ static void changed_cgroup(struct cfq_io_cq *cic)
 		cic_set_cfqq(cic, NULL, 1);
 		cfq_put_queue(sync_cfqq);
 	}
+
+	cic->blkcg_id = id;
 }
+#else
+static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
 static struct cfq_queue *
@@ -2731,6 +2753,9 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
 		return &cfqd->async_cfqq[0][ioprio];
+	case IOPRIO_CLASS_NONE:
+		ioprio = IOPRIO_NORM;
+		/* fall through */
 	case IOPRIO_CLASS_BE:
 		return &cfqd->async_cfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
@@ -2744,8 +2769,8 @@ static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 	      struct bio *bio, gfp_t gfp_mask)
 {
-	const int ioprio = task_ioprio(cic->icq.ioc);
-	const int ioprio_class = task_ioprio_class(cic->icq.ioc);
+	const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
+	const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 
@@ -3303,21 +3328,13 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
-	unsigned int changed;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
 	spin_lock_irq(q->queue_lock);
 
-	/* handle changed notifications */
-	changed = icq_get_changed(&cic->icq);
-	if (unlikely(changed & ICQ_IOPRIO_CHANGED))
-		changed_ioprio(cic, bio);
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	if (unlikely(changed & ICQ_CGROUP_CHANGED))
-		changed_cgroup(cic);
-#endif
-
+	check_ioprio_changed(cic, bio);
+	check_blkcg_changed(cic, bio);
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 76dad48..beb9ce1 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -42,26 +42,14 @@ enum {
 };
 
 /*
- * if process has set io priority explicitly, use that. if not, convert
- * the cpu scheduler nice value to an io priority
+ * Fallback BE priority
  */
 #define IOPRIO_NORM	(4)
-static inline int task_ioprio(struct io_context *ioc)
-{
-	if (ioprio_valid(ioc->ioprio))
-		return IOPRIO_PRIO_DATA(ioc->ioprio);
-
-	return IOPRIO_NORM;
-}
-
-static inline int task_ioprio_class(struct io_context *ioc)
-{
-	if (ioprio_valid(ioc->ioprio))
-		return IOPRIO_PRIO_CLASS(ioc->ioprio);
-
-	return IOPRIO_CLASS_BE;
-}
 
+/*
+ * if process has set io priority explicitly, use that. if not, convert
+ * the cpu scheduler nice value to an io priority
+ */
 static inline int task_nice_ioprio(struct task_struct *task)
 {
 	return (task_nice(task) + 20) / 5;
-- 
cgit v1.1


From 2b566fa55b9a94b53217c2818e6c5e5756eeb1a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Mar 2012 15:10:59 -0700
Subject: block: remove ioc_*_changed()

After the previous patch to cfq, there's no ioc_get_changed() user
left.  This patch yanks out ioc_{ioprio|cgroup|get}_changed() and all
related stuff.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        | 19 -------------
 block/blk-ioc.c           | 68 -----------------------------------------------
 fs/ioprio.c               |  2 +-
 include/linux/iocontext.h |  7 -----
 4 files changed, 1 insertion(+), 95 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 30e0730..a74019b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -47,8 +47,6 @@ static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
 						  struct cgroup *);
 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
 			      struct cgroup_taskset *);
-static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
-			   struct cgroup_taskset *);
 static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *);
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
@@ -63,7 +61,6 @@ struct cgroup_subsys blkio_subsys = {
 	.name = "blkio",
 	.create = blkiocg_create,
 	.can_attach = blkiocg_can_attach,
-	.attach = blkiocg_attach,
 	.pre_destroy = blkiocg_pre_destroy,
 	.destroy = blkiocg_destroy,
 	.populate = blkiocg_populate,
@@ -1729,22 +1726,6 @@ static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	return ret;
 }
 
-static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			   struct cgroup_taskset *tset)
-{
-	struct task_struct *task;
-	struct io_context *ioc;
-
-	cgroup_taskset_for_each(task, cgrp, tset) {
-		/* we don't lose anything even if ioc allocation fails */
-		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
-		if (ioc) {
-			ioc_cgroup_changed(ioc);
-			put_io_context(ioc);
-		}
-	}
-}
-
 static void blkcg_bypass_start(void)
 	__acquires(&all_q_mutex)
 {
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 439ec21..3f3dd51 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -388,74 +388,6 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	return icq;
 }
 
-void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags)
-{
-	struct io_cq *icq;
-	struct hlist_node *n;
-
-	hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
-		icq->flags |= flags;
-}
-
-/**
- * ioc_ioprio_changed - notify ioprio change
- * @ioc: io_context of interest
- * @ioprio: new ioprio
- *
- * @ioc's ioprio has changed to @ioprio.  Set %ICQ_IOPRIO_CHANGED for all
- * icq's.  iosched is responsible for checking the bit and applying it on
- * request issue path.
- */
-void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioc->lock, flags);
-	ioc->ioprio = ioprio;
-	ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-}
-
-/**
- * ioc_cgroup_changed - notify cgroup change
- * @ioc: io_context of interest
- *
- * @ioc's cgroup has changed.  Set %ICQ_CGROUP_CHANGED for all icq's.
- * iosched is responsible for checking the bit and applying it on request
- * issue path.
- */
-void ioc_cgroup_changed(struct io_context *ioc)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioc->lock, flags);
-	ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-}
-EXPORT_SYMBOL(ioc_cgroup_changed);
-
-/**
- * icq_get_changed - fetch and clear icq changed mask
- * @icq: icq of interest
- *
- * Fetch and clear ICQ_*_CHANGED bits from @icq.  Grabs and releases
- * @icq->ioc->lock.
- */
-unsigned icq_get_changed(struct io_cq *icq)
-{
-	unsigned int changed = 0;
-	unsigned long flags;
-
-	if (unlikely(icq->flags & ICQ_CHANGED_MASK)) {
-		spin_lock_irqsave(&icq->ioc->lock, flags);
-		changed = icq->flags & ICQ_CHANGED_MASK;
-		icq->flags &= ~ICQ_CHANGED_MASK;
-		spin_unlock_irqrestore(&icq->ioc->lock, flags);
-	}
-	return changed;
-}
-EXPORT_SYMBOL(icq_get_changed);
-
 static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 0f1b951..4864437 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -50,7 +50,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
-		ioc_ioprio_changed(ioc, ioprio);
+		ioc->ioprio = ioprio;
 		put_io_context(ioc);
 	}
 
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 6f1a260..df38db2 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -6,11 +6,7 @@
 #include <linux/workqueue.h>
 
 enum {
-	ICQ_IOPRIO_CHANGED	= 1 << 0,
-	ICQ_CGROUP_CHANGED	= 1 << 1,
 	ICQ_EXITED		= 1 << 2,
-
-	ICQ_CHANGED_MASK	= ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED,
 };
 
 /*
@@ -152,9 +148,6 @@ void put_io_context_active(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
-void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
-void ioc_cgroup_changed(struct io_context *ioc);
-unsigned int icq_get_changed(struct io_cq *icq);
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
-- 
cgit v1.1


From eb7d8c07f9c5fca6190b0d328179551122d1b8a3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 23 Mar 2012 14:02:53 +0100
Subject: cfq: fix cfqg ref handling when BLK_CGROUP && !CFQ_GROUP_IOSCHED

When BLK_CGROUP is enabled but CFQ_GROUP_IOSCHED is, cfq ends up
calling blkg_get/put() on dummy cfqg leading to the following crash.

  BUG: unable to handle kernel NULL pointer dereference at 00000000000000b0
  IP: [<ffffffff813d44d8>] cfq_init_queue+0x258/0x430
  PGD 0
  Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  CPU 0
  Modules linked in:

  Pid: 1, comm: swapper/0 Not tainted 3.3.0-rc6-work+ #125 Bochs Bochs
  RIP: 0010:[<ffffffff813d44d8>]  [<ffffffff813d44d8>] cfq_init_queue+0x258/0x430
  RSP: 0018:ffff88001f9dfd80  EFLAGS: 00010046
  RAX: ffff88001aefbbf0 RBX: ffff88001aeedbf0 RCX: 0000000000000100
  RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff820ffd40
  RBP: ffff88001f9dfdd0 R08: 0000000000000000 R09: 0000000000000001
  R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
  R13: 0000000000000009 R14: ffff88001aefbc30 R15: 0000000000000003
  FS:  0000000000000000(0000) GS:ffff88001fc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 00000000000000b0 CR3: 000000000206f000 CR4: 00000000000006f0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Process swapper/0 (pid: 1, threadinfo ffff88001f9de000, task ffff88001f9dc040)
  Stack:
   ffff88001aeedbf0 ffff88001aefbdb0 ffff88001aef1548 ffff88001aefbbf0
   ffff88001f9dfdd0 ffff88001aef1548 ffffffff820d6320 ffffffff8165ce30
   ffffffff82c555e0 ffff88001aeebbf0 ffff88001f9dfe00 ffffffff813b0507
  Call Trace:
   [<ffffffff813b0507>] elevator_init+0xd7/0x140
   [<ffffffff813b83d5>] blk_init_allocated_queue+0x125/0x150
   [<ffffffff813b94d3>] blk_init_queue_node+0x43/0x80
   [<ffffffff813b9523>] blk_init_queue+0x13/0x20
   [<ffffffff821aec00>] floppy_init+0x82/0xec7
   [<ffffffff810001d2>] do_one_initcall+0x42/0x170
   [<ffffffff821835fc>] kernel_init+0xcb/0x14f
   [<ffffffff81b40b24>] kernel_thread_helper+0x4/0x10
  Code: 00 e8 1d 9e 76 00 48 8b 43 48 48 85 c0 48 89 83 28 03 00 00 74 07 4c 8b a0 10 ff ff ff 8b 15 b0 2e d0 00 85 d2 0f 85 49 01 00 00 <41> 8b 84 24 b0 00 00 00 85 c0 0f 8e 8c 01 00 00 83 e8 01 85 c0
  RIP  [<ffffffff813d44d8>] cfq_init_queue+0x258/0x430

Because cfq's blkcg support has a on/off switch, CFQ_GROUP_IOSCHED,
separate from BLK_CGROUP, blkg access through cfqg needs to be
conditioned on it.

* Make blkg_to_cfqg() and cfqg_to_blkg() conditioned on
  CFQ_GROUP_IOSCHED.  If disabled, they always return %NULL.

* Introduce cfqg_get() and cfqg_put() conditioned on
  CFQ_GROUP_IOSCHED.  If disabled, they are noops.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 52 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7c3893d..39c4330 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -306,16 +306,6 @@ struct cfq_data {
 	unsigned long last_delayed_sync;
 };
 
-static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
-{
-	return blkg_to_pdata(blkg, &blkio_policy_cfq);
-}
-
-static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg)
-{
-	return pdata_to_blkg(cfqg, &blkio_policy_cfq);
-}
-
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
@@ -377,6 +367,26 @@ CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
+{
+	return blkg_to_pdata(blkg, &blkio_policy_cfq);
+}
+
+static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+	return pdata_to_blkg(cfqg, &blkio_policy_cfq);
+}
+
+static inline void cfqg_get(struct cfq_group *cfqg)
+{
+	return blkg_get(cfqg_to_blkg(cfqg));
+}
+
+static inline void cfqg_put(struct cfq_group *cfqg)
+{
+	return blkg_put(cfqg_to_blkg(cfqg));
+}
+
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
@@ -386,11 +396,19 @@ CFQ_CFQQ_FNS(wait_busy);
 	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
 			blkg_path(cfqg_to_blkg((cfqg))), ##args)	\
 
-#else
+#else	/* CONFIG_CFQ_GROUP_IOSCHED */
+
+static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) { return NULL; }
+static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg) { return NULL; }
+static inline void cfqg_get(struct cfq_group *cfqg) { }
+static inline void cfqg_put(struct cfq_group *cfqg) { }
+
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
-#endif
+
+#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
+
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 
@@ -1090,7 +1108,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
-	blkg_get(cfqg_to_blkg(cfqg));
+	cfqg_get(cfqg);
 }
 
 #else /* GROUP_IOSCHED */
@@ -2505,7 +2523,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
-	blkg_put(cfqg_to_blkg(cfqg));
+	cfqg_put(cfqg);
 }
 
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -3276,7 +3294,7 @@ static void cfq_put_request(struct request *rq)
 		cfqq->allocated[rw]--;
 
 		/* Put down rq reference on cfqg */
-		blkg_put(cfqg_to_blkg(RQ_CFQG(rq)));
+		cfqg_put(RQ_CFQG(rq));
 		rq->elv.priv[0] = NULL;
 		rq->elv.priv[1] = NULL;
 
@@ -3364,7 +3382,7 @@ new_queue:
 	cfqq->allocated[rw]++;
 
 	cfqq->ref++;
-	blkg_get(cfqg_to_blkg(cfqq->cfqg));
+	cfqg_get(cfqq->cfqg);
 	rq->elv.priv[0] = cfqq;
 	rq->elv.priv[1] = cfqq->cfqg;
 	spin_unlock_irq(q->queue_lock);
@@ -3545,7 +3563,7 @@ static int cfq_init_queue(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
-	blkg_put(cfqg_to_blkg(cfqd->root_group));
+	cfqg_put(cfqd->root_group);
 	spin_unlock_irq(q->queue_lock);
 
 	init_timer(&cfqd->idle_slice_timer);
-- 
cgit v1.1


From a5567932fc926739e29e98487128080f40c61710 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 29 Mar 2012 20:57:08 +0200
Subject: blkcg: change a spin_lock() to spin_lock_irq()

Smatch complains that we re-enable IRQs twice.  It looks like we forgot
to disable them here on the spin_trylock() failure path.  This was added
in 9f13ef678e "blkcg: use double locking instead of RCU for blkg
synchronization".

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>`
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a74019b..aa54c41 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1601,7 +1601,7 @@ static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
 		} else {
 			spin_unlock_irq(&blkcg->lock);
 			cpu_relax();
-			spin_lock(&blkcg->lock);
+			spin_lock_irq(&blkcg->lock);
 		}
 	}
 
-- 
cgit v1.1


From aaec55a002a29bf940588dc03253099a4cd543bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:42 -0700
Subject: blkcg: remove unused @pol and @plid parameters

@pol to blkg_to_pdata() and @plid to blkg_lookup_create() are no
longer necessary.  Drop them.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c   | 3 +--
 block/blk-cgroup.h   | 8 ++------
 block/blk-throttle.c | 7 +++----
 block/cfq-iosched.c  | 7 +++----
 4 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4fdeb46..55ccbae 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -568,7 +568,6 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       struct request_queue *q,
-				       enum blkio_policy_id plid,
 				       bool for_root)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
@@ -1027,7 +1026,7 @@ static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
 	rcu_read_lock();
 
 	spin_lock_irq(disk->queue->queue_lock);
-	blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
+	blkg = blkg_lookup_create(blkcg, disk->queue, false);
 	spin_unlock_irq(disk->queue->queue_lock);
 
 	if (IS_ERR(blkg)) {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1cb8f76..1add3dc 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -266,13 +266,10 @@ static inline void *blkg_to_pdata(struct blkio_group *blkg,
 /**
  * pdata_to_blkg - get blkg associated with policy private data
  * @pdata: policy private data of interest
- * @pol: policy @pdata is for
  *
- * @pdata is policy private data for @pol.  Determine the blkg it's
- * associated with.
+ * @pdata is policy private data.  Determine the blkg it's associated with.
  */
-static inline struct blkio_group *pdata_to_blkg(void *pdata,
-						struct blkio_policy_type *pol)
+static inline struct blkio_group *pdata_to_blkg(void *pdata)
 {
 	if (pdata) {
 		struct blkg_policy_data *pd =
@@ -402,7 +399,6 @@ extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 				       struct request_queue *q);
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       struct request_queue *q,
-				       enum blkio_policy_id plid,
 				       bool for_root);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 				   struct blkio_policy_type *pol,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 4ba1418..1cc6c23d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -107,7 +107,7 @@ static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
 
 static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg)
 {
-	return pdata_to_blkg(tg, &blkio_policy_throtl);
+	return pdata_to_blkg(tg);
 }
 
 enum tg_state_flags {
@@ -185,7 +185,7 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 	} else {
 		struct blkio_group *blkg;
 
-		blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_THROTL, false);
+		blkg = blkg_lookup_create(blkcg, q, false);
 
 		/* if %NULL and @q is alive, fall back to root_tg */
 		if (!IS_ERR(blkg))
@@ -1033,8 +1033,7 @@ int blk_throtl_init(struct request_queue *q)
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
 
-	blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_THROTL,
-				  true);
+	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
 	if (!IS_ERR(blkg))
 		td->root_tg = blkg_to_tg(blkg);
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 39c4330..8cca6161 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -374,7 +374,7 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
 
 static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg)
 {
-	return pdata_to_blkg(cfqg, &blkio_policy_cfq);
+	return pdata_to_blkg(cfqg);
 }
 
 static inline void cfqg_get(struct cfq_group *cfqg)
@@ -1092,7 +1092,7 @@ static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 	} else {
 		struct blkio_group *blkg;
 
-		blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_PROP, false);
+		blkg = blkg_lookup_create(blkcg, q, false);
 		if (!IS_ERR(blkg))
 			cfqg = blkg_to_cfqg(blkg);
 	}
@@ -3523,8 +3523,7 @@ static int cfq_init_queue(struct request_queue *q)
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
 
-	blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_PROP,
-				  true);
+	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
 	if (!IS_ERR(blkg))
 		cfqd->root_group = blkg_to_cfqg(blkg);
 
-- 
cgit v1.1


From 2aa4a1523b40a065bc3a31e20097ea7a618ec3de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:42 -0700
Subject: blkcg: BLKIO_STAT_CPU_SECTORS doesn't have subcounters

BLKIO_STAT_CPU_SECTORS doesn't need read/write/sync/async subcounters
and is counted by blkio_group_stats_cpu->sectors; however, it still
holds a member in blkio_group_stats_cpu->stat_arr_cpu.

Rearrange stat_type_cpu and define BLKIO_STAT_CPU_ARR_NR and use it
for stat_arr_cpu[] size so that only SERVICE_BYTES and SERVICED have
subcounters.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1add3dc..2060d81 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -58,14 +58,17 @@ enum stat_type {
 
 /* Per cpu stats */
 enum stat_type_cpu {
-	BLKIO_STAT_CPU_SECTORS,
 	/* Total bytes transferred */
 	BLKIO_STAT_CPU_SERVICE_BYTES,
 	/* Total IOs serviced, post merge */
 	BLKIO_STAT_CPU_SERVICED,
-	BLKIO_STAT_CPU_NR
+
+	/* All the single valued stats go below this */
+	BLKIO_STAT_CPU_SECTORS,
 };
 
+#define BLKIO_STAT_CPU_ARR_NR	(BLKIO_STAT_CPU_SERVICED + 1)
+
 enum stat_sub_type {
 	BLKIO_STAT_READ = 0,
 	BLKIO_STAT_WRITE,
@@ -167,7 +170,7 @@ struct blkio_group_stats {
 /* Per cpu blkio group stats */
 struct blkio_group_stats_cpu {
 	uint64_t sectors;
-	uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
+	uint64_t stat_arr_cpu[BLKIO_STAT_CPU_ARR_NR][BLKIO_STAT_TOTAL];
 	struct u64_stats_sync syncp;
 };
 
-- 
cgit v1.1


From edcb0722c654947908388df660791abd41e6617c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:42 -0700
Subject: blkcg: introduce blkg_stat and blkg_rwstat

blkcg uses u64_stats_sync to avoid reading wrong u64 statistic values
on 32bit archs and some stat counters have subtypes to distinguish
read/writes and sync/async IOs.  The stat code paths are confusing and
involve a lot of going back and forth between blkcg core and specific
policy implementations, and synchronization and subtype handling are
open coded in blkcg core.

This patch introduces struct blkg_stat and blkg_rwstat which, with
accompanying operations, encapsulate stat updating and accessing with
proper synchronization.

blkg_stat is simple u64 counter with 64bit read-access protection.
blkg_rwstat is the one with rw and [a]sync subcounters and takes @rw
flags to distinguish IO subtypes (%REQ_WRITE and %REQ_SYNC) and
replaces stat_sub_type indexed arrays.

All counters in blkio_group_stats and blkio_group_stats_cpu are
replaced with either blkg_stat or blkg_rwstat along with all users.

This does add one u64_stats_sync per counter and increase stats_sync
operations but they're empty/noops on 64bit archs and blkcg doesn't
have too many counters, especially with DEBUG_BLK_CGROUP off.

While the currently resulting code isn't necessarily simpler at the
moment, this will enable further clean up of blkcg stats code.

- BLKIO_STAT_{READ|WRITE|SYNC|ASYNC|TOTAL} renamed to
  BLKG_RWSTAT_{READ|WRITE|SYNC|ASYNC|TOTAL}.

- blkg_stat_add() replaces blkio_add_stat() and
  blkio_check_and_dec_stat().  Note that BUG_ON() on underflow in the
  latter function no longer exists.  It's *way* better to have
  underflowed stat counters than oopsing.

- blkio_group_stats->dequeue is now a proper u64 stat counter instead
  of ulong.

- reset_stats() updated to clear each stat counters individually and
  BLKG_STATS_DEBUG_CLEAR_{START|SIZE} are removed.

- Some functions reconstruct rw flags from direction and sync
  booleans.  This will be removed by future patches.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 289 +++++++++++++++++++++++------------------------------
 block/blk-cgroup.h | 211 +++++++++++++++++++++++++++++---------
 2 files changed, 293 insertions(+), 207 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 55ccbae..09ac462 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -132,46 +132,6 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg,
 	}
 }
 
-/*
- * Add to the appropriate stat variable depending on the request type.
- * This should be called with queue_lock held.
- */
-static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
-				bool sync)
-{
-	if (direction)
-		stat[BLKIO_STAT_WRITE] += add;
-	else
-		stat[BLKIO_STAT_READ] += add;
-	if (sync)
-		stat[BLKIO_STAT_SYNC] += add;
-	else
-		stat[BLKIO_STAT_ASYNC] += add;
-}
-
-/*
- * Decrements the appropriate stat variable if non-zero depending on the
- * request type. Panics on value being zero.
- * This should be called with the queue_lock held.
- */
-static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
-{
-	if (direction) {
-		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
-		stat[BLKIO_STAT_WRITE]--;
-	} else {
-		BUG_ON(stat[BLKIO_STAT_READ] == 0);
-		stat[BLKIO_STAT_READ]--;
-	}
-	if (sync) {
-		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
-		stat[BLKIO_STAT_SYNC]--;
-	} else {
-		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
-		stat[BLKIO_STAT_ASYNC]--;
-	}
-}
-
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 /* This should be called with the queue_lock held. */
 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
@@ -198,7 +158,8 @@ static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
 
 	now = sched_clock();
 	if (time_after64(now, stats->start_group_wait_time))
-		stats->group_wait_time += now - stats->start_group_wait_time;
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
 	blkio_clear_blkg_waiting(stats);
 }
 
@@ -212,7 +173,8 @@ static void blkio_end_empty_time(struct blkio_group_stats *stats)
 
 	now = sched_clock();
 	if (time_after64(now, stats->start_empty_time))
-		stats->empty_time += now - stats->start_empty_time;
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
 	blkio_clear_blkg_empty(stats);
 }
 
@@ -239,11 +201,9 @@ void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
 	if (blkio_blkg_idling(stats)) {
 		unsigned long long now = sched_clock();
 
-		if (time_after64(now, stats->start_idle_time)) {
-			u64_stats_update_begin(&stats->syncp);
-			stats->idle_time += now - stats->start_idle_time;
-			u64_stats_update_end(&stats->syncp);
-		}
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
 		blkio_clear_blkg_idling(stats);
 	}
 }
@@ -256,13 +216,10 @@ void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
 
 	lockdep_assert_held(blkg->q->queue_lock);
 
-	u64_stats_update_begin(&stats->syncp);
-	stats->avg_queue_size_sum +=
-			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
-			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
-	stats->avg_queue_size_samples++;
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_sum(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
 	blkio_update_group_wait_time(stats);
-	u64_stats_update_end(&stats->syncp);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 
@@ -273,8 +230,7 @@ void blkiocg_set_start_empty_time(struct blkio_group *blkg,
 
 	lockdep_assert_held(blkg->q->queue_lock);
 
-	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
-			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE])
+	if (blkg_rwstat_sum(&stats->queued))
 		return;
 
 	/*
@@ -298,7 +254,7 @@ void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 
 	lockdep_assert_held(blkg->q->queue_lock);
 
-	pd->stats.dequeue += dequeue;
+	blkg_stat_add(&pd->stats.dequeue, dequeue);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 #else
@@ -314,14 +270,12 @@ void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 				 bool sync)
 {
 	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
 
 	lockdep_assert_held(blkg->q->queue_lock);
 
-	u64_stats_update_begin(&stats->syncp);
-	blkio_add_stat(stats->stat_arr[BLKIO_STAT_QUEUED], 1, direction, sync);
+	blkg_rwstat_add(&stats->queued, rw, 1);
 	blkio_end_empty_time(stats);
-	u64_stats_update_end(&stats->syncp);
-
 	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
@@ -331,13 +285,11 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 				    bool direction, bool sync)
 {
 	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
 
 	lockdep_assert_held(blkg->q->queue_lock);
 
-	u64_stats_update_begin(&stats->syncp);
-	blkio_check_and_dec_stat(stats->stat_arr[BLKIO_STAT_QUEUED], direction,
-				 sync);
-	u64_stats_update_end(&stats->syncp);
+	blkg_rwstat_add(&stats->queued, rw, -1);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 
@@ -350,12 +302,10 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 
 	lockdep_assert_held(blkg->q->queue_lock);
 
-	u64_stats_update_begin(&stats->syncp);
-	stats->time += time;
+	blkg_stat_add(&stats->time, time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	stats->unaccounted_time += unaccounted_time;
+	blkg_stat_add(&stats->unaccounted_time, unaccounted_time);
 #endif
-	u64_stats_update_end(&stats->syncp);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
@@ -367,6 +317,7 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 				   struct blkio_policy_type *pol,
 				   uint64_t bytes, bool direction, bool sync)
 {
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
 	struct blkg_policy_data *pd = blkg->pd[pol->plid];
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
@@ -384,13 +335,10 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 
 	stats_cpu = this_cpu_ptr(pd->stats_cpu);
 
-	u64_stats_update_begin(&stats_cpu->syncp);
-	stats_cpu->sectors += bytes >> 9;
-	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
-			1, direction, sync);
-	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
-			bytes, direction, sync);
-	u64_stats_update_end(&stats_cpu->syncp);
+	blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
+	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
+	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
+
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
@@ -403,17 +351,15 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 {
 	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 	unsigned long long now = sched_clock();
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
 
 	lockdep_assert_held(blkg->q->queue_lock);
 
-	u64_stats_update_begin(&stats->syncp);
 	if (time_after64(now, io_start_time))
-		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
-				now - io_start_time, direction, sync);
+		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
 	if (time_after64(io_start_time, start_time))
-		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
-				io_start_time - start_time, direction, sync);
-	u64_stats_update_end(&stats->syncp);
+		blkg_rwstat_add(&stats->wait_time, rw,
+				io_start_time - start_time);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 
@@ -423,12 +369,11 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 				    bool direction, bool sync)
 {
 	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
 
 	lockdep_assert_held(blkg->q->queue_lock);
 
-	u64_stats_update_begin(&stats->syncp);
-	blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync);
-	u64_stats_update_end(&stats->syncp);
+	blkg_rwstat_add(&stats->merged, rw, 1);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
@@ -757,8 +702,9 @@ static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
 		struct blkio_group_stats_cpu *sc =
 			per_cpu_ptr(pd->stats_cpu, cpu);
 
-		sc->sectors = 0;
-		memset(sc->stat_arr_cpu, 0, sizeof(sc->stat_arr_cpu));
+		blkg_rwstat_reset(&sc->service_bytes);
+		blkg_rwstat_reset(&sc->serviced);
+		blkg_stat_reset(&sc->sectors);
 	}
 }
 
@@ -768,7 +714,6 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	struct blkio_group *blkg;
 	struct hlist_node *n;
-	int i;
 
 	spin_lock(&blkio_list_lock);
 	spin_lock_irq(&blkcg->lock);
@@ -786,14 +731,18 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 			struct blkio_group_stats *stats = &pd->stats;
 
 			/* queued stats shouldn't be cleared */
-			for (i = 0; i < ARRAY_SIZE(stats->stat_arr); i++)
-				if (i != BLKIO_STAT_QUEUED)
-					memset(stats->stat_arr[i], 0,
-					       sizeof(stats->stat_arr[i]));
-			stats->time = 0;
+			blkg_rwstat_reset(&stats->merged);
+			blkg_rwstat_reset(&stats->service_time);
+			blkg_rwstat_reset(&stats->wait_time);
+			blkg_stat_reset(&stats->time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-			memset((void *)stats + BLKG_STATS_DEBUG_CLEAR_START, 0,
-			       BLKG_STATS_DEBUG_CLEAR_SIZE);
+			blkg_stat_reset(&stats->unaccounted_time);
+			blkg_stat_reset(&stats->avg_queue_size_sum);
+			blkg_stat_reset(&stats->avg_queue_size_samples);
+			blkg_stat_reset(&stats->dequeue);
+			blkg_stat_reset(&stats->group_wait_time);
+			blkg_stat_reset(&stats->idle_time);
+			blkg_stat_reset(&stats->empty_time);
 #endif
 			blkio_reset_stats_cpu(blkg, pol->plid);
 		}
@@ -804,7 +753,7 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	return 0;
 }
 
-static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
+static void blkio_get_key_name(enum blkg_rwstat_type type, const char *dname,
 			       char *str, int chars_left, bool diskname_only)
 {
 	snprintf(str, chars_left, "%s", dname);
@@ -817,19 +766,19 @@ static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
 	if (diskname_only)
 		return;
 	switch (type) {
-	case BLKIO_STAT_READ:
+	case BLKG_RWSTAT_READ:
 		strlcat(str, " Read", chars_left);
 		break;
-	case BLKIO_STAT_WRITE:
+	case BLKG_RWSTAT_WRITE:
 		strlcat(str, " Write", chars_left);
 		break;
-	case BLKIO_STAT_SYNC:
+	case BLKG_RWSTAT_SYNC:
 		strlcat(str, " Sync", chars_left);
 		break;
-	case BLKIO_STAT_ASYNC:
+	case BLKG_RWSTAT_ASYNC:
 		strlcat(str, " Async", chars_left);
 		break;
-	case BLKIO_STAT_TOTAL:
+	case BLKG_RWSTAT_TOTAL:
 		strlcat(str, " Total", chars_left);
 		break;
 	default:
@@ -838,29 +787,34 @@ static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
 }
 
 static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
-			enum stat_type_cpu type, enum stat_sub_type sub_type)
+				    enum stat_type_cpu type,
+				    enum blkg_rwstat_type sub_type)
 {
 	struct blkg_policy_data *pd = blkg->pd[plid];
+	u64 val = 0;
 	int cpu;
-	struct blkio_group_stats_cpu *stats_cpu;
-	u64 val = 0, tval;
 
 	if (pd->stats_cpu == NULL)
 		return val;
 
 	for_each_possible_cpu(cpu) {
-		unsigned int start;
-		stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);
-
-		do {
-			start = u64_stats_fetch_begin(&stats_cpu->syncp);
-			if (type == BLKIO_STAT_CPU_SECTORS)
-				tval = stats_cpu->sectors;
-			else
-				tval = stats_cpu->stat_arr_cpu[type][sub_type];
-		} while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
-
-		val += tval;
+		struct blkio_group_stats_cpu *stats_cpu =
+			per_cpu_ptr(pd->stats_cpu, cpu);
+		struct blkg_rwstat rws;
+
+		switch (type) {
+		case BLKIO_STAT_CPU_SECTORS:
+			val += blkg_stat_read(&stats_cpu->sectors);
+			break;
+		case BLKIO_STAT_CPU_SERVICE_BYTES:
+			rws = blkg_rwstat_read(&stats_cpu->service_bytes);
+			val += rws.cnt[sub_type];
+			break;
+		case BLKIO_STAT_CPU_SERVICED:
+			rws = blkg_rwstat_read(&stats_cpu->serviced);
+			val += rws.cnt[sub_type];
+			break;
+		}
 	}
 
 	return val;
@@ -872,7 +826,7 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
 {
 	uint64_t disk_total, val;
 	char key_str[MAX_KEY_LEN];
-	enum stat_sub_type sub_type;
+	enum blkg_rwstat_type sub_type;
 
 	if (type == BLKIO_STAT_CPU_SECTORS) {
 		val = blkio_read_stat_cpu(blkg, plid, type, 0);
@@ -881,7 +835,7 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
 		return val;
 	}
 
-	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+	for (sub_type = BLKG_RWSTAT_READ; sub_type < BLKG_RWSTAT_NR;
 			sub_type++) {
 		blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
 				   false);
@@ -889,10 +843,10 @@ static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
 		cb->fill(cb, key_str, val);
 	}
 
-	disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_READ) +
-		blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_WRITE);
+	disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKG_RWSTAT_READ) +
+		blkio_read_stat_cpu(blkg, plid, type, BLKG_RWSTAT_WRITE);
 
-	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
+	blkio_get_key_name(BLKG_RWSTAT_TOTAL, dname, key_str, MAX_KEY_LEN,
 			   false);
 	cb->fill(cb, key_str, disk_total);
 	return disk_total;
@@ -905,65 +859,76 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
 	struct blkio_group_stats *stats = &blkg->pd[plid]->stats;
 	uint64_t v = 0, disk_total = 0;
 	char key_str[MAX_KEY_LEN];
-	unsigned int sync_start;
+	struct blkg_rwstat rws = { };
 	int st;
 
 	if (type >= BLKIO_STAT_ARR_NR) {
-		do {
-			sync_start = u64_stats_fetch_begin(&stats->syncp);
-			switch (type) {
-			case BLKIO_STAT_TIME:
-				v = stats->time;
-				break;
+		switch (type) {
+		case BLKIO_STAT_TIME:
+			v = blkg_stat_read(&stats->time);
+			break;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-			case BLKIO_STAT_UNACCOUNTED_TIME:
-				v = stats->unaccounted_time;
-				break;
-			case BLKIO_STAT_AVG_QUEUE_SIZE: {
-				uint64_t samples = stats->avg_queue_size_samples;
+		case BLKIO_STAT_UNACCOUNTED_TIME:
+			v = blkg_stat_read(&stats->unaccounted_time);
+			break;
+		case BLKIO_STAT_AVG_QUEUE_SIZE: {
+			uint64_t samples;
 
-				if (samples) {
-					v = stats->avg_queue_size_sum;
-					do_div(v, samples);
-				}
-				break;
+			samples = blkg_stat_read(&stats->avg_queue_size_samples);
+			if (samples) {
+				v = blkg_stat_read(&stats->avg_queue_size_sum);
+				do_div(v, samples);
 			}
-			case BLKIO_STAT_IDLE_TIME:
-				v = stats->idle_time;
-				break;
-			case BLKIO_STAT_EMPTY_TIME:
-				v = stats->empty_time;
-				break;
-			case BLKIO_STAT_DEQUEUE:
-				v = stats->dequeue;
-				break;
-			case BLKIO_STAT_GROUP_WAIT_TIME:
-				v = stats->group_wait_time;
-				break;
+			break;
+		}
+		case BLKIO_STAT_IDLE_TIME:
+			v = blkg_stat_read(&stats->idle_time);
+			break;
+		case BLKIO_STAT_EMPTY_TIME:
+			v = blkg_stat_read(&stats->empty_time);
+			break;
+		case BLKIO_STAT_DEQUEUE:
+			v = blkg_stat_read(&stats->dequeue);
+			break;
+		case BLKIO_STAT_GROUP_WAIT_TIME:
+			v = blkg_stat_read(&stats->group_wait_time);
+			break;
 #endif
-			default:
-				WARN_ON_ONCE(1);
-			}
-		} while (u64_stats_fetch_retry(&stats->syncp, sync_start));
+		default:
+			WARN_ON_ONCE(1);
+		}
 
 		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
 		cb->fill(cb, key_str, v);
 		return v;
 	}
 
-	for (st = BLKIO_STAT_READ; st < BLKIO_STAT_TOTAL; st++) {
-		do {
-			sync_start = u64_stats_fetch_begin(&stats->syncp);
-			v = stats->stat_arr[type][st];
-		} while (u64_stats_fetch_retry(&stats->syncp, sync_start));
+	switch (type) {
+	case BLKIO_STAT_MERGED:
+		rws = blkg_rwstat_read(&stats->merged);
+		break;
+	case BLKIO_STAT_SERVICE_TIME:
+		rws = blkg_rwstat_read(&stats->service_time);
+		break;
+	case BLKIO_STAT_WAIT_TIME:
+		rws = blkg_rwstat_read(&stats->wait_time);
+		break;
+	case BLKIO_STAT_QUEUED:
+		rws = blkg_rwstat_read(&stats->queued);
+		break;
+	default:
+		WARN_ON_ONCE(true);
+		break;
+	}
 
+	for (st = BLKG_RWSTAT_READ; st < BLKG_RWSTAT_NR; st++) {
 		blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false);
-		cb->fill(cb, key_str, v);
-		if (st == BLKIO_STAT_READ || st == BLKIO_STAT_WRITE)
-			disk_total += v;
+		cb->fill(cb, key_str, rws.cnt[st]);
+		if (st == BLKG_RWSTAT_READ || st == BLKG_RWSTAT_WRITE)
+			disk_total += rws.cnt[st];
 	}
 
-	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
+	blkio_get_key_name(BLKG_RWSTAT_TOTAL, dname, key_str, MAX_KEY_LEN,
 			   false);
 	cb->fill(cb, key_str, disk_total);
 	return disk_total;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2060d81..7578df3 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -69,12 +69,14 @@ enum stat_type_cpu {
 
 #define BLKIO_STAT_CPU_ARR_NR	(BLKIO_STAT_CPU_SERVICED + 1)
 
-enum stat_sub_type {
-	BLKIO_STAT_READ = 0,
-	BLKIO_STAT_WRITE,
-	BLKIO_STAT_SYNC,
-	BLKIO_STAT_ASYNC,
-	BLKIO_STAT_TOTAL
+enum blkg_rwstat_type {
+	BLKG_RWSTAT_READ,
+	BLKG_RWSTAT_WRITE,
+	BLKG_RWSTAT_SYNC,
+	BLKG_RWSTAT_ASYNC,
+
+	BLKG_RWSTAT_NR,
+	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
 };
 
 /* blkg state flags */
@@ -124,54 +126,58 @@ struct blkio_cgroup {
 	uint64_t id;
 };
 
+struct blkg_stat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt;
+};
+
+struct blkg_rwstat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt[BLKG_RWSTAT_NR];
+};
+
 struct blkio_group_stats {
-	struct u64_stats_sync syncp;
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
 	/* total disk time and nr sectors dispatched by this group */
-	uint64_t time;
-	uint64_t stat_arr[BLKIO_STAT_ARR_NR][BLKIO_STAT_TOTAL];
+	struct blkg_stat		time;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	/* Time not charged to this cgroup */
-	uint64_t unaccounted_time;
-
-	/* Sum of number of IOs queued across all samples */
-	uint64_t avg_queue_size_sum;
-	/* Count of samples taken for average */
-	uint64_t avg_queue_size_samples;
-	/* How many times this group has been removed from service tree */
-	unsigned long dequeue;
-
-	/* Total time spent waiting for it to be assigned a timeslice. */
-	uint64_t group_wait_time;
-
-	/* Time spent idling for this blkio_group */
-	uint64_t idle_time;
-	/*
-	 * Total time when we have requests queued and do not contain the
-	 * current active queue.
-	 */
-	uint64_t empty_time;
-
+	/* time not charged to this cgroup */
+	struct blkg_stat		unaccounted_time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkio_group */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
 	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t start_group_wait_time;
-	uint64_t start_idle_time;
-	uint64_t start_empty_time;
-	uint16_t flags;
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
 #endif
 };
 
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-#define BLKG_STATS_DEBUG_CLEAR_START	\
-	offsetof(struct blkio_group_stats, unaccounted_time)
-#define BLKG_STATS_DEBUG_CLEAR_SIZE	\
-	(offsetof(struct blkio_group_stats, start_group_wait_time) - \
-	 BLKG_STATS_DEBUG_CLEAR_START)
-#endif
-
 /* Per cpu blkio group stats */
 struct blkio_group_stats_cpu {
-	uint64_t sectors;
-	uint64_t stat_arr_cpu[BLKIO_STAT_CPU_ARR_NR][BLKIO_STAT_TOTAL];
-	struct u64_stats_sync syncp;
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
+	/* total sectors transferred */
+	struct blkg_stat		sectors;
 };
 
 struct blkio_group_conf {
@@ -316,6 +322,121 @@ static inline void blkg_put(struct blkio_group *blkg)
 		__blkg_release(blkg);
 }
 
+/**
+ * blkg_stat_add - add a value to a blkg_stat
+ * @stat: target blkg_stat
+ * @val: value to add
+ *
+ * Add @val to @stat.  The caller is responsible for synchronizing calls to
+ * this function.
+ */
+static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+{
+	u64_stats_update_begin(&stat->syncp);
+	stat->cnt += val;
+	u64_stats_update_end(&stat->syncp);
+}
+
+/**
+ * blkg_stat_read - read the current value of a blkg_stat
+ * @stat: blkg_stat to read
+ *
+ * Read the current value of @stat.  This function can be called without
+ * synchroniztion and takes care of u64 atomicity.
+ */
+static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
+{
+	unsigned int start;
+	uint64_t v;
+
+	do {
+		start = u64_stats_fetch_begin(&stat->syncp);
+		v = stat->cnt;
+	} while (u64_stats_fetch_retry(&stat->syncp, start));
+
+	return v;
+}
+
+/**
+ * blkg_stat_reset - reset a blkg_stat
+ * @stat: blkg_stat to reset
+ */
+static inline void blkg_stat_reset(struct blkg_stat *stat)
+{
+	stat->cnt = 0;
+}
+
+/**
+ * blkg_rwstat_add - add a value to a blkg_rwstat
+ * @rwstat: target blkg_rwstat
+ * @rw: mask of REQ_{WRITE|SYNC}
+ * @val: value to add
+ *
+ * Add @val to @rwstat.  The counters are chosen according to @rw.  The
+ * caller is responsible for synchronizing calls to this function.
+ */
+static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+				   int rw, uint64_t val)
+{
+	u64_stats_update_begin(&rwstat->syncp);
+
+	if (rw & REQ_WRITE)
+		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_READ] += val;
+	if (rw & REQ_SYNC)
+		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+
+	u64_stats_update_end(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_read - read the current values of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Read the current snapshot of @rwstat and return it as the return value.
+ * This function can be called without synchronization and takes care of
+ * u64 atomicity.
+ */
+static struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+{
+	unsigned int start;
+	struct blkg_rwstat tmp;
+
+	do {
+		start = u64_stats_fetch_begin(&rwstat->syncp);
+		tmp = *rwstat;
+	} while (u64_stats_fetch_retry(&rwstat->syncp, start));
+
+	return tmp;
+}
+
+/**
+ * blkg_rwstat_sum - read the total count of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Return the total count of @rwstat regardless of the IO direction.  This
+ * function can be called without synchronization and takes care of u64
+ * atomicity.
+ */
+static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+
+	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+}
+
+/**
+ * blkg_rwstat_reset - reset a blkg_rwstat
+ * @rwstat: blkg_rwstat to reset
+ */
+static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+{
+	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+}
+
 #else
 
 struct blkio_group {
-- 
cgit v1.1


From d3d32e69fa368e131b25ee68806aa3fd3916cec1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:42 -0700
Subject: blkcg: restructure statistics printing

blkcg stats handling is a mess.  None of the stats has much to do with
blkcg core but they are all implemented in blkcg core.  Code sharing
is achieved by mixing common code with hard-coded cases for each stat
counter.

This patch restructures statistics printing such that

* Common logic exists as helper functions and specific print functions
  use the helpers to implement specific cases.

* Printing functions serving multiple counters don't require hardcoded
  switching on specific counters.

* Printing uses read_seq_string callback (other methods will be phased
  out).

This change enables further cleanups and relocating stats code to the
policy implementation it belongs to.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 557 +++++++++++++++++++++++------------------------------
 block/blk-cgroup.h |  60 +-----
 2 files changed, 243 insertions(+), 374 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 09ac462..951e7f3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -753,186 +753,227 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	return 0;
 }
 
-static void blkio_get_key_name(enum blkg_rwstat_type type, const char *dname,
-			       char *str, int chars_left, bool diskname_only)
-{
-	snprintf(str, chars_left, "%s", dname);
-	chars_left -= strlen(str);
-	if (chars_left <= 0) {
-		printk(KERN_WARNING
-			"Possibly incorrect cgroup stat display format");
-		return;
-	}
-	if (diskname_only)
-		return;
-	switch (type) {
-	case BLKG_RWSTAT_READ:
-		strlcat(str, " Read", chars_left);
-		break;
-	case BLKG_RWSTAT_WRITE:
-		strlcat(str, " Write", chars_left);
-		break;
-	case BLKG_RWSTAT_SYNC:
-		strlcat(str, " Sync", chars_left);
-		break;
-	case BLKG_RWSTAT_ASYNC:
-		strlcat(str, " Async", chars_left);
-		break;
-	case BLKG_RWSTAT_TOTAL:
-		strlcat(str, " Total", chars_left);
-		break;
-	default:
-		strlcat(str, " Invalid", chars_left);
-	}
+static const char *blkg_dev_name(struct blkio_group *blkg)
+{
+	/* some drivers (floppy) instantiate a queue w/o disk registered */
+	if (blkg->q->backing_dev_info.dev)
+		return dev_name(blkg->q->backing_dev_info.dev);
+	return NULL;
 }
 
-static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
-				    enum stat_type_cpu type,
-				    enum blkg_rwstat_type sub_type)
+/**
+ * blkcg_print_blkgs - helper for printing per-blkg data
+ * @sf: seq_file to print to
+ * @blkcg: blkcg of interest
+ * @prfill: fill function to print out a blkg
+ * @pol: policy in question
+ * @data: data to be passed to @prfill
+ * @show_total: to print out sum of prfill return values or not
+ *
+ * This function invokes @prfill on each blkg of @blkcg if pd for the
+ * policy specified by @pol exists.  @prfill is invoked with @sf, the
+ * policy data and @data.  If @show_total is %true, the sum of the return
+ * values from @prfill is printed with "Total" label at the end.
+ *
+ * This is to be used to construct print functions for
+ * cftype->read_seq_string method.
+ */
+static void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
+			      u64 (*prfill)(struct seq_file *,
+					    struct blkg_policy_data *, int),
+			      int pol, int data, bool show_total)
 {
-	struct blkg_policy_data *pd = blkg->pd[plid];
-	u64 val = 0;
-	int cpu;
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	u64 total = 0;
 
-	if (pd->stats_cpu == NULL)
-		return val;
+	spin_lock_irq(&blkcg->lock);
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
+		if (blkg->pd[pol])
+			total += prfill(sf, blkg->pd[pol], data);
+	spin_unlock_irq(&blkcg->lock);
+
+	if (show_total)
+		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
+}
+
+/**
+ * __blkg_prfill_u64 - prfill helper for a single u64 value
+ * @sf: seq_file to print to
+ * @pd: policy data of interest
+ * @v: value to print
+ *
+ * Print @v to @sf for the device assocaited with @pd.
+ */
+static u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd,
+			     u64 v)
+{
+	const char *dname = blkg_dev_name(pd->blkg);
+
+	if (!dname)
+		return 0;
+
+	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
+	return v;
+}
+
+/**
+ * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
+ * @sf: seq_file to print to
+ * @pd: policy data of interest
+ * @rwstat: rwstat to print
+ *
+ * Print @rwstat to @sf for the device assocaited with @pd.
+ */
+static u64 __blkg_prfill_rwstat(struct seq_file *sf,
+				struct blkg_policy_data *pd,
+				const struct blkg_rwstat *rwstat)
+{
+	static const char *rwstr[] = {
+		[BLKG_RWSTAT_READ]	= "Read",
+		[BLKG_RWSTAT_WRITE]	= "Write",
+		[BLKG_RWSTAT_SYNC]	= "Sync",
+		[BLKG_RWSTAT_ASYNC]	= "Async",
+	};
+	const char *dname = blkg_dev_name(pd->blkg);
+	u64 v;
+	int i;
+
+	if (!dname)
+		return 0;
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
+			   (unsigned long long)rwstat->cnt[i]);
+
+	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
+	return v;
+}
+
+static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
+			    int off)
+{
+	return __blkg_prfill_u64(sf, pd,
+				 blkg_stat_read((void *)&pd->stats + off));
+}
+
+static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+			      int off)
+{
+	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)&pd->stats + off);
+
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/* print blkg_stat specified by BLKCG_STAT_PRIV() */
+static int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+			    struct seq_file *sf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
+			  BLKCG_STAT_POL(cft->private),
+			  BLKCG_STAT_OFF(cft->private), false);
+	return 0;
+}
+
+/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
+static int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
+			      struct seq_file *sf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
+			  BLKCG_STAT_POL(cft->private),
+			  BLKCG_STAT_OFF(cft->private), true);
+	return 0;
+}
+
+static u64 blkg_prfill_cpu_stat(struct seq_file *sf,
+				struct blkg_policy_data *pd, int off)
+{
+	u64 v = 0;
+	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		struct blkio_group_stats_cpu *stats_cpu =
+		struct blkio_group_stats_cpu *sc =
 			per_cpu_ptr(pd->stats_cpu, cpu);
-		struct blkg_rwstat rws;
 
-		switch (type) {
-		case BLKIO_STAT_CPU_SECTORS:
-			val += blkg_stat_read(&stats_cpu->sectors);
-			break;
-		case BLKIO_STAT_CPU_SERVICE_BYTES:
-			rws = blkg_rwstat_read(&stats_cpu->service_bytes);
-			val += rws.cnt[sub_type];
-			break;
-		case BLKIO_STAT_CPU_SERVICED:
-			rws = blkg_rwstat_read(&stats_cpu->serviced);
-			val += rws.cnt[sub_type];
-			break;
-		}
+		v += blkg_stat_read((void *)sc + off);
 	}
 
-	return val;
+	return __blkg_prfill_u64(sf, pd, v);
 }
 
-static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
-				   struct cgroup_map_cb *cb, const char *dname,
-				   enum stat_type_cpu type)
+static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
+				  struct blkg_policy_data *pd, int off)
 {
-	uint64_t disk_total, val;
-	char key_str[MAX_KEY_LEN];
-	enum blkg_rwstat_type sub_type;
+	struct blkg_rwstat rwstat = { }, tmp;
+	int i, cpu;
 
-	if (type == BLKIO_STAT_CPU_SECTORS) {
-		val = blkio_read_stat_cpu(blkg, plid, type, 0);
-		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
-		cb->fill(cb, key_str, val);
-		return val;
-	}
+	for_each_possible_cpu(cpu) {
+		struct blkio_group_stats_cpu *sc =
+			per_cpu_ptr(pd->stats_cpu, cpu);
 
-	for (sub_type = BLKG_RWSTAT_READ; sub_type < BLKG_RWSTAT_NR;
-			sub_type++) {
-		blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
-				   false);
-		val = blkio_read_stat_cpu(blkg, plid, type, sub_type);
-		cb->fill(cb, key_str, val);
+		tmp = blkg_rwstat_read((void *)sc + off);
+		for (i = 0; i < BLKG_RWSTAT_NR; i++)
+			rwstat.cnt[i] += tmp.cnt[i];
 	}
 
-	disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKG_RWSTAT_READ) +
-		blkio_read_stat_cpu(blkg, plid, type, BLKG_RWSTAT_WRITE);
-
-	blkio_get_key_name(BLKG_RWSTAT_TOTAL, dname, key_str, MAX_KEY_LEN,
-			   false);
-	cb->fill(cb, key_str, disk_total);
-	return disk_total;
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 
-static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
-			       struct cgroup_map_cb *cb, const char *dname,
-			       enum stat_type type)
+/* print per-cpu blkg_stat specified by BLKCG_STAT_PRIV() */
+static int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *sf)
 {
-	struct blkio_group_stats *stats = &blkg->pd[plid]->stats;
-	uint64_t v = 0, disk_total = 0;
-	char key_str[MAX_KEY_LEN];
-	struct blkg_rwstat rws = { };
-	int st;
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-	if (type >= BLKIO_STAT_ARR_NR) {
-		switch (type) {
-		case BLKIO_STAT_TIME:
-			v = blkg_stat_read(&stats->time);
-			break;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-		case BLKIO_STAT_UNACCOUNTED_TIME:
-			v = blkg_stat_read(&stats->unaccounted_time);
-			break;
-		case BLKIO_STAT_AVG_QUEUE_SIZE: {
-			uint64_t samples;
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_stat,
+			  BLKCG_STAT_POL(cft->private),
+			  BLKCG_STAT_OFF(cft->private), false);
+	return 0;
+}
 
-			samples = blkg_stat_read(&stats->avg_queue_size_samples);
-			if (samples) {
-				v = blkg_stat_read(&stats->avg_queue_size_sum);
-				do_div(v, samples);
-			}
-			break;
-		}
-		case BLKIO_STAT_IDLE_TIME:
-			v = blkg_stat_read(&stats->idle_time);
-			break;
-		case BLKIO_STAT_EMPTY_TIME:
-			v = blkg_stat_read(&stats->empty_time);
-			break;
-		case BLKIO_STAT_DEQUEUE:
-			v = blkg_stat_read(&stats->dequeue);
-			break;
-		case BLKIO_STAT_GROUP_WAIT_TIME:
-			v = blkg_stat_read(&stats->group_wait_time);
-			break;
-#endif
-		default:
-			WARN_ON_ONCE(1);
-		}
+/* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
+static int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
+				  struct seq_file *sf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
-		cb->fill(cb, key_str, v);
-		return v;
-	}
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
+			  BLKCG_STAT_POL(cft->private),
+			  BLKCG_STAT_OFF(cft->private), true);
+	return 0;
+}
 
-	switch (type) {
-	case BLKIO_STAT_MERGED:
-		rws = blkg_rwstat_read(&stats->merged);
-		break;
-	case BLKIO_STAT_SERVICE_TIME:
-		rws = blkg_rwstat_read(&stats->service_time);
-		break;
-	case BLKIO_STAT_WAIT_TIME:
-		rws = blkg_rwstat_read(&stats->wait_time);
-		break;
-	case BLKIO_STAT_QUEUED:
-		rws = blkg_rwstat_read(&stats->queued);
-		break;
-	default:
-		WARN_ON_ONCE(true);
-		break;
-	}
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+static u64 blkg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 samples = blkg_stat_read(&pd->stats.avg_queue_size_samples);
+	u64 v = 0;
 
-	for (st = BLKG_RWSTAT_READ; st < BLKG_RWSTAT_NR; st++) {
-		blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false);
-		cb->fill(cb, key_str, rws.cnt[st]);
-		if (st == BLKG_RWSTAT_READ || st == BLKG_RWSTAT_WRITE)
-			disk_total += rws.cnt[st];
+	if (samples) {
+		v = blkg_stat_read(&pd->stats.avg_queue_size_sum);
+		do_div(v, samples);
 	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
+
+/* print avg_queue_size */
+static int blkcg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
+				      struct seq_file *sf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-	blkio_get_key_name(BLKG_RWSTAT_TOTAL, dname, key_str, MAX_KEY_LEN,
-			   false);
-	cb->fill(cb, key_str, disk_total);
-	return disk_total;
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_avg_queue_size,
+			  BLKIO_POLICY_PROP, 0, false);
+	return 0;
 }
+#endif	/* CONFIG_DEBUG_BLK_CGROUP */
 
 static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
 				      int fileid, struct blkio_cgroup *blkcg)
@@ -1074,14 +1115,6 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 	return ret;
 }
 
-static const char *blkg_dev_name(struct blkio_group *blkg)
-{
-	/* some drivers (floppy) instantiate a queue w/o disk registered */
-	if (blkg->q->backing_dev_info.dev)
-		return dev_name(blkg->q->backing_dev_info.dev);
-	return NULL;
-}
-
 static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
 				   struct seq_file *m)
 {
@@ -1174,116 +1207,6 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
-static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
-		struct cftype *cft, struct cgroup_map_cb *cb,
-		enum stat_type type, bool show_total, bool pcpu)
-{
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-	uint64_t cgroup_total = 0;
-
-	spin_lock_irq(&blkcg->lock);
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		const char *dname = blkg_dev_name(blkg);
-		int plid = BLKIOFILE_POLICY(cft->private);
-
-		if (!dname)
-			continue;
-		if (pcpu)
-			cgroup_total += blkio_get_stat_cpu(blkg, plid,
-							   cb, dname, type);
-		else
-			cgroup_total += blkio_get_stat(blkg, plid,
-						       cb, dname, type);
-	}
-	if (show_total)
-		cb->fill(cb, "Total", cgroup_total);
-
-	spin_unlock_irq(&blkcg->lock);
-	return 0;
-}
-
-/* All map kind of cgroup file get serviced by this function */
-static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
-				struct cgroup_map_cb *cb)
-{
-	struct blkio_cgroup *blkcg;
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int name = BLKIOFILE_ATTR(cft->private);
-
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	switch(plid) {
-	case BLKIO_POLICY_PROP:
-		switch(name) {
-		case BLKIO_PROP_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_TIME, 0, 0);
-		case BLKIO_PROP_sectors:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_SECTORS, 0, 1);
-		case BLKIO_PROP_io_service_bytes:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-					BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
-		case BLKIO_PROP_io_serviced:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_SERVICED, 1, 1);
-		case BLKIO_PROP_io_service_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_SERVICE_TIME, 1, 0);
-		case BLKIO_PROP_io_wait_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_WAIT_TIME, 1, 0);
-		case BLKIO_PROP_io_merged:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_MERGED, 1, 0);
-		case BLKIO_PROP_io_queued:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_QUEUED, 1, 0);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-		case BLKIO_PROP_unaccounted_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-					BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
-		case BLKIO_PROP_dequeue:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_DEQUEUE, 0, 0);
-		case BLKIO_PROP_avg_queue_size:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-					BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
-		case BLKIO_PROP_group_wait_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-					BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
-		case BLKIO_PROP_idle_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_IDLE_TIME, 0, 0);
-		case BLKIO_PROP_empty_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_EMPTY_TIME, 0, 0);
-#endif
-		default:
-			BUG();
-		}
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(name){
-		case BLKIO_THROTL_io_service_bytes:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
-		case BLKIO_THROTL_io_serviced:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_SERVICED, 1, 1);
-		default:
-			BUG();
-		}
-		break;
-	default:
-		BUG();
-	}
-
-	return 0;
-}
-
 static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
 {
 	struct blkio_group *blkg;
@@ -1369,51 +1292,51 @@ struct cftype blkio_files[] = {
 	},
 	{
 		.name = "time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_time),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, time)),
+		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "sectors",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_sectors),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats_cpu, sectors)),
+		.read_seq_string = blkcg_print_cpu_stat,
 	},
 	{
 		.name = "io_service_bytes",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_service_bytes),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats_cpu, service_bytes)),
+		.read_seq_string = blkcg_print_cpu_rwstat,
 	},
 	{
 		.name = "io_serviced",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_serviced),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats_cpu, serviced)),
+		.read_seq_string = blkcg_print_cpu_rwstat,
 	},
 	{
 		.name = "io_service_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_service_time),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, service_time)),
+		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_wait_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_wait_time),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, wait_time)),
+		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_merged",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_merged),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, merged)),
+		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_queued",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_queued),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, queued)),
+		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "reset_stats",
@@ -1457,54 +1380,52 @@ struct cftype blkio_files[] = {
 	},
 	{
 		.name = "throttle.io_service_bytes",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_io_service_bytes),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
+				offsetof(struct blkio_group_stats_cpu, service_bytes)),
+		.read_seq_string = blkcg_print_cpu_rwstat,
 	},
 	{
 		.name = "throttle.io_serviced",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_io_serviced),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
+				offsetof(struct blkio_group_stats_cpu, serviced)),
+		.read_seq_string = blkcg_print_cpu_rwstat,
 	},
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_avg_queue_size),
-		.read_map = blkiocg_file_read_map,
+		.read_seq_string = blkcg_print_avg_queue_size,
 	},
 	{
 		.name = "group_wait_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_group_wait_time),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, group_wait_time)),
+		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "idle_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_idle_time),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, idle_time)),
+		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "empty_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_empty_time),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, empty_time)),
+		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "dequeue",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_dequeue),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, dequeue)),
+		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "unaccounted_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_unaccounted_time),
-		.read_map = blkiocg_file_read_map,
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, unaccounted_time)),
+		.read_seq_string = blkcg_print_stat,
 	},
 #endif
 	{ }	/* terminate */
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 7578df3..7331d79 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -28,46 +28,10 @@ enum blkio_policy_id {
 
 #ifdef CONFIG_BLK_CGROUP
 
-enum stat_type {
-	/* Number of IOs merged */
-	BLKIO_STAT_MERGED,
-	/* Total time spent (in ns) between request dispatch to the driver and
-	 * request completion for IOs doen by this cgroup. This may not be
-	 * accurate when NCQ is turned on. */
-	BLKIO_STAT_SERVICE_TIME,
-	/* Total time spent waiting in scheduler queue in ns */
-	BLKIO_STAT_WAIT_TIME,
-	/* Number of IOs queued up */
-	BLKIO_STAT_QUEUED,
-
-	/* All the single valued stats go below this */
-	BLKIO_STAT_TIME,
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	/* Time not charged to this cgroup */
-	BLKIO_STAT_UNACCOUNTED_TIME,
-	BLKIO_STAT_AVG_QUEUE_SIZE,
-	BLKIO_STAT_IDLE_TIME,
-	BLKIO_STAT_EMPTY_TIME,
-	BLKIO_STAT_GROUP_WAIT_TIME,
-	BLKIO_STAT_DEQUEUE
-#endif
-};
-
-/* Types lower than this live in stat_arr and have subtypes */
-#define BLKIO_STAT_ARR_NR	(BLKIO_STAT_QUEUED + 1)
-
-/* Per cpu stats */
-enum stat_type_cpu {
-	/* Total bytes transferred */
-	BLKIO_STAT_CPU_SERVICE_BYTES,
-	/* Total IOs serviced, post merge */
-	BLKIO_STAT_CPU_SERVICED,
-
-	/* All the single valued stats go below this */
-	BLKIO_STAT_CPU_SECTORS,
-};
-
-#define BLKIO_STAT_CPU_ARR_NR	(BLKIO_STAT_CPU_SERVICED + 1)
+/* cft->private [un]packing for stat printing */
+#define BLKCG_STAT_PRIV(pol, off)	(((unsigned)(pol) << 16) | (off))
+#define BLKCG_STAT_POL(prv)		((unsigned)(prv) >> 16)
+#define BLKCG_STAT_OFF(prv)		((unsigned)(prv) & 0xffff)
 
 enum blkg_rwstat_type {
 	BLKG_RWSTAT_READ,
@@ -90,20 +54,6 @@ enum blkg_state_flags {
 enum blkcg_file_name_prop {
 	BLKIO_PROP_weight = 1,
 	BLKIO_PROP_weight_device,
-	BLKIO_PROP_io_service_bytes,
-	BLKIO_PROP_io_serviced,
-	BLKIO_PROP_time,
-	BLKIO_PROP_sectors,
-	BLKIO_PROP_unaccounted_time,
-	BLKIO_PROP_io_service_time,
-	BLKIO_PROP_io_wait_time,
-	BLKIO_PROP_io_merged,
-	BLKIO_PROP_io_queued,
-	BLKIO_PROP_avg_queue_size,
-	BLKIO_PROP_group_wait_time,
-	BLKIO_PROP_idle_time,
-	BLKIO_PROP_empty_time,
-	BLKIO_PROP_dequeue,
 };
 
 /* cgroup files owned by throttle policy */
@@ -112,8 +62,6 @@ enum blkcg_file_name_throtl {
 	BLKIO_THROTL_write_bps_device,
 	BLKIO_THROTL_read_iops_device,
 	BLKIO_THROTL_write_iops_device,
-	BLKIO_THROTL_io_service_bytes,
-	BLKIO_THROTL_io_serviced,
 };
 
 struct blkio_cgroup {
-- 
cgit v1.1


From 627f29f48158c3088cc55a25d8650cae455ff63b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:43 -0700
Subject: blkcg: drop blkiocg_file_write_u64()

blkiocg_file_write_u64() has single switch case.  Drop
blkiocg_file_write_u64(), rename blkio_weight_write() to
blkcg_set_weight() and use it directly for .write_u64 callback.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 951e7f3..13cae77 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1207,8 +1207,9 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
-static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
+static int blkcg_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	struct blkio_group *blkg;
 	struct hlist_node *n;
 
@@ -1220,10 +1221,11 @@ static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
 	blkcg->weight = (unsigned int)val;
 
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		struct blkg_policy_data *pd = blkg->pd[plid];
+		struct blkg_policy_data *pd = blkg->pd[BLKIO_POLICY_PROP];
 
-		if (!pd->conf.weight)
-			blkio_update_group_weight(blkg, plid, blkcg->weight);
+		if (pd && !pd->conf.weight)
+			blkio_update_group_weight(blkg, BLKIO_POLICY_PROP,
+						  blkcg->weight);
 	}
 
 	spin_unlock_irq(&blkcg->lock);
@@ -1251,29 +1253,6 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
 	return 0;
 }
 
-static int
-blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
-{
-	struct blkio_cgroup *blkcg;
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int name = BLKIOFILE_ATTR(cft->private);
-
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	switch(plid) {
-	case BLKIO_POLICY_PROP:
-		switch(name) {
-		case BLKIO_PROP_weight:
-			return blkio_weight_write(blkcg, plid, val);
-		}
-		break;
-	default:
-		BUG();
-	}
-
-	return 0;
-}
-
 struct cftype blkio_files[] = {
 	{
 		.name = "weight_device",
@@ -1288,7 +1267,7 @@ struct cftype blkio_files[] = {
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_weight),
 		.read_u64 = blkiocg_file_read_u64,
-		.write_u64 = blkiocg_file_write_u64,
+		.write_u64 = blkcg_set_weight,
 	},
 	{
 		.name = "time",
-- 
cgit v1.1


From c4682aec9caaca1fcfd1dd4b59cef47af22cbdc6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:43 -0700
Subject: blkcg: restructure configuration printing

Similarly to the previous stat restructuring, this patch restructures
conf printing code such that,

* Conf printing uses the same helpers as stat.

* Printing function doesn't require hardcoded switching on the config
  being printed.  Note that this isn't complete yet for throttle
  confs.  The next patch will convert setting for these confs and will
  complete the transition.

* Printing uses read_seq_string callback (other methods will be phased
  out).

Note that blkio_group_conf.iops[2] is changed to u64 so that they can
be manipulated with the same functions.  This is transitional and will
go away later.

After this patch, per-device configurations - weight, bps and iops -
use __blkg_prfill_u64() for printing which uses white space as
delimiter instead of tab.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 156 +++++++++++++++++++----------------------------------
 block/blk-cgroup.h |   3 +-
 2 files changed, 55 insertions(+), 104 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 13cae77..a9723a8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1115,95 +1115,28 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 	return ret;
 }
 
-static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
-				   struct seq_file *m)
+/* for propio conf */
+static u64 blkg_prfill_weight_device(struct seq_file *sf,
+				     struct blkg_policy_data *pd, int off)
 {
-	int plid = BLKIOFILE_POLICY(cft->private);
-	int fileid = BLKIOFILE_ATTR(cft->private);
-	struct blkg_policy_data *pd = blkg->pd[plid];
-	const char *dname = blkg_dev_name(blkg);
-	int rw = WRITE;
-
-	if (!dname)
-		return;
-
-	switch (plid) {
-		case BLKIO_POLICY_PROP:
-			if (pd->conf.weight)
-				seq_printf(m, "%s\t%u\n",
-					   dname, pd->conf.weight);
-			break;
-		case BLKIO_POLICY_THROTL:
-			switch (fileid) {
-			case BLKIO_THROTL_read_bps_device:
-				rw = READ;
-			case BLKIO_THROTL_write_bps_device:
-				if (pd->conf.bps[rw])
-					seq_printf(m, "%s\t%llu\n",
-						   dname, pd->conf.bps[rw]);
-				break;
-			case BLKIO_THROTL_read_iops_device:
-				rw = READ;
-			case BLKIO_THROTL_write_iops_device:
-				if (pd->conf.iops[rw])
-					seq_printf(m, "%s\t%u\n",
-						   dname, pd->conf.iops[rw]);
-				break;
-			}
-			break;
-		default:
-			BUG();
-	}
+	if (!pd->conf.weight)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, pd->conf.weight);
 }
 
-/* cgroup files which read their data from policy nodes end up here */
-static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
-			    struct seq_file *m)
+static int blkcg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *sf)
 {
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-
-	spin_lock_irq(&blkcg->lock);
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
-		blkio_print_group_conf(cft, blkg, m);
-	spin_unlock_irq(&blkcg->lock);
+	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
+			  blkg_prfill_weight_device, BLKIO_POLICY_PROP, 0,
+			  false);
+	return 0;
 }
 
-static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
-				struct seq_file *m)
+static int blkcg_print_weight(struct cgroup *cgrp, struct cftype *cft,
+			      struct seq_file *sf)
 {
-	struct blkio_cgroup *blkcg;
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int name = BLKIOFILE_ATTR(cft->private);
-
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	switch(plid) {
-	case BLKIO_POLICY_PROP:
-		switch(name) {
-		case BLKIO_PROP_weight_device:
-			blkio_read_conf(cft, blkcg, m);
-			return 0;
-		default:
-			BUG();
-		}
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(name){
-		case BLKIO_THROTL_read_bps_device:
-		case BLKIO_THROTL_write_bps_device:
-		case BLKIO_THROTL_read_iops_device:
-		case BLKIO_THROTL_write_iops_device:
-			blkio_read_conf(cft, blkcg, m);
-			return 0;
-		default:
-			BUG();
-		}
-		break;
-	default:
-		BUG();
-	}
-
+	seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->weight);
 	return 0;
 }
 
@@ -1233,40 +1166,59 @@ static int blkcg_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 	return 0;
 }
 
-static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
-	struct blkio_cgroup *blkcg;
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int name = BLKIOFILE_ATTR(cft->private);
+/* for blk-throttle conf */
+#ifdef CONFIG_BLK_DEV_THROTTLING
+static u64 blkg_prfill_conf_u64(struct seq_file *sf,
+				struct blkg_policy_data *pd, int off)
+{
+	u64 v = *(u64 *)((void *)&pd->conf + off);
 
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
+	if (!v)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, v);
+}
 
-	switch(plid) {
-	case BLKIO_POLICY_PROP:
-		switch(name) {
-		case BLKIO_PROP_weight:
-			return (u64)blkcg->weight;
-		}
+static int blkcg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *sf)
+{
+	int off;
+
+	switch (BLKIOFILE_ATTR(cft->private)) {
+	case BLKIO_THROTL_read_bps_device:
+		off = offsetof(struct blkio_group_conf, bps[READ]);
+		break;
+	case BLKIO_THROTL_write_bps_device:
+		off = offsetof(struct blkio_group_conf, bps[WRITE]);
+		break;
+	case BLKIO_THROTL_read_iops_device:
+		off = offsetof(struct blkio_group_conf, iops[READ]);
+		break;
+	case BLKIO_THROTL_write_iops_device:
+		off = offsetof(struct blkio_group_conf, iops[WRITE]);
 		break;
 	default:
-		BUG();
+		return -EINVAL;
 	}
+
+	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
+			  blkg_prfill_conf_u64, BLKIO_POLICY_THROTL,
+			  off, false);
 	return 0;
 }
+#endif
 
 struct cftype blkio_files[] = {
 	{
 		.name = "weight_device",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_weight_device),
-		.read_seq_string = blkiocg_file_read,
+		.read_seq_string = blkcg_print_weight_device,
 		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_weight),
-		.read_u64 = blkiocg_file_read_u64,
+		.read_seq_string = blkcg_print_weight,
 		.write_u64 = blkcg_set_weight,
 	},
 	{
@@ -1326,7 +1278,7 @@ struct cftype blkio_files[] = {
 		.name = "throttle.read_bps_device",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_read_bps_device),
-		.read_seq_string = blkiocg_file_read,
+		.read_seq_string = blkcg_print_conf_u64,
 		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
@@ -1335,7 +1287,7 @@ struct cftype blkio_files[] = {
 		.name = "throttle.write_bps_device",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_write_bps_device),
-		.read_seq_string = blkiocg_file_read,
+		.read_seq_string = blkcg_print_conf_u64,
 		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
@@ -1344,7 +1296,7 @@ struct cftype blkio_files[] = {
 		.name = "throttle.read_iops_device",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_read_iops_device),
-		.read_seq_string = blkiocg_file_read,
+		.read_seq_string = blkcg_print_conf_u64,
 		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
@@ -1353,7 +1305,7 @@ struct cftype blkio_files[] = {
 		.name = "throttle.write_iops_device",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_write_iops_device),
-		.read_seq_string = blkiocg_file_read,
+		.read_seq_string = blkcg_print_conf_u64,
 		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 7331d79..b67eefa 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -52,7 +52,6 @@ enum blkg_state_flags {
 
 /* cgroup files owned by proportional weight policy */
 enum blkcg_file_name_prop {
-	BLKIO_PROP_weight = 1,
 	BLKIO_PROP_weight_device,
 };
 
@@ -130,7 +129,7 @@ struct blkio_group_stats_cpu {
 
 struct blkio_group_conf {
 	unsigned int weight;
-	unsigned int iops[2];
+	u64 iops[2];
 	u64 bps[2];
 };
 
-- 
cgit v1.1


From 3a8b31d396b296df4b8594429d86d415d3409432 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:43 -0700
Subject: blkcg: restructure blkio_group configruation setting

As part of userland interface restructuring, this patch updates
per-blkio_group configuration setting.  Instead of funneling
everything through a master function which has hard-coded cases for
each config file it may handle, the common part is factored into
blkg_conf_prep() and blkg_conf_finish() and different configuration
setters are implemented using the helpers.

While this doesn't result in immediate LOC reduction, this enables
further cleanups and more modular implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 274 ++++++++++++++++++++++++++++-------------------------
 block/blk-cgroup.h |  13 ---
 2 files changed, 147 insertions(+), 140 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a9723a8..1e1ee2a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -43,12 +43,6 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
 static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
 
-/* for encoding cft->private value on file */
-#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
-/* What policy owns the file, proportional or throttle */
-#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
-#define BLKIOFILE_ATTR(val)		((val) & 0xffff)
-
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -86,7 +80,7 @@ static inline void blkio_update_group_weight(struct blkio_group *blkg,
 }
 
 static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
-					  u64 bps, int fileid)
+					  u64 bps, int rw)
 {
 	struct blkio_policy_type *blkiop;
 
@@ -96,21 +90,18 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
 		if (blkiop->plid != plid)
 			continue;
 
-		if (fileid == BLKIO_THROTL_read_bps_device
-		    && blkiop->ops.blkio_update_group_read_bps_fn)
+		if (rw == READ && blkiop->ops.blkio_update_group_read_bps_fn)
 			blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
 								blkg, bps);
 
-		if (fileid == BLKIO_THROTL_write_bps_device
-		    && blkiop->ops.blkio_update_group_write_bps_fn)
+		if (rw == WRITE && blkiop->ops.blkio_update_group_write_bps_fn)
 			blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
 								blkg, bps);
 	}
 }
 
-static inline void blkio_update_group_iops(struct blkio_group *blkg,
-					   int plid, unsigned int iops,
-					   int fileid)
+static inline void blkio_update_group_iops(struct blkio_group *blkg, int plid,
+					   u64 iops, int rw)
 {
 	struct blkio_policy_type *blkiop;
 
@@ -120,13 +111,11 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg,
 		if (blkiop->plid != plid)
 			continue;
 
-		if (fileid == BLKIO_THROTL_read_iops_device
-		    && blkiop->ops.blkio_update_group_read_iops_fn)
+		if (rw == READ && blkiop->ops.blkio_update_group_read_iops_fn)
 			blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
 								blkg, iops);
 
-		if (fileid == BLKIO_THROTL_write_iops_device
-		    && blkiop->ops.blkio_update_group_write_iops_fn)
+		if (rw == WRITE && blkiop->ops.blkio_update_group_write_iops_fn)
 			blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
 								blkg,iops);
 	}
@@ -975,19 +964,40 @@ static int blkcg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
 }
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 
-static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
-				      int fileid, struct blkio_cgroup *blkcg)
+struct blkg_conf_ctx {
+	struct gendisk		*disk;
+	struct blkio_group	*blkg;
+	u64			v;
+};
+
+/**
+ * blkg_conf_prep - parse and prepare for per-blkg config update
+ * @blkcg: target block cgroup
+ * @input: input string
+ * @ctx: blkg_conf_ctx to be filled
+ *
+ * Parse per-blkg config update from @input and initialize @ctx with the
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
+ * value.  This function returns with RCU read locked and must be paired
+ * with blkg_conf_finish().
+ */
+static int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
+			  struct blkg_conf_ctx *ctx)
+	__acquires(rcu)
 {
-	struct gendisk *disk = NULL;
-	struct blkio_group *blkg = NULL;
-	struct blkg_policy_data *pd;
-	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
+	struct gendisk *disk;
+	struct blkio_group *blkg;
+	char *buf, *s[4], *p, *major_s, *minor_s;
 	unsigned long major, minor;
 	int i = 0, ret = -EINVAL;
 	int part;
 	dev_t dev;
 	u64 temp;
 
+	buf = kstrdup(input, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
 	memset(s, 0, sizeof(s));
 
 	while ((p = strsep(&buf, " ")) != NULL) {
@@ -1037,82 +1047,42 @@ static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
 
 	if (IS_ERR(blkg)) {
 		ret = PTR_ERR(blkg);
-		goto out_unlock;
-	}
-
-	pd = blkg->pd[plid];
-
-	switch (plid) {
-	case BLKIO_POLICY_PROP:
-		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
-		     temp > BLKIO_WEIGHT_MAX)
-			goto out_unlock;
-
-		pd->conf.weight = temp;
-		blkio_update_group_weight(blkg, plid, temp ?: blkcg->weight);
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(fileid) {
-		case BLKIO_THROTL_read_bps_device:
-			pd->conf.bps[READ] = temp;
-			blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
-			break;
-		case BLKIO_THROTL_write_bps_device:
-			pd->conf.bps[WRITE] = temp;
-			blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
-			break;
-		case BLKIO_THROTL_read_iops_device:
-			if (temp > THROTL_IOPS_MAX)
-				goto out_unlock;
-			pd->conf.iops[READ] = temp;
-			blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
-			break;
-		case BLKIO_THROTL_write_iops_device:
-			if (temp > THROTL_IOPS_MAX)
-				goto out_unlock;
-			pd->conf.iops[WRITE] = temp;
-			blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
-			break;
+		rcu_read_unlock();
+		put_disk(disk);
+		/*
+		 * If queue was bypassing, we should retry.  Do so after a
+		 * short msleep().  It isn't strictly necessary but queue
+		 * can be bypassing for some time and it's always nice to
+		 * avoid busy looping.
+		 */
+		if (ret == -EBUSY) {
+			msleep(10);
+			ret = restart_syscall();
 		}
-		break;
-	default:
-		BUG();
+		goto out;
 	}
+
+	ctx->disk = disk;
+	ctx->blkg = blkg;
+	ctx->v = temp;
 	ret = 0;
-out_unlock:
-	rcu_read_unlock();
 out:
-	put_disk(disk);
-
-	/*
-	 * If queue was bypassing, we should retry.  Do so after a short
-	 * msleep().  It isn't strictly necessary but queue can be
-	 * bypassing for some time and it's always nice to avoid busy
-	 * looping.
-	 */
-	if (ret == -EBUSY) {
-		msleep(10);
-		return restart_syscall();
-	}
+	kfree(buf);
 	return ret;
 }
 
-static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
- 				       const char *buffer)
+/**
+ * blkg_conf_finish - finish up per-blkg config update
+ * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
+ *
+ * Finish up after per-blkg config update.  This function must be paired
+ * with blkg_conf_prep().
+ */
+static void blkg_conf_finish(struct blkg_conf_ctx *ctx)
+	__releases(rcu)
 {
-	int ret = 0;
-	char *buf;
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int fileid = BLKIOFILE_ATTR(cft->private);
-
-	buf = kstrdup(buffer, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
-	kfree(buf);
-	return ret;
+	rcu_read_unlock();
+	put_disk(ctx->disk);
 }
 
 /* for propio conf */
@@ -1140,6 +1110,32 @@ static int blkcg_print_weight(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
+static int blkcg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				   const char *buf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkg_policy_data *pd;
+	struct blkg_conf_ctx ctx;
+	int ret;
+
+	ret = blkg_conf_prep(blkcg, buf, &ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	pd = ctx.blkg->pd[BLKIO_POLICY_PROP];
+	if (pd && (!ctx.v || (ctx.v >= BLKIO_WEIGHT_MIN &&
+			      ctx.v <= BLKIO_WEIGHT_MAX))) {
+		pd->conf.weight = ctx.v;
+		blkio_update_group_weight(ctx.blkg, BLKIO_POLICY_PROP,
+					  ctx.v ?: blkcg->weight);
+		ret = 0;
+	}
+
+	blkg_conf_finish(&ctx);
+	return ret;
+}
+
 static int blkcg_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
@@ -1181,39 +1177,67 @@ static u64 blkg_prfill_conf_u64(struct seq_file *sf,
 static int blkcg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
 				struct seq_file *sf)
 {
-	int off;
-
-	switch (BLKIOFILE_ATTR(cft->private)) {
-	case BLKIO_THROTL_read_bps_device:
-		off = offsetof(struct blkio_group_conf, bps[READ]);
-		break;
-	case BLKIO_THROTL_write_bps_device:
-		off = offsetof(struct blkio_group_conf, bps[WRITE]);
-		break;
-	case BLKIO_THROTL_read_iops_device:
-		off = offsetof(struct blkio_group_conf, iops[READ]);
-		break;
-	case BLKIO_THROTL_write_iops_device:
-		off = offsetof(struct blkio_group_conf, iops[WRITE]);
-		break;
-	default:
-		return -EINVAL;
-	}
-
 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
 			  blkg_prfill_conf_u64, BLKIO_POLICY_THROTL,
-			  off, false);
+			  cft->private, false);
 	return 0;
 }
+
+static int blkcg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+			      const char *buf, int rw,
+			      void (*update)(struct blkio_group *, int, u64, int))
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkg_policy_data *pd;
+	struct blkg_conf_ctx ctx;
+	int ret;
+
+	ret = blkg_conf_prep(blkcg, buf, &ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	pd = ctx.blkg->pd[BLKIO_POLICY_THROTL];
+	if (pd) {
+		*(u64 *)((void *)&pd->conf + cft->private) = ctx.v;
+		update(ctx.blkg, BLKIO_POLICY_THROTL, ctx.v ?: -1, rw);
+		ret = 0;
+	}
+
+	blkg_conf_finish(&ctx);
+	return ret;
+}
+
+static int blkcg_set_conf_bps_r(struct cgroup *cgrp, struct cftype *cft,
+				const char *buf)
+{
+	return blkcg_set_conf_u64(cgrp, cft, buf, READ, blkio_update_group_bps);
+}
+
+static int blkcg_set_conf_bps_w(struct cgroup *cgrp, struct cftype *cft,
+				const char *buf)
+{
+	return blkcg_set_conf_u64(cgrp, cft, buf, WRITE, blkio_update_group_bps);
+}
+
+static int blkcg_set_conf_iops_r(struct cgroup *cgrp, struct cftype *cft,
+				 const char *buf)
+{
+	return blkcg_set_conf_u64(cgrp, cft, buf, READ, blkio_update_group_iops);
+}
+
+static int blkcg_set_conf_iops_w(struct cgroup *cgrp, struct cftype *cft,
+				 const char *buf)
+{
+	return blkcg_set_conf_u64(cgrp, cft, buf, WRITE, blkio_update_group_iops);
+}
 #endif
 
 struct cftype blkio_files[] = {
 	{
 		.name = "weight_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_weight_device),
 		.read_seq_string = blkcg_print_weight_device,
-		.write_string = blkiocg_file_write,
+		.write_string = blkcg_set_weight_device,
 		.max_write_len = 256,
 	},
 	{
@@ -1276,37 +1300,33 @@ struct cftype blkio_files[] = {
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	{
 		.name = "throttle.read_bps_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_bps_device),
+		.private = offsetof(struct blkio_group_conf, bps[READ]),
 		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkiocg_file_write,
+		.write_string = blkcg_set_conf_bps_r,
 		.max_write_len = 256,
 	},
 
 	{
 		.name = "throttle.write_bps_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_bps_device),
+		.private = offsetof(struct blkio_group_conf, bps[WRITE]),
 		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkiocg_file_write,
+		.write_string = blkcg_set_conf_bps_w,
 		.max_write_len = 256,
 	},
 
 	{
 		.name = "throttle.read_iops_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_iops_device),
+		.private = offsetof(struct blkio_group_conf, iops[READ]),
 		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkiocg_file_write,
+		.write_string = blkcg_set_conf_iops_r,
 		.max_write_len = 256,
 	},
 
 	{
 		.name = "throttle.write_iops_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_iops_device),
+		.private = offsetof(struct blkio_group_conf, iops[WRITE]),
 		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkiocg_file_write,
+		.write_string = blkcg_set_conf_iops_w,
 		.max_write_len = 256,
 	},
 	{
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b67eefa..108ffbf 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -50,19 +50,6 @@ enum blkg_state_flags {
 	BLKG_empty,
 };
 
-/* cgroup files owned by proportional weight policy */
-enum blkcg_file_name_prop {
-	BLKIO_PROP_weight_device,
-};
-
-/* cgroup files owned by throttle policy */
-enum blkcg_file_name_throtl {
-	BLKIO_THROTL_read_bps_device,
-	BLKIO_THROTL_write_bps_device,
-	BLKIO_THROTL_read_iops_device,
-	BLKIO_THROTL_write_iops_device,
-};
-
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
-- 
cgit v1.1


From 726fa6945e6e5f0389bf47a790e1df734a67de54 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:43 -0700
Subject: blkcg: simplify blkg_conf_prep()

blkg_conf_prep() implements "MAJ:MIN VAL" parsing manually, which is
unnecessary.  Just use sscanf("%u:%u %llu").  This might not reject
some malformed input (extra input at the end) but we don't care.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 64 +++++++++---------------------------------------------
 1 file changed, 10 insertions(+), 54 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1e1ee2a..b07a501 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -987,57 +987,16 @@ static int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
 {
 	struct gendisk *disk;
 	struct blkio_group *blkg;
-	char *buf, *s[4], *p, *major_s, *minor_s;
-	unsigned long major, minor;
-	int i = 0, ret = -EINVAL;
-	int part;
-	dev_t dev;
-	u64 temp;
+	unsigned int major, minor;
+	unsigned long long v;
+	int part, ret;
 
-	buf = kstrdup(input, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	memset(s, 0, sizeof(s));
-
-	while ((p = strsep(&buf, " ")) != NULL) {
-		if (!*p)
-			continue;
-
-		s[i++] = p;
-
-		/* Prevent from inputing too many things */
-		if (i == 3)
-			break;
-	}
-
-	if (i != 2)
-		goto out;
-
-	p = strsep(&s[0], ":");
-	if (p != NULL)
-		major_s = p;
-	else
-		goto out;
-
-	minor_s = s[0];
-	if (!minor_s)
-		goto out;
-
-	if (strict_strtoul(major_s, 10, &major))
-		goto out;
-
-	if (strict_strtoul(minor_s, 10, &minor))
-		goto out;
-
-	dev = MKDEV(major, minor);
-
-	if (strict_strtoull(s[1], 10, &temp))
-		goto out;
+	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+		return -EINVAL;
 
-	disk = get_gendisk(dev, &part);
+	disk = get_gendisk(MKDEV(major, minor), &part);
 	if (!disk || part)
-		goto out;
+		return -EINVAL;
 
 	rcu_read_lock();
 
@@ -1059,16 +1018,13 @@ static int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
 			msleep(10);
 			ret = restart_syscall();
 		}
-		goto out;
+		return ret;
 	}
 
 	ctx->disk = disk;
 	ctx->blkg = blkg;
-	ctx->v = temp;
-	ret = 0;
-out:
-	kfree(buf);
-	return ret;
+	ctx->v = v;
+	return 0;
 }
 
 /**
-- 
cgit v1.1


From 829fdb50004de78f1bd187e428d72edcd9721cb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:43 -0700
Subject: blkcg: export conf/stat helpers to prepare for reorganization

conf/stat handling is about to be moved to policy implementation from
blkcg core.  Export conf/stat helpers from blkcg core so that
blk-throttle and cfq-iosched can use them.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 52 +++++++++++++++++++++++++---------------------------
 block/blk-cgroup.h | 27 +++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b07a501..53976f2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -11,7 +11,6 @@
  * 	              Nauman Rafique <nauman@google.com>
  */
 #include <linux/ioprio.h>
-#include <linux/seq_file.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include <linux/err.h>
@@ -767,10 +766,9 @@ static const char *blkg_dev_name(struct blkio_group *blkg)
  * This is to be used to construct print functions for
  * cftype->read_seq_string method.
  */
-static void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
-			      u64 (*prfill)(struct seq_file *,
-					    struct blkg_policy_data *, int),
-			      int pol, int data, bool show_total)
+void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
+		       u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
+		       int pol, int data, bool show_total)
 {
 	struct blkio_group *blkg;
 	struct hlist_node *n;
@@ -785,6 +783,7 @@ static void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 	if (show_total)
 		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
 }
+EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
 
 /**
  * __blkg_prfill_u64 - prfill helper for a single u64 value
@@ -794,8 +793,7 @@ static void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
  *
  * Print @v to @sf for the device assocaited with @pd.
  */
-static u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd,
-			     u64 v)
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
 {
 	const char *dname = blkg_dev_name(pd->blkg);
 
@@ -805,6 +803,7 @@ static u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd,
 	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
 	return v;
 }
+EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
 
 /**
  * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
@@ -814,9 +813,8 @@ static u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  *
  * Print @rwstat to @sf for the device assocaited with @pd.
  */
-static u64 __blkg_prfill_rwstat(struct seq_file *sf,
-				struct blkg_policy_data *pd,
-				const struct blkg_rwstat *rwstat)
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+			 const struct blkg_rwstat *rwstat)
 {
 	static const char *rwstr[] = {
 		[BLKG_RWSTAT_READ]	= "Read",
@@ -856,8 +854,8 @@ static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 }
 
 /* print blkg_stat specified by BLKCG_STAT_PRIV() */
-static int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
-			    struct seq_file *sf)
+int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+		     struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
@@ -866,10 +864,11 @@ static int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 			  BLKCG_STAT_OFF(cft->private), false);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(blkcg_print_stat);
 
 /* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
-static int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
-			      struct seq_file *sf)
+int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
+		       struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
@@ -878,6 +877,7 @@ static int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
 			  BLKCG_STAT_OFF(cft->private), true);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
 
 static u64 blkg_prfill_cpu_stat(struct seq_file *sf,
 				struct blkg_policy_data *pd, int off)
@@ -914,8 +914,8 @@ static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
 }
 
 /* print per-cpu blkg_stat specified by BLKCG_STAT_PRIV() */
-static int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
-				struct seq_file *sf)
+int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
+			 struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
@@ -924,10 +924,11 @@ static int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
 			  BLKCG_STAT_OFF(cft->private), false);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(blkcg_print_cpu_stat);
 
 /* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
-static int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
-				  struct seq_file *sf)
+int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
+			   struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
@@ -936,6 +937,7 @@ static int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 			  BLKCG_STAT_OFF(cft->private), true);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(blkcg_print_cpu_rwstat);
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 blkg_prfill_avg_queue_size(struct seq_file *sf,
@@ -964,12 +966,6 @@ static int blkcg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
 }
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 
-struct blkg_conf_ctx {
-	struct gendisk		*disk;
-	struct blkio_group	*blkg;
-	u64			v;
-};
-
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
@@ -981,8 +977,8 @@ struct blkg_conf_ctx {
  * value.  This function returns with RCU read locked and must be paired
  * with blkg_conf_finish().
  */
-static int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
-			  struct blkg_conf_ctx *ctx)
+int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
+		   struct blkg_conf_ctx *ctx)
 	__acquires(rcu)
 {
 	struct gendisk *disk;
@@ -1026,6 +1022,7 @@ static int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
 	ctx->v = v;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(blkg_conf_prep);
 
 /**
  * blkg_conf_finish - finish up per-blkg config update
@@ -1034,12 +1031,13 @@ static int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
  * Finish up after per-blkg config update.  This function must be paired
  * with blkg_conf_prep().
  */
-static void blkg_conf_finish(struct blkg_conf_ctx *ctx)
+void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 	__releases(rcu)
 {
 	rcu_read_unlock();
 	put_disk(ctx->disk);
 }
+EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
 /* for propio conf */
 static u64 blkg_prfill_weight_device(struct seq_file *sf,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 108ffbf..361ecfa 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,7 @@
 
 #include <linux/cgroup.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/seq_file.h>
 
 enum blkio_policy_id {
 	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
@@ -193,6 +194,32 @@ extern void blkg_destroy_all(struct request_queue *q, bool destroy_root);
 extern void update_root_blkg_pd(struct request_queue *q,
 				enum blkio_policy_id plid);
 
+void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
+		       u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
+		       int pol, int data, bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+			 const struct blkg_rwstat *rwstat);
+int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+		     struct seq_file *sf);
+int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
+		       struct seq_file *sf);
+int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
+			 struct seq_file *sf);
+int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
+			   struct seq_file *sf);
+
+struct blkg_conf_ctx {
+	struct gendisk		*disk;
+	struct blkio_group	*blkg;
+	u64			v;
+};
+
+int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
+		   struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+
 /**
  * blkg_to_pdata - get policy private data
  * @blkg: blkg of interest
-- 
cgit v1.1


From 44ea53de46a8b01a65ae6217f47e00b516725190 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:43 -0700
Subject: blkcg: implement blkio_policy_type->cftypes

Add blkiop->cftypes which is added and removed together with the
policy.  This will be used to move conf/stat handling to the policies.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 6 ++++++
 block/blk-cgroup.h | 1 +
 2 files changed, 7 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 53976f2..4e714f8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1538,6 +1538,9 @@ void blkio_policy_register(struct blkio_policy_type *blkiop)
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		update_root_blkg_pd(q, blkiop->plid);
 	blkcg_bypass_end();
+
+	if (blkiop->cftypes)
+		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
 }
 EXPORT_SYMBOL_GPL(blkio_policy_register);
 
@@ -1545,6 +1548,9 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 {
 	struct request_queue *q;
 
+	if (blkiop->cftypes)
+		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);
+
 	blkcg_bypass_start();
 	spin_lock(&blkio_list_lock);
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 361ecfa..fa744d5 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -181,6 +181,7 @@ struct blkio_policy_type {
 	struct blkio_policy_ops ops;
 	enum blkio_policy_id plid;
 	size_t pdata_size;		/* policy specific private data size */
+	struct cftype *cftypes;		/* cgroup files for the policy */
 };
 
 extern int blkcg_init_queue(struct request_queue *q);
-- 
cgit v1.1


From 60c2bc2d5a12369deef395cda41638d7e6b6bf19 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:43 -0700
Subject: blkcg: move conf/stat file handling code to policies

blkcg conf/stat handling is convoluted in that details which belong to
specific policy implementations are all out in blkcg core and then
policies hook into core layer to access and manipulate confs and
stats.  This sadly achieves both inflexibility (confs/stats can't be
modified without messing with blkcg core) and complexity (all the
call-ins and call-backs).

The previous patches restructured conf and stat handling code such
that they can be separated out.  This patch relocates the file
handling part.  All conf/stat file handling code which belongs to
BLKIO_POLICY_PROP is moved to cfq-iosched.c and all
BKLIO_POLICY_THROTL code to blk-throtl.c.

The move is verbatim except for blkio_update_group_{weight|bps|iops}()
callbacks which relays conf changes to policies.  The configuration
settings are handled in policies themselves so the relaying isn't
necessary.  Conf setting functions are modified to directly call
per-policy update functions and the relaying mechanism is dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c   | 373 ---------------------------------------------------
 block/blk-cgroup.h   |  15 ---
 block/blk-throttle.c | 163 ++++++++++++++++++----
 block/cfq-iosched.c  | 202 +++++++++++++++++++++++++++-
 4 files changed, 333 insertions(+), 420 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4e714f8..b963fb4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -63,63 +63,6 @@ struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
 
-static inline void blkio_update_group_weight(struct blkio_group *blkg,
-					     int plid, unsigned int weight)
-{
-	struct blkio_policy_type *blkiop;
-
-	list_for_each_entry(blkiop, &blkio_list, list) {
-		/* If this policy does not own the blkg, do not send updates */
-		if (blkiop->plid != plid)
-			continue;
-		if (blkiop->ops.blkio_update_group_weight_fn)
-			blkiop->ops.blkio_update_group_weight_fn(blkg->q,
-							blkg, weight);
-	}
-}
-
-static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
-					  u64 bps, int rw)
-{
-	struct blkio_policy_type *blkiop;
-
-	list_for_each_entry(blkiop, &blkio_list, list) {
-
-		/* If this policy does not own the blkg, do not send updates */
-		if (blkiop->plid != plid)
-			continue;
-
-		if (rw == READ && blkiop->ops.blkio_update_group_read_bps_fn)
-			blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
-								blkg, bps);
-
-		if (rw == WRITE && blkiop->ops.blkio_update_group_write_bps_fn)
-			blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
-								blkg, bps);
-	}
-}
-
-static inline void blkio_update_group_iops(struct blkio_group *blkg, int plid,
-					   u64 iops, int rw)
-{
-	struct blkio_policy_type *blkiop;
-
-	list_for_each_entry(blkiop, &blkio_list, list) {
-
-		/* If this policy does not own the blkg, do not send updates */
-		if (blkiop->plid != plid)
-			continue;
-
-		if (rw == READ && blkiop->ops.blkio_update_group_read_iops_fn)
-			blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
-								blkg, iops);
-
-		if (rw == WRITE && blkiop->ops.blkio_update_group_write_iops_fn)
-			blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
-								blkg,iops);
-	}
-}
-
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 /* This should be called with the queue_lock held. */
 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
@@ -939,33 +882,6 @@ int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 }
 EXPORT_SYMBOL_GPL(blkcg_print_cpu_rwstat);
 
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-static u64 blkg_prfill_avg_queue_size(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
-{
-	u64 samples = blkg_stat_read(&pd->stats.avg_queue_size_samples);
-	u64 v = 0;
-
-	if (samples) {
-		v = blkg_stat_read(&pd->stats.avg_queue_size_sum);
-		do_div(v, samples);
-	}
-	__blkg_prfill_u64(sf, pd, v);
-	return 0;
-}
-
-/* print avg_queue_size */
-static int blkcg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
-				      struct seq_file *sf)
-{
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_avg_queue_size,
-			  BLKIO_POLICY_PROP, 0, false);
-	return 0;
-}
-#endif	/* CONFIG_DEBUG_BLK_CGROUP */
-
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
@@ -1039,300 +955,11 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
-/* for propio conf */
-static u64 blkg_prfill_weight_device(struct seq_file *sf,
-				     struct blkg_policy_data *pd, int off)
-{
-	if (!pd->conf.weight)
-		return 0;
-	return __blkg_prfill_u64(sf, pd, pd->conf.weight);
-}
-
-static int blkcg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				     struct seq_file *sf)
-{
-	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
-			  blkg_prfill_weight_device, BLKIO_POLICY_PROP, 0,
-			  false);
-	return 0;
-}
-
-static int blkcg_print_weight(struct cgroup *cgrp, struct cftype *cft,
-			      struct seq_file *sf)
-{
-	seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->weight);
-	return 0;
-}
-
-static int blkcg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				   const char *buf)
-{
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-	struct blkg_policy_data *pd;
-	struct blkg_conf_ctx ctx;
-	int ret;
-
-	ret = blkg_conf_prep(blkcg, buf, &ctx);
-	if (ret)
-		return ret;
-
-	ret = -EINVAL;
-	pd = ctx.blkg->pd[BLKIO_POLICY_PROP];
-	if (pd && (!ctx.v || (ctx.v >= BLKIO_WEIGHT_MIN &&
-			      ctx.v <= BLKIO_WEIGHT_MAX))) {
-		pd->conf.weight = ctx.v;
-		blkio_update_group_weight(ctx.blkg, BLKIO_POLICY_PROP,
-					  ctx.v ?: blkcg->weight);
-		ret = 0;
-	}
-
-	blkg_conf_finish(&ctx);
-	return ret;
-}
-
-static int blkcg_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
-{
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-
-	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
-		return -EINVAL;
-
-	spin_lock(&blkio_list_lock);
-	spin_lock_irq(&blkcg->lock);
-	blkcg->weight = (unsigned int)val;
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		struct blkg_policy_data *pd = blkg->pd[BLKIO_POLICY_PROP];
-
-		if (pd && !pd->conf.weight)
-			blkio_update_group_weight(blkg, BLKIO_POLICY_PROP,
-						  blkcg->weight);
-	}
-
-	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
-	return 0;
-}
-
-/* for blk-throttle conf */
-#ifdef CONFIG_BLK_DEV_THROTTLING
-static u64 blkg_prfill_conf_u64(struct seq_file *sf,
-				struct blkg_policy_data *pd, int off)
-{
-	u64 v = *(u64 *)((void *)&pd->conf + off);
-
-	if (!v)
-		return 0;
-	return __blkg_prfill_u64(sf, pd, v);
-}
-
-static int blkcg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
-				struct seq_file *sf)
-{
-	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
-			  blkg_prfill_conf_u64, BLKIO_POLICY_THROTL,
-			  cft->private, false);
-	return 0;
-}
-
-static int blkcg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
-			      const char *buf, int rw,
-			      void (*update)(struct blkio_group *, int, u64, int))
-{
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-	struct blkg_policy_data *pd;
-	struct blkg_conf_ctx ctx;
-	int ret;
-
-	ret = blkg_conf_prep(blkcg, buf, &ctx);
-	if (ret)
-		return ret;
-
-	ret = -EINVAL;
-	pd = ctx.blkg->pd[BLKIO_POLICY_THROTL];
-	if (pd) {
-		*(u64 *)((void *)&pd->conf + cft->private) = ctx.v;
-		update(ctx.blkg, BLKIO_POLICY_THROTL, ctx.v ?: -1, rw);
-		ret = 0;
-	}
-
-	blkg_conf_finish(&ctx);
-	return ret;
-}
-
-static int blkcg_set_conf_bps_r(struct cgroup *cgrp, struct cftype *cft,
-				const char *buf)
-{
-	return blkcg_set_conf_u64(cgrp, cft, buf, READ, blkio_update_group_bps);
-}
-
-static int blkcg_set_conf_bps_w(struct cgroup *cgrp, struct cftype *cft,
-				const char *buf)
-{
-	return blkcg_set_conf_u64(cgrp, cft, buf, WRITE, blkio_update_group_bps);
-}
-
-static int blkcg_set_conf_iops_r(struct cgroup *cgrp, struct cftype *cft,
-				 const char *buf)
-{
-	return blkcg_set_conf_u64(cgrp, cft, buf, READ, blkio_update_group_iops);
-}
-
-static int blkcg_set_conf_iops_w(struct cgroup *cgrp, struct cftype *cft,
-				 const char *buf)
-{
-	return blkcg_set_conf_u64(cgrp, cft, buf, WRITE, blkio_update_group_iops);
-}
-#endif
-
 struct cftype blkio_files[] = {
 	{
-		.name = "weight_device",
-		.read_seq_string = blkcg_print_weight_device,
-		.write_string = blkcg_set_weight_device,
-		.max_write_len = 256,
-	},
-	{
-		.name = "weight",
-		.read_seq_string = blkcg_print_weight,
-		.write_u64 = blkcg_set_weight,
-	},
-	{
-		.name = "time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, time)),
-		.read_seq_string = blkcg_print_stat,
-	},
-	{
-		.name = "sectors",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats_cpu, sectors)),
-		.read_seq_string = blkcg_print_cpu_stat,
-	},
-	{
-		.name = "io_service_bytes",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats_cpu, service_bytes)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
-	},
-	{
-		.name = "io_serviced",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats_cpu, serviced)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
-	},
-	{
-		.name = "io_service_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, service_time)),
-		.read_seq_string = blkcg_print_rwstat,
-	},
-	{
-		.name = "io_wait_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, wait_time)),
-		.read_seq_string = blkcg_print_rwstat,
-	},
-	{
-		.name = "io_merged",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, merged)),
-		.read_seq_string = blkcg_print_rwstat,
-	},
-	{
-		.name = "io_queued",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, queued)),
-		.read_seq_string = blkcg_print_rwstat,
-	},
-	{
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
 	},
-#ifdef CONFIG_BLK_DEV_THROTTLING
-	{
-		.name = "throttle.read_bps_device",
-		.private = offsetof(struct blkio_group_conf, bps[READ]),
-		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkcg_set_conf_bps_r,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.write_bps_device",
-		.private = offsetof(struct blkio_group_conf, bps[WRITE]),
-		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkcg_set_conf_bps_w,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.read_iops_device",
-		.private = offsetof(struct blkio_group_conf, iops[READ]),
-		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkcg_set_conf_iops_r,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.write_iops_device",
-		.private = offsetof(struct blkio_group_conf, iops[WRITE]),
-		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkcg_set_conf_iops_w,
-		.max_write_len = 256,
-	},
-	{
-		.name = "throttle.io_service_bytes",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
-				offsetof(struct blkio_group_stats_cpu, service_bytes)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
-	},
-	{
-		.name = "throttle.io_serviced",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
-				offsetof(struct blkio_group_stats_cpu, serviced)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
-	},
-#endif /* CONFIG_BLK_DEV_THROTTLING */
-
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	{
-		.name = "avg_queue_size",
-		.read_seq_string = blkcg_print_avg_queue_size,
-	},
-	{
-		.name = "group_wait_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, group_wait_time)),
-		.read_seq_string = blkcg_print_stat,
-	},
-	{
-		.name = "idle_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, idle_time)),
-		.read_seq_string = blkcg_print_stat,
-	},
-	{
-		.name = "empty_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, empty_time)),
-		.read_seq_string = blkcg_print_stat,
-	},
-	{
-		.name = "dequeue",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, dequeue)),
-		.read_seq_string = blkcg_print_stat,
-	},
-	{
-		.name = "unaccounted_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, unaccounted_time)),
-		.read_seq_string = blkcg_print_stat,
-	},
-#endif
 	{ }	/* terminate */
 };
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index fa744d5..ba64b28 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -156,24 +156,9 @@ struct blkio_group {
 };
 
 typedef void (blkio_init_group_fn)(struct blkio_group *blkg);
-typedef void (blkio_update_group_weight_fn)(struct request_queue *q,
-			struct blkio_group *blkg, unsigned int weight);
-typedef void (blkio_update_group_read_bps_fn)(struct request_queue *q,
-			struct blkio_group *blkg, u64 read_bps);
-typedef void (blkio_update_group_write_bps_fn)(struct request_queue *q,
-			struct blkio_group *blkg, u64 write_bps);
-typedef void (blkio_update_group_read_iops_fn)(struct request_queue *q,
-			struct blkio_group *blkg, unsigned int read_iops);
-typedef void (blkio_update_group_write_iops_fn)(struct request_queue *q,
-			struct blkio_group *blkg, unsigned int write_iops);
 
 struct blkio_policy_ops {
 	blkio_init_group_fn *blkio_init_group_fn;
-	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
-	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
-	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
-	blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
-	blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
 };
 
 struct blkio_policy_type {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1cc6c23d..fb6f257 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -804,6 +804,11 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 	}
 }
 
+/*
+ * Can not take queue lock in update functions as queue lock under
+ * blkcg_lock is not allowed. Under other paths we take blkcg_lock under
+ * queue_lock.
+ */
 static void throtl_update_blkio_group_common(struct throtl_data *td,
 				struct throtl_grp *tg)
 {
@@ -813,51 +818,158 @@ static void throtl_update_blkio_group_common(struct throtl_data *td,
 	throtl_schedule_delayed_work(td, 0);
 }
 
-/*
- * For all update functions, @q should be a valid pointer because these
- * update functions are called under blkcg_lock, that means, blkg is
- * valid and in turn @q is valid. queue exit path can not race because
- * of blkcg_lock
- *
- * Can not take queue lock in update functions as queue lock under blkcg_lock
- * is not allowed. Under other paths we take blkcg_lock under queue_lock.
- */
-static void throtl_update_blkio_group_read_bps(struct request_queue *q,
-				struct blkio_group *blkg, u64 read_bps)
+static u64 blkg_prfill_conf_u64(struct seq_file *sf,
+				struct blkg_policy_data *pd, int off)
+{
+	u64 v = *(u64 *)((void *)&pd->conf + off);
+
+	if (!v)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, v);
+}
+
+static int blkcg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *sf)
+{
+	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
+			  blkg_prfill_conf_u64, BLKIO_POLICY_THROTL,
+			  cft->private, false);
+	return 0;
+}
+
+static void throtl_update_blkio_group_read_bps(struct blkio_group *blkg,
+					       u64 read_bps)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	tg->bps[READ] = read_bps;
-	throtl_update_blkio_group_common(q->td, tg);
+	throtl_update_blkio_group_common(blkg->q->td, tg);
 }
 
-static void throtl_update_blkio_group_write_bps(struct request_queue *q,
-				struct blkio_group *blkg, u64 write_bps)
+static void throtl_update_blkio_group_write_bps(struct blkio_group *blkg,
+						u64 write_bps)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	tg->bps[WRITE] = write_bps;
-	throtl_update_blkio_group_common(q->td, tg);
+	throtl_update_blkio_group_common(blkg->q->td, tg);
 }
 
-static void throtl_update_blkio_group_read_iops(struct request_queue *q,
-			struct blkio_group *blkg, unsigned int read_iops)
+static void throtl_update_blkio_group_read_iops(struct blkio_group *blkg,
+						u64 read_iops)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	tg->iops[READ] = read_iops;
-	throtl_update_blkio_group_common(q->td, tg);
+	throtl_update_blkio_group_common(blkg->q->td, tg);
 }
 
-static void throtl_update_blkio_group_write_iops(struct request_queue *q,
-			struct blkio_group *blkg, unsigned int write_iops)
+static void throtl_update_blkio_group_write_iops(struct blkio_group *blkg,
+						 u64 write_iops)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 
 	tg->iops[WRITE] = write_iops;
-	throtl_update_blkio_group_common(q->td, tg);
+	throtl_update_blkio_group_common(blkg->q->td, tg);
+}
+
+static int blkcg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+			      const char *buf,
+			      void (*update)(struct blkio_group *, u64))
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkg_policy_data *pd;
+	struct blkg_conf_ctx ctx;
+	int ret;
+
+	ret = blkg_conf_prep(blkcg, buf, &ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	pd = ctx.blkg->pd[BLKIO_POLICY_THROTL];
+	if (pd) {
+		*(u64 *)((void *)&pd->conf + cft->private) = ctx.v;
+		update(ctx.blkg, ctx.v ?: -1);
+		ret = 0;
+	}
+
+	blkg_conf_finish(&ctx);
+	return ret;
 }
 
+static int blkcg_set_conf_bps_r(struct cgroup *cgrp, struct cftype *cft,
+				const char *buf)
+{
+	return blkcg_set_conf_u64(cgrp, cft, buf,
+				  throtl_update_blkio_group_read_bps);
+}
+
+static int blkcg_set_conf_bps_w(struct cgroup *cgrp, struct cftype *cft,
+				const char *buf)
+{
+	return blkcg_set_conf_u64(cgrp, cft, buf,
+				  throtl_update_blkio_group_write_bps);
+}
+
+static int blkcg_set_conf_iops_r(struct cgroup *cgrp, struct cftype *cft,
+				 const char *buf)
+{
+	return blkcg_set_conf_u64(cgrp, cft, buf,
+				  throtl_update_blkio_group_read_iops);
+}
+
+static int blkcg_set_conf_iops_w(struct cgroup *cgrp, struct cftype *cft,
+				 const char *buf)
+{
+	return blkcg_set_conf_u64(cgrp, cft, buf,
+				  throtl_update_blkio_group_write_iops);
+}
+
+static struct cftype throtl_files[] = {
+	{
+		.name = "throttle.read_bps_device",
+		.private = offsetof(struct blkio_group_conf, bps[READ]),
+		.read_seq_string = blkcg_print_conf_u64,
+		.write_string = blkcg_set_conf_bps_r,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.write_bps_device",
+		.private = offsetof(struct blkio_group_conf, bps[WRITE]),
+		.read_seq_string = blkcg_print_conf_u64,
+		.write_string = blkcg_set_conf_bps_w,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.read_iops_device",
+		.private = offsetof(struct blkio_group_conf, iops[READ]),
+		.read_seq_string = blkcg_print_conf_u64,
+		.write_string = blkcg_set_conf_iops_r,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.write_iops_device",
+		.private = offsetof(struct blkio_group_conf, iops[WRITE]),
+		.read_seq_string = blkcg_print_conf_u64,
+		.write_string = blkcg_set_conf_iops_w,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.io_service_bytes",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
+				offsetof(struct blkio_group_stats_cpu, service_bytes)),
+		.read_seq_string = blkcg_print_cpu_rwstat,
+	},
+	{
+		.name = "throttle.io_serviced",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
+				offsetof(struct blkio_group_stats_cpu, serviced)),
+		.read_seq_string = blkcg_print_cpu_rwstat,
+	},
+	{ }	/* terminate */
+};
+
 static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
@@ -868,17 +980,10 @@ static void throtl_shutdown_wq(struct request_queue *q)
 static struct blkio_policy_type blkio_policy_throtl = {
 	.ops = {
 		.blkio_init_group_fn = throtl_init_blkio_group,
-		.blkio_update_group_read_bps_fn =
-					throtl_update_blkio_group_read_bps,
-		.blkio_update_group_write_bps_fn =
-					throtl_update_blkio_group_write_bps,
-		.blkio_update_group_read_iops_fn =
-					throtl_update_blkio_group_read_iops,
-		.blkio_update_group_write_iops_fn =
-					throtl_update_blkio_group_write_iops,
 	},
 	.plid = BLKIO_POLICY_THROTL,
 	.pdata_size = sizeof(struct throtl_grp),
+	.cftypes = throtl_files,
 };
 
 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8cca6161..119e061 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1058,8 +1058,7 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfq_update_blkio_group_weight(struct request_queue *q,
-					  struct blkio_group *blkg,
+static void cfq_update_blkio_group_weight(struct blkio_group *blkg,
 					  unsigned int weight)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1111,6 +1110,203 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 	cfqg_get(cfqg);
 }
 
+static u64 blkg_prfill_weight_device(struct seq_file *sf,
+				     struct blkg_policy_data *pd, int off)
+{
+	if (!pd->conf.weight)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, pd->conf.weight);
+}
+
+static int blkcg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *sf)
+{
+	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
+			  blkg_prfill_weight_device, BLKIO_POLICY_PROP, 0,
+			  false);
+	return 0;
+}
+
+static int blkcg_print_weight(struct cgroup *cgrp, struct cftype *cft,
+			      struct seq_file *sf)
+{
+	seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->weight);
+	return 0;
+}
+
+static int blkcg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				   const char *buf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkg_policy_data *pd;
+	struct blkg_conf_ctx ctx;
+	int ret;
+
+	ret = blkg_conf_prep(blkcg, buf, &ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	pd = ctx.blkg->pd[BLKIO_POLICY_PROP];
+	if (pd && (!ctx.v || (ctx.v >= BLKIO_WEIGHT_MIN &&
+			      ctx.v <= BLKIO_WEIGHT_MAX))) {
+		pd->conf.weight = ctx.v;
+		cfq_update_blkio_group_weight(ctx.blkg, ctx.v ?: blkcg->weight);
+		ret = 0;
+	}
+
+	blkg_conf_finish(&ctx);
+	return ret;
+}
+
+static int blkcg_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+
+	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+		return -EINVAL;
+
+	spin_lock_irq(&blkcg->lock);
+	blkcg->weight = (unsigned int)val;
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		struct blkg_policy_data *pd = blkg->pd[BLKIO_POLICY_PROP];
+
+		if (pd && !pd->conf.weight)
+			cfq_update_blkio_group_weight(blkg, blkcg->weight);
+	}
+
+	spin_unlock_irq(&blkcg->lock);
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+static u64 blkg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 samples = blkg_stat_read(&pd->stats.avg_queue_size_samples);
+	u64 v = 0;
+
+	if (samples) {
+		v = blkg_stat_read(&pd->stats.avg_queue_size_sum);
+		do_div(v, samples);
+	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
+
+/* print avg_queue_size */
+static int blkcg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
+				      struct seq_file *sf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_avg_queue_size,
+			  BLKIO_POLICY_PROP, 0, false);
+	return 0;
+}
+#endif	/* CONFIG_DEBUG_BLK_CGROUP */
+
+static struct cftype cfq_blkcg_files[] = {
+	{
+		.name = "weight_device",
+		.read_seq_string = blkcg_print_weight_device,
+		.write_string = blkcg_set_weight_device,
+		.max_write_len = 256,
+	},
+	{
+		.name = "weight",
+		.read_seq_string = blkcg_print_weight,
+		.write_u64 = blkcg_set_weight,
+	},
+	{
+		.name = "time",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, time)),
+		.read_seq_string = blkcg_print_stat,
+	},
+	{
+		.name = "sectors",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats_cpu, sectors)),
+		.read_seq_string = blkcg_print_cpu_stat,
+	},
+	{
+		.name = "io_service_bytes",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats_cpu, service_bytes)),
+		.read_seq_string = blkcg_print_cpu_rwstat,
+	},
+	{
+		.name = "io_serviced",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats_cpu, serviced)),
+		.read_seq_string = blkcg_print_cpu_rwstat,
+	},
+	{
+		.name = "io_service_time",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, service_time)),
+		.read_seq_string = blkcg_print_rwstat,
+	},
+	{
+		.name = "io_wait_time",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, wait_time)),
+		.read_seq_string = blkcg_print_rwstat,
+	},
+	{
+		.name = "io_merged",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, merged)),
+		.read_seq_string = blkcg_print_rwstat,
+	},
+	{
+		.name = "io_queued",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, queued)),
+		.read_seq_string = blkcg_print_rwstat,
+	},
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	{
+		.name = "avg_queue_size",
+		.read_seq_string = blkcg_print_avg_queue_size,
+	},
+	{
+		.name = "group_wait_time",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, group_wait_time)),
+		.read_seq_string = blkcg_print_stat,
+	},
+	{
+		.name = "idle_time",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, idle_time)),
+		.read_seq_string = blkcg_print_stat,
+	},
+	{
+		.name = "empty_time",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, empty_time)),
+		.read_seq_string = blkcg_print_stat,
+	},
+	{
+		.name = "dequeue",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, dequeue)),
+		.read_seq_string = blkcg_print_stat,
+	},
+	{
+		.name = "unaccounted_time",
+		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
+				offsetof(struct blkio_group_stats, unaccounted_time)),
+		.read_seq_string = blkcg_print_stat,
+	},
+#endif	/* CONFIG_DEBUG_BLK_CGROUP */
+	{ }	/* terminate */
+};
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 						struct blkio_cgroup *blkcg)
@@ -3715,10 +3911,10 @@ static struct elevator_type iosched_cfq = {
 static struct blkio_policy_type blkio_policy_cfq = {
 	.ops = {
 		.blkio_init_group_fn =		cfq_init_blkio_group,
-		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
 	},
 	.plid = BLKIO_POLICY_PROP,
 	.pdata_size = sizeof(struct cfq_group),
+	.cftypes = cfq_blkcg_files,
 };
 #endif
 
-- 
cgit v1.1


From 2ce4d50f9cfab40831eee5e51e950d5c4724994b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:43 -0700
Subject: cfq: collapse cfq.h into cfq-iosched.c

block/cfq.h contains some functions which interact with blkcg;
however, this is only part of it and cfq-iosched.c already has quite
some #ifdef CONFIG_CFQ_GROUP_IOSCHED.  With conf/stat handling being
moved to specific policies, having these relay functions isolated in
cfq.h doesn't make much sense.  Collapse cfq.h into cfq-iosched.c for
now.  Let's split blkcg support properly later if necessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/cfq-iosched.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++-
 block/cfq.h         | 118 ----------------------------------------------------
 2 files changed, 113 insertions(+), 119 deletions(-)
 delete mode 100644 block/cfq.h

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 119e061..2e13e9e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -15,7 +15,6 @@
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
 #include "blk.h"
-#include "cfq.h"
 
 static struct blkio_policy_type blkio_policy_cfq;
 
@@ -367,6 +366,9 @@ CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+
+#include "blk-cgroup.h"
+
 static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
 {
 	return blkg_to_pdata(blkg, &blkio_policy_cfq);
@@ -396,6 +398,82 @@ static inline void cfqg_put(struct cfq_group *cfqg)
 	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
 			blkg_path(cfqg_to_blkg((cfqg))), ##args)	\
 
+static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol,
+			struct blkio_group *curr_blkg,
+			bool direction, bool sync)
+{
+	blkiocg_update_io_add_stats(blkg, pol, curr_blkg, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, unsigned long dequeue)
+{
+	blkiocg_update_dequeue_stats(blkg, pol, dequeue);
+}
+
+static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, unsigned long time,
+			unsigned long unaccounted_time)
+{
+	blkiocg_update_timeslice_used(blkg, pol, time, unaccounted_time);
+}
+
+static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
+			struct blkio_policy_type *pol)
+{
+	blkiocg_set_start_empty_time(blkg, pol);
+}
+
+static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, bool direction,
+			bool sync)
+{
+	blkiocg_update_io_remove_stats(blkg, pol, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, bool direction,
+			bool sync)
+{
+	blkiocg_update_io_merged_stats(blkg, pol, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol)
+{
+	blkiocg_update_idle_time_stats(blkg, pol);
+}
+
+static inline void
+cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol)
+{
+	blkiocg_update_avg_queue_size_stats(blkg, pol);
+}
+
+static inline void
+cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol)
+{
+	blkiocg_update_set_idle_time_stats(blkg, pol);
+}
+
+static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, uint64_t bytes,
+			bool direction, bool sync)
+{
+	blkiocg_update_dispatch_stats(blkg, pol, bytes, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, uint64_t start_time,
+			uint64_t io_start_time, bool direction, bool sync)
+{
+	blkiocg_update_completion_stats(blkg, pol, start_time, io_start_time,
+					direction, sync);
+}
+
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
 
 static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) { return NULL; }
@@ -407,6 +485,40 @@ static inline void cfqg_put(struct cfq_group *cfqg) { }
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
 
+static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol,
+			struct blkio_group *curr_blkg, bool direction,
+			bool sync) { }
+static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, unsigned long dequeue) { }
+static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, unsigned long time,
+			unsigned long unaccounted_time) { }
+static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
+			struct blkio_policy_type *pol) { }
+static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, bool direction,
+			bool sync) { }
+static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, bool direction,
+			bool sync) { }
+static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol) { }
+static inline void
+cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
+					struct blkio_policy_type *pol) { }
+
+static inline void
+cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
+				       struct blkio_policy_type *pol) { }
+
+static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, uint64_t bytes,
+			bool direction, bool sync) { }
+static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
+			struct blkio_policy_type *pol, uint64_t start_time,
+			uint64_t io_start_time, bool direction, bool sync) { }
+
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 
 #define cfq_log(cfqd, fmt, args...)	\
diff --git a/block/cfq.h b/block/cfq.h
deleted file mode 100644
index c8b15ef..0000000
--- a/block/cfq.h
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef _CFQ_H
-#define _CFQ_H
-#include "blk-cgroup.h"
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol,
-			struct blkio_group *curr_blkg,
-			bool direction, bool sync)
-{
-	blkiocg_update_io_add_stats(blkg, pol, curr_blkg, direction, sync);
-}
-
-static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long dequeue)
-{
-	blkiocg_update_dequeue_stats(blkg, pol, dequeue);
-}
-
-static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long time,
-			unsigned long unaccounted_time)
-{
-	blkiocg_update_timeslice_used(blkg, pol, time, unaccounted_time);
-}
-
-static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
-			struct blkio_policy_type *pol)
-{
-	blkiocg_set_start_empty_time(blkg, pol);
-}
-
-static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync)
-{
-	blkiocg_update_io_remove_stats(blkg, pol, direction, sync);
-}
-
-static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync)
-{
-	blkiocg_update_io_merged_stats(blkg, pol, direction, sync);
-}
-
-static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol)
-{
-	blkiocg_update_idle_time_stats(blkg, pol);
-}
-
-static inline void
-cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol)
-{
-	blkiocg_update_avg_queue_size_stats(blkg, pol);
-}
-
-static inline void
-cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol)
-{
-	blkiocg_update_set_idle_time_stats(blkg, pol);
-}
-
-static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t bytes,
-			bool direction, bool sync)
-{
-	blkiocg_update_dispatch_stats(blkg, pol, bytes, direction, sync);
-}
-
-static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t start_time,
-			uint64_t io_start_time, bool direction, bool sync)
-{
-	blkiocg_update_completion_stats(blkg, pol, start_time, io_start_time,
-					direction, sync);
-}
-
-#else /* CFQ_GROUP_IOSCHED */
-static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol,
-			struct blkio_group *curr_blkg, bool direction,
-			bool sync) { }
-static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long dequeue) { }
-static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long time,
-			unsigned long unaccounted_time) { }
-static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
-			struct blkio_policy_type *pol) { }
-static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync) { }
-static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync) { }
-static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol) { }
-static inline void
-cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
-					struct blkio_policy_type *pol) { }
-
-static inline void
-cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
-				       struct blkio_policy_type *pol) { }
-
-static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t bytes,
-			bool direction, bool sync) { }
-static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t start_time,
-			uint64_t io_start_time, bool direction, bool sync) { }
-
-#endif /* CFQ_GROUP_IOSCHED */
-#endif
-- 
cgit v1.1


From 629ed0b10209ffc4e1d439e5508d52d5e3a090b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:44 -0700
Subject: blkcg: move statistics update code to policies

As with conf/stats file handling code, there's no reason for stat
update code to live in blkcg core with policies calling into update
them.  The current organization is both inflexible and complex.

This patch moves stat update code to specific policies.  All
blkiocg_update_*_stats() functions which deal with BLKIO_POLICY_PROP
stats are collapsed into their cfq_blkiocg_update_*_stats()
counterparts.  blkiocg_update_dispatch_stats() is used by both
policies and duplicated as throtl_update_dispatch_stats() and
cfq_blkiocg_update_dispatch_stats().  This will be cleaned up later.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c   | 245 --------------------------------------------
 block/blk-cgroup.h   |  94 -----------------
 block/blk-throttle.c |  37 +++++--
 block/cfq-iosched.c  | 280 +++++++++++++++++++++++++++++++++++++++++----------
 4 files changed, 259 insertions(+), 397 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b963fb4..821a0a3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -63,251 +63,6 @@ struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
 
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-/* This should be called with the queue_lock held. */
-static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
-					    struct blkio_policy_type *pol,
-					    struct blkio_group *curr_blkg)
-{
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-
-	if (blkio_blkg_waiting(&pd->stats))
-		return;
-	if (blkg == curr_blkg)
-		return;
-	pd->stats.start_group_wait_time = sched_clock();
-	blkio_mark_blkg_waiting(&pd->stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
-{
-	unsigned long long now;
-
-	if (!blkio_blkg_waiting(stats))
-		return;
-
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
-		blkg_stat_add(&stats->group_wait_time,
-			      now - stats->start_group_wait_time);
-	blkio_clear_blkg_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void blkio_end_empty_time(struct blkio_group_stats *stats)
-{
-	unsigned long long now;
-
-	if (!blkio_blkg_empty(stats))
-		return;
-
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
-		blkg_stat_add(&stats->empty_time,
-			      now - stats->start_empty_time);
-	blkio_clear_blkg_empty(stats);
-}
-
-void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
-					struct blkio_policy_type *pol)
-{
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-
-	lockdep_assert_held(blkg->q->queue_lock);
-	BUG_ON(blkio_blkg_idling(stats));
-
-	stats->start_idle_time = sched_clock();
-	blkio_mark_blkg_idling(stats);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
-
-void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
-				    struct blkio_policy_type *pol)
-{
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	if (blkio_blkg_idling(stats)) {
-		unsigned long long now = sched_clock();
-
-		if (time_after64(now, stats->start_idle_time))
-			blkg_stat_add(&stats->idle_time,
-				      now - stats->start_idle_time);
-		blkio_clear_blkg_idling(stats);
-	}
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
-
-void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
-					 struct blkio_policy_type *pol)
-{
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_stat_add(&stats->avg_queue_size_sum,
-		      blkg_rwstat_sum(&stats->queued));
-	blkg_stat_add(&stats->avg_queue_size_samples, 1);
-	blkio_update_group_wait_time(stats);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
-
-void blkiocg_set_start_empty_time(struct blkio_group *blkg,
-				  struct blkio_policy_type *pol)
-{
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	if (blkg_rwstat_sum(&stats->queued))
-		return;
-
-	/*
-	 * group is already marked empty. This can happen if cfqq got new
-	 * request in parent group and moved to this group while being added
-	 * to service tree. Just ignore the event and move on.
-	 */
-	if (blkio_blkg_empty(stats))
-		return;
-
-	stats->start_empty_time = sched_clock();
-	blkio_mark_blkg_empty(stats);
-}
-EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
-
-void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-				  struct blkio_policy_type *pol,
-				  unsigned long dequeue)
-{
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_stat_add(&pd->stats.dequeue, dequeue);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
-#else
-static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
-					struct blkio_policy_type *pol,
-					struct blkio_group *curr_blkg) { }
-static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
-#endif
-
-void blkiocg_update_io_add_stats(struct blkio_group *blkg,
-				 struct blkio_policy_type *pol,
-				 struct blkio_group *curr_blkg, bool direction,
-				 bool sync)
-{
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_rwstat_add(&stats->queued, rw, 1);
-	blkio_end_empty_time(stats);
-	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
-
-void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-				    struct blkio_policy_type *pol,
-				    bool direction, bool sync)
-{
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_rwstat_add(&stats->queued, rw, -1);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
-
-void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-				   struct blkio_policy_type *pol,
-				   unsigned long time,
-				   unsigned long unaccounted_time)
-{
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_stat_add(&stats->time, time);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_add(&stats->unaccounted_time, unaccounted_time);
-#endif
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
-
-/*
- * should be called under rcu read lock or queue lock to make sure blkg pointer
- * is valid.
- */
-void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				   struct blkio_policy_type *pol,
-				   uint64_t bytes, bool direction, bool sync)
-{
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	struct blkio_group_stats_cpu *stats_cpu;
-	unsigned long flags;
-
-	/* If per cpu stats are not allocated yet, don't do any accounting. */
-	if (pd->stats_cpu == NULL)
-		return;
-
-	/*
-	 * Disabling interrupts to provide mutual exclusion between two
-	 * writes on same cpu. It probably is not needed for 64bit. Not
-	 * optimizing that case yet.
-	 */
-	local_irq_save(flags);
-
-	stats_cpu = this_cpu_ptr(pd->stats_cpu);
-
-	blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
-	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
-
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
-
-void blkiocg_update_completion_stats(struct blkio_group *blkg,
-				     struct blkio_policy_type *pol,
-				     uint64_t start_time,
-				     uint64_t io_start_time, bool direction,
-				     bool sync)
-{
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-	unsigned long long now = sched_clock();
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
-	if (time_after64(io_start_time, start_time))
-		blkg_rwstat_add(&stats->wait_time, rw,
-				io_start_time - start_time);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
-
-/*  Merged stats are per cpu.  */
-void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-				    struct blkio_policy_type *pol,
-				    bool direction, bool sync)
-{
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_rwstat_add(&stats->merged, rw, 1);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
-
 /*
  * Worker for allocating per cpu stat for blk groups. This is scheduled on
  * the system_nrt_wq once there are some groups on the alloc_list waiting
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ba64b28..0b0a176 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -44,13 +44,6 @@ enum blkg_rwstat_type {
 	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
 };
 
-/* blkg state flags */
-enum blkg_state_flags {
-	BLKG_waiting = 0,
-	BLKG_idling,
-	BLKG_empty,
-};
-
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -416,52 +409,6 @@ static inline void blkg_put(struct blkio_group *blkg) { }
 #define BLKIO_WEIGHT_MAX	1000
 #define BLKIO_WEIGHT_DEFAULT	500
 
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
-					 struct blkio_policy_type *pol);
-void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-				  struct blkio_policy_type *pol,
-				  unsigned long dequeue);
-void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
-					struct blkio_policy_type *pol);
-void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
-				    struct blkio_policy_type *pol);
-void blkiocg_set_start_empty_time(struct blkio_group *blkg,
-				  struct blkio_policy_type *pol);
-
-#define BLKG_FLAG_FNS(name)						\
-static inline void blkio_mark_blkg_##name(				\
-		struct blkio_group_stats *stats)			\
-{									\
-	stats->flags |= (1 << BLKG_##name);				\
-}									\
-static inline void blkio_clear_blkg_##name(				\
-		struct blkio_group_stats *stats)			\
-{									\
-	stats->flags &= ~(1 << BLKG_##name);				\
-}									\
-static inline int blkio_blkg_##name(struct blkio_group_stats *stats)	\
-{									\
-	return (stats->flags & (1 << BLKG_##name)) != 0;		\
-}									\
-
-BLKG_FLAG_FNS(waiting)
-BLKG_FLAG_FNS(idling)
-BLKG_FLAG_FNS(empty)
-#undef BLKG_FLAG_FNS
-#else
-static inline void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol) { }
-static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long dequeue) { }
-static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol) { }
-static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol) { }
-static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
-			struct blkio_policy_type *pol) { }
-#endif
-
 #ifdef CONFIG_BLK_CGROUP
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
@@ -471,28 +418,6 @@ extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       struct request_queue *q,
 				       bool for_root);
-void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-				   struct blkio_policy_type *pol,
-				   unsigned long time,
-				   unsigned long unaccounted_time);
-void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				   struct blkio_policy_type *pol,
-				   uint64_t bytes, bool direction, bool sync);
-void blkiocg_update_completion_stats(struct blkio_group *blkg,
-				     struct blkio_policy_type *pol,
-				     uint64_t start_time,
-				     uint64_t io_start_time, bool direction,
-				     bool sync);
-void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-				    struct blkio_policy_type *pol,
-				    bool direction, bool sync);
-void blkiocg_update_io_add_stats(struct blkio_group *blkg,
-				 struct blkio_policy_type *pol,
-				 struct blkio_group *curr_blkg, bool direction,
-				 bool sync);
-void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-				    struct blkio_policy_type *pol,
-				    bool direction, bool sync);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
@@ -502,24 +427,5 @@ bio_blkio_cgroup(struct bio *bio) { return NULL; }
 
 static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 					      void *key) { return NULL; }
-static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long time,
-			unsigned long unaccounted_time) { }
-static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t bytes,
-			bool direction, bool sync) { }
-static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t start_time,
-			uint64_t io_start_time, bool direction, bool sync) { }
-static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync) { }
-static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol,
-			struct blkio_group *curr_blkg, bool direction,
-			bool sync) { }
-static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync) { }
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index fb6f257..5d647ed 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -562,17 +562,42 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 	return 0;
 }
 
+static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes,
+					 int rw)
+{
+	struct blkg_policy_data *pd = blkg->pd[BLKIO_POLICY_THROTL];
+	struct blkio_group_stats_cpu *stats_cpu;
+	unsigned long flags;
+
+	/* If per cpu stats are not allocated yet, don't do any accounting. */
+	if (pd->stats_cpu == NULL)
+		return;
+
+	/*
+	 * Disabling interrupts to provide mutual exclusion between two
+	 * writes on same cpu. It probably is not needed for 64bit. Not
+	 * optimizing that case yet.
+	 */
+	local_irq_save(flags);
+
+	stats_cpu = this_cpu_ptr(pd->stats_cpu);
+
+	blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
+	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
+	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
+
+	local_irq_restore(flags);
+}
+
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
-	bool sync = rw_is_sync(bio->bi_rw);
 
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;
 
-	blkiocg_update_dispatch_stats(tg_to_blkg(tg), &blkio_policy_throtl,
-				      bio->bi_size, rw, sync);
+	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -1012,10 +1037,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
 		if (tg_no_rule_group(tg, rw)) {
-			blkiocg_update_dispatch_stats(tg_to_blkg(tg),
-						      &blkio_policy_throtl,
-						      bio->bi_size, rw,
-						      rw_is_sync(bio->bi_rw));
+			throtl_update_dispatch_stats(tg_to_blkg(tg),
+						     bio->bi_size, bio->bi_rw);
 			goto out_unlock_rcu;
 		}
 	}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2e13e9e..4991380 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -15,6 +15,7 @@
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
 #include "blk.h"
+#include "blk-cgroup.h"
 
 static struct blkio_policy_type blkio_policy_cfq;
 
@@ -365,9 +366,177 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
+#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 
-#include "blk-cgroup.h"
+/* blkg state flags */
+enum blkg_state_flags {
+	BLKG_waiting = 0,
+	BLKG_idling,
+	BLKG_empty,
+};
+
+#define BLKG_FLAG_FNS(name)						\
+static inline void blkio_mark_blkg_##name(				\
+		struct blkio_group_stats *stats)			\
+{									\
+	stats->flags |= (1 << BLKG_##name);				\
+}									\
+static inline void blkio_clear_blkg_##name(				\
+		struct blkio_group_stats *stats)			\
+{									\
+	stats->flags &= ~(1 << BLKG_##name);				\
+}									\
+static inline int blkio_blkg_##name(struct blkio_group_stats *stats)	\
+{									\
+	return (stats->flags & (1 << BLKG_##name)) != 0;		\
+}									\
+
+BLKG_FLAG_FNS(waiting)
+BLKG_FLAG_FNS(idling)
+BLKG_FLAG_FNS(empty)
+#undef BLKG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
+{
+	unsigned long long now;
+
+	if (!blkio_blkg_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	blkio_clear_blkg_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+					    struct blkio_policy_type *pol,
+					    struct blkio_group *curr_blkg)
+{
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
+
+	if (blkio_blkg_waiting(&pd->stats))
+		return;
+	if (blkg == curr_blkg)
+		return;
+	pd->stats.start_group_wait_time = sched_clock();
+	blkio_mark_blkg_waiting(&pd->stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void blkio_end_empty_time(struct blkio_group_stats *stats)
+{
+	unsigned long long now;
+
+	if (!blkio_blkg_empty(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	blkio_clear_blkg_empty(stats);
+}
+
+static void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+					     struct blkio_policy_type *pol,
+					     unsigned long dequeue)
+{
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
+
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	blkg_stat_add(&pd->stats.dequeue, dequeue);
+}
+
+static void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
+					     struct blkio_policy_type *pol)
+{
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	if (blkg_rwstat_sum(&stats->queued))
+		return;
+
+	/*
+	 * group is already marked empty. This can happen if cfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
+	 */
+	if (blkio_blkg_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	blkio_mark_blkg_empty(stats);
+}
+
+static void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
+					       struct blkio_policy_type *pol)
+{
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	if (blkio_blkg_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		blkio_clear_blkg_idling(stats);
+	}
+}
+
+static void cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
+						   struct blkio_policy_type *pol)
+{
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+
+	lockdep_assert_held(blkg->q->queue_lock);
+	BUG_ON(blkio_blkg_idling(stats));
+
+	stats->start_idle_time = sched_clock();
+	blkio_mark_blkg_idling(stats);
+}
+
+static void cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
+						    struct blkio_policy_type *pol)
+{
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_sum(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	blkio_update_group_wait_time(stats);
+}
+
+#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+
+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+					    struct blkio_policy_type *pol,
+					    struct blkio_group *curr_blkg) { }
+static void blkio_end_empty_time(struct blkio_group_stats *stats) { }
+static void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+					     struct blkio_policy_type *pol,
+					     unsigned long dequeue) { }
+static void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
+					     struct blkio_policy_type *pol) { }
+static void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
+					       struct blkio_policy_type *pol) { }
+static void cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
+						   struct blkio_policy_type *pol) { }
+static void cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
+						    struct blkio_policy_type *pol) { }
+
+#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
 
 static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
 {
@@ -403,75 +572,98 @@ static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
 			struct blkio_group *curr_blkg,
 			bool direction, bool sync)
 {
-	blkiocg_update_io_add_stats(blkg, pol, curr_blkg, direction, sync);
-}
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
 
-static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long dequeue)
-{
-	blkiocg_update_dequeue_stats(blkg, pol, dequeue);
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	blkg_rwstat_add(&stats->queued, rw, 1);
+	blkio_end_empty_time(stats);
+	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
 }
 
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, unsigned long time,
 			unsigned long unaccounted_time)
 {
-	blkiocg_update_timeslice_used(blkg, pol, time, unaccounted_time);
-}
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 
-static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
-			struct blkio_policy_type *pol)
-{
-	blkiocg_set_start_empty_time(blkg, pol);
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	blkg_stat_add(&stats->time, time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_add(&stats->unaccounted_time, unaccounted_time);
+#endif
 }
 
 static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, bool direction,
 			bool sync)
 {
-	blkiocg_update_io_remove_stats(blkg, pol, direction, sync);
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
+
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	blkg_rwstat_add(&stats->queued, rw, -1);
 }
 
 static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, bool direction,
 			bool sync)
 {
-	blkiocg_update_io_merged_stats(blkg, pol, direction, sync);
-}
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
 
-static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol)
-{
-	blkiocg_update_idle_time_stats(blkg, pol);
-}
+	lockdep_assert_held(blkg->q->queue_lock);
 
-static inline void
-cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol)
-{
-	blkiocg_update_avg_queue_size_stats(blkg, pol);
-}
-
-static inline void
-cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol)
-{
-	blkiocg_update_set_idle_time_stats(blkg, pol);
+	blkg_rwstat_add(&stats->merged, rw, 1);
 }
 
 static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, uint64_t bytes,
 			bool direction, bool sync)
 {
-	blkiocg_update_dispatch_stats(blkg, pol, bytes, direction, sync);
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
+	struct blkg_policy_data *pd = blkg->pd[pol->plid];
+	struct blkio_group_stats_cpu *stats_cpu;
+	unsigned long flags;
+
+	/* If per cpu stats are not allocated yet, don't do any accounting. */
+	if (pd->stats_cpu == NULL)
+		return;
+
+	/*
+	 * Disabling interrupts to provide mutual exclusion between two
+	 * writes on same cpu. It probably is not needed for 64bit. Not
+	 * optimizing that case yet.
+	 */
+	local_irq_save(flags);
+
+	stats_cpu = this_cpu_ptr(pd->stats_cpu);
+
+	blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
+	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
+	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
+
+	local_irq_restore(flags);
 }
 
 static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, uint64_t start_time,
 			uint64_t io_start_time, bool direction, bool sync)
 {
-	blkiocg_update_completion_stats(blkg, pol, start_time, io_start_time,
-					direction, sync);
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	unsigned long long now = sched_clock();
+	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
+
+	lockdep_assert_held(blkg->q->queue_lock);
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, rw,
+				io_start_time - start_time);
 }
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
@@ -489,29 +681,15 @@ static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol,
 			struct blkio_group *curr_blkg, bool direction,
 			bool sync) { }
-static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long dequeue) { }
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, unsigned long time,
 			unsigned long unaccounted_time) { }
-static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
-			struct blkio_policy_type *pol) { }
 static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, bool direction,
 			bool sync) { }
 static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, bool direction,
 			bool sync) { }
-static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol) { }
-static inline void
-cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
-					struct blkio_policy_type *pol) { }
-
-static inline void
-cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
-				       struct blkio_policy_type *pol) { }
-
 static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, uint64_t bytes,
 			bool direction, bool sync) { }
-- 
cgit v1.1


From 41b38b6d540f951c49315d8573e6f6195a6e736d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:44 -0700
Subject: blkcg: cfq doesn't need per-cpu dispatch stats

blkio_group_stats_cpu is used to count dispatch stats using per-cpu
counters.  This is used by both blk-throtl and cfq-iosched but the
sharing is rather silly.

* cfq-iosched doesn't need per-cpu dispatch stats.  cfq always updates
  those stats while holding queue_lock.

* blk-throtl needs per-cpu dispatch stats but only service_bytes and
  serviced.  It doesn't make use of sectors.

This patch makes cfq add and use global stats for service_bytes,
serviced and sectors, removes per-cpu sectors counter and moves
per-cpu stat printing code to blk-throttle.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c   | 63 ++--------------------------------------------------
 block/blk-cgroup.h   | 12 +++++-----
 block/blk-throttle.c | 31 +++++++++++++++++++++++++-
 block/cfq-iosched.c  | 37 +++++++++---------------------
 4 files changed, 48 insertions(+), 95 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 821a0a3..19ee29f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -390,7 +390,6 @@ static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
 
 		blkg_rwstat_reset(&sc->service_bytes);
 		blkg_rwstat_reset(&sc->serviced);
-		blkg_stat_reset(&sc->sectors);
 	}
 }
 
@@ -417,6 +416,8 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 			struct blkio_group_stats *stats = &pd->stats;
 
 			/* queued stats shouldn't be cleared */
+			blkg_rwstat_reset(&stats->service_bytes);
+			blkg_rwstat_reset(&stats->serviced);
 			blkg_rwstat_reset(&stats->merged);
 			blkg_rwstat_reset(&stats->service_time);
 			blkg_rwstat_reset(&stats->wait_time);
@@ -577,66 +578,6 @@ int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
 }
 EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
 
-static u64 blkg_prfill_cpu_stat(struct seq_file *sf,
-				struct blkg_policy_data *pd, int off)
-{
-	u64 v = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct blkio_group_stats_cpu *sc =
-			per_cpu_ptr(pd->stats_cpu, cpu);
-
-		v += blkg_stat_read((void *)sc + off);
-	}
-
-	return __blkg_prfill_u64(sf, pd, v);
-}
-
-static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
-				  struct blkg_policy_data *pd, int off)
-{
-	struct blkg_rwstat rwstat = { }, tmp;
-	int i, cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct blkio_group_stats_cpu *sc =
-			per_cpu_ptr(pd->stats_cpu, cpu);
-
-		tmp = blkg_rwstat_read((void *)sc + off);
-		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			rwstat.cnt[i] += tmp.cnt[i];
-	}
-
-	return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-/* print per-cpu blkg_stat specified by BLKCG_STAT_PRIV() */
-int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
-			 struct seq_file *sf)
-{
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_stat,
-			  BLKCG_STAT_POL(cft->private),
-			  BLKCG_STAT_OFF(cft->private), false);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blkcg_print_cpu_stat);
-
-/* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
-int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
-			   struct seq_file *sf)
-{
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
-			  BLKCG_STAT_POL(cft->private),
-			  BLKCG_STAT_OFF(cft->private), true);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blkcg_print_cpu_rwstat);
-
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 0b0a176..c82de47 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -65,6 +65,10 @@ struct blkg_rwstat {
 };
 
 struct blkio_group_stats {
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
 	/* number of ios merged */
 	struct blkg_rwstat		merged;
 	/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -73,6 +77,8 @@ struct blkio_group_stats {
 	struct blkg_rwstat		wait_time;
 	/* number of IOs queued up */
 	struct blkg_rwstat		queued;
+	/* total sectors transferred */
+	struct blkg_stat		sectors;
 	/* total disk time and nr sectors dispatched by this group */
 	struct blkg_stat		time;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -104,8 +110,6 @@ struct blkio_group_stats_cpu {
 	struct blkg_rwstat		service_bytes;
 	/* total IOs serviced, post merge */
 	struct blkg_rwstat		serviced;
-	/* total sectors transferred */
-	struct blkg_stat		sectors;
 };
 
 struct blkio_group_conf {
@@ -183,10 +187,6 @@ int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 		     struct seq_file *sf);
 int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
 		       struct seq_file *sf);
-int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
-			 struct seq_file *sf);
-int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
-			   struct seq_file *sf);
 
 struct blkg_conf_ctx {
 	struct gendisk		*disk;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5d647ed..cb259bc 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -582,7 +582,6 @@ static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes,
 
 	stats_cpu = this_cpu_ptr(pd->stats_cpu);
 
-	blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
 	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
 	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
 
@@ -843,6 +842,36 @@ static void throtl_update_blkio_group_common(struct throtl_data *td,
 	throtl_schedule_delayed_work(td, 0);
 }
 
+static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
+				  struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat rwstat = { }, tmp;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct blkio_group_stats_cpu *sc =
+			per_cpu_ptr(pd->stats_cpu, cpu);
+
+		tmp = blkg_rwstat_read((void *)sc + off);
+		for (i = 0; i < BLKG_RWSTAT_NR; i++)
+			rwstat.cnt[i] += tmp.cnt[i];
+	}
+
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
+static int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
+				  struct seq_file *sf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
+			  BLKCG_STAT_POL(cft->private),
+			  BLKCG_STAT_OFF(cft->private), true);
+	return 0;
+}
+
 static u64 blkg_prfill_conf_u64(struct seq_file *sf,
 				struct blkg_policy_data *pd, int off)
 {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4991380..effd894 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -624,29 +624,12 @@ static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, uint64_t bytes,
 			bool direction, bool sync)
 {
+	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-	struct blkio_group_stats_cpu *stats_cpu;
-	unsigned long flags;
 
-	/* If per cpu stats are not allocated yet, don't do any accounting. */
-	if (pd->stats_cpu == NULL)
-		return;
-
-	/*
-	 * Disabling interrupts to provide mutual exclusion between two
-	 * writes on same cpu. It probably is not needed for 64bit. Not
-	 * optimizing that case yet.
-	 */
-	local_irq_save(flags);
-
-	stats_cpu = this_cpu_ptr(pd->stats_cpu);
-
-	blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
-	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
-
-	local_irq_restore(flags);
+	blkg_stat_add(&stats->sectors, bytes >> 9);
+	blkg_rwstat_add(&stats->serviced, rw, 1);
+	blkg_rwstat_add(&stats->service_bytes, rw, bytes);
 }
 
 static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -1520,20 +1503,20 @@ static struct cftype cfq_blkcg_files[] = {
 	{
 		.name = "sectors",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats_cpu, sectors)),
-		.read_seq_string = blkcg_print_cpu_stat,
+				offsetof(struct blkio_group_stats, sectors)),
+		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "io_service_bytes",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats_cpu, service_bytes)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
+				offsetof(struct blkio_group_stats, service_bytes)),
+		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_serviced",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats_cpu, serviced)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
+				offsetof(struct blkio_group_stats, serviced)),
+		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_service_time",
-- 
cgit v1.1


From 9ade5ea4ce57d3596eaee6a57cd212a483674058 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:44 -0700
Subject: blkcg: add blkio_policy_ops operations for exit and stat reset

Add blkio_policy_ops->blkio_exit_group_fn() and
->blkio_reset_group_stats_fn().  These will be used to further
modularize blkcg policy implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 16 ++++++++++++----
 block/blk-cgroup.h |  4 ++++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 19ee29f..2e6fb7d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -131,12 +131,17 @@ static void blkg_free(struct blkio_group *blkg)
 		return;
 
 	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+		struct blkio_policy_type *pol = blkio_policy[i];
 		struct blkg_policy_data *pd = blkg->pd[i];
 
-		if (pd) {
-			free_percpu(pd->stats_cpu);
-			kfree(pd);
-		}
+		if (!pd)
+			continue;
+
+		if (pol && pol->ops.blkio_exit_group_fn)
+			pol->ops.blkio_exit_group_fn(blkg);
+
+		free_percpu(pd->stats_cpu);
+		kfree(pd);
 	}
 
 	kfree(blkg);
@@ -432,6 +437,9 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 			blkg_stat_reset(&stats->empty_time);
 #endif
 			blkio_reset_stats_cpu(blkg, pol->plid);
+
+			if (pol->ops.blkio_reset_group_stats_fn)
+				pol->ops.blkio_reset_group_stats_fn(blkg);
 		}
 	}
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c82de47..d0ee649 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -153,9 +153,13 @@ struct blkio_group {
 };
 
 typedef void (blkio_init_group_fn)(struct blkio_group *blkg);
+typedef void (blkio_exit_group_fn)(struct blkio_group *blkg);
+typedef void (blkio_reset_group_stats_fn)(struct blkio_group *blkg);
 
 struct blkio_policy_ops {
 	blkio_init_group_fn *blkio_init_group_fn;
+	blkio_exit_group_fn *blkio_exit_group_fn;
+	blkio_reset_group_stats_fn *blkio_reset_group_stats_fn;
 };
 
 struct blkio_policy_type {
-- 
cgit v1.1


From 155fead9b6347ead90e0b0396cb108a6ba6126c6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:44 -0700
Subject: blkcg: move blkio_group_stats to cfq-iosched.c

blkio_group_stats contains only fields used by cfq and has no reason
to be defined in blkcg core.

* Move blkio_group_stats to cfq-iosched.c and rename it to cfqg_stats.

* blkg_policy_data->stats is replaced with cfq_group->stats.
  blkg_prfill_[rw]stat() are updated to use offset against pd->pdata
  instead.

* All related macros / functions are renamed so that they have cfqg_
  prefix and the unnecessary @pol arguments are dropped.

* All stat functions now take cfq_group * instead of blkio_group *.

* lockdep assertion on queue lock dropped.  Elevator runs under queue
  lock by default.  There isn't much to be gained by adding lockdep
  assertions at stat function level.

* cfqg_stats_reset() implemented for blkio_reset_group_stats_fn method
  so that cfqg->stats can be reset.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c  |  23 +--
 block/blk-cgroup.h  |  41 ------
 block/cfq-iosched.c | 407 ++++++++++++++++++++++++----------------------------
 3 files changed, 193 insertions(+), 278 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2e6fb7d..cfdda44 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -417,25 +417,6 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 		struct blkio_policy_type *pol;
 
 		list_for_each_entry(pol, &blkio_list, list) {
-			struct blkg_policy_data *pd = blkg->pd[pol->plid];
-			struct blkio_group_stats *stats = &pd->stats;
-
-			/* queued stats shouldn't be cleared */
-			blkg_rwstat_reset(&stats->service_bytes);
-			blkg_rwstat_reset(&stats->serviced);
-			blkg_rwstat_reset(&stats->merged);
-			blkg_rwstat_reset(&stats->service_time);
-			blkg_rwstat_reset(&stats->wait_time);
-			blkg_stat_reset(&stats->time);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-			blkg_stat_reset(&stats->unaccounted_time);
-			blkg_stat_reset(&stats->avg_queue_size_sum);
-			blkg_stat_reset(&stats->avg_queue_size_samples);
-			blkg_stat_reset(&stats->dequeue);
-			blkg_stat_reset(&stats->group_wait_time);
-			blkg_stat_reset(&stats->idle_time);
-			blkg_stat_reset(&stats->empty_time);
-#endif
 			blkio_reset_stats_cpu(blkg, pol->plid);
 
 			if (pol->ops.blkio_reset_group_stats_fn)
@@ -549,13 +530,13 @@ static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
 			    int off)
 {
 	return __blkg_prfill_u64(sf, pd,
-				 blkg_stat_read((void *)&pd->stats + off));
+				 blkg_stat_read((void *)pd->pdata + off));
 }
 
 static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 			      int off)
 {
-	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)&pd->stats + off);
+	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->pdata + off);
 
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index d0ee649..791570394 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -64,46 +64,6 @@ struct blkg_rwstat {
 	uint64_t			cnt[BLKG_RWSTAT_NR];
 };
 
-struct blkio_group_stats {
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
-	/* number of ios merged */
-	struct blkg_rwstat		merged;
-	/* total time spent on device in ns, may not be accurate w/ queueing */
-	struct blkg_rwstat		service_time;
-	/* total time spent waiting in scheduler queue in ns */
-	struct blkg_rwstat		wait_time;
-	/* number of IOs queued up */
-	struct blkg_rwstat		queued;
-	/* total sectors transferred */
-	struct blkg_stat		sectors;
-	/* total disk time and nr sectors dispatched by this group */
-	struct blkg_stat		time;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	/* time not charged to this cgroup */
-	struct blkg_stat		unaccounted_time;
-	/* sum of number of ios queued across all samples */
-	struct blkg_stat		avg_queue_size_sum;
-	/* count of samples taken for average */
-	struct blkg_stat		avg_queue_size_samples;
-	/* how many times this group has been removed from service tree */
-	struct blkg_stat		dequeue;
-	/* total time spent waiting for it to be assigned a timeslice. */
-	struct blkg_stat		group_wait_time;
-	/* time spent idling for this blkio_group */
-	struct blkg_stat		idle_time;
-	/* total time with empty current active q with other requests queued */
-	struct blkg_stat		empty_time;
-	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
-	uint16_t			flags;
-#endif
-};
-
 /* Per cpu blkio group stats */
 struct blkio_group_stats_cpu {
 	/* total bytes transferred */
@@ -126,7 +86,6 @@ struct blkg_policy_data {
 	/* Configuration */
 	struct blkio_group_conf conf;
 
-	struct blkio_group_stats stats;
 	/* Per cpu stats pointer */
 	struct blkio_group_stats_cpu __percpu *stats_cpu;
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index effd894..a1f37df 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -173,6 +173,48 @@ enum wl_type_t {
 	SYNC_WORKLOAD = 2
 };
 
+struct cfqg_stats {
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
+	/* total sectors transferred */
+	struct blkg_stat		sectors;
+	/* total disk time and nr sectors dispatched by this group */
+	struct blkg_stat		time;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	/* time not charged to this cgroup */
+	struct blkg_stat		unaccounted_time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkio_group */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
+#endif	/* CONFIG_DEBUG_BLK_CGROUP */
+#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
+};
+
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
 	/* group service_tree member */
@@ -212,6 +254,7 @@ struct cfq_group {
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 	struct cfq_ttime ttime;
+	struct cfqg_stats stats;
 };
 
 struct cfq_io_cq {
@@ -368,96 +411,84 @@ CFQ_CFQQ_FNS(wait_busy);
 
 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 
-/* blkg state flags */
-enum blkg_state_flags {
-	BLKG_waiting = 0,
-	BLKG_idling,
-	BLKG_empty,
+/* cfqg stats flags */
+enum cfqg_stats_flags {
+	CFQG_stats_waiting = 0,
+	CFQG_stats_idling,
+	CFQG_stats_empty,
 };
 
-#define BLKG_FLAG_FNS(name)						\
-static inline void blkio_mark_blkg_##name(				\
-		struct blkio_group_stats *stats)			\
+#define CFQG_FLAG_FNS(name)						\
+static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
 {									\
-	stats->flags |= (1 << BLKG_##name);				\
+	stats->flags |= (1 << CFQG_stats_##name);			\
 }									\
-static inline void blkio_clear_blkg_##name(				\
-		struct blkio_group_stats *stats)			\
+static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
 {									\
-	stats->flags &= ~(1 << BLKG_##name);				\
+	stats->flags &= ~(1 << CFQG_stats_##name);			\
 }									\
-static inline int blkio_blkg_##name(struct blkio_group_stats *stats)	\
+static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
 {									\
-	return (stats->flags & (1 << BLKG_##name)) != 0;		\
+	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
 }									\
 
-BLKG_FLAG_FNS(waiting)
-BLKG_FLAG_FNS(idling)
-BLKG_FLAG_FNS(empty)
-#undef BLKG_FLAG_FNS
+CFQG_FLAG_FNS(waiting)
+CFQG_FLAG_FNS(idling)
+CFQG_FLAG_FNS(empty)
+#undef CFQG_FLAG_FNS
 
 /* This should be called with the queue_lock held. */
-static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
+static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
 {
 	unsigned long long now;
 
-	if (!blkio_blkg_waiting(stats))
+	if (!cfqg_stats_waiting(stats))
 		return;
 
 	now = sched_clock();
 	if (time_after64(now, stats->start_group_wait_time))
 		blkg_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
-	blkio_clear_blkg_waiting(stats);
+	cfqg_stats_clear_waiting(stats);
 }
 
 /* This should be called with the queue_lock held. */
-static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
-					    struct blkio_policy_type *pol,
-					    struct blkio_group *curr_blkg)
+static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
+						 struct cfq_group *curr_cfqg)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
+	struct cfqg_stats *stats = &cfqg->stats;
 
-	if (blkio_blkg_waiting(&pd->stats))
+	if (cfqg_stats_waiting(stats))
 		return;
-	if (blkg == curr_blkg)
+	if (cfqg == curr_cfqg)
 		return;
-	pd->stats.start_group_wait_time = sched_clock();
-	blkio_mark_blkg_waiting(&pd->stats);
+	stats->start_group_wait_time = sched_clock();
+	cfqg_stats_mark_waiting(stats);
 }
 
 /* This should be called with the queue_lock held. */
-static void blkio_end_empty_time(struct blkio_group_stats *stats)
+static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
 {
 	unsigned long long now;
 
-	if (!blkio_blkg_empty(stats))
+	if (!cfqg_stats_empty(stats))
 		return;
 
 	now = sched_clock();
 	if (time_after64(now, stats->start_empty_time))
 		blkg_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
-	blkio_clear_blkg_empty(stats);
+	cfqg_stats_clear_empty(stats);
 }
 
-static void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-					     struct blkio_policy_type *pol,
-					     unsigned long dequeue)
+static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
 {
-	struct blkg_policy_data *pd = blkg->pd[pol->plid];
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_stat_add(&pd->stats.dequeue, dequeue);
+	blkg_stat_add(&cfqg->stats.dequeue, 1);
 }
 
-static void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
-					     struct blkio_policy_type *pol)
+static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-
-	lockdep_assert_held(blkg->q->queue_lock);
+	struct cfqg_stats *stats = &cfqg->stats;
 
 	if (blkg_rwstat_sum(&stats->queued))
 		return;
@@ -467,72 +498,57 @@ static void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
 	 * request in parent group and moved to this group while being added
 	 * to service tree. Just ignore the event and move on.
 	 */
-	if (blkio_blkg_empty(stats))
+	if (cfqg_stats_empty(stats))
 		return;
 
 	stats->start_empty_time = sched_clock();
-	blkio_mark_blkg_empty(stats);
+	cfqg_stats_mark_empty(stats);
 }
 
-static void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
-					       struct blkio_policy_type *pol)
+static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	struct cfqg_stats *stats = &cfqg->stats;
 
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	if (blkio_blkg_idling(stats)) {
+	if (cfqg_stats_idling(stats)) {
 		unsigned long long now = sched_clock();
 
 		if (time_after64(now, stats->start_idle_time))
 			blkg_stat_add(&stats->idle_time,
 				      now - stats->start_idle_time);
-		blkio_clear_blkg_idling(stats);
+		cfqg_stats_clear_idling(stats);
 	}
 }
 
-static void cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
-						   struct blkio_policy_type *pol)
+static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	struct cfqg_stats *stats = &cfqg->stats;
 
-	lockdep_assert_held(blkg->q->queue_lock);
-	BUG_ON(blkio_blkg_idling(stats));
+	BUG_ON(cfqg_stats_idling(stats));
 
 	stats->start_idle_time = sched_clock();
-	blkio_mark_blkg_idling(stats);
+	cfqg_stats_mark_idling(stats);
 }
 
-static void cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
-						    struct blkio_policy_type *pol)
+static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-
-	lockdep_assert_held(blkg->q->queue_lock);
+	struct cfqg_stats *stats = &cfqg->stats;
 
 	blkg_stat_add(&stats->avg_queue_size_sum,
 		      blkg_rwstat_sum(&stats->queued));
 	blkg_stat_add(&stats->avg_queue_size_samples, 1);
-	blkio_update_group_wait_time(stats);
+	cfqg_stats_update_group_wait_time(stats);
 }
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 
-static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
-					    struct blkio_policy_type *pol,
-					    struct blkio_group *curr_blkg) { }
-static void blkio_end_empty_time(struct blkio_group_stats *stats) { }
-static void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-					     struct blkio_policy_type *pol,
-					     unsigned long dequeue) { }
-static void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg,
-					     struct blkio_policy_type *pol) { }
-static void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg,
-					       struct blkio_policy_type *pol) { }
-static void cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
-						   struct blkio_policy_type *pol) { }
-static void cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
-						    struct blkio_policy_type *pol) { }
+static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
+						 struct cfq_group *curr_cfqg) { }
+static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
+static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
+static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
+static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
+static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
+static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 
@@ -567,80 +583,46 @@ static inline void cfqg_put(struct cfq_group *cfqg)
 	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
 			blkg_path(cfqg_to_blkg((cfqg))), ##args)	\
 
-static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol,
-			struct blkio_group *curr_blkg,
-			bool direction, bool sync)
+static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
+					    struct cfq_group *curr_cfqg, int rw)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_rwstat_add(&stats->queued, rw, 1);
-	blkio_end_empty_time(stats);
-	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
+	blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
+	cfqg_stats_end_empty_time(&cfqg->stats);
+	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
 }
 
-static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long time,
-			unsigned long unaccounted_time)
+static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
+			unsigned long time, unsigned long unaccounted_time)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_stat_add(&stats->time, time);
+	blkg_stat_add(&cfqg->stats.time, time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_add(&stats->unaccounted_time, unaccounted_time);
+	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
 #endif
 }
 
-static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync)
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_rwstat_add(&stats->queued, rw, -1);
+	blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
 }
 
-static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync)
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-
-	lockdep_assert_held(blkg->q->queue_lock);
-
-	blkg_rwstat_add(&stats->merged, rw, 1);
+	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
 }
 
-static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t bytes,
-			bool direction, bool sync)
+static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
+					      uint64_t bytes, int rw)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-
-	blkg_stat_add(&stats->sectors, bytes >> 9);
-	blkg_rwstat_add(&stats->serviced, rw, 1);
-	blkg_rwstat_add(&stats->service_bytes, rw, bytes);
+	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
+	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
+	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
 }
 
-static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t start_time,
-			uint64_t io_start_time, bool direction, bool sync)
+static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
+			uint64_t start_time, uint64_t io_start_time, int rw)
 {
-	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
+	struct cfqg_stats *stats = &cfqg->stats;
 	unsigned long long now = sched_clock();
-	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
-
-	lockdep_assert_held(blkg->q->queue_lock);
 
 	if (time_after64(now, io_start_time))
 		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
@@ -649,6 +631,29 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 				io_start_time - start_time);
 }
 
+static void cfqg_stats_reset(struct blkio_group *blkg)
+{
+	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfqg_stats *stats = &cfqg->stats;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->service_bytes);
+	blkg_rwstat_reset(&stats->serviced);
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_reset(&stats->unaccounted_time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+#endif
+}
+
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
 
 static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) { return NULL; }
@@ -660,25 +665,16 @@ static inline void cfqg_put(struct cfq_group *cfqg) { }
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
 
-static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol,
-			struct blkio_group *curr_blkg, bool direction,
-			bool sync) { }
-static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, unsigned long time,
-			unsigned long unaccounted_time) { }
-static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync) { }
-static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, bool direction,
-			bool sync) { }
-static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t bytes,
-			bool direction, bool sync) { }
-static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
-			struct blkio_policy_type *pol, uint64_t start_time,
-			uint64_t io_start_time, bool direction, bool sync) { }
+static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
+			struct cfq_group *curr_cfqg, int rw) { }
+static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
+			unsigned long time, unsigned long unaccounted_time) { }
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
+static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
+					      uint64_t bytes, int rw) { }
+static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
+			uint64_t start_time, uint64_t io_start_time, int rw) { }
 
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 
@@ -1233,8 +1229,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfq_group_service_tree_del(st, cfqg);
 	cfqg->saved_workload_slice = 0;
-	cfq_blkiocg_update_dequeue_stats(cfqg_to_blkg(cfqg),
-					 &blkio_policy_cfq, 1);
+	cfqg_stats_update_dequeue(cfqg);
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
@@ -1306,9 +1301,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
 		     used_sl, cfqq->slice_dispatch, charge,
 		     iops_mode(cfqd), cfqq->nr_sectors);
-	cfq_blkiocg_update_timeslice_used(cfqg_to_blkg(cfqg), &blkio_policy_cfq,
-					  used_sl, unaccounted_sl);
-	cfq_blkiocg_set_start_empty_time(cfqg_to_blkg(cfqg), &blkio_policy_cfq);
+	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
+	cfqg_stats_set_start_empty_time(cfqg);
 }
 
 /**
@@ -1456,14 +1450,15 @@ static int blkcg_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 }
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-static u64 blkg_prfill_avg_queue_size(struct seq_file *sf,
+static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
-	u64 samples = blkg_stat_read(&pd->stats.avg_queue_size_samples);
+	struct cfq_group *cfqg = (void *)pd->pdata;
+	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
 	u64 v = 0;
 
 	if (samples) {
-		v = blkg_stat_read(&pd->stats.avg_queue_size_sum);
+		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
 		do_div(v, samples);
 	}
 	__blkg_prfill_u64(sf, pd, v);
@@ -1471,12 +1466,12 @@ static u64 blkg_prfill_avg_queue_size(struct seq_file *sf,
 }
 
 /* print avg_queue_size */
-static int blkcg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
-				      struct seq_file *sf)
+static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_avg_queue_size,
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
 			  BLKIO_POLICY_PROP, 0, false);
 	return 0;
 }
@@ -1497,84 +1492,84 @@ static struct cftype cfq_blkcg_files[] = {
 	{
 		.name = "time",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, time)),
+				offsetof(struct cfq_group, stats.time)),
 		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "sectors",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, sectors)),
+				offsetof(struct cfq_group, stats.sectors)),
 		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "io_service_bytes",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, service_bytes)),
+				offsetof(struct cfq_group, stats.service_bytes)),
 		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_serviced",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, serviced)),
+				offsetof(struct cfq_group, stats.serviced)),
 		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_service_time",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, service_time)),
+				offsetof(struct cfq_group, stats.service_time)),
 		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_wait_time",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, wait_time)),
+				offsetof(struct cfq_group, stats.wait_time)),
 		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_merged",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, merged)),
+				offsetof(struct cfq_group, stats.merged)),
 		.read_seq_string = blkcg_print_rwstat,
 	},
 	{
 		.name = "io_queued",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, queued)),
+				offsetof(struct cfq_group, stats.queued)),
 		.read_seq_string = blkcg_print_rwstat,
 	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
-		.read_seq_string = blkcg_print_avg_queue_size,
+		.read_seq_string = cfqg_print_avg_queue_size,
 	},
 	{
 		.name = "group_wait_time",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, group_wait_time)),
+				offsetof(struct cfq_group, stats.group_wait_time)),
 		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "idle_time",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, idle_time)),
+				offsetof(struct cfq_group, stats.idle_time)),
 		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "empty_time",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, empty_time)),
+				offsetof(struct cfq_group, stats.empty_time)),
 		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "dequeue",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, dequeue)),
+				offsetof(struct cfq_group, stats.dequeue)),
 		.read_seq_string = blkcg_print_stat,
 	},
 	{
 		.name = "unaccounted_time",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct blkio_group_stats, unaccounted_time)),
+				offsetof(struct cfq_group, stats.unaccounted_time)),
 		.read_seq_string = blkcg_print_stat,
 	},
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
@@ -1858,14 +1853,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	cfq_blkiocg_update_io_remove_stats(cfqg_to_blkg(RQ_CFQG(rq)),
-					   &blkio_policy_cfq, rq_data_dir(rq),
-					   rq_is_sync(rq));
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	cfq_add_rq_rb(rq);
-	cfq_blkiocg_update_io_add_stats(cfqg_to_blkg(RQ_CFQG(rq)),
-					&blkio_policy_cfq,
-					cfqg_to_blkg(cfqq->cfqd->serving_group),
-					rq_data_dir(rq), rq_is_sync(rq));
+	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
+				 rq->cmd_flags);
 }
 
 static struct request *
@@ -1921,9 +1912,7 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	cfq_blkiocg_update_io_remove_stats(cfqg_to_blkg(RQ_CFQG(rq)),
-					   &blkio_policy_cfq, rq_data_dir(rq),
-					   rq_is_sync(rq));
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	if (rq->cmd_flags & REQ_PRIO) {
 		WARN_ON(!cfqq->prio_pending);
 		cfqq->prio_pending--;
@@ -1958,9 +1947,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	cfq_blkiocg_update_io_merged_stats(cfqg_to_blkg(RQ_CFQG(req)),
-					   &blkio_policy_cfq, bio_data_dir(bio),
-					   cfq_bio_sync(bio));
+	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
 }
 
 static void
@@ -1982,9 +1969,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
-	cfq_blkiocg_update_io_merged_stats(cfqg_to_blkg(RQ_CFQG(rq)),
-					   &blkio_policy_cfq, rq_data_dir(next),
-					   rq_is_sync(next));
+	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
 
 	cfqq = RQ_CFQQ(next);
 	/*
@@ -2025,8 +2010,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	del_timer(&cfqd->idle_slice_timer);
-	cfq_blkiocg_update_idle_time_stats(cfqg_to_blkg(cfqq->cfqg),
-					   &blkio_policy_cfq);
+	cfqg_stats_update_idle_time(cfqq->cfqg);
 }
 
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
@@ -2035,8 +2019,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		cfq_blkiocg_update_avg_queue_size_stats(cfqg_to_blkg(cfqq->cfqg),
-							&blkio_policy_cfq);
+		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
@@ -2384,8 +2367,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 		sl = cfqd->cfq_slice_idle;
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-	cfq_blkiocg_update_set_idle_time_stats(cfqg_to_blkg(cfqq->cfqg),
-					       &blkio_policy_cfq);
+	cfqg_stats_set_start_idle_time(cfqq->cfqg);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
 			group_idle ? 1 : 0);
 }
@@ -2408,9 +2390,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	cfqq->nr_sectors += blk_rq_sectors(rq);
-	cfq_blkiocg_update_dispatch_stats(cfqg_to_blkg(cfqq->cfqg),
-					  &blkio_policy_cfq, blk_rq_bytes(rq),
-					  rq_data_dir(rq), rq_is_sync(rq));
+	cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
 }
 
 /*
@@ -3513,9 +3493,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
 			} else {
-				cfq_blkiocg_update_idle_time_stats(
-						cfqg_to_blkg(cfqq->cfqg),
-						&blkio_policy_cfq);
+				cfqg_stats_update_idle_time(cfqq->cfqg);
 				cfq_mark_cfqq_must_dispatch(cfqq);
 			}
 		}
@@ -3542,10 +3520,8 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
-	cfq_blkiocg_update_io_add_stats(cfqg_to_blkg(RQ_CFQG(rq)),
-					&blkio_policy_cfq,
-					cfqg_to_blkg(cfqd->serving_group),
-					rq_data_dir(rq), rq_is_sync(rq));
+	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
+				 rq->cmd_flags);
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
@@ -3641,10 +3617,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
-	cfq_blkiocg_update_completion_stats(cfqg_to_blkg(cfqq->cfqg),
-			&blkio_policy_cfq, rq_start_time_ns(rq),
-			rq_io_start_time_ns(rq), rq_data_dir(rq),
-			rq_is_sync(rq));
+	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
+				     rq_io_start_time_ns(rq), rq->cmd_flags);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -4184,6 +4158,7 @@ static struct elevator_type iosched_cfq = {
 static struct blkio_policy_type blkio_policy_cfq = {
 	.ops = {
 		.blkio_init_group_fn =		cfq_init_blkio_group,
+		.blkio_reset_group_stats_fn =	cfqg_stats_reset,
 	},
 	.plid = BLKIO_POLICY_PROP,
 	.pdata_size = sizeof(struct cfq_group),
-- 
cgit v1.1


From 8a3d26151f24e2a2ffa550890144c3d54d2edb15 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:44 -0700
Subject: blkcg: move blkio_group_stats_cpu and friends to blk-throttle.c

blkio_group_stats_cpu is used only by blk-throtl and has no reason to
be defined in blkcg core.

* Move blkio_group_stats_cpu to blk-throttle.c and rename it to
  tg_stats_cpu.

* blkg_policy_data->stats_cpu is replaced with throtl_grp->stats_cpu.
  prfill functions updated accordingly.

* All related macros / functions are renamed so that they have tg_
  prefix and the unnecessary @pol arguments are dropped.

* Per-cpu stats allocation code is also moved from blk-cgroup.c to
  blk-throttle.c and gets simplified to only deal with
  BLKIO_POLICY_THROTL.  percpu stat free is performed by the exit
  method throtl_exit_blkio_group().

* throtl_reset_group_stats() implemented for
  blkio_reset_group_stats_fn method so that tg->stats_cpu can be
  reset.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c   |  98 +--------------------------------------
 block/blk-cgroup.h   |  13 ------
 block/blk-throttle.c | 128 +++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 114 insertions(+), 125 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index cfdda44..16f6ee6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,13 +30,6 @@ static LIST_HEAD(blkio_list);
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
-/* List of groups pending per cpu stats allocation */
-static DEFINE_SPINLOCK(alloc_list_lock);
-static LIST_HEAD(alloc_list);
-
-static void blkio_stat_alloc_fn(struct work_struct *);
-static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);
-
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
@@ -63,60 +56,6 @@ struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
 
-/*
- * Worker for allocating per cpu stat for blk groups. This is scheduled on
- * the system_nrt_wq once there are some groups on the alloc_list waiting
- * for allocation.
- */
-static void blkio_stat_alloc_fn(struct work_struct *work)
-{
-	static void *pcpu_stats[BLKIO_NR_POLICIES];
-	struct delayed_work *dwork = to_delayed_work(work);
-	struct blkio_group *blkg;
-	int i;
-	bool empty = false;
-
-alloc_stats:
-	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
-		if (pcpu_stats[i] != NULL)
-			continue;
-
-		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);
-
-		/* Allocation failed. Try again after some time. */
-		if (pcpu_stats[i] == NULL) {
-			queue_delayed_work(system_nrt_wq, dwork,
-						msecs_to_jiffies(10));
-			return;
-		}
-	}
-
-	spin_lock_irq(&blkio_list_lock);
-	spin_lock(&alloc_list_lock);
-
-	/* cgroup got deleted or queue exited. */
-	if (!list_empty(&alloc_list)) {
-		blkg = list_first_entry(&alloc_list, struct blkio_group,
-						alloc_node);
-		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
-			struct blkg_policy_data *pd = blkg->pd[i];
-
-			if (blkio_policy[i] && pd && !pd->stats_cpu)
-				swap(pd->stats_cpu, pcpu_stats[i]);
-		}
-
-		list_del_init(&blkg->alloc_node);
-	}
-
-	empty = list_empty(&alloc_list);
-
-	spin_unlock(&alloc_list_lock);
-	spin_unlock_irq(&blkio_list_lock);
-
-	if (!empty)
-		goto alloc_stats;
-}
-
 /**
  * blkg_free - free a blkg
  * @blkg: blkg to free
@@ -140,7 +79,6 @@ static void blkg_free(struct blkio_group *blkg)
 		if (pol && pol->ops.blkio_exit_group_fn)
 			pol->ops.blkio_exit_group_fn(blkg);
 
-		free_percpu(pd->stats_cpu);
 		kfree(pd);
 	}
 
@@ -167,7 +105,6 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
-	INIT_LIST_HEAD(&blkg->alloc_node);
 	blkg->blkcg = blkcg;
 	blkg->refcnt = 1;
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -245,12 +182,6 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	list_add(&blkg->q_node, &q->blkg_list);
 	spin_unlock(&blkcg->lock);
-
-	spin_lock(&alloc_list_lock);
-	list_add(&blkg->alloc_node, &alloc_list);
-	/* Queue per cpu stat allocation from worker thread. */
-	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
-	spin_unlock(&alloc_list_lock);
 out:
 	return blkg;
 }
@@ -284,10 +215,6 @@ static void blkg_destroy(struct blkio_group *blkg)
 	list_del_init(&blkg->q_node);
 	hlist_del_init_rcu(&blkg->blkcg_node);
 
-	spin_lock(&alloc_list_lock);
-	list_del_init(&blkg->alloc_node);
-	spin_unlock(&alloc_list_lock);
-
 	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
@@ -319,9 +246,6 @@ void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
 	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
 	WARN_ON_ONCE(!pd);
 
-	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-	WARN_ON_ONCE(!pd->stats_cpu);
-
 	blkg->pd[plid] = pd;
 	pd->blkg = blkg;
 	pol->ops.blkio_init_group_fn(blkg);
@@ -381,23 +305,6 @@ void __blkg_release(struct blkio_group *blkg)
 }
 EXPORT_SYMBOL_GPL(__blkg_release);
 
-static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
-{
-	struct blkg_policy_data *pd = blkg->pd[plid];
-	int cpu;
-
-	if (pd->stats_cpu == NULL)
-		return;
-
-	for_each_possible_cpu(cpu) {
-		struct blkio_group_stats_cpu *sc =
-			per_cpu_ptr(pd->stats_cpu, cpu);
-
-		blkg_rwstat_reset(&sc->service_bytes);
-		blkg_rwstat_reset(&sc->serviced);
-	}
-}
-
 static int
 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
@@ -416,12 +323,9 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		struct blkio_policy_type *pol;
 
-		list_for_each_entry(pol, &blkio_list, list) {
-			blkio_reset_stats_cpu(blkg, pol->plid);
-
+		list_for_each_entry(pol, &blkio_list, list)
 			if (pol->ops.blkio_reset_group_stats_fn)
 				pol->ops.blkio_reset_group_stats_fn(blkg);
-		}
 	}
 
 	spin_unlock_irq(&blkcg->lock);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 791570394..e368dd00 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -64,14 +64,6 @@ struct blkg_rwstat {
 	uint64_t			cnt[BLKG_RWSTAT_NR];
 };
 
-/* Per cpu blkio group stats */
-struct blkio_group_stats_cpu {
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
-};
-
 struct blkio_group_conf {
 	unsigned int weight;
 	u64 iops[2];
@@ -86,9 +78,6 @@ struct blkg_policy_data {
 	/* Configuration */
 	struct blkio_group_conf conf;
 
-	/* Per cpu stats pointer */
-	struct blkio_group_stats_cpu __percpu *stats_cpu;
-
 	/* pol->pdata_size bytes of private data used by policy impl */
 	char pdata[] __aligned(__alignof__(unsigned long long));
 };
@@ -106,8 +95,6 @@ struct blkio_group {
 
 	struct blkg_policy_data *pd[BLKIO_NR_POLICIES];
 
-	/* List of blkg waiting for per cpu stats memory to be allocated */
-	struct list_head alloc_node;
 	struct rcu_head rcu_head;
 };
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index cb259bc..27f7960 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -40,6 +40,14 @@ struct throtl_rb_root {
 
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
+/* Per-cpu group stats */
+struct tg_stats_cpu {
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
+};
+
 struct throtl_grp {
 	/* active throtl group service_tree member */
 	struct rb_node rb_node;
@@ -76,6 +84,12 @@ struct throtl_grp {
 
 	/* Some throttle limits got updated for the group */
 	int limits_changed;
+
+	/* Per cpu stats pointer */
+	struct tg_stats_cpu __percpu *stats_cpu;
+
+	/* List of tgs waiting for per cpu stats memory to be allocated */
+	struct list_head stats_alloc_node;
 };
 
 struct throtl_data
@@ -100,6 +114,13 @@ struct throtl_data
 	int limits_changed;
 };
 
+/* list and work item to allocate percpu group stats */
+static DEFINE_SPINLOCK(tg_stats_alloc_lock);
+static LIST_HEAD(tg_stats_alloc_list);
+
+static void tg_stats_alloc_fn(struct work_struct *);
+static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
+
 static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
 {
 	return blkg_to_pdata(blkg, &blkio_policy_throtl);
@@ -142,6 +163,44 @@ static inline unsigned int total_nr_queued(struct throtl_data *td)
 	return td->nr_queued[0] + td->nr_queued[1];
 }
 
+/*
+ * Worker for allocating per cpu stat for tgs. This is scheduled on the
+ * system_nrt_wq once there are some groups on the alloc_list waiting for
+ * allocation.
+ */
+static void tg_stats_alloc_fn(struct work_struct *work)
+{
+	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
+	struct delayed_work *dwork = to_delayed_work(work);
+	bool empty = false;
+
+alloc_stats:
+	if (!stats_cpu) {
+		stats_cpu = alloc_percpu(struct tg_stats_cpu);
+		if (!stats_cpu) {
+			/* allocation failed, try again after some time */
+			queue_delayed_work(system_nrt_wq, dwork,
+					   msecs_to_jiffies(10));
+			return;
+		}
+	}
+
+	spin_lock_irq(&tg_stats_alloc_lock);
+
+	if (!list_empty(&tg_stats_alloc_list)) {
+		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
+							 struct throtl_grp,
+							 stats_alloc_node);
+		swap(tg->stats_cpu, stats_cpu);
+		list_del_init(&tg->stats_alloc_node);
+	}
+
+	empty = list_empty(&tg_stats_alloc_list);
+	spin_unlock_irq(&tg_stats_alloc_lock);
+	if (!empty)
+		goto alloc_stats;
+}
+
 static void throtl_init_blkio_group(struct blkio_group *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -155,6 +214,43 @@ static void throtl_init_blkio_group(struct blkio_group *blkg)
 	tg->bps[WRITE] = -1;
 	tg->iops[READ] = -1;
 	tg->iops[WRITE] = -1;
+
+	/*
+	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
+	 * but percpu allocator can't be called from IO path.  Queue tg on
+	 * tg_stats_alloc_list and allocate from work item.
+	 */
+	spin_lock(&tg_stats_alloc_lock);
+	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
+	queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0);
+	spin_unlock(&tg_stats_alloc_lock);
+}
+
+static void throtl_exit_blkio_group(struct blkio_group *blkg)
+{
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+
+	spin_lock(&tg_stats_alloc_lock);
+	list_del_init(&tg->stats_alloc_node);
+	spin_unlock(&tg_stats_alloc_lock);
+
+	free_percpu(tg->stats_cpu);
+}
+
+static void throtl_reset_group_stats(struct blkio_group *blkg)
+{
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+	int cpu;
+
+	if (tg->stats_cpu == NULL)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
+
+		blkg_rwstat_reset(&sc->service_bytes);
+		blkg_rwstat_reset(&sc->serviced);
+	}
 }
 
 static struct
@@ -565,12 +661,12 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes,
 					 int rw)
 {
-	struct blkg_policy_data *pd = blkg->pd[BLKIO_POLICY_THROTL];
-	struct blkio_group_stats_cpu *stats_cpu;
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct tg_stats_cpu *stats_cpu;
 	unsigned long flags;
 
 	/* If per cpu stats are not allocated yet, don't do any accounting. */
-	if (pd->stats_cpu == NULL)
+	if (tg->stats_cpu == NULL)
 		return;
 
 	/*
@@ -580,7 +676,7 @@ static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes,
 	 */
 	local_irq_save(flags);
 
-	stats_cpu = this_cpu_ptr(pd->stats_cpu);
+	stats_cpu = this_cpu_ptr(tg->stats_cpu);
 
 	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
 	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
@@ -842,15 +938,15 @@ static void throtl_update_blkio_group_common(struct throtl_data *td,
 	throtl_schedule_delayed_work(td, 0);
 }
 
-static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
-				  struct blkg_policy_data *pd, int off)
+static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
+				struct blkg_policy_data *pd, int off)
 {
+	struct throtl_grp *tg = (void *)pd->pdata;
 	struct blkg_rwstat rwstat = { }, tmp;
 	int i, cpu;
 
 	for_each_possible_cpu(cpu) {
-		struct blkio_group_stats_cpu *sc =
-			per_cpu_ptr(pd->stats_cpu, cpu);
+		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 
 		tmp = blkg_rwstat_read((void *)sc + off);
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
@@ -861,12 +957,12 @@ static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
 }
 
 /* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
-static int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
-				  struct seq_file *sf)
+static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
+			       struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
+	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat,
 			  BLKCG_STAT_POL(cft->private),
 			  BLKCG_STAT_OFF(cft->private), true);
 	return 0;
@@ -1012,14 +1108,14 @@ static struct cftype throtl_files[] = {
 	{
 		.name = "throttle.io_service_bytes",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
-				offsetof(struct blkio_group_stats_cpu, service_bytes)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
+				offsetof(struct tg_stats_cpu, service_bytes)),
+		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{
 		.name = "throttle.io_serviced",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
-				offsetof(struct blkio_group_stats_cpu, serviced)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
+				offsetof(struct tg_stats_cpu, serviced)),
+		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{ }	/* terminate */
 };
@@ -1034,6 +1130,8 @@ static void throtl_shutdown_wq(struct request_queue *q)
 static struct blkio_policy_type blkio_policy_throtl = {
 	.ops = {
 		.blkio_init_group_fn = throtl_init_blkio_group,
+		.blkio_exit_group_fn = throtl_exit_blkio_group,
+		.blkio_reset_group_stats_fn = throtl_reset_group_stats,
 	},
 	.plid = BLKIO_POLICY_THROTL,
 	.pdata_size = sizeof(struct throtl_grp),
-- 
cgit v1.1


From 3381cb8d2ef1523dbaeec99161d766c25f1e52d6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:44 -0700
Subject: blkcg: move blkio_group_conf->weight to cfq

blkio_group_conf->weight is owned by cfq and has no reason to be
defined in blkcg core.  Replace it with cfq_group->dev_weight and let
conf setting functions directly set it.  If dev_weight is zero, the
cfqg doesn't have device specific weight configured.

Also, rename BLKIO_WEIGHT_* constants to CFQ_WEIGHT_* and rename
blkio_cgroup->weight to blkio_cgroup->cfq_weight.  We eventually want
per-policy storage in blkio_cgroup but just mark the ownership of the
field for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c  |  4 +--
 block/blk-cgroup.h  | 14 +++++-----
 block/cfq-iosched.c | 77 ++++++++++++++++++++++++-----------------------------
 3 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 16f6ee6..c0e239b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,7 +30,7 @@ static LIST_HEAD(blkio_list);
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
-struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+struct blkio_cgroup blkio_root_cgroup = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
 static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
@@ -611,7 +611,7 @@ static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
 
-	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
+	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
 	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
 done:
 	spin_lock_init(&blkcg->lock);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index e368dd00..386db29 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -29,6 +29,11 @@ enum blkio_policy_id {
 
 #ifdef CONFIG_BLK_CGROUP
 
+/* CFQ specific, out here for blkcg->cfq_weight */
+#define CFQ_WEIGHT_MIN		10
+#define CFQ_WEIGHT_MAX		1000
+#define CFQ_WEIGHT_DEFAULT	500
+
 /* cft->private [un]packing for stat printing */
 #define BLKCG_STAT_PRIV(pol, off)	(((unsigned)(pol) << 16) | (off))
 #define BLKCG_STAT_POL(prv)		((unsigned)(prv) >> 16)
@@ -46,12 +51,14 @@ enum blkg_rwstat_type {
 
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
-	unsigned int weight;
 	spinlock_t lock;
 	struct hlist_head blkg_list;
 
 	/* for policies to test whether associated blkcg has changed */
 	uint64_t id;
+
+	/* TODO: per-policy storage in blkio_cgroup */
+	unsigned int cfq_weight;	/* belongs to cfq */
 };
 
 struct blkg_stat {
@@ -65,7 +72,6 @@ struct blkg_rwstat {
 };
 
 struct blkio_group_conf {
-	unsigned int weight;
 	u64 iops[2];
 	u64 bps[2];
 };
@@ -355,10 +361,6 @@ static inline void blkg_put(struct blkio_group *blkg) { }
 
 #endif
 
-#define BLKIO_WEIGHT_MIN	10
-#define BLKIO_WEIGHT_MAX	1000
-#define BLKIO_WEIGHT_DEFAULT	500
-
 #ifdef CONFIG_BLK_CGROUP
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a1f37df..adab10d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -224,7 +224,7 @@ struct cfq_group {
 	u64 vdisktime;
 	unsigned int weight;
 	unsigned int new_weight;
-	bool needs_update;
+	unsigned int dev_weight;
 
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
@@ -838,7 +838,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
 {
 	u64 d = delta << CFQ_SERVICE_SHIFT;
 
-	d = d * BLKIO_WEIGHT_DEFAULT;
+	d = d * CFQ_WEIGHT_DEFAULT;
 	do_div(d, cfqg->weight);
 	return d;
 }
@@ -1165,9 +1165,9 @@ static void
 cfq_update_group_weight(struct cfq_group *cfqg)
 {
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
-	if (cfqg->needs_update) {
+	if (cfqg->new_weight) {
 		cfqg->weight = cfqg->new_weight;
-		cfqg->needs_update = false;
+		cfqg->new_weight = 0;
 	}
 }
 
@@ -1325,21 +1325,12 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfq_update_blkio_group_weight(struct blkio_group *blkg,
-					  unsigned int weight)
-{
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-
-	cfqg->new_weight = weight;
-	cfqg->needs_update = true;
-}
-
 static void cfq_init_blkio_group(struct blkio_group *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 
 	cfq_init_cfqg_base(cfqg);
-	cfqg->weight = blkg->blkcg->weight;
+	cfqg->weight = blkg->blkcg->cfq_weight;
 }
 
 /*
@@ -1377,36 +1368,38 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 	cfqg_get(cfqg);
 }
 
-static u64 blkg_prfill_weight_device(struct seq_file *sf,
+static u64 cfqg_prfill_weight_device(struct seq_file *sf,
 				     struct blkg_policy_data *pd, int off)
 {
-	if (!pd->conf.weight)
+	struct cfq_group *cfqg = (void *)pd->pdata;
+
+	if (!cfqg->dev_weight)
 		return 0;
-	return __blkg_prfill_u64(sf, pd, pd->conf.weight);
+	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
 }
 
-static int blkcg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				     struct seq_file *sf)
+static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				    struct seq_file *sf)
 {
 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
-			  blkg_prfill_weight_device, BLKIO_POLICY_PROP, 0,
+			  cfqg_prfill_weight_device, BLKIO_POLICY_PROP, 0,
 			  false);
 	return 0;
 }
 
-static int blkcg_print_weight(struct cgroup *cgrp, struct cftype *cft,
-			      struct seq_file *sf)
+static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
+			    struct seq_file *sf)
 {
-	seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->weight);
+	seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->cfq_weight);
 	return 0;
 }
 
-static int blkcg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				   const char *buf)
+static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				  const char *buf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-	struct blkg_policy_data *pd;
 	struct blkg_conf_ctx ctx;
+	struct cfq_group *cfqg;
 	int ret;
 
 	ret = blkg_conf_prep(blkcg, buf, &ctx);
@@ -1414,11 +1407,11 @@ static int blkcg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 		return ret;
 
 	ret = -EINVAL;
-	pd = ctx.blkg->pd[BLKIO_POLICY_PROP];
-	if (pd && (!ctx.v || (ctx.v >= BLKIO_WEIGHT_MIN &&
-			      ctx.v <= BLKIO_WEIGHT_MAX))) {
-		pd->conf.weight = ctx.v;
-		cfq_update_blkio_group_weight(ctx.blkg, ctx.v ?: blkcg->weight);
+	cfqg = blkg_to_cfqg(ctx.blkg);
+	if (cfqg && (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN &&
+				ctx.v <= CFQ_WEIGHT_MAX))) {
+		cfqg->dev_weight = ctx.v;
+		cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
 		ret = 0;
 	}
 
@@ -1426,23 +1419,23 @@ static int blkcg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 	return ret;
 }
 
-static int blkcg_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	struct blkio_group *blkg;
 	struct hlist_node *n;
 
-	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
 		return -EINVAL;
 
 	spin_lock_irq(&blkcg->lock);
-	blkcg->weight = (unsigned int)val;
+	blkcg->cfq_weight = (unsigned int)val;
 
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		struct blkg_policy_data *pd = blkg->pd[BLKIO_POLICY_PROP];
+		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 
-		if (pd && !pd->conf.weight)
-			cfq_update_blkio_group_weight(blkg, blkcg->weight);
+		if (cfqg && !cfqg->dev_weight)
+			cfqg->new_weight = blkcg->cfq_weight;
 	}
 
 	spin_unlock_irq(&blkcg->lock);
@@ -1480,14 +1473,14 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
 static struct cftype cfq_blkcg_files[] = {
 	{
 		.name = "weight_device",
-		.read_seq_string = blkcg_print_weight_device,
-		.write_string = blkcg_set_weight_device,
+		.read_seq_string = cfqg_print_weight_device,
+		.write_string = cfqg_set_weight_device,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
-		.read_seq_string = blkcg_print_weight,
-		.write_u64 = blkcg_set_weight,
+		.read_seq_string = cfq_print_weight,
+		.write_u64 = cfq_set_weight,
 	},
 	{
 		.name = "time",
@@ -3983,7 +3976,7 @@ static int cfq_init_queue(struct request_queue *q)
 		return -ENOMEM;
 	}
 
-	cfqd->root_group->weight = 2*BLKIO_WEIGHT_DEFAULT;
+	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
-- 
cgit v1.1


From af133ceb261033eb43c03d161a991c3b772e8c56 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:44 -0700
Subject: blkcg: move blkio_group_conf->iops and ->bps to blk-throttle

blkio_cgroup_conf->iops and ->bps are owned by blk-throttle and has no
reason to be defined in blkcg core.  Drop them and let conf setting
functions directly manipulate throtl_grp->bps[] and ->iops[].

This makes blkio_group_conf empty.  Drop it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.h   |   8 ---
 block/blk-throttle.c | 153 +++++++++++++++++++--------------------------------
 2 files changed, 58 insertions(+), 103 deletions(-)

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 386db29..a77ab1a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -71,19 +71,11 @@ struct blkg_rwstat {
 	uint64_t			cnt[BLKG_RWSTAT_NR];
 };
 
-struct blkio_group_conf {
-	u64 iops[2];
-	u64 bps[2];
-};
-
 /* per-blkg per-policy data */
 struct blkg_policy_data {
 	/* the blkg this per-policy data belongs to */
 	struct blkio_group *blkg;
 
-	/* Configuration */
-	struct blkio_group_conf conf;
-
 	/* pol->pdata_size bytes of private data used by policy impl */
 	char pdata[] __aligned(__alignof__(unsigned long long));
 };
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 27f7960..004964b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -924,20 +924,6 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 	}
 }
 
-/*
- * Can not take queue lock in update functions as queue lock under
- * blkcg_lock is not allowed. Under other paths we take blkcg_lock under
- * queue_lock.
- */
-static void throtl_update_blkio_group_common(struct throtl_data *td,
-				struct throtl_grp *tg)
-{
-	xchg(&tg->limits_changed, true);
-	xchg(&td->limits_changed, true);
-	/* Schedule a work now to process the limit change */
-	throtl_schedule_delayed_work(td, 0);
-}
-
 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
 				struct blkg_policy_data *pd, int off)
 {
@@ -968,68 +954,48 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
-static u64 blkg_prfill_conf_u64(struct seq_file *sf,
-				struct blkg_policy_data *pd, int off)
+static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
+			      int off)
 {
-	u64 v = *(u64 *)((void *)&pd->conf + off);
+	u64 v = *(u64 *)((void *)pd->pdata + off);
 
-	if (!v)
+	if (v == -1)
 		return 0;
 	return __blkg_prfill_u64(sf, pd, v);
 }
 
-static int blkcg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
-				struct seq_file *sf)
+static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
 {
-	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
-			  blkg_prfill_conf_u64, BLKIO_POLICY_THROTL,
-			  cft->private, false);
-	return 0;
-}
+	unsigned int v = *(unsigned int *)((void *)pd->pdata + off);
 
-static void throtl_update_blkio_group_read_bps(struct blkio_group *blkg,
-					       u64 read_bps)
-{
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-
-	tg->bps[READ] = read_bps;
-	throtl_update_blkio_group_common(blkg->q->td, tg);
-}
-
-static void throtl_update_blkio_group_write_bps(struct blkio_group *blkg,
-						u64 write_bps)
-{
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-
-	tg->bps[WRITE] = write_bps;
-	throtl_update_blkio_group_common(blkg->q->td, tg);
+	if (v == -1)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, v);
 }
 
-static void throtl_update_blkio_group_read_iops(struct blkio_group *blkg,
-						u64 read_iops)
+static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+			     struct seq_file *sf)
 {
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-
-	tg->iops[READ] = read_iops;
-	throtl_update_blkio_group_common(blkg->q->td, tg);
+	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_u64,
+			  BLKIO_POLICY_THROTL, cft->private, false);
+	return 0;
 }
 
-static void throtl_update_blkio_group_write_iops(struct blkio_group *blkg,
-						 u64 write_iops)
+static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
+			      struct seq_file *sf)
 {
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-
-	tg->iops[WRITE] = write_iops;
-	throtl_update_blkio_group_common(blkg->q->td, tg);
+	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_uint,
+			  BLKIO_POLICY_THROTL, cft->private, false);
+	return 0;
 }
 
-static int blkcg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
-			      const char *buf,
-			      void (*update)(struct blkio_group *, u64))
+static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
+		       bool is_u64)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-	struct blkg_policy_data *pd;
 	struct blkg_conf_ctx ctx;
+	struct throtl_grp *tg;
 	int ret;
 
 	ret = blkg_conf_prep(blkcg, buf, &ctx);
@@ -1037,10 +1003,23 @@ static int blkcg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
 		return ret;
 
 	ret = -EINVAL;
-	pd = ctx.blkg->pd[BLKIO_POLICY_THROTL];
-	if (pd) {
-		*(u64 *)((void *)&pd->conf + cft->private) = ctx.v;
-		update(ctx.blkg, ctx.v ?: -1);
+	tg = blkg_to_tg(ctx.blkg);
+	if (tg) {
+		struct throtl_data *td = ctx.blkg->q->td;
+
+		if (!ctx.v)
+			ctx.v = -1;
+
+		if (is_u64)
+			*(u64 *)((void *)tg + cft->private) = ctx.v;
+		else
+			*(unsigned int *)((void *)tg + cft->private) = ctx.v;
+
+		/* XXX: we don't need the following deferred processing */
+		xchg(&tg->limits_changed, true);
+		xchg(&td->limits_changed, true);
+		throtl_schedule_delayed_work(td, 0);
+
 		ret = 0;
 	}
 
@@ -1048,61 +1027,45 @@ static int blkcg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
 	return ret;
 }
 
-static int blkcg_set_conf_bps_r(struct cgroup *cgrp, struct cftype *cft,
-				const char *buf)
-{
-	return blkcg_set_conf_u64(cgrp, cft, buf,
-				  throtl_update_blkio_group_read_bps);
-}
-
-static int blkcg_set_conf_bps_w(struct cgroup *cgrp, struct cftype *cft,
-				const char *buf)
-{
-	return blkcg_set_conf_u64(cgrp, cft, buf,
-				  throtl_update_blkio_group_write_bps);
-}
-
-static int blkcg_set_conf_iops_r(struct cgroup *cgrp, struct cftype *cft,
-				 const char *buf)
+static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+			   const char *buf)
 {
-	return blkcg_set_conf_u64(cgrp, cft, buf,
-				  throtl_update_blkio_group_read_iops);
+	return tg_set_conf(cgrp, cft, buf, true);
 }
 
-static int blkcg_set_conf_iops_w(struct cgroup *cgrp, struct cftype *cft,
-				 const char *buf)
+static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
+			    const char *buf)
 {
-	return blkcg_set_conf_u64(cgrp, cft, buf,
-				  throtl_update_blkio_group_write_iops);
+	return tg_set_conf(cgrp, cft, buf, false);
 }
 
 static struct cftype throtl_files[] = {
 	{
 		.name = "throttle.read_bps_device",
-		.private = offsetof(struct blkio_group_conf, bps[READ]),
-		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkcg_set_conf_bps_r,
+		.private = offsetof(struct throtl_grp, bps[READ]),
+		.read_seq_string = tg_print_conf_u64,
+		.write_string = tg_set_conf_u64,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_bps_device",
-		.private = offsetof(struct blkio_group_conf, bps[WRITE]),
-		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkcg_set_conf_bps_w,
+		.private = offsetof(struct throtl_grp, bps[WRITE]),
+		.read_seq_string = tg_print_conf_u64,
+		.write_string = tg_set_conf_u64,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.read_iops_device",
-		.private = offsetof(struct blkio_group_conf, iops[READ]),
-		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkcg_set_conf_iops_r,
+		.private = offsetof(struct throtl_grp, iops[READ]),
+		.read_seq_string = tg_print_conf_uint,
+		.write_string = tg_set_conf_uint,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_iops_device",
-		.private = offsetof(struct blkio_group_conf, iops[WRITE]),
-		.read_seq_string = blkcg_print_conf_u64,
-		.write_string = blkcg_set_conf_iops_w,
+		.private = offsetof(struct throtl_grp, iops[WRITE]),
+		.read_seq_string = tg_print_conf_uint,
+		.write_string = tg_set_conf_uint,
 		.max_write_len = 256,
 	},
 	{
-- 
cgit v1.1


From d366e7ec41882791c970dfb7c67b737be8c3a174 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:44 -0700
Subject: blkcg: pass around pd->pdata instead of pd itself in prfill functions

Now that all conf and stat fields are moved into policy specific
blkio_policy_data->pdata areas, there's no reason to use
blkio_policy_data itself in prfill functions.  Pass around @pd->pdata
instead of @pd.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c   | 33 +++++++++++++++------------------
 block/blk-cgroup.h   |  6 +++---
 block/blk-throttle.c | 21 +++++++++------------
 block/cfq-iosched.c  | 14 ++++++--------
 4 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c0e239b..7de19d7 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -359,7 +359,7 @@ static const char *blkg_dev_name(struct blkio_group *blkg)
  * cftype->read_seq_string method.
  */
 void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
-		       u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
+		       u64 (*prfill)(struct seq_file *, void *, int),
 		       int pol, int data, bool show_total)
 {
 	struct blkio_group *blkg;
@@ -369,7 +369,7 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 	spin_lock_irq(&blkcg->lock);
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
 		if (blkg->pd[pol])
-			total += prfill(sf, blkg->pd[pol], data);
+			total += prfill(sf, blkg->pd[pol]->pdata, data);
 	spin_unlock_irq(&blkcg->lock);
 
 	if (show_total)
@@ -380,14 +380,14 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
 /**
  * __blkg_prfill_u64 - prfill helper for a single u64 value
  * @sf: seq_file to print to
- * @pd: policy data of interest
+ * @pdata: policy private data of interest
  * @v: value to print
  *
- * Print @v to @sf for the device assocaited with @pd.
+ * Print @v to @sf for the device assocaited with @pdata.
  */
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
+u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v)
 {
-	const char *dname = blkg_dev_name(pd->blkg);
+	const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
 
 	if (!dname)
 		return 0;
@@ -400,12 +400,12 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
 /**
  * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
  * @sf: seq_file to print to
- * @pd: policy data of interest
+ * @pdata: policy private data of interest
  * @rwstat: rwstat to print
  *
- * Print @rwstat to @sf for the device assocaited with @pd.
+ * Print @rwstat to @sf for the device assocaited with @pdata.
  */
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
 			 const struct blkg_rwstat *rwstat)
 {
 	static const char *rwstr[] = {
@@ -414,7 +414,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		[BLKG_RWSTAT_SYNC]	= "Sync",
 		[BLKG_RWSTAT_ASYNC]	= "Async",
 	};
-	const char *dname = blkg_dev_name(pd->blkg);
+	const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
 	u64 v;
 	int i;
 
@@ -430,19 +430,16 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 	return v;
 }
 
-static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
-			    int off)
+static u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off)
 {
-	return __blkg_prfill_u64(sf, pd,
-				 blkg_stat_read((void *)pd->pdata + off));
+	return __blkg_prfill_u64(sf, pdata, blkg_stat_read(pdata + off));
 }
 
-static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-			      int off)
+static u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off)
 {
-	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->pdata + off);
+	struct blkg_rwstat rwstat = blkg_rwstat_read(pdata + off);
 
-	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
 }
 
 /* print blkg_stat specified by BLKCG_STAT_PRIV() */
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index a77ab1a..c930895 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -126,10 +126,10 @@ extern void update_root_blkg_pd(struct request_queue *q,
 				enum blkio_policy_id plid);
 
 void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
-		       u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
+		       u64 (*prfill)(struct seq_file *, void *, int),
 		       int pol, int data, bool show_total);
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
 			 const struct blkg_rwstat *rwstat);
 int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 		     struct seq_file *sf);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 004964b..bd6dbfe 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -924,10 +924,9 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 	}
 }
 
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
-				struct blkg_policy_data *pd, int off)
+static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, void *pdata, int off)
 {
-	struct throtl_grp *tg = (void *)pd->pdata;
+	struct throtl_grp *tg = pdata;
 	struct blkg_rwstat rwstat = { }, tmp;
 	int i, cpu;
 
@@ -939,7 +938,7 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
 			rwstat.cnt[i] += tmp.cnt[i];
 	}
 
-	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
 }
 
 /* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
@@ -954,24 +953,22 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
-static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
-			      int off)
+static u64 tg_prfill_conf_u64(struct seq_file *sf, void *pdata, int off)
 {
-	u64 v = *(u64 *)((void *)pd->pdata + off);
+	u64 v = *(u64 *)(pdata + off);
 
 	if (v == -1)
 		return 0;
-	return __blkg_prfill_u64(sf, pd, v);
+	return __blkg_prfill_u64(sf, pdata, v);
 }
 
-static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
-			       int off)
+static u64 tg_prfill_conf_uint(struct seq_file *sf, void *pdata, int off)
 {
-	unsigned int v = *(unsigned int *)((void *)pd->pdata + off);
+	unsigned int v = *(unsigned int *)(pdata + off);
 
 	if (v == -1)
 		return 0;
-	return __blkg_prfill_u64(sf, pd, v);
+	return __blkg_prfill_u64(sf, pdata, v);
 }
 
 static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index adab10d..fd505f7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1368,14 +1368,13 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 	cfqg_get(cfqg);
 }
 
-static u64 cfqg_prfill_weight_device(struct seq_file *sf,
-				     struct blkg_policy_data *pd, int off)
+static u64 cfqg_prfill_weight_device(struct seq_file *sf, void *pdata, int off)
 {
-	struct cfq_group *cfqg = (void *)pd->pdata;
+	struct cfq_group *cfqg = pdata;
 
 	if (!cfqg->dev_weight)
 		return 0;
-	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
+	return __blkg_prfill_u64(sf, pdata, cfqg->dev_weight);
 }
 
 static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
@@ -1443,10 +1442,9 @@ static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 }
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
+static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, void *pdata, int off)
 {
-	struct cfq_group *cfqg = (void *)pd->pdata;
+	struct cfq_group *cfqg = pdata;
 	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
 	u64 v = 0;
 
@@ -1454,7 +1452,7 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
 		do_div(v, samples);
 	}
-	__blkg_prfill_u64(sf, pd, v);
+	__blkg_prfill_u64(sf, pdata, v);
 	return 0;
 }
 
-- 
cgit v1.1


From 5bc4afb1ec6aa562fac4d9aba34d957ee42f5813 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 14:38:45 -0700
Subject: blkcg: drop BLKCG_STAT_{PRIV|POL|OFF} macros

Now that all stat handling code lives in policy implementations,
there's no need to encode policy ID in cft->private.

* Export blkcg_prfill_[rw]stat() from blkcg, remove
  blkcg_print_[rw]stat(), and implement cfqg_print_[rw]stat() which
  use hard-code BLKIO_POLICY_PROP.

* Use cft->private for offset of the target field directly and drop
  BLKCG_STAT_{PRIV|POL|OFF}().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c   | 48 +++++++++++++----------------
 block/blk-cgroup.h   | 11 ++-----
 block/blk-throttle.c | 12 +++-----
 block/cfq-iosched.c  | 85 ++++++++++++++++++++++++++++------------------------
 4 files changed, 72 insertions(+), 84 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7de19d7..9449c38 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -430,43 +430,35 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
 	return v;
 }
 
-static u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off)
+/**
+ * blkg_prfill_stat - prfill callback for blkg_stat
+ * @sf: seq_file to print to
+ * @pdata: policy private data of interest
+ * @off: offset to the blkg_stat in @pdata
+ *
+ * prfill callback for printing a blkg_stat.
+ */
+u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off)
 {
 	return __blkg_prfill_u64(sf, pdata, blkg_stat_read(pdata + off));
 }
+EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 
-static u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off)
+/**
+ * blkg_prfill_rwstat - prfill callback for blkg_rwstat
+ * @sf: seq_file to print to
+ * @pdata: policy private data of interest
+ * @off: offset to the blkg_rwstat in @pdata
+ *
+ * prfill callback for printing a blkg_rwstat.
+ */
+u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off)
 {
 	struct blkg_rwstat rwstat = blkg_rwstat_read(pdata + off);
 
 	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
 }
-
-/* print blkg_stat specified by BLKCG_STAT_PRIV() */
-int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
-		     struct seq_file *sf)
-{
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
-			  BLKCG_STAT_POL(cft->private),
-			  BLKCG_STAT_OFF(cft->private), false);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blkcg_print_stat);
-
-/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
-int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
-		       struct seq_file *sf)
-{
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
-			  BLKCG_STAT_POL(cft->private),
-			  BLKCG_STAT_OFF(cft->private), true);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
+EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c930895..ca0ff7c 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -34,11 +34,6 @@ enum blkio_policy_id {
 #define CFQ_WEIGHT_MAX		1000
 #define CFQ_WEIGHT_DEFAULT	500
 
-/* cft->private [un]packing for stat printing */
-#define BLKCG_STAT_PRIV(pol, off)	(((unsigned)(pol) << 16) | (off))
-#define BLKCG_STAT_POL(prv)		((unsigned)(prv) >> 16)
-#define BLKCG_STAT_OFF(prv)		((unsigned)(prv) & 0xffff)
-
 enum blkg_rwstat_type {
 	BLKG_RWSTAT_READ,
 	BLKG_RWSTAT_WRITE,
@@ -131,10 +126,8 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v);
 u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
 			 const struct blkg_rwstat *rwstat);
-int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
-		     struct seq_file *sf);
-int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
-		       struct seq_file *sf);
+u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off);
+u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off);
 
 struct blkg_conf_ctx {
 	struct gendisk		*disk;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index bd6dbfe..6024014 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -941,15 +941,13 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, void *pdata, int off)
 	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
 }
 
-/* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 			       struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat,
-			  BLKCG_STAT_POL(cft->private),
-			  BLKCG_STAT_OFF(cft->private), true);
+	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, BLKIO_POLICY_THROTL,
+			  cft->private, true);
 	return 0;
 }
 
@@ -1067,14 +1065,12 @@ static struct cftype throtl_files[] = {
 	},
 	{
 		.name = "throttle.io_service_bytes",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
-				offsetof(struct tg_stats_cpu, service_bytes)),
+		.private = offsetof(struct tg_stats_cpu, service_bytes),
 		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{
 		.name = "throttle.io_serviced",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
-				offsetof(struct tg_stats_cpu, serviced)),
+		.private = offsetof(struct tg_stats_cpu, serviced),
 		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{ }	/* terminate */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fd505f7..cff8b5b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1441,6 +1441,26 @@ static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 	return 0;
 }
 
+static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+			   struct seq_file *sf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, BLKIO_POLICY_PROP,
+			  cft->private, false);
+	return 0;
+}
+
+static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
+			     struct seq_file *sf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, BLKIO_POLICY_PROP,
+			  cft->private, true);
+	return 0;
+}
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, void *pdata, int off)
 {
@@ -1482,51 +1502,43 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.time)),
-		.read_seq_string = blkcg_print_stat,
+		.private = offsetof(struct cfq_group, stats.time),
+		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "sectors",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.sectors)),
-		.read_seq_string = blkcg_print_stat,
+		.private = offsetof(struct cfq_group, stats.sectors),
+		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "io_service_bytes",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.service_bytes)),
-		.read_seq_string = blkcg_print_rwstat,
+		.private = offsetof(struct cfq_group, stats.service_bytes),
+		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_serviced",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.serviced)),
-		.read_seq_string = blkcg_print_rwstat,
+		.private = offsetof(struct cfq_group, stats.serviced),
+		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_service_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.service_time)),
-		.read_seq_string = blkcg_print_rwstat,
+		.private = offsetof(struct cfq_group, stats.service_time),
+		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_wait_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.wait_time)),
-		.read_seq_string = blkcg_print_rwstat,
+		.private = offsetof(struct cfq_group, stats.wait_time),
+		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_merged",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.merged)),
-		.read_seq_string = blkcg_print_rwstat,
+		.private = offsetof(struct cfq_group, stats.merged),
+		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_queued",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.queued)),
-		.read_seq_string = blkcg_print_rwstat,
+		.private = offsetof(struct cfq_group, stats.queued),
+		.read_seq_string = cfqg_print_rwstat,
 	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
@@ -1535,33 +1547,28 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "group_wait_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.group_wait_time)),
-		.read_seq_string = blkcg_print_stat,
+		.private = offsetof(struct cfq_group, stats.group_wait_time),
+		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "idle_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.idle_time)),
-		.read_seq_string = blkcg_print_stat,
+		.private = offsetof(struct cfq_group, stats.idle_time),
+		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "empty_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.empty_time)),
-		.read_seq_string = blkcg_print_stat,
+		.private = offsetof(struct cfq_group, stats.empty_time),
+		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "dequeue",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.dequeue)),
-		.read_seq_string = blkcg_print_stat,
+		.private = offsetof(struct cfq_group, stats.dequeue),
+		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "unaccounted_time",
-		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
-				offsetof(struct cfq_group, stats.unaccounted_time)),
-		.read_seq_string = blkcg_print_stat,
+		.private = offsetof(struct cfq_group, stats.unaccounted_time),
+		.read_seq_string = cfqg_print_stat,
 	},
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 	{ }	/* terminate */
-- 
cgit v1.1


From f48ec1d7885281a9c6cd7779d61f321d1b1fd741 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:25 -0700
Subject: cfq: fix build breakage & warnings

* CFQ_WEIGHT_* defined inside CONFIG_BLK_CGROUP causes cfq-iosched.c
  compile failure when the config is disabled.  Move it outside the
  ifdef block.

* Dummy cfqg_stats_*() definitions were lacking inline modifiers
  causing unused functions warning if !CONFIG_CFQ_GROUP_IOSCHED.  Add
  them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.h  |  4 ++--
 block/cfq-iosched.c | 17 ++++++++---------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ca0ff7c..64392ac 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -27,13 +27,13 @@ enum blkio_policy_id {
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
 
-#ifdef CONFIG_BLK_CGROUP
-
 /* CFQ specific, out here for blkcg->cfq_weight */
 #define CFQ_WEIGHT_MIN		10
 #define CFQ_WEIGHT_MAX		1000
 #define CFQ_WEIGHT_DEFAULT	500
 
+#ifdef CONFIG_BLK_CGROUP
+
 enum blkg_rwstat_type {
 	BLKG_RWSTAT_READ,
 	BLKG_RWSTAT_WRITE,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cff8b5b..7a8c3e0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -17,7 +17,7 @@
 #include "blk.h"
 #include "blk-cgroup.h"
 
-static struct blkio_policy_type blkio_policy_cfq;
+static struct blkio_policy_type blkio_policy_cfq __maybe_unused;
 
 /*
  * tunables
@@ -541,14 +541,13 @@ static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 
-static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
-						 struct cfq_group *curr_cfqg) { }
-static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
-static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
-static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
-static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
-static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
-static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
+static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
+static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 
-- 
cgit v1.1


From bc0d6501a844392ab6ad419d7ca5af4693b6afac Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:26 -0700
Subject: blkcg: kill blkio_list and replace blkio_list_lock with a mutex

With blkio_policy[], blkio_list is redundant and hinders with
per-queue policy activation.  Remove it.  Also, replace
blkio_list_lock with a mutex blkcg_pol_mutex and let it protect the
whole [un]registration.

This is to prepare for per-queue policy activation and doesn't cause
any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 32 +++++++++++++++++---------------
 block/blk-cgroup.h |  1 -
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9449c38..af665fe 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,9 +24,7 @@
 
 #define MAX_KEY_LEN 100
 
-static DEFINE_SPINLOCK(blkio_list_lock);
-static LIST_HEAD(blkio_list);
-
+static DEFINE_MUTEX(blkcg_pol_mutex);
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
@@ -311,8 +309,9 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	struct blkio_group *blkg;
 	struct hlist_node *n;
+	int i;
 
-	spin_lock(&blkio_list_lock);
+	mutex_lock(&blkcg_pol_mutex);
 	spin_lock_irq(&blkcg->lock);
 
 	/*
@@ -321,15 +320,16 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	 * anyway.  If you get hit by a race, retry.
 	 */
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		struct blkio_policy_type *pol;
+		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+			struct blkio_policy_type *pol = blkio_policy[i];
 
-		list_for_each_entry(pol, &blkio_list, list)
-			if (pol->ops.blkio_reset_group_stats_fn)
+			if (pol && pol->ops.blkio_reset_group_stats_fn)
 				pol->ops.blkio_reset_group_stats_fn(blkg);
+		}
 	}
 
 	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
+	mutex_unlock(&blkcg_pol_mutex);
 	return 0;
 }
 
@@ -732,20 +732,21 @@ void blkio_policy_register(struct blkio_policy_type *blkiop)
 {
 	struct request_queue *q;
 
+	mutex_lock(&blkcg_pol_mutex);
+
 	blkcg_bypass_start();
-	spin_lock(&blkio_list_lock);
 
 	BUG_ON(blkio_policy[blkiop->plid]);
 	blkio_policy[blkiop->plid] = blkiop;
-	list_add_tail(&blkiop->list, &blkio_list);
-
-	spin_unlock(&blkio_list_lock);
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		update_root_blkg_pd(q, blkiop->plid);
+
 	blkcg_bypass_end();
 
 	if (blkiop->cftypes)
 		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
+
+	mutex_unlock(&blkcg_pol_mutex);
 }
 EXPORT_SYMBOL_GPL(blkio_policy_register);
 
@@ -753,19 +754,20 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 {
 	struct request_queue *q;
 
+	mutex_lock(&blkcg_pol_mutex);
+
 	if (blkiop->cftypes)
 		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);
 
 	blkcg_bypass_start();
-	spin_lock(&blkio_list_lock);
 
 	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
 	blkio_policy[blkiop->plid] = NULL;
-	list_del_init(&blkiop->list);
 
-	spin_unlock(&blkio_list_lock);
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		update_root_blkg_pd(q, blkiop->plid);
 	blkcg_bypass_end();
+
+	mutex_unlock(&blkcg_pol_mutex);
 }
 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 64392ac..c772581 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -102,7 +102,6 @@ struct blkio_policy_ops {
 };
 
 struct blkio_policy_type {
-	struct list_head list;
 	struct blkio_policy_ops ops;
 	enum blkio_policy_id plid;
 	size_t pdata_size;		/* policy specific private data size */
-- 
cgit v1.1


From ec399347d39fb2337ebace928cf4a2855bd0ec37 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:27 -0700
Subject: blkcg: use @pol instead of @plid in update_root_blkg_pd() and
 blkcg_print_blkgs()

The two functions were taking "enum blkio_policy_id plid".  Make them
take "const struct blkio_policy_type *pol" instead.

This is to prepare for per-queue policy activation and doesn't cause
any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 21 +++++++++++----------
 block/blk-cgroup.h   |  7 ++++---
 block/blk-throttle.c |  6 +++---
 block/cfq-iosched.c  | 10 +++++-----
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index af665fe..b123152 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -226,17 +226,17 @@ static void blkg_destroy(struct blkio_group *blkg)
  * aren't shot down.  This broken and racy implementation is temporary.
  * Eventually, blkg shoot down will be replaced by proper in-place update.
  */
-void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
+void update_root_blkg_pd(struct request_queue *q,
+			 const struct blkio_policy_type *pol)
 {
-	struct blkio_policy_type *pol = blkio_policy[plid];
 	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
 	struct blkg_policy_data *pd;
 
 	if (!blkg)
 		return;
 
-	kfree(blkg->pd[plid]);
-	blkg->pd[plid] = NULL;
+	kfree(blkg->pd[pol->plid]);
+	blkg->pd[pol->plid] = NULL;
 
 	if (!pol)
 		return;
@@ -244,7 +244,7 @@ void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
 	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
 	WARN_ON_ONCE(!pd);
 
-	blkg->pd[plid] = pd;
+	blkg->pd[pol->plid] = pd;
 	pd->blkg = blkg;
 	pol->ops.blkio_init_group_fn(blkg);
 }
@@ -360,7 +360,8 @@ static const char *blkg_dev_name(struct blkio_group *blkg)
  */
 void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 		       u64 (*prfill)(struct seq_file *, void *, int),
-		       int pol, int data, bool show_total)
+		       const struct blkio_policy_type *pol, int data,
+		       bool show_total)
 {
 	struct blkio_group *blkg;
 	struct hlist_node *n;
@@ -368,8 +369,8 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 
 	spin_lock_irq(&blkcg->lock);
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
-		if (blkg->pd[pol])
-			total += prfill(sf, blkg->pd[pol]->pdata, data);
+		if (blkg->pd[pol->plid])
+			total += prfill(sf, blkg->pd[pol->plid]->pdata, data);
 	spin_unlock_irq(&blkcg->lock);
 
 	if (show_total)
@@ -739,7 +740,7 @@ void blkio_policy_register(struct blkio_policy_type *blkiop)
 	BUG_ON(blkio_policy[blkiop->plid]);
 	blkio_policy[blkiop->plid] = blkiop;
 	list_for_each_entry(q, &all_q_list, all_q_node)
-		update_root_blkg_pd(q, blkiop->plid);
+		update_root_blkg_pd(q, blkiop);
 
 	blkcg_bypass_end();
 
@@ -765,7 +766,7 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 	blkio_policy[blkiop->plid] = NULL;
 
 	list_for_each_entry(q, &all_q_list, all_q_node)
-		update_root_blkg_pd(q, blkiop->plid);
+		update_root_blkg_pd(q, blkiop);
 	blkcg_bypass_end();
 
 	mutex_unlock(&blkcg_pol_mutex);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c772581..2694973 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -117,11 +117,12 @@ extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
 extern void blkg_destroy_all(struct request_queue *q, bool destroy_root);
 extern void update_root_blkg_pd(struct request_queue *q,
-				enum blkio_policy_id plid);
+				const struct blkio_policy_type *pol);
 
 void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 		       u64 (*prfill)(struct seq_file *, void *, int),
-		       int pol, int data, bool show_total);
+		       const struct blkio_policy_type *pol, int data,
+		       bool show_total);
 u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v);
 u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
 			 const struct blkg_rwstat *rwstat);
@@ -333,7 +334,7 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 static inline void blkg_destroy_all(struct request_queue *q,
 				    bool destory_root) { }
 static inline void update_root_blkg_pd(struct request_queue *q,
-				       enum blkio_policy_id plid) { }
+				       const struct blkio_policy_type *pol) { }
 
 static inline void *blkg_to_pdata(struct blkio_group *blkg,
 				struct blkio_policy_type *pol) { return NULL; }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6024014..07c17c2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -946,7 +946,7 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, BLKIO_POLICY_THROTL,
+	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkio_policy_throtl,
 			  cft->private, true);
 	return 0;
 }
@@ -973,7 +973,7 @@ static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
 			     struct seq_file *sf)
 {
 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_u64,
-			  BLKIO_POLICY_THROTL, cft->private, false);
+			  &blkio_policy_throtl, cft->private, false);
 	return 0;
 }
 
@@ -981,7 +981,7 @@ static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
 			      struct seq_file *sf)
 {
 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_uint,
-			  BLKIO_POLICY_THROTL, cft->private, false);
+			  &blkio_policy_throtl, cft->private, false);
 	return 0;
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7a8c3e0..d02f0ae 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1380,7 +1380,7 @@ static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
 				    struct seq_file *sf)
 {
 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
-			  cfqg_prfill_weight_device, BLKIO_POLICY_PROP, 0,
+			  cfqg_prfill_weight_device, &blkio_policy_cfq, 0,
 			  false);
 	return 0;
 }
@@ -1445,7 +1445,7 @@ static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, BLKIO_POLICY_PROP,
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkio_policy_cfq,
 			  cft->private, false);
 	return 0;
 }
@@ -1455,7 +1455,7 @@ static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, BLKIO_POLICY_PROP,
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkio_policy_cfq,
 			  cft->private, true);
 	return 0;
 }
@@ -1482,7 +1482,7 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
 	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
-			  BLKIO_POLICY_PROP, 0, false);
+			  &blkio_policy_cfq, 0, false);
 	return 0;
 }
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
@@ -3938,7 +3938,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 #ifndef CONFIG_CFQ_GROUP_IOSCHED
 	kfree(cfqd->root_group);
 #endif
-	update_root_blkg_pd(q, BLKIO_POLICY_PROP);
+	update_root_blkg_pd(q, &blkio_policy_cfq);
 	kfree(cfqd);
 }
 
-- 
cgit v1.1


From 8bd435b30ecacb69bbb8b2d3e251f770b807c5b2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:28 -0700
Subject: blkcg: remove static policy ID enums

Remove BLKIO_POLICY_* enums and let blkio_policy_register() allocate
@pol->plid dynamically on registration.  The maximum number of blkcg
policies which can be registered at the same time is defined by
BLKCG_MAX_POLS constant added to include/linux/blkdev.h.

Note that blkio_policy_register() now may fail.  Policy init functions
updated accordingly and unnecessary ifdefs removed from cfq_init().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     | 59 +++++++++++++++++++++++++++++++++++++-------------
 block/blk-cgroup.h     | 15 ++++---------
 block/blk-throttle.c   |  4 +---
 block/cfq-iosched.c    | 25 +++++++++++----------
 include/linux/blkdev.h |  7 +++++-
 5 files changed, 69 insertions(+), 41 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b123152..2d4d7d6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -31,7 +31,7 @@ static LIST_HEAD(all_q_list);
 struct blkio_cgroup blkio_root_cgroup = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
-static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
+static struct blkio_policy_type *blkio_policy[BLKCG_MAX_POLS];
 
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
@@ -67,7 +67,7 @@ static void blkg_free(struct blkio_group *blkg)
 	if (!blkg)
 		return;
 
-	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkio_policy_type *pol = blkio_policy[i];
 		struct blkg_policy_data *pd = blkg->pd[i];
 
@@ -107,7 +107,7 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	blkg->refcnt = 1;
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 
-	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkio_policy_type *pol = blkio_policy[i];
 		struct blkg_policy_data *pd;
 
@@ -127,7 +127,7 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	}
 
 	/* invoke per-policy init */
-	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkio_policy_type *pol = blkio_policy[i];
 
 		if (pol)
@@ -320,7 +320,7 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	 * anyway.  If you get hit by a race, retry.
 	 */
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkio_policy_type *pol = blkio_policy[i];
 
 			if (pol && pol->ops.blkio_reset_group_stats_fn)
@@ -729,46 +729,75 @@ struct cgroup_subsys blkio_subsys = {
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
-void blkio_policy_register(struct blkio_policy_type *blkiop)
+/**
+ * blkio_policy_register - register a blkcg policy
+ * @blkiop: blkcg policy to register
+ *
+ * Register @blkiop with blkcg core.  Might sleep and @blkiop may be
+ * modified on successful registration.  Returns 0 on success and -errno on
+ * failure.
+ */
+int blkio_policy_register(struct blkio_policy_type *blkiop)
 {
 	struct request_queue *q;
+	int i, ret;
 
 	mutex_lock(&blkcg_pol_mutex);
 
-	blkcg_bypass_start();
+	/* find an empty slot */
+	ret = -ENOSPC;
+	for (i = 0; i < BLKCG_MAX_POLS; i++)
+		if (!blkio_policy[i])
+			break;
+	if (i >= BLKCG_MAX_POLS)
+		goto out_unlock;
 
-	BUG_ON(blkio_policy[blkiop->plid]);
-	blkio_policy[blkiop->plid] = blkiop;
+	/* register and update blkgs */
+	blkiop->plid = i;
+	blkio_policy[i] = blkiop;
+
+	blkcg_bypass_start();
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		update_root_blkg_pd(q, blkiop);
-
 	blkcg_bypass_end();
 
+	/* everything is in place, add intf files for the new policy */
 	if (blkiop->cftypes)
 		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
-
+	ret = 0;
+out_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blkio_policy_register);
 
+/**
+ * blkiop_policy_unregister - unregister a blkcg policy
+ * @blkiop: blkcg policy to unregister
+ *
+ * Undo blkio_policy_register(@blkiop).  Might sleep.
+ */
 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 {
 	struct request_queue *q;
 
 	mutex_lock(&blkcg_pol_mutex);
 
+	if (WARN_ON(blkio_policy[blkiop->plid] != blkiop))
+		goto out_unlock;
+
+	/* kill the intf files first */
 	if (blkiop->cftypes)
 		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);
 
-	blkcg_bypass_start();
-
-	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
+	/* unregister and update blkgs */
 	blkio_policy[blkiop->plid] = NULL;
 
+	blkcg_bypass_start();
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		update_root_blkg_pd(q, blkiop);
 	blkcg_bypass_end();
-
+out_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
 }
 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2694973..be80d6e 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,13 +17,6 @@
 #include <linux/u64_stats_sync.h>
 #include <linux/seq_file.h>
 
-enum blkio_policy_id {
-	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
-	BLKIO_POLICY_THROTL,		/* Throttling */
-
-	BLKIO_NR_POLICIES,
-};
-
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
 
@@ -86,7 +79,7 @@ struct blkio_group {
 	/* reference count */
 	int refcnt;
 
-	struct blkg_policy_data *pd[BLKIO_NR_POLICIES];
+	struct blkg_policy_data *pd[BLKCG_MAX_POLS];
 
 	struct rcu_head rcu_head;
 };
@@ -103,7 +96,7 @@ struct blkio_policy_ops {
 
 struct blkio_policy_type {
 	struct blkio_policy_ops ops;
-	enum blkio_policy_id plid;
+	int plid;
 	size_t pdata_size;		/* policy specific private data size */
 	struct cftype *cftypes;		/* cgroup files for the policy */
 };
@@ -113,7 +106,7 @@ extern void blkcg_drain_queue(struct request_queue *q);
 extern void blkcg_exit_queue(struct request_queue *q);
 
 /* Blkio controller policy registration */
-extern void blkio_policy_register(struct blkio_policy_type *);
+extern int blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
 extern void blkg_destroy_all(struct request_queue *q, bool destroy_root);
 extern void update_root_blkg_pd(struct request_queue *q,
@@ -329,7 +322,7 @@ struct blkio_policy_type {
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
 static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
+static inline int blkio_policy_register(struct blkio_policy_type *blkiop) { return 0; }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 static inline void blkg_destroy_all(struct request_queue *q,
 				    bool destory_root) { }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 07c17c2..0dc4645a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1089,7 +1089,6 @@ static struct blkio_policy_type blkio_policy_throtl = {
 		.blkio_exit_group_fn = throtl_exit_blkio_group,
 		.blkio_reset_group_stats_fn = throtl_reset_group_stats,
 	},
-	.plid = BLKIO_POLICY_THROTL,
 	.pdata_size = sizeof(struct throtl_grp),
 	.cftypes = throtl_files,
 };
@@ -1271,8 +1270,7 @@ static int __init throtl_init(void)
 	if (!kthrotld_workqueue)
 		panic("Failed to create kthrotld\n");
 
-	blkio_policy_register(&blkio_policy_throtl);
-	return 0;
+	return blkio_policy_register(&blkio_policy_throtl);
 }
 
 module_init(throtl_init);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d02f0ae..08db2fc 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4157,7 +4157,6 @@ static struct blkio_policy_type blkio_policy_cfq = {
 		.blkio_init_group_fn =		cfq_init_blkio_group,
 		.blkio_reset_group_stats_fn =	cfqg_stats_reset,
 	},
-	.plid = BLKIO_POLICY_PROP,
 	.pdata_size = sizeof(struct cfq_group),
 	.cftypes = cfq_blkcg_files,
 };
@@ -4181,27 +4180,31 @@ static int __init cfq_init(void)
 #else
 		cfq_group_idle = 0;
 #endif
+
+	ret = blkio_policy_register(&blkio_policy_cfq);
+	if (ret)
+		return ret;
+
 	cfq_pool = KMEM_CACHE(cfq_queue, 0);
 	if (!cfq_pool)
-		return -ENOMEM;
+		goto err_pol_unreg;
 
 	ret = elv_register(&iosched_cfq);
-	if (ret) {
-		kmem_cache_destroy(cfq_pool);
-		return ret;
-	}
+	if (ret)
+		goto err_free_pool;
 
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	blkio_policy_register(&blkio_policy_cfq);
-#endif
 	return 0;
+
+err_free_pool:
+	kmem_cache_destroy(cfq_pool);
+err_pol_unreg:
+	blkio_policy_unregister(&blkio_policy_cfq);
+	return ret;
 }
 
 static void __exit cfq_exit(void)
 {
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
 	blkio_policy_unregister(&blkio_policy_cfq);
-#endif
 	elv_unregister(&iosched_cfq);
 	kmem_cache_destroy(cfq_pool);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 33f1b29..d2c69f8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -35,6 +35,12 @@ struct bsg_job;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
+/*
+ * Maximum number of blkcg policies allowed to be registered concurrently.
+ * Defined here to simplify include dependency.
+ */
+#define BLKCG_MAX_POLS		2
+
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 
@@ -363,7 +369,6 @@ struct request_queue {
 
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
-	/* XXX: array size hardcoded to avoid include dependency (temporary) */
 	struct list_head	blkg_list;
 #endif
 
-- 
cgit v1.1


From da8b066262e12d1d0a3b1e6d3486e500169bf730 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:29 -0700
Subject: blkcg: make blkg_conf_prep() take @pol and return with queue lock
 held

Add @pol to blkg_conf_prep() and let it return with queue lock held
(to be released by blkg_conf_finish()).  Note that @pol isn't used
yet.

This is to prepare for per-queue policy activation and doesn't cause
any visible difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 17 ++++++++++-------
 block/blk-cgroup.h   |  3 ++-
 block/blk-throttle.c |  2 +-
 block/cfq-iosched.c  |  2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2d4d7d6..f6581a0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -464,17 +464,19 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
+ * @pol: target policy
  * @input: input string
  * @ctx: blkg_conf_ctx to be filled
  *
  * Parse per-blkg config update from @input and initialize @ctx with the
  * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read locked and must be paired
- * with blkg_conf_finish().
+ * value.  This function returns with RCU read lock and queue lock held and
+ * must be paired with blkg_conf_finish().
  */
-int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
+int blkg_conf_prep(struct blkio_cgroup *blkcg,
+		   const struct blkio_policy_type *pol, const char *input,
 		   struct blkg_conf_ctx *ctx)
-	__acquires(rcu)
+	__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
 	struct gendisk *disk;
 	struct blkio_group *blkg;
@@ -490,14 +492,14 @@ int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
 		return -EINVAL;
 
 	rcu_read_lock();
-
 	spin_lock_irq(disk->queue->queue_lock);
+
 	blkg = blkg_lookup_create(blkcg, disk->queue, false);
-	spin_unlock_irq(disk->queue->queue_lock);
 
 	if (IS_ERR(blkg)) {
 		ret = PTR_ERR(blkg);
 		rcu_read_unlock();
+		spin_unlock_irq(disk->queue->queue_lock);
 		put_disk(disk);
 		/*
 		 * If queue was bypassing, we should retry.  Do so after a
@@ -527,8 +529,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
  * with blkg_conf_prep().
  */
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
-	__releases(rcu)
+	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
 {
+	spin_unlock_irq(ctx->disk->queue->queue_lock);
 	rcu_read_unlock();
 	put_disk(ctx->disk);
 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index be80d6e..df1c7b2 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -128,7 +128,8 @@ struct blkg_conf_ctx {
 	u64			v;
 };
 
-int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
+int blkg_conf_prep(struct blkio_cgroup *blkcg,
+		   const struct blkio_policy_type *pol, const char *input,
 		   struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0dc4645a..6f1bfdf 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -993,7 +993,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	struct throtl_grp *tg;
 	int ret;
 
-	ret = blkg_conf_prep(blkcg, buf, &ctx);
+	ret = blkg_conf_prep(blkcg, &blkio_policy_throtl, buf, &ctx);
 	if (ret)
 		return ret;
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 08db2fc..de95f9a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1400,7 +1400,7 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 	struct cfq_group *cfqg;
 	int ret;
 
-	ret = blkg_conf_prep(blkcg, buf, &ctx);
+	ret = blkg_conf_prep(blkcg, &blkio_policy_cfq, buf, &ctx);
 	if (ret)
 		return ret;
 
-- 
cgit v1.1


From 80fd99792b0b9f162abdf3da12fb10eb9eb5f321 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 14:50:53 -0700
Subject: blkcg: make sure blkg_lookup() returns %NULL if @q is bypassing

Currently, blkg_lookup() doesn't check @q bypass state.  This patch
updates blk_queue_bypass_start() to do synchronize_rcu() before
returning and updates blkg_lookup() to check blk_queue_bypass() and
return %NULL if bypassing.  This ensures blkg_lookup() returns %NULL
if @q is bypassing.

This is to guarantee that nobody is accessing policy data while @q is
bypassing, which is necessary to allow replacing blkio_cgroup->pd[] in
place on policy [de]activation.

v2: Added more comments explaining bypass guarantees as suggested by
    Vivek.

v3: Added more comments explaining why there's no synchronize_rcu() in
    blk_cleanup_queue() as suggested by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 50 +++++++++++++++++++++++++++++++++-----------------
 block/blk-core.c   | 15 +++++++++++++--
 2 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index f6581a0..d6e4555 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -137,6 +137,38 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	return blkg;
 }
 
+static struct blkio_group *__blkg_lookup(struct blkio_cgroup *blkcg,
+					 struct request_queue *q)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
+		if (blkg->q == q)
+			return blkg;
+	return NULL;
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
+				struct request_queue *q)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (unlikely(blk_queue_bypass(q)))
+		return NULL;
+	return __blkg_lookup(blkcg, q);
+}
+EXPORT_SYMBOL_GPL(blkg_lookup);
+
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       struct request_queue *q,
 				       bool for_root)
@@ -150,13 +182,11 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	/*
 	 * This could be the first entry point of blkcg implementation and
 	 * we shouldn't allow anything to go through for a bypassing queue.
-	 * The following can be removed if blkg lookup is guaranteed to
-	 * fail on a bypassing queue.
 	 */
 	if (unlikely(blk_queue_bypass(q)) && !for_root)
 		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
 
-	blkg = blkg_lookup(blkcg, q);
+	blkg = __blkg_lookup(blkcg, q);
 	if (blkg)
 		return blkg;
 
@@ -185,20 +215,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
-/* called under rcu_read_lock(). */
-struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
-				struct request_queue *q)
-{
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-
-	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
-		if (blkg->q == q)
-			return blkg;
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
-
 static void blkg_destroy(struct blkio_group *blkg)
 {
 	struct request_queue *q = blkg->q;
diff --git a/block/blk-core.c b/block/blk-core.c
index 991c1d6..f2db628 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -416,7 +416,8 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
  * In bypass mode, only the dispatch FIFO queue of @q is used.  This
  * function makes @q enter bypass mode and drains all requests which were
  * throttled or issued before.  On return, it's guaranteed that no request
- * is being throttled or has ELVPRIV set.
+ * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
+ * inside queue or RCU read lock.
  */
 void blk_queue_bypass_start(struct request_queue *q)
 {
@@ -426,6 +427,8 @@ void blk_queue_bypass_start(struct request_queue *q)
 	spin_unlock_irq(q->queue_lock);
 
 	blk_drain_queue(q, false);
+	/* ensure blk_queue_bypass() is %true inside RCU read lock */
+	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
 
@@ -462,7 +465,15 @@ void blk_cleanup_queue(struct request_queue *q)
 
 	spin_lock_irq(lock);
 
-	/* dead queue is permanently in bypass mode till released */
+	/*
+	 * Dead queue is permanently in bypass mode till released.  Note
+	 * that, unlike blk_queue_bypass_start(), we aren't performing
+	 * synchronize_rcu() after entering bypass mode to avoid the delay
+	 * as some drivers create and destroy a lot of queues while
+	 * probing.  This is still safe because blk_release_queue() will be
+	 * called only after the queue refcnt drops to zero and nothing,
+	 * RCU or not, would be traversing the queue by then.
+	 */
 	q->bypass_depth++;
 	queue_flag_set(QUEUE_FLAG_BYPASS, q);
 
-- 
cgit v1.1


From b82d4b197c782ced82a8b7b76664125d2d3c156c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:31 -0700
Subject: blkcg: make request_queue bypassing on allocation

With the previous change to guarantee bypass visiblity for RCU read
lock regions, entering bypass mode involves non-trivial overhead and
future changes are scheduled to make use of bypass mode during init
path.  Combined it may end up adding noticeable delay during boot.

This patch makes request_queue start its life in bypass mode, which is
ended on queue init completion at the end of
blk_init_allocated_queue(), and updates blk_queue_bypass_start() such
that draining and RCU synchronization are performed only when the
queue actually enters bypass mode.

This avoids unnecessarily switching in and out of bypass mode during
init avoiding the overhead and any nasty surprises which may step from
leaving bypass mode on half-initialized queues.

The boot time overhead was pointed out by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f2db628..3b02ba3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -421,14 +421,18 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
  */
 void blk_queue_bypass_start(struct request_queue *q)
 {
+	bool drain;
+
 	spin_lock_irq(q->queue_lock);
-	q->bypass_depth++;
+	drain = !q->bypass_depth++;
 	queue_flag_set(QUEUE_FLAG_BYPASS, q);
 	spin_unlock_irq(q->queue_lock);
 
-	blk_drain_queue(q, false);
-	/* ensure blk_queue_bypass() is %true inside RCU read lock */
-	synchronize_rcu();
+	if (drain) {
+		blk_drain_queue(q, false);
+		/* ensure blk_queue_bypass() is %true inside RCU read lock */
+		synchronize_rcu();
+	}
 }
 EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
 
@@ -577,6 +581,15 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	 */
 	q->queue_lock = &q->__queue_lock;
 
+	/*
+	 * A queue starts its life with bypass turned on to avoid
+	 * unnecessary bypass on/off overhead and nasty surprises during
+	 * init.  The initial bypass will be finished at the end of
+	 * blk_init_allocated_queue().
+	 */
+	q->bypass_depth = 1;
+	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+
 	if (blkcg_init_queue(q))
 		goto fail_id;
 
@@ -672,15 +685,15 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 
 	q->sg_reserved_size = INT_MAX;
 
-	/*
-	 * all done
-	 */
-	if (!elevator_init(q, NULL)) {
-		blk_queue_congestion_threshold(q);
-		return q;
-	}
+	/* init elevator */
+	if (elevator_init(q, NULL))
+		return NULL;
 
-	return NULL;
+	blk_queue_congestion_threshold(q);
+
+	/* all done, end the initial bypass */
+	blk_queue_bypass_end(q);
+	return q;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
 
-- 
cgit v1.1


From 03d8e11142a893ad322285d3c8a08e88b570cda1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:32 -0700
Subject: blkcg: add request_queue->root_blkg

With per-queue policy activation, root blkg creation will be moved to
blkcg core.  Add q->root_blkg in preparation.  For blk-throtl, this
replaces throtl_data->root_tg; however, cfq needs to keep
cfqd->root_group for !CONFIG_CFQ_GROUP_IOSCHED.

This is to prepare for per-queue policy activation and doesn't cause
any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c   | 16 ++++++++++------
 block/cfq-iosched.c    |  4 +++-
 include/linux/blkdev.h |  2 ++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6f1bfdf..8c520fa 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -97,7 +97,6 @@ struct throtl_data
 	/* service tree for active throtl groups */
 	struct throtl_rb_root tg_service_tree;
 
-	struct throtl_grp *root_tg;
 	struct request_queue *queue;
 
 	/* Total Number of queued bios on READ and WRITE lists */
@@ -131,6 +130,11 @@ static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg)
 	return pdata_to_blkg(tg);
 }
 
+static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
+{
+	return blkg_to_tg(td->queue->root_blkg);
+}
+
 enum tg_state_flags {
 	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
 };
@@ -261,7 +265,7 @@ throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 	 * Avoid lookup in this case
 	 */
 	if (blkcg == &blkio_root_cgroup)
-		return td->root_tg;
+		return td_root_tg(td);
 
 	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
 }
@@ -277,7 +281,7 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 	 * Avoid lookup in this case
 	 */
 	if (blkcg == &blkio_root_cgroup) {
-		tg = td->root_tg;
+		tg = td_root_tg(td);
 	} else {
 		struct blkio_group *blkg;
 
@@ -287,7 +291,7 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 		if (!IS_ERR(blkg))
 			tg = blkg_to_tg(blkg);
 		else if (!blk_queue_dead(q))
-			tg = td->root_tg;
+			tg = td_root_tg(td);
 	}
 
 	return tg;
@@ -1245,12 +1249,12 @@ int blk_throtl_init(struct request_queue *q)
 
 	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
 	if (!IS_ERR(blkg))
-		td->root_tg = blkg_to_tg(blkg);
+		q->root_blkg = blkg;
 
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 
-	if (!td->root_tg) {
+	if (!q->root_blkg) {
 		kfree(td);
 		return -ENOMEM;
 	}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index de95f9a..86440e0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3964,8 +3964,10 @@ static int cfq_init_queue(struct request_queue *q)
 	spin_lock_irq(q->queue_lock);
 
 	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
-	if (!IS_ERR(blkg))
+	if (!IS_ERR(blkg)) {
+		q->root_blkg = blkg;
 		cfqd->root_group = blkg_to_cfqg(blkg);
+	}
 
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d2c69f8..b01c377 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -31,6 +31,7 @@ struct blk_trace;
 struct request;
 struct sg_io_hdr;
 struct bsg_job;
+struct blkio_group;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -369,6 +370,7 @@ struct request_queue {
 
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
+	struct blkio_group	*root_blkg;
 	struct list_head	blkg_list;
 #endif
 
-- 
cgit v1.1


From a2b1693bac45ea3fe3ba612fd22c45f17449f610 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:33 -0700
Subject: blkcg: implement per-queue policy activation

All blkcg policies were assumed to be enabled on all request_queues.
Due to various implementation obstacles, during the recent blkcg core
updates, this was temporarily implemented as shooting down all !root
blkgs on elevator switch and policy [de]registration combined with
half-broken in-place root blkg updates.  In addition to being buggy
and racy, this meant losing all blkcg configurations across those
events.

Now that blkcg is cleaned up enough, this patch replaces the temporary
implementation with proper per-queue policy activation.  Each blkcg
policy should call the new blkcg_[de]activate_policy() to enable and
disable the policy on a specific queue.  blkcg_activate_policy()
allocates and installs policy data for the policy for all existing
blkgs.  blkcg_deactivate_policy() does the reverse.  If a policy is
not enabled for a given queue, blkg printing / config functions skip
the respective blkg for the queue.

blkcg_activate_policy() also takes care of root blkg creation, and
cfq_init_queue() and blk_throtl_init() are updated accordingly.

This replaces blkcg_bypass_{start|end}() and update_root_blkg_pd()
unnecessary.  Dropped.

v2: cfq_init_queue() was returning uninitialized @ret on root_group
    alloc failure if !CONFIG_CFQ_GROUP_IOSCHED.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     | 228 +++++++++++++++++++++++++++++++++----------------
 block/blk-cgroup.h     |  15 +++-
 block/blk-throttle.c   |  52 +++++------
 block/cfq-iosched.c    |  37 ++++----
 block/elevator.c       |   2 -
 include/linux/blkdev.h |   1 +
 6 files changed, 201 insertions(+), 134 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d6e4555..d6d59ad 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -54,6 +54,17 @@ struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
 
+static bool blkcg_policy_enabled(struct request_queue *q,
+				 const struct blkio_policy_type *pol)
+{
+	return pol && test_bit(pol->plid, q->blkcg_pols);
+}
+
+static size_t blkg_pd_size(const struct blkio_policy_type *pol)
+{
+	return sizeof(struct blkg_policy_data) + pol->pdata_size;
+}
+
 /**
  * blkg_free - free a blkg
  * @blkg: blkg to free
@@ -111,12 +122,11 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 		struct blkio_policy_type *pol = blkio_policy[i];
 		struct blkg_policy_data *pd;
 
-		if (!pol)
+		if (!blkcg_policy_enabled(q, pol))
 			continue;
 
 		/* alloc per-policy data and attach it to blkg */
-		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
-				  q->node);
+		pd = kzalloc_node(blkg_pd_size(pol), GFP_ATOMIC, q->node);
 		if (!pd) {
 			blkg_free(blkg);
 			return NULL;
@@ -130,7 +140,7 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkio_policy_type *pol = blkio_policy[i];
 
-		if (pol)
+		if (blkcg_policy_enabled(blkg->q, pol))
 			pol->ops.blkio_init_group_fn(blkg);
 	}
 
@@ -236,36 +246,6 @@ static void blkg_destroy(struct blkio_group *blkg)
 	blkg_put(blkg);
 }
 
-/*
- * XXX: This updates blkg policy data in-place for root blkg, which is
- * necessary across elevator switch and policy registration as root blkgs
- * aren't shot down.  This broken and racy implementation is temporary.
- * Eventually, blkg shoot down will be replaced by proper in-place update.
- */
-void update_root_blkg_pd(struct request_queue *q,
-			 const struct blkio_policy_type *pol)
-{
-	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
-	struct blkg_policy_data *pd;
-
-	if (!blkg)
-		return;
-
-	kfree(blkg->pd[pol->plid]);
-	blkg->pd[pol->plid] = NULL;
-
-	if (!pol)
-		return;
-
-	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
-	WARN_ON_ONCE(!pd);
-
-	blkg->pd[pol->plid] = pd;
-	pd->blkg = blkg;
-	pol->ops.blkio_init_group_fn(blkg);
-}
-EXPORT_SYMBOL_GPL(update_root_blkg_pd);
-
 /**
  * blkg_destroy_all - destroy all blkgs associated with a request_queue
  * @q: request_queue of interest
@@ -339,7 +319,8 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkio_policy_type *pol = blkio_policy[i];
 
-			if (pol && pol->ops.blkio_reset_group_stats_fn)
+			if (blkcg_policy_enabled(blkg->q, pol) &&
+			    pol->ops.blkio_reset_group_stats_fn)
 				pol->ops.blkio_reset_group_stats_fn(blkg);
 		}
 	}
@@ -385,7 +366,7 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 
 	spin_lock_irq(&blkcg->lock);
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
-		if (blkg->pd[pol->plid])
+		if (blkcg_policy_enabled(blkg->q, pol))
 			total += prfill(sf, blkg->pd[pol->plid]->pdata, data);
 	spin_unlock_irq(&blkcg->lock);
 
@@ -510,7 +491,10 @@ int blkg_conf_prep(struct blkio_cgroup *blkcg,
 	rcu_read_lock();
 	spin_lock_irq(disk->queue->queue_lock);
 
-	blkg = blkg_lookup_create(blkcg, disk->queue, false);
+	if (blkcg_policy_enabled(disk->queue, pol))
+		blkg = blkg_lookup_create(blkcg, disk->queue, false);
+	else
+		blkg = ERR_PTR(-EINVAL);
 
 	if (IS_ERR(blkg)) {
 		ret = PTR_ERR(blkg);
@@ -712,30 +696,6 @@ static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 	return ret;
 }
 
-static void blkcg_bypass_start(void)
-	__acquires(&all_q_mutex)
-{
-	struct request_queue *q;
-
-	mutex_lock(&all_q_mutex);
-
-	list_for_each_entry(q, &all_q_list, all_q_node) {
-		blk_queue_bypass_start(q);
-		blkg_destroy_all(q, false);
-	}
-}
-
-static void blkcg_bypass_end(void)
-	__releases(&all_q_mutex)
-{
-	struct request_queue *q;
-
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_queue_bypass_end(q);
-
-	mutex_unlock(&all_q_mutex);
-}
-
 struct cgroup_subsys blkio_subsys = {
 	.name = "blkio",
 	.create = blkiocg_create,
@@ -749,6 +709,139 @@ struct cgroup_subsys blkio_subsys = {
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
 /**
+ * blkcg_activate_policy - activate a blkcg policy on a request_queue
+ * @q: request_queue of interest
+ * @pol: blkcg policy to activate
+ *
+ * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
+ * bypass mode to populate its blkgs with policy_data for @pol.
+ *
+ * Activation happens with @q bypassed, so nobody would be accessing blkgs
+ * from IO path.  Update of each blkg is protected by both queue and blkcg
+ * locks so that holding either lock and testing blkcg_policy_enabled() is
+ * always enough for dereferencing policy data.
+ *
+ * The caller is responsible for synchronizing [de]activations and policy
+ * [un]registerations.  Returns 0 on success, -errno on failure.
+ */
+int blkcg_activate_policy(struct request_queue *q,
+			  const struct blkio_policy_type *pol)
+{
+	LIST_HEAD(pds);
+	struct blkio_group *blkg;
+	struct blkg_policy_data *pd, *n;
+	int cnt = 0, ret;
+
+	if (blkcg_policy_enabled(q, pol))
+		return 0;
+
+	blk_queue_bypass_start(q);
+
+	/* make sure the root blkg exists and count the existing blkgs */
+	spin_lock_irq(q->queue_lock);
+
+	rcu_read_lock();
+	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
+	rcu_read_unlock();
+
+	if (IS_ERR(blkg)) {
+		ret = PTR_ERR(blkg);
+		goto out_unlock;
+	}
+	q->root_blkg = blkg;
+
+	list_for_each_entry(blkg, &q->blkg_list, q_node)
+		cnt++;
+
+	spin_unlock_irq(q->queue_lock);
+
+	/* allocate policy_data for all existing blkgs */
+	while (cnt--) {
+		pd = kzalloc_node(blkg_pd_size(pol), GFP_KERNEL, q->node);
+		if (!pd) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+		list_add_tail(&pd->alloc_node, &pds);
+	}
+
+	/*
+	 * Install the allocated pds.  With @q bypassing, no new blkg
+	 * should have been created while the queue lock was dropped.
+	 */
+	spin_lock_irq(q->queue_lock);
+
+	list_for_each_entry(blkg, &q->blkg_list, q_node) {
+		if (WARN_ON(list_empty(&pds))) {
+			/* umm... this shouldn't happen, just abort */
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
+		list_del_init(&pd->alloc_node);
+
+		/* grab blkcg lock too while installing @pd on @blkg */
+		spin_lock(&blkg->blkcg->lock);
+
+		blkg->pd[pol->plid] = pd;
+		pd->blkg = blkg;
+		pol->ops.blkio_init_group_fn(blkg);
+
+		spin_unlock(&blkg->blkcg->lock);
+	}
+
+	__set_bit(pol->plid, q->blkcg_pols);
+	ret = 0;
+out_unlock:
+	spin_unlock_irq(q->queue_lock);
+out_free:
+	blk_queue_bypass_end(q);
+	list_for_each_entry_safe(pd, n, &pds, alloc_node)
+		kfree(pd);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkcg_activate_policy);
+
+/**
+ * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
+ * @q: request_queue of interest
+ * @pol: blkcg policy to deactivate
+ *
+ * Deactivate @pol on @q.  Follows the same synchronization rules as
+ * blkcg_activate_policy().
+ */
+void blkcg_deactivate_policy(struct request_queue *q,
+			     const struct blkio_policy_type *pol)
+{
+	struct blkio_group *blkg;
+
+	if (!blkcg_policy_enabled(q, pol))
+		return;
+
+	blk_queue_bypass_start(q);
+	spin_lock_irq(q->queue_lock);
+
+	__clear_bit(pol->plid, q->blkcg_pols);
+
+	list_for_each_entry(blkg, &q->blkg_list, q_node) {
+		/* grab blkcg lock too while removing @pd from @blkg */
+		spin_lock(&blkg->blkcg->lock);
+
+		if (pol->ops.blkio_exit_group_fn)
+			pol->ops.blkio_exit_group_fn(blkg);
+
+		kfree(blkg->pd[pol->plid]);
+		blkg->pd[pol->plid] = NULL;
+
+		spin_unlock(&blkg->blkcg->lock);
+	}
+
+	spin_unlock_irq(q->queue_lock);
+	blk_queue_bypass_end(q);
+}
+EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
+
+/**
  * blkio_policy_register - register a blkcg policy
  * @blkiop: blkcg policy to register
  *
@@ -758,7 +851,6 @@ EXPORT_SYMBOL_GPL(blkio_subsys);
  */
 int blkio_policy_register(struct blkio_policy_type *blkiop)
 {
-	struct request_queue *q;
 	int i, ret;
 
 	mutex_lock(&blkcg_pol_mutex);
@@ -775,11 +867,6 @@ int blkio_policy_register(struct blkio_policy_type *blkiop)
 	blkiop->plid = i;
 	blkio_policy[i] = blkiop;
 
-	blkcg_bypass_start();
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		update_root_blkg_pd(q, blkiop);
-	blkcg_bypass_end();
-
 	/* everything is in place, add intf files for the new policy */
 	if (blkiop->cftypes)
 		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
@@ -798,8 +885,6 @@ EXPORT_SYMBOL_GPL(blkio_policy_register);
  */
 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 {
-	struct request_queue *q;
-
 	mutex_lock(&blkcg_pol_mutex);
 
 	if (WARN_ON(blkio_policy[blkiop->plid] != blkiop))
@@ -811,11 +896,6 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 
 	/* unregister and update blkgs */
 	blkio_policy[blkiop->plid] = NULL;
-
-	blkcg_bypass_start();
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		update_root_blkg_pd(q, blkiop);
-	blkcg_bypass_end();
 out_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index df1c7b2..66253a7 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -64,6 +64,9 @@ struct blkg_policy_data {
 	/* the blkg this per-policy data belongs to */
 	struct blkio_group *blkg;
 
+	/* used during policy activation */
+	struct list_head alloc_node;
+
 	/* pol->pdata_size bytes of private data used by policy impl */
 	char pdata[] __aligned(__alignof__(unsigned long long));
 };
@@ -108,9 +111,11 @@ extern void blkcg_exit_queue(struct request_queue *q);
 /* Blkio controller policy registration */
 extern int blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
+extern int blkcg_activate_policy(struct request_queue *q,
+				 const struct blkio_policy_type *pol);
+extern void blkcg_deactivate_policy(struct request_queue *q,
+				    const struct blkio_policy_type *pol);
 extern void blkg_destroy_all(struct request_queue *q, bool destroy_root);
-extern void update_root_blkg_pd(struct request_queue *q,
-				const struct blkio_policy_type *pol);
 
 void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 		       u64 (*prfill)(struct seq_file *, void *, int),
@@ -325,10 +330,12 @@ static inline void blkcg_drain_queue(struct request_queue *q) { }
 static inline void blkcg_exit_queue(struct request_queue *q) { }
 static inline int blkio_policy_register(struct blkio_policy_type *blkiop) { return 0; }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+					const struct blkio_policy_type *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+					   const struct blkio_policy_type *pol) { }
 static inline void blkg_destroy_all(struct request_queue *q,
 				    bool destory_root) { }
-static inline void update_root_blkg_pd(struct request_queue *q,
-				       const struct blkio_policy_type *pol) { }
 
 static inline void *blkg_to_pdata(struct blkio_group *blkg,
 				struct blkio_policy_type *pol) { return NULL; }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8c520fa..2fc964e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -995,35 +995,31 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
+	struct throtl_data *td;
 	int ret;
 
 	ret = blkg_conf_prep(blkcg, &blkio_policy_throtl, buf, &ctx);
 	if (ret)
 		return ret;
 
-	ret = -EINVAL;
 	tg = blkg_to_tg(ctx.blkg);
-	if (tg) {
-		struct throtl_data *td = ctx.blkg->q->td;
-
-		if (!ctx.v)
-			ctx.v = -1;
+	td = ctx.blkg->q->td;
 
-		if (is_u64)
-			*(u64 *)((void *)tg + cft->private) = ctx.v;
-		else
-			*(unsigned int *)((void *)tg + cft->private) = ctx.v;
+	if (!ctx.v)
+		ctx.v = -1;
 
-		/* XXX: we don't need the following deferred processing */
-		xchg(&tg->limits_changed, true);
-		xchg(&td->limits_changed, true);
-		throtl_schedule_delayed_work(td, 0);
+	if (is_u64)
+		*(u64 *)((void *)tg + cft->private) = ctx.v;
+	else
+		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
 
-		ret = 0;
-	}
+	/* XXX: we don't need the following deferred processing */
+	xchg(&tg->limits_changed, true);
+	xchg(&td->limits_changed, true);
+	throtl_schedule_delayed_work(td, 0);
 
 	blkg_conf_finish(&ctx);
-	return ret;
+	return 0;
 }
 
 static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
@@ -1230,7 +1226,7 @@ void blk_throtl_drain(struct request_queue *q)
 int blk_throtl_init(struct request_queue *q)
 {
 	struct throtl_data *td;
-	struct blkio_group *blkg;
+	int ret;
 
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
@@ -1243,28 +1239,18 @@ int blk_throtl_init(struct request_queue *q)
 	q->td = td;
 	td->queue = q;
 
-	/* alloc and init root group. */
-	rcu_read_lock();
-	spin_lock_irq(q->queue_lock);
-
-	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
-	if (!IS_ERR(blkg))
-		q->root_blkg = blkg;
-
-	spin_unlock_irq(q->queue_lock);
-	rcu_read_unlock();
-
-	if (!q->root_blkg) {
+	/* activate policy */
+	ret = blkcg_activate_policy(q, &blkio_policy_throtl);
+	if (ret)
 		kfree(td);
-		return -ENOMEM;
-	}
-	return 0;
+	return ret;
 }
 
 void blk_throtl_exit(struct request_queue *q)
 {
 	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
+	blkcg_deactivate_policy(q, &blkio_policy_throtl);
 	kfree(q->td);
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 86440e0..0203652 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1406,8 +1406,7 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 
 	ret = -EINVAL;
 	cfqg = blkg_to_cfqg(ctx.blkg);
-	if (cfqg && (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN &&
-				ctx.v <= CFQ_WEIGHT_MAX))) {
+	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
 		cfqg->dev_weight = ctx.v;
 		cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
 		ret = 0;
@@ -3938,7 +3937,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 #ifndef CONFIG_CFQ_GROUP_IOSCHED
 	kfree(cfqd->root_group);
 #endif
-	update_root_blkg_pd(q, &blkio_policy_cfq);
+	blkcg_deactivate_policy(q, &blkio_policy_cfq);
 	kfree(cfqd);
 }
 
@@ -3946,7 +3945,7 @@ static int cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
 	struct blkio_group *blkg __maybe_unused;
-	int i;
+	int i, ret;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
@@ -3960,28 +3959,20 @@ static int cfq_init_queue(struct request_queue *q)
 
 	/* Init root group and prefer root group over other groups by default */
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-	rcu_read_lock();
-	spin_lock_irq(q->queue_lock);
-
-	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
-	if (!IS_ERR(blkg)) {
-		q->root_blkg = blkg;
-		cfqd->root_group = blkg_to_cfqg(blkg);
-	}
+	ret = blkcg_activate_policy(q, &blkio_policy_cfq);
+	if (ret)
+		goto out_free;
 
-	spin_unlock_irq(q->queue_lock);
-	rcu_read_unlock();
+	cfqd->root_group = blkg_to_cfqg(q->root_blkg);
 #else
+	ret = -ENOMEM;
 	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
 					GFP_KERNEL, cfqd->queue->node);
-	if (cfqd->root_group)
-		cfq_init_cfqg_base(cfqd->root_group);
-#endif
-	if (!cfqd->root_group) {
-		kfree(cfqd);
-		return -ENOMEM;
-	}
+	if (!cfqd->root_group)
+		goto out_free;
 
+	cfq_init_cfqg_base(cfqd->root_group);
+#endif
 	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
 
 	/*
@@ -4031,6 +4022,10 @@ static int cfq_init_queue(struct request_queue *q)
 	 */
 	cfqd->last_delayed_sync = jiffies - HZ;
 	return 0;
+
+out_free:
+	kfree(cfqd);
+	return ret;
 }
 
 /*
diff --git a/block/elevator.c b/block/elevator.c
index be3ab6d..6a55d41 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -896,8 +896,6 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	ioc_clear_queue(q);
 	spin_unlock_irq(q->queue_lock);
 
-	blkg_destroy_all(q, false);
-
 	/* allocate, init and register new elevator */
 	err = -ENOMEM;
 	q->elevator = elevator_alloc(q, new_e);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b01c377..68720ab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -370,6 +370,7 @@ struct request_queue {
 
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
+	DECLARE_BITMAP		(blkcg_pols, BLKCG_MAX_POLS);
 	struct blkio_group	*root_blkg;
 	struct list_head	blkg_list;
 #endif
-- 
cgit v1.1


From 3c96cb32d318f323c1bf972a4c66821f8499e34d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:34 -0700
Subject: blkcg: drop stuff unused after per-queue policy activation update

* All_q_list is unused.  Drop all_q_{mutex|list}.

* @for_root of blkg_lookup_create() is always %false when called from
  outside blk-cgroup.c proper.  Factor out __blkg_lookup_create() so
  that it doesn't check whether @q is bypassing and use the
  underscored version for the @for_root callsite.

* blkg_destroy_all() is used only from blkcg proper and @destroy_root
  is always %true.  Make it static and drop @destroy_root.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 61 +++++++++++++++++-----------------------------------
 block/blk-cgroup.h   |  6 +-----
 block/blk-throttle.c |  2 +-
 block/cfq-iosched.c  |  2 +-
 4 files changed, 23 insertions(+), 48 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d6d59ad..10f0d2fc 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -25,8 +25,6 @@
 #define MAX_KEY_LEN 100
 
 static DEFINE_MUTEX(blkcg_pol_mutex);
-static DEFINE_MUTEX(all_q_mutex);
-static LIST_HEAD(all_q_list);
 
 struct blkio_cgroup blkio_root_cgroup = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
@@ -179,9 +177,8 @@ struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkg_lookup);
 
-struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
-				       struct request_queue *q,
-				       bool for_root)
+static struct blkio_group *__blkg_lookup_create(struct blkio_cgroup *blkcg,
+						struct request_queue *q)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct blkio_group *blkg;
@@ -189,13 +186,6 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
 
-	/*
-	 * This could be the first entry point of blkcg implementation and
-	 * we shouldn't allow anything to go through for a bypassing queue.
-	 */
-	if (unlikely(blk_queue_bypass(q)) && !for_root)
-		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
-
 	blkg = __blkg_lookup(blkcg, q);
 	if (blkg)
 		return blkg;
@@ -223,6 +213,18 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 out:
 	return blkg;
 }
+
+struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
+				       struct request_queue *q)
+{
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q)))
+		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
+	return __blkg_lookup_create(blkcg, q);
+}
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void blkg_destroy(struct blkio_group *blkg)
@@ -249,12 +251,10 @@ static void blkg_destroy(struct blkio_group *blkg)
 /**
  * blkg_destroy_all - destroy all blkgs associated with a request_queue
  * @q: request_queue of interest
- * @destroy_root: whether to destroy root blkg or not
  *
- * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
- * destroyed; otherwise, root blkg is left alone.
+ * Destroy all blkgs associated with @q.
  */
-void blkg_destroy_all(struct request_queue *q, bool destroy_root)
+static void blkg_destroy_all(struct request_queue *q)
 {
 	struct blkio_group *blkg, *n;
 
@@ -263,10 +263,6 @@ void blkg_destroy_all(struct request_queue *q, bool destroy_root)
 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 		struct blkio_cgroup *blkcg = blkg->blkcg;
 
-		/* skip root? */
-		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
-			continue;
-
 		spin_lock(&blkcg->lock);
 		blkg_destroy(blkg);
 		spin_unlock(&blkcg->lock);
@@ -274,7 +270,6 @@ void blkg_destroy_all(struct request_queue *q, bool destroy_root)
 
 	spin_unlock_irq(q->queue_lock);
 }
-EXPORT_SYMBOL_GPL(blkg_destroy_all);
 
 static void blkg_rcu_free(struct rcu_head *rcu_head)
 {
@@ -492,7 +487,7 @@ int blkg_conf_prep(struct blkio_cgroup *blkcg,
 	spin_lock_irq(disk->queue->queue_lock);
 
 	if (blkcg_policy_enabled(disk->queue, pol))
-		blkg = blkg_lookup_create(blkcg, disk->queue, false);
+		blkg = blkg_lookup_create(blkcg, disk->queue);
 	else
 		blkg = ERR_PTR(-EINVAL);
 
@@ -625,20 +620,9 @@ done:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
-	int ret;
-
 	might_sleep();
 
-	ret = blk_throtl_init(q);
-	if (ret)
-		return ret;
-
-	mutex_lock(&all_q_mutex);
-	INIT_LIST_HEAD(&q->all_q_node);
-	list_add_tail(&q->all_q_node, &all_q_list);
-	mutex_unlock(&all_q_mutex);
-
-	return 0;
+	return blk_throtl_init(q);
 }
 
 /**
@@ -662,12 +646,7 @@ void blkcg_drain_queue(struct request_queue *q)
  */
 void blkcg_exit_queue(struct request_queue *q)
 {
-	mutex_lock(&all_q_mutex);
-	list_del_init(&q->all_q_node);
-	mutex_unlock(&all_q_mutex);
-
-	blkg_destroy_all(q, true);
-
+	blkg_destroy_all(q);
 	blk_throtl_exit(q);
 }
 
@@ -741,7 +720,7 @@ int blkcg_activate_policy(struct request_queue *q,
 	spin_lock_irq(q->queue_lock);
 
 	rcu_read_lock();
-	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
+	blkg = __blkg_lookup_create(&blkio_root_cgroup, q);
 	rcu_read_unlock();
 
 	if (IS_ERR(blkg)) {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 66253a7..222063d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -115,7 +115,6 @@ extern int blkcg_activate_policy(struct request_queue *q,
 				 const struct blkio_policy_type *pol);
 extern void blkcg_deactivate_policy(struct request_queue *q,
 				    const struct blkio_policy_type *pol);
-extern void blkg_destroy_all(struct request_queue *q, bool destroy_root);
 
 void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 		       u64 (*prfill)(struct seq_file *, void *, int),
@@ -334,8 +333,6 @@ static inline int blkcg_activate_policy(struct request_queue *q,
 					const struct blkio_policy_type *pol) { return 0; }
 static inline void blkcg_deactivate_policy(struct request_queue *q,
 					   const struct blkio_policy_type *pol) { }
-static inline void blkg_destroy_all(struct request_queue *q,
-				    bool destory_root) { }
 
 static inline void *blkg_to_pdata(struct blkio_group *blkg,
 				struct blkio_policy_type *pol) { return NULL; }
@@ -354,8 +351,7 @@ extern struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio);
 extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 				       struct request_queue *q);
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
-				       struct request_queue *q,
-				       bool for_root);
+				       struct request_queue *q);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 2fc964e..e2aaf27 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -285,7 +285,7 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 	} else {
 		struct blkio_group *blkg;
 
-		blkg = blkg_lookup_create(blkcg, q, false);
+		blkg = blkg_lookup_create(blkcg, q);
 
 		/* if %NULL and @q is alive, fall back to root_tg */
 		if (!IS_ERR(blkg))
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0203652..eb07eb6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1348,7 +1348,7 @@ static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 	} else {
 		struct blkio_group *blkg;
 
-		blkg = blkg_lookup_create(blkcg, q, false);
+		blkg = blkg_lookup_create(blkcg, q);
 		if (!IS_ERR(blkg))
 			cfqg = blkg_to_cfqg(blkg);
 	}
-- 
cgit v1.1


From 6d18b008daf46bcd82b8ae250aae0785f9714096 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Apr 2012 13:11:35 -0700
Subject: blkcg: shoot down blkgs if all policies are deactivated

There's no reason to keep blkgs around if no policy is activated for
the queue.  This patch moves queue locking out of blkg_destroy_all()
and call it from blkg_deactivate_policy() on deactivation of the last
policy on the queue.

This change was suggested by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 10f0d2fc..b1807d4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -258,7 +258,7 @@ static void blkg_destroy_all(struct request_queue *q)
 {
 	struct blkio_group *blkg, *n;
 
-	spin_lock_irq(q->queue_lock);
+	lockdep_assert_held(q->queue_lock);
 
 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 		struct blkio_cgroup *blkcg = blkg->blkcg;
@@ -267,8 +267,6 @@ static void blkg_destroy_all(struct request_queue *q)
 		blkg_destroy(blkg);
 		spin_unlock(&blkcg->lock);
 	}
-
-	spin_unlock_irq(q->queue_lock);
 }
 
 static void blkg_rcu_free(struct rcu_head *rcu_head)
@@ -646,7 +644,10 @@ void blkcg_drain_queue(struct request_queue *q)
  */
 void blkcg_exit_queue(struct request_queue *q)
 {
+	spin_lock_irq(q->queue_lock);
 	blkg_destroy_all(q);
+	spin_unlock_irq(q->queue_lock);
+
 	blk_throtl_exit(q);
 }
 
@@ -802,6 +803,10 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 	__clear_bit(pol->plid, q->blkcg_pols);
 
+	/* if no policy is left, no need for blkgs - shoot them down */
+	if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
+		blkg_destroy_all(q);
+
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
 		/* grab blkcg lock too while removing @pd from @blkg */
 		spin_lock(&blkg->blkcg->lock);
-- 
cgit v1.1


From c94bed89995e638e43a6663177358b9d20617361 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 16 Apr 2012 13:57:22 -0700
Subject: blkcg: blkg_rwstat_read() was missing inline

blkg_rwstat_read() in blk-cgroup.h was missing inline modifier causing
compile warning depending on configuration.  Add it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 222063d..ef6550a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -279,7 +279,7 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
  * This function can be called without synchronization and takes care of
  * u64 atomicity.
  */
-static struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 {
 	unsigned int start;
 	struct blkg_rwstat tmp;
-- 
cgit v1.1


From 54e7ed12bad1e3aa2a28558fab6850240465f973 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 16 Apr 2012 13:57:23 -0700
Subject: blkcg: remove blkio_group->path[]

blkio_group->path[] stores the path of the associated cgroup and is
used only for debug messages.  Just format the path from blkg->cgroup
when printing debug messages.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   |  1 -
 block/blk-cgroup.h   | 21 +++++++++++++++++----
 block/blk-throttle.c |  9 ++++++---
 block/cfq-iosched.c  | 21 ++++++++++++++-------
 4 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b1807d4..6333702 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -114,7 +114,6 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
 	blkg->refcnt = 1;
-	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkio_policy_type *pol = blkio_policy[i];
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ef6550a..c524267 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -77,8 +77,6 @@ struct blkio_group {
 	struct list_head q_node;
 	struct hlist_node blkcg_node;
 	struct blkio_cgroup *blkcg;
-	/* Store cgroup path */
-	char path[128];
 	/* reference count */
 	int refcnt;
 
@@ -167,9 +165,24 @@ static inline struct blkio_group *pdata_to_blkg(void *pdata)
 	return NULL;
 }
 
-static inline char *blkg_path(struct blkio_group *blkg)
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkio_group *blkg, char *buf, int buflen)
 {
-	return blkg->path;
+	int ret;
+
+	rcu_read_lock();
+	ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+	rcu_read_unlock();
+	if (ret)
+		strncpy(buf, "<unavailable>", buflen);
+	return ret;
 }
 
 /**
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e2aaf27..e9b7a47 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -155,9 +155,12 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
 
 THROTL_TG_FNS(on_rr);
 
-#define throtl_log_tg(td, tg, fmt, args...)				\
-	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
-			  blkg_path(tg_to_blkg(tg)), ##args);		\
+#define throtl_log_tg(td, tg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
+} while (0)
 
 #define throtl_log(td, fmt, args...)	\
 	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index eb07eb6..901286b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -573,14 +573,21 @@ static inline void cfqg_put(struct cfq_group *cfqg)
 	return blkg_put(cfqg_to_blkg(cfqg));
 }
 
-#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
+#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
-			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
-			blkg_path(cfqg_to_blkg((cfqq)->cfqg)), ##args)
-
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
-	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
-			blkg_path(cfqg_to_blkg((cfqg))), ##args)	\
+			  cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
+			  __pbuf, ##args);				\
+} while (0)
+
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);	\
+} while (0)
 
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 					    struct cfq_group *curr_cfqg, int rw)
-- 
cgit v1.1


From 36558c8a30e121f97b5852ae33e28081af21bdbf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 16 Apr 2012 13:57:24 -0700
Subject: blkcg: style cleanups for blk-cgroup.h

* Update indentation on struct field declarations.

* Uniformly don't use "extern" on function declarations.

* Merge the two #ifdef CONFIG_BLK_CGROUP blocks.

All changes in this patch are cosmetic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.h | 108 ++++++++++++++++++++++++++---------------------------
 1 file changed, 52 insertions(+), 56 deletions(-)

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c524267..b347aa0 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -38,15 +38,15 @@ enum blkg_rwstat_type {
 };
 
 struct blkio_cgroup {
-	struct cgroup_subsys_state css;
-	spinlock_t lock;
-	struct hlist_head blkg_list;
+	struct cgroup_subsys_state	css;
+	spinlock_t			lock;
+	struct hlist_head		blkg_list;
 
 	/* for policies to test whether associated blkcg has changed */
-	uint64_t id;
+	uint64_t			id;
 
 	/* TODO: per-policy storage in blkio_cgroup */
-	unsigned int cfq_weight;	/* belongs to cfq */
+	unsigned int			cfq_weight;	/* belongs to cfq */
 };
 
 struct blkg_stat {
@@ -62,27 +62,27 @@ struct blkg_rwstat {
 /* per-blkg per-policy data */
 struct blkg_policy_data {
 	/* the blkg this per-policy data belongs to */
-	struct blkio_group *blkg;
+	struct blkio_group		*blkg;
 
 	/* used during policy activation */
-	struct list_head alloc_node;
+	struct list_head		alloc_node;
 
 	/* pol->pdata_size bytes of private data used by policy impl */
-	char pdata[] __aligned(__alignof__(unsigned long long));
+	char				pdata[] __aligned(__alignof__(unsigned long long));
 };
 
 struct blkio_group {
 	/* Pointer to the associated request_queue */
-	struct request_queue *q;
-	struct list_head q_node;
-	struct hlist_node blkcg_node;
-	struct blkio_cgroup *blkcg;
+	struct request_queue		*q;
+	struct list_head		q_node;
+	struct hlist_node		blkcg_node;
+	struct blkio_cgroup		*blkcg;
 	/* reference count */
-	int refcnt;
+	int				refcnt;
 
-	struct blkg_policy_data *pd[BLKCG_MAX_POLS];
+	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
-	struct rcu_head rcu_head;
+	struct rcu_head			rcu_head;
 };
 
 typedef void (blkio_init_group_fn)(struct blkio_group *blkg);
@@ -90,29 +90,39 @@ typedef void (blkio_exit_group_fn)(struct blkio_group *blkg);
 typedef void (blkio_reset_group_stats_fn)(struct blkio_group *blkg);
 
 struct blkio_policy_ops {
-	blkio_init_group_fn *blkio_init_group_fn;
-	blkio_exit_group_fn *blkio_exit_group_fn;
-	blkio_reset_group_stats_fn *blkio_reset_group_stats_fn;
+	blkio_init_group_fn		*blkio_init_group_fn;
+	blkio_exit_group_fn		*blkio_exit_group_fn;
+	blkio_reset_group_stats_fn	*blkio_reset_group_stats_fn;
 };
 
 struct blkio_policy_type {
-	struct blkio_policy_ops ops;
-	int plid;
-	size_t pdata_size;		/* policy specific private data size */
-	struct cftype *cftypes;		/* cgroup files for the policy */
+	struct blkio_policy_ops		ops;
+	int				plid;
+	/* policy specific private data size */
+	size_t				pdata_size;
+	/* cgroup files for the policy */
+	struct cftype			*cftypes;
 };
 
-extern int blkcg_init_queue(struct request_queue *q);
-extern void blkcg_drain_queue(struct request_queue *q);
-extern void blkcg_exit_queue(struct request_queue *q);
+extern struct blkio_cgroup blkio_root_cgroup;
+
+struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
+struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio);
+struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
+				struct request_queue *q);
+struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
+				       struct request_queue *q);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_drain_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
 
 /* Blkio controller policy registration */
-extern int blkio_policy_register(struct blkio_policy_type *);
-extern void blkio_policy_unregister(struct blkio_policy_type *);
-extern int blkcg_activate_policy(struct request_queue *q,
-				 const struct blkio_policy_type *pol);
-extern void blkcg_deactivate_policy(struct request_queue *q,
-				    const struct blkio_policy_type *pol);
+int blkio_policy_register(struct blkio_policy_type *);
+void blkio_policy_unregister(struct blkio_policy_type *);
+int blkcg_activate_policy(struct request_queue *q,
+			  const struct blkio_policy_type *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+			     const struct blkio_policy_type *pol);
 
 void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
 		       u64 (*prfill)(struct seq_file *, void *, int),
@@ -125,9 +135,9 @@ u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off);
 
 struct blkg_conf_ctx {
-	struct gendisk		*disk;
-	struct blkio_group	*blkg;
-	u64			v;
+	struct gendisk			*disk;
+	struct blkio_group		*blkg;
+	u64				v;
 };
 
 int blkg_conf_prep(struct blkio_cgroup *blkcg,
@@ -329,7 +339,9 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
 }
 
-#else
+#else	/* CONFIG_BLK_CGROUP */
+
+struct cgroup;
 
 struct blkio_group {
 };
@@ -337,6 +349,9 @@ struct blkio_group {
 struct blkio_policy_type {
 };
 
+static inline struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
+static inline struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio) { return NULL; }
+static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg, void *key) { return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
 static inline void blkcg_exit_queue(struct request_queue *q) { }
@@ -355,24 +370,5 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 static inline void blkg_get(struct blkio_group *blkg) { }
 static inline void blkg_put(struct blkio_group *blkg) { }
 
-#endif
-
-#ifdef CONFIG_BLK_CGROUP
-extern struct blkio_cgroup blkio_root_cgroup;
-extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
-extern struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio);
-extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
-				       struct request_queue *q);
-struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
-				       struct request_queue *q);
-#else
-struct cgroup;
-static inline struct blkio_cgroup *
-cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
-static inline struct blkio_cgroup *
-bio_blkio_cgroup(struct bio *bio) { return NULL; }
-
-static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
-					      void *key) { return NULL; }
-#endif
-#endif /* _BLK_CGROUP_H */
+#endif	/* CONFIG_BLK_CGROUP */
+#endif	/* _BLK_CGROUP_H */
-- 
cgit v1.1


From 3c798398e393e5f9502dbab2b51e6c25e2e8f2ac Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 16 Apr 2012 13:57:25 -0700
Subject: blkcg: mass rename of blkcg API

During the recent blkcg cleanup, most of blkcg API has changed to such
extent that mass renaming wouldn't cause any noticeable pain.  Take
the chance and cleanup the naming.

* Rename blkio_cgroup to blkcg.

* Drop blkio / blkiocg prefixes and consistently use blkcg.

* Rename blkio_group to blkcg_gq, which is consistent with io_cq but
  keep the blkg prefix / variable name.

* Rename policy method type and field names to signify they're dealing
  with policy data.

* Rename blkio_policy_type to blkcg_policy.

This patch doesn't cause any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     | 202 ++++++++++++++++++++++++-------------------------
 block/blk-cgroup.h     | 109 +++++++++++++-------------
 block/blk-throttle.c   |  72 +++++++++---------
 block/cfq-iosched.c    |  78 +++++++++----------
 include/linux/blkdev.h |   4 +-
 5 files changed, 230 insertions(+), 235 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6333702..9975703 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -26,39 +26,39 @@
 
 static DEFINE_MUTEX(blkcg_pol_mutex);
 
-struct blkio_cgroup blkio_root_cgroup = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
-EXPORT_SYMBOL_GPL(blkio_root_cgroup);
+struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
+EXPORT_SYMBOL_GPL(blkcg_root);
 
-static struct blkio_policy_type *blkio_policy[BLKCG_MAX_POLS];
+static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
-struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
+struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
 {
 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
-			    struct blkio_cgroup, css);
+			    struct blkcg, css);
 }
-EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
+EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
 
-static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
+static struct blkcg *task_blkcg(struct task_struct *tsk)
 {
 	return container_of(task_subsys_state(tsk, blkio_subsys_id),
-			    struct blkio_cgroup, css);
+			    struct blkcg, css);
 }
 
-struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
+struct blkcg *bio_blkcg(struct bio *bio)
 {
 	if (bio && bio->bi_css)
-		return container_of(bio->bi_css, struct blkio_cgroup, css);
-	return task_blkio_cgroup(current);
+		return container_of(bio->bi_css, struct blkcg, css);
+	return task_blkcg(current);
 }
-EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
+EXPORT_SYMBOL_GPL(bio_blkcg);
 
 static bool blkcg_policy_enabled(struct request_queue *q,
-				 const struct blkio_policy_type *pol)
+				 const struct blkcg_policy *pol)
 {
 	return pol && test_bit(pol->plid, q->blkcg_pols);
 }
 
-static size_t blkg_pd_size(const struct blkio_policy_type *pol)
+static size_t blkg_pd_size(const struct blkcg_policy *pol)
 {
 	return sizeof(struct blkg_policy_data) + pol->pdata_size;
 }
@@ -69,7 +69,7 @@ static size_t blkg_pd_size(const struct blkio_policy_type *pol)
  *
  * Free @blkg which may be partially allocated.
  */
-static void blkg_free(struct blkio_group *blkg)
+static void blkg_free(struct blkcg_gq *blkg)
 {
 	int i;
 
@@ -77,14 +77,14 @@ static void blkg_free(struct blkio_group *blkg)
 		return;
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
-		struct blkio_policy_type *pol = blkio_policy[i];
+		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkg_policy_data *pd = blkg->pd[i];
 
 		if (!pd)
 			continue;
 
-		if (pol && pol->ops.blkio_exit_group_fn)
-			pol->ops.blkio_exit_group_fn(blkg);
+		if (pol && pol->ops.pd_exit_fn)
+			pol->ops.pd_exit_fn(blkg);
 
 		kfree(pd);
 	}
@@ -99,10 +99,9 @@ static void blkg_free(struct blkio_group *blkg)
  *
  * Allocate a new blkg assocating @blkcg and @q.
  */
-static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
-				      struct request_queue *q)
+static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
 {
-	struct blkio_group *blkg;
+	struct blkcg_gq *blkg;
 	int i;
 
 	/* alloc and init base part */
@@ -116,7 +115,7 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 	blkg->refcnt = 1;
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
-		struct blkio_policy_type *pol = blkio_policy[i];
+		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkg_policy_data *pd;
 
 		if (!blkcg_policy_enabled(q, pol))
@@ -135,19 +134,19 @@ static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 
 	/* invoke per-policy init */
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
-		struct blkio_policy_type *pol = blkio_policy[i];
+		struct blkcg_policy *pol = blkcg_policy[i];
 
 		if (blkcg_policy_enabled(blkg->q, pol))
-			pol->ops.blkio_init_group_fn(blkg);
+			pol->ops.pd_init_fn(blkg);
 	}
 
 	return blkg;
 }
 
-static struct blkio_group *__blkg_lookup(struct blkio_cgroup *blkcg,
-					 struct request_queue *q)
+static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+				      struct request_queue *q)
 {
-	struct blkio_group *blkg;
+	struct blkcg_gq *blkg;
 	struct hlist_node *n;
 
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
@@ -165,8 +164,7 @@ static struct blkio_group *__blkg_lookup(struct blkio_cgroup *blkcg,
  * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
  * - see blk_queue_bypass_start() for details.
  */
-struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
-				struct request_queue *q)
+struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
@@ -176,11 +174,11 @@ struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkg_lookup);
 
-static struct blkio_group *__blkg_lookup_create(struct blkio_cgroup *blkcg,
-						struct request_queue *q)
+static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+					     struct request_queue *q)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
-	struct blkio_group *blkg;
+	struct blkcg_gq *blkg;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
@@ -213,8 +211,8 @@ out:
 	return blkg;
 }
 
-struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
-				       struct request_queue *q)
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q)
 {
 	/*
 	 * This could be the first entry point of blkcg implementation and
@@ -226,10 +224,10 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
-static void blkg_destroy(struct blkio_group *blkg)
+static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct request_queue *q = blkg->q;
-	struct blkio_cgroup *blkcg = blkg->blkcg;
+	struct blkcg *blkcg = blkg->blkcg;
 
 	lockdep_assert_held(q->queue_lock);
 	lockdep_assert_held(&blkcg->lock);
@@ -255,12 +253,12 @@ static void blkg_destroy(struct blkio_group *blkg)
  */
 static void blkg_destroy_all(struct request_queue *q)
 {
-	struct blkio_group *blkg, *n;
+	struct blkcg_gq *blkg, *n;
 
 	lockdep_assert_held(q->queue_lock);
 
 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
-		struct blkio_cgroup *blkcg = blkg->blkcg;
+		struct blkcg *blkcg = blkg->blkcg;
 
 		spin_lock(&blkcg->lock);
 		blkg_destroy(blkg);
@@ -270,10 +268,10 @@ static void blkg_destroy_all(struct request_queue *q)
 
 static void blkg_rcu_free(struct rcu_head *rcu_head)
 {
-	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
+	blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
 }
 
-void __blkg_release(struct blkio_group *blkg)
+void __blkg_release(struct blkcg_gq *blkg)
 {
 	/* release the extra blkcg reference this blkg has been holding */
 	css_put(&blkg->blkcg->css);
@@ -291,11 +289,11 @@ void __blkg_release(struct blkio_group *blkg)
 }
 EXPORT_SYMBOL_GPL(__blkg_release);
 
-static int
-blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
+			     u64 val)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-	struct blkio_group *blkg;
+	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+	struct blkcg_gq *blkg;
 	struct hlist_node *n;
 	int i;
 
@@ -309,11 +307,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	 */
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
-			struct blkio_policy_type *pol = blkio_policy[i];
+			struct blkcg_policy *pol = blkcg_policy[i];
 
 			if (blkcg_policy_enabled(blkg->q, pol) &&
-			    pol->ops.blkio_reset_group_stats_fn)
-				pol->ops.blkio_reset_group_stats_fn(blkg);
+			    pol->ops.pd_reset_stats_fn)
+				pol->ops.pd_reset_stats_fn(blkg);
 		}
 	}
 
@@ -322,7 +320,7 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	return 0;
 }
 
-static const char *blkg_dev_name(struct blkio_group *blkg)
+static const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
 	/* some drivers (floppy) instantiate a queue w/o disk registered */
 	if (blkg->q->backing_dev_info.dev)
@@ -347,12 +345,12 @@ static const char *blkg_dev_name(struct blkio_group *blkg)
  * This is to be used to construct print functions for
  * cftype->read_seq_string method.
  */
-void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       u64 (*prfill)(struct seq_file *, void *, int),
-		       const struct blkio_policy_type *pol, int data,
+		       const struct blkcg_policy *pol, int data,
 		       bool show_total)
 {
-	struct blkio_group *blkg;
+	struct blkcg_gq *blkg;
 	struct hlist_node *n;
 	u64 total = 0;
 
@@ -462,13 +460,12 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
  * value.  This function returns with RCU read lock and queue lock held and
  * must be paired with blkg_conf_finish().
  */
-int blkg_conf_prep(struct blkio_cgroup *blkcg,
-		   const struct blkio_policy_type *pol, const char *input,
-		   struct blkg_conf_ctx *ctx)
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+		   const char *input, struct blkg_conf_ctx *ctx)
 	__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
 	struct gendisk *disk;
-	struct blkio_group *blkg;
+	struct blkcg_gq *blkg;
 	unsigned int major, minor;
 	unsigned long long v;
 	int part, ret;
@@ -529,16 +526,16 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
-struct cftype blkio_files[] = {
+struct cftype blkcg_files[] = {
 	{
 		.name = "reset_stats",
-		.write_u64 = blkiocg_reset_stats,
+		.write_u64 = blkcg_reset_stats,
 	},
 	{ }	/* terminate */
 };
 
 /**
- * blkiocg_pre_destroy - cgroup pre_destroy callback
+ * blkcg_pre_destroy - cgroup pre_destroy callback
  * @cgroup: cgroup of interest
  *
  * This function is called when @cgroup is about to go away and responsible
@@ -548,15 +545,15 @@ struct cftype blkio_files[] = {
  *
  * This is the blkcg counterpart of ioc_release_fn().
  */
-static int blkiocg_pre_destroy(struct cgroup *cgroup)
+static int blkcg_pre_destroy(struct cgroup *cgroup)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 
 	spin_lock_irq(&blkcg->lock);
 
 	while (!hlist_empty(&blkcg->blkg_list)) {
-		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
-						struct blkio_group, blkcg_node);
+		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
+						struct blkcg_gq, blkcg_node);
 		struct request_queue *q = blkg->q;
 
 		if (spin_trylock(q->queue_lock)) {
@@ -573,22 +570,22 @@ static int blkiocg_pre_destroy(struct cgroup *cgroup)
 	return 0;
 }
 
-static void blkiocg_destroy(struct cgroup *cgroup)
+static void blkcg_destroy(struct cgroup *cgroup)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 
-	if (blkcg != &blkio_root_cgroup)
+	if (blkcg != &blkcg_root)
 		kfree(blkcg);
 }
 
-static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
+static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
 {
 	static atomic64_t id_seq = ATOMIC64_INIT(0);
-	struct blkio_cgroup *blkcg;
+	struct blkcg *blkcg;
 	struct cgroup *parent = cgroup->parent;
 
 	if (!parent) {
-		blkcg = &blkio_root_cgroup;
+		blkcg = &blkcg_root;
 		goto done;
 	}
 
@@ -656,7 +653,7 @@ void blkcg_exit_queue(struct request_queue *q)
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	struct io_context *ioc;
@@ -677,12 +674,12 @@ static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 
 struct cgroup_subsys blkio_subsys = {
 	.name = "blkio",
-	.create = blkiocg_create,
-	.can_attach = blkiocg_can_attach,
-	.pre_destroy = blkiocg_pre_destroy,
-	.destroy = blkiocg_destroy,
+	.create = blkcg_create,
+	.can_attach = blkcg_can_attach,
+	.pre_destroy = blkcg_pre_destroy,
+	.destroy = blkcg_destroy,
 	.subsys_id = blkio_subsys_id,
-	.base_cftypes = blkio_files,
+	.base_cftypes = blkcg_files,
 	.module = THIS_MODULE,
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
@@ -704,10 +701,10 @@ EXPORT_SYMBOL_GPL(blkio_subsys);
  * [un]registerations.  Returns 0 on success, -errno on failure.
  */
 int blkcg_activate_policy(struct request_queue *q,
-			  const struct blkio_policy_type *pol)
+			  const struct blkcg_policy *pol)
 {
 	LIST_HEAD(pds);
-	struct blkio_group *blkg;
+	struct blkcg_gq *blkg;
 	struct blkg_policy_data *pd, *n;
 	int cnt = 0, ret;
 
@@ -720,7 +717,7 @@ int blkcg_activate_policy(struct request_queue *q,
 	spin_lock_irq(q->queue_lock);
 
 	rcu_read_lock();
-	blkg = __blkg_lookup_create(&blkio_root_cgroup, q);
+	blkg = __blkg_lookup_create(&blkcg_root, q);
 	rcu_read_unlock();
 
 	if (IS_ERR(blkg)) {
@@ -764,7 +761,7 @@ int blkcg_activate_policy(struct request_queue *q,
 
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
-		pol->ops.blkio_init_group_fn(blkg);
+		pol->ops.pd_init_fn(blkg);
 
 		spin_unlock(&blkg->blkcg->lock);
 	}
@@ -790,9 +787,9 @@ EXPORT_SYMBOL_GPL(blkcg_activate_policy);
  * blkcg_activate_policy().
  */
 void blkcg_deactivate_policy(struct request_queue *q,
-			     const struct blkio_policy_type *pol)
+			     const struct blkcg_policy *pol)
 {
-	struct blkio_group *blkg;
+	struct blkcg_gq *blkg;
 
 	if (!blkcg_policy_enabled(q, pol))
 		return;
@@ -810,8 +807,8 @@ void blkcg_deactivate_policy(struct request_queue *q,
 		/* grab blkcg lock too while removing @pd from @blkg */
 		spin_lock(&blkg->blkcg->lock);
 
-		if (pol->ops.blkio_exit_group_fn)
-			pol->ops.blkio_exit_group_fn(blkg);
+		if (pol->ops.pd_exit_fn)
+			pol->ops.pd_exit_fn(blkg);
 
 		kfree(blkg->pd[pol->plid]);
 		blkg->pd[pol->plid] = NULL;
@@ -825,14 +822,13 @@ void blkcg_deactivate_policy(struct request_queue *q,
 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
 
 /**
- * blkio_policy_register - register a blkcg policy
- * @blkiop: blkcg policy to register
+ * blkcg_policy_register - register a blkcg policy
+ * @pol: blkcg policy to register
  *
- * Register @blkiop with blkcg core.  Might sleep and @blkiop may be
- * modified on successful registration.  Returns 0 on success and -errno on
- * failure.
+ * Register @pol with blkcg core.  Might sleep and @pol may be modified on
+ * successful registration.  Returns 0 on success and -errno on failure.
  */
-int blkio_policy_register(struct blkio_policy_type *blkiop)
+int blkcg_policy_register(struct blkcg_policy *pol)
 {
 	int i, ret;
 
@@ -841,45 +837,45 @@ int blkio_policy_register(struct blkio_policy_type *blkiop)
 	/* find an empty slot */
 	ret = -ENOSPC;
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
-		if (!blkio_policy[i])
+		if (!blkcg_policy[i])
 			break;
 	if (i >= BLKCG_MAX_POLS)
 		goto out_unlock;
 
 	/* register and update blkgs */
-	blkiop->plid = i;
-	blkio_policy[i] = blkiop;
+	pol->plid = i;
+	blkcg_policy[i] = pol;
 
 	/* everything is in place, add intf files for the new policy */
-	if (blkiop->cftypes)
-		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
+	if (pol->cftypes)
+		WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
 	ret = 0;
 out_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blkio_policy_register);
+EXPORT_SYMBOL_GPL(blkcg_policy_register);
 
 /**
- * blkiop_policy_unregister - unregister a blkcg policy
- * @blkiop: blkcg policy to unregister
+ * blkcg_policy_unregister - unregister a blkcg policy
+ * @pol: blkcg policy to unregister
  *
- * Undo blkio_policy_register(@blkiop).  Might sleep.
+ * Undo blkcg_policy_register(@pol).  Might sleep.
  */
-void blkio_policy_unregister(struct blkio_policy_type *blkiop)
+void blkcg_policy_unregister(struct blkcg_policy *pol)
 {
 	mutex_lock(&blkcg_pol_mutex);
 
-	if (WARN_ON(blkio_policy[blkiop->plid] != blkiop))
+	if (WARN_ON(blkcg_policy[pol->plid] != pol))
 		goto out_unlock;
 
 	/* kill the intf files first */
-	if (blkiop->cftypes)
-		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);
+	if (pol->cftypes)
+		cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
 
 	/* unregister and update blkgs */
-	blkio_policy[blkiop->plid] = NULL;
+	blkcg_policy[pol->plid] = NULL;
 out_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
 }
-EXPORT_SYMBOL_GPL(blkio_policy_unregister);
+EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b347aa0..a443b84 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -37,7 +37,7 @@ enum blkg_rwstat_type {
 	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
 };
 
-struct blkio_cgroup {
+struct blkcg {
 	struct cgroup_subsys_state	css;
 	spinlock_t			lock;
 	struct hlist_head		blkg_list;
@@ -45,7 +45,7 @@ struct blkio_cgroup {
 	/* for policies to test whether associated blkcg has changed */
 	uint64_t			id;
 
-	/* TODO: per-policy storage in blkio_cgroup */
+	/* TODO: per-policy storage in blkcg */
 	unsigned int			cfq_weight;	/* belongs to cfq */
 };
 
@@ -62,7 +62,7 @@ struct blkg_rwstat {
 /* per-blkg per-policy data */
 struct blkg_policy_data {
 	/* the blkg this per-policy data belongs to */
-	struct blkio_group		*blkg;
+	struct blkcg_gq			*blkg;
 
 	/* used during policy activation */
 	struct list_head		alloc_node;
@@ -71,12 +71,13 @@ struct blkg_policy_data {
 	char				pdata[] __aligned(__alignof__(unsigned long long));
 };
 
-struct blkio_group {
+/* association between a blk cgroup and a request queue */
+struct blkcg_gq {
 	/* Pointer to the associated request_queue */
 	struct request_queue		*q;
 	struct list_head		q_node;
 	struct hlist_node		blkcg_node;
-	struct blkio_cgroup		*blkcg;
+	struct blkcg			*blkcg;
 	/* reference count */
 	int				refcnt;
 
@@ -85,18 +86,18 @@ struct blkio_group {
 	struct rcu_head			rcu_head;
 };
 
-typedef void (blkio_init_group_fn)(struct blkio_group *blkg);
-typedef void (blkio_exit_group_fn)(struct blkio_group *blkg);
-typedef void (blkio_reset_group_stats_fn)(struct blkio_group *blkg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
 
-struct blkio_policy_ops {
-	blkio_init_group_fn		*blkio_init_group_fn;
-	blkio_exit_group_fn		*blkio_exit_group_fn;
-	blkio_reset_group_stats_fn	*blkio_reset_group_stats_fn;
+struct blkcg_policy_ops {
+	blkcg_pol_init_pd_fn		*pd_init_fn;
+	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
 };
 
-struct blkio_policy_type {
-	struct blkio_policy_ops		ops;
+struct blkcg_policy {
+	struct blkcg_policy_ops		ops;
 	int				plid;
 	/* policy specific private data size */
 	size_t				pdata_size;
@@ -104,29 +105,28 @@ struct blkio_policy_type {
 	struct cftype			*cftypes;
 };
 
-extern struct blkio_cgroup blkio_root_cgroup;
+extern struct blkcg blkcg_root;
 
-struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
-struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio);
-struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
-				struct request_queue *q);
-struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
-				       struct request_queue *q);
+struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup);
+struct blkcg *bio_blkcg(struct bio *bio);
+struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
 void blkcg_drain_queue(struct request_queue *q);
 void blkcg_exit_queue(struct request_queue *q);
 
 /* Blkio controller policy registration */
-int blkio_policy_register(struct blkio_policy_type *);
-void blkio_policy_unregister(struct blkio_policy_type *);
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
 int blkcg_activate_policy(struct request_queue *q,
-			  const struct blkio_policy_type *pol);
+			  const struct blkcg_policy *pol);
 void blkcg_deactivate_policy(struct request_queue *q,
-			     const struct blkio_policy_type *pol);
+			     const struct blkcg_policy *pol);
 
-void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       u64 (*prfill)(struct seq_file *, void *, int),
-		       const struct blkio_policy_type *pol, int data,
+		       const struct blkcg_policy *pol, int data,
 		       bool show_total);
 u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v);
 u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
@@ -136,13 +136,12 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off);
 
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
-	struct blkio_group		*blkg;
+	struct blkcg_gq			*blkg;
 	u64				v;
 };
 
-int blkg_conf_prep(struct blkio_cgroup *blkcg,
-		   const struct blkio_policy_type *pol, const char *input,
-		   struct blkg_conf_ctx *ctx);
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+		   const char *input, struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
 
@@ -153,8 +152,8 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx);
  *
  * Return pointer to private data associated with the @blkg-@pol pair.
  */
-static inline void *blkg_to_pdata(struct blkio_group *blkg,
-			      struct blkio_policy_type *pol)
+static inline void *blkg_to_pdata(struct blkcg_gq *blkg,
+				  struct blkcg_policy *pol)
 {
 	return blkg ? blkg->pd[pol->plid]->pdata : NULL;
 }
@@ -165,7 +164,7 @@ static inline void *blkg_to_pdata(struct blkio_group *blkg,
  *
  * @pdata is policy private data.  Determine the blkg it's associated with.
  */
-static inline struct blkio_group *pdata_to_blkg(void *pdata)
+static inline struct blkcg_gq *pdata_to_blkg(void *pdata)
 {
 	if (pdata) {
 		struct blkg_policy_data *pd =
@@ -183,7 +182,7 @@ static inline struct blkio_group *pdata_to_blkg(void *pdata)
  *
  * Format the path of the cgroup of @blkg into @buf.
  */
-static inline int blkg_path(struct blkio_group *blkg, char *buf, int buflen)
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
 {
 	int ret;
 
@@ -201,14 +200,14 @@ static inline int blkg_path(struct blkio_group *blkg, char *buf, int buflen)
  *
  * The caller should be holding queue_lock and an existing reference.
  */
-static inline void blkg_get(struct blkio_group *blkg)
+static inline void blkg_get(struct blkcg_gq *blkg)
 {
 	lockdep_assert_held(blkg->q->queue_lock);
 	WARN_ON_ONCE(!blkg->refcnt);
 	blkg->refcnt++;
 }
 
-void __blkg_release(struct blkio_group *blkg);
+void __blkg_release(struct blkcg_gq *blkg);
 
 /**
  * blkg_put - put a blkg reference
@@ -216,7 +215,7 @@ void __blkg_release(struct blkio_group *blkg);
  *
  * The caller should be holding queue_lock.
  */
-static inline void blkg_put(struct blkio_group *blkg)
+static inline void blkg_put(struct blkcg_gq *blkg)
 {
 	lockdep_assert_held(blkg->q->queue_lock);
 	WARN_ON_ONCE(blkg->refcnt <= 0);
@@ -343,32 +342,32 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 
 struct cgroup;
 
-struct blkio_group {
+struct blkcg_gq {
 };
 
-struct blkio_policy_type {
+struct blkcg_policy {
 };
 
-static inline struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
-static inline struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio) { return NULL; }
-static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg, void *key) { return NULL; }
+static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
+static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
 static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline int blkio_policy_register(struct blkio_policy_type *blkiop) { return 0; }
-static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
 static inline int blkcg_activate_policy(struct request_queue *q,
-					const struct blkio_policy_type *pol) { return 0; }
+					const struct blkcg_policy *pol) { return 0; }
 static inline void blkcg_deactivate_policy(struct request_queue *q,
-					   const struct blkio_policy_type *pol) { }
-
-static inline void *blkg_to_pdata(struct blkio_group *blkg,
-				struct blkio_policy_type *pol) { return NULL; }
-static inline struct blkio_group *pdata_to_blkg(void *pdata,
-				struct blkio_policy_type *pol) { return NULL; }
-static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
-static inline void blkg_get(struct blkio_group *blkg) { }
-static inline void blkg_put(struct blkio_group *blkg) { }
+					   const struct blkcg_policy *pol) { }
+
+static inline void *blkg_to_pdata(struct blkcg_gq *blkg,
+				  struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pdata_to_blkg(void *pdata,
+				  struct blkcg_policy *pol) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
 
 #endif	/* CONFIG_BLK_CGROUP */
 #endif	/* _BLK_CGROUP_H */
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e9b7a47..00c7eff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,7 +21,7 @@ static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 static unsigned long throtl_slice = HZ/10;	/* 100 ms */
 
-static struct blkio_policy_type blkio_policy_throtl;
+static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
@@ -120,12 +120,12 @@ static LIST_HEAD(tg_stats_alloc_list);
 static void tg_stats_alloc_fn(struct work_struct *);
 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
 
-static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
+static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
 {
-	return blkg_to_pdata(blkg, &blkio_policy_throtl);
+	return blkg_to_pdata(blkg, &blkcg_policy_throtl);
 }
 
-static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg)
+static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
 {
 	return pdata_to_blkg(tg);
 }
@@ -208,7 +208,7 @@ alloc_stats:
 		goto alloc_stats;
 }
 
-static void throtl_init_blkio_group(struct blkio_group *blkg)
+static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 
@@ -233,7 +233,7 @@ static void throtl_init_blkio_group(struct blkio_group *blkg)
 	spin_unlock(&tg_stats_alloc_lock);
 }
 
-static void throtl_exit_blkio_group(struct blkio_group *blkg)
+static void throtl_pd_exit(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 
@@ -244,7 +244,7 @@ static void throtl_exit_blkio_group(struct blkio_group *blkg)
 	free_percpu(tg->stats_cpu);
 }
 
-static void throtl_reset_group_stats(struct blkio_group *blkg)
+static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	int cpu;
@@ -260,33 +260,33 @@ static void throtl_reset_group_stats(struct blkio_group *blkg)
 	}
 }
 
-static struct
-throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
+static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
+					   struct blkcg *blkcg)
 {
 	/*
-	 * This is the common case when there are no blkio cgroups.
-	 * Avoid lookup in this case
+	 * This is the common case when there are no blkcgs.  Avoid lookup
+	 * in this case
 	 */
-	if (blkcg == &blkio_root_cgroup)
+	if (blkcg == &blkcg_root)
 		return td_root_tg(td);
 
 	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
 }
 
 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
-						  struct blkio_cgroup *blkcg)
+						  struct blkcg *blkcg)
 {
 	struct request_queue *q = td->queue;
 	struct throtl_grp *tg = NULL;
 
 	/*
-	 * This is the common case when there are no blkio cgroups.
-	 * Avoid lookup in this case
+	 * This is the common case when there are no blkcgs.  Avoid lookup
+	 * in this case
 	 */
-	if (blkcg == &blkio_root_cgroup) {
+	if (blkcg == &blkcg_root) {
 		tg = td_root_tg(td);
 	} else {
-		struct blkio_group *blkg;
+		struct blkcg_gq *blkg;
 
 		blkg = blkg_lookup_create(blkcg, q);
 
@@ -665,7 +665,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 	return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes,
+static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
 					 int rw)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -822,7 +822,7 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 static void throtl_process_limit_change(struct throtl_data *td)
 {
 	struct request_queue *q = td->queue;
-	struct blkio_group *blkg, *n;
+	struct blkcg_gq *blkg, *n;
 
 	if (!td->limits_changed)
 		return;
@@ -951,9 +951,9 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, void *pdata, int off)
 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 			       struct seq_file *sf)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkio_policy_throtl,
+	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
 			  cft->private, true);
 	return 0;
 }
@@ -979,29 +979,29 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, void *pdata, int off)
 static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
 			     struct seq_file *sf)
 {
-	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_u64,
-			  &blkio_policy_throtl, cft->private, false);
+	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
+			  &blkcg_policy_throtl, cft->private, false);
 	return 0;
 }
 
 static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
 			      struct seq_file *sf)
 {
-	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_uint,
-			  &blkio_policy_throtl, cft->private, false);
+	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
+			  &blkcg_policy_throtl, cft->private, false);
 	return 0;
 }
 
 static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 		       bool is_u64)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
 	struct throtl_data *td;
 	int ret;
 
-	ret = blkg_conf_prep(blkcg, &blkio_policy_throtl, buf, &ctx);
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
 	if (ret)
 		return ret;
 
@@ -1086,11 +1086,11 @@ static void throtl_shutdown_wq(struct request_queue *q)
 	cancel_delayed_work_sync(&td->throtl_work);
 }
 
-static struct blkio_policy_type blkio_policy_throtl = {
+static struct blkcg_policy blkcg_policy_throtl = {
 	.ops = {
-		.blkio_init_group_fn = throtl_init_blkio_group,
-		.blkio_exit_group_fn = throtl_exit_blkio_group,
-		.blkio_reset_group_stats_fn = throtl_reset_group_stats,
+		.pd_init_fn		= throtl_pd_init,
+		.pd_exit_fn		= throtl_pd_exit,
+		.pd_reset_stats_fn	= throtl_pd_reset_stats,
 	},
 	.pdata_size = sizeof(struct throtl_grp),
 	.cftypes = throtl_files,
@@ -1101,7 +1101,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	struct throtl_data *td = q->td;
 	struct throtl_grp *tg;
 	bool rw = bio_data_dir(bio), update_disptime = true;
-	struct blkio_cgroup *blkcg;
+	struct blkcg *blkcg;
 	bool throttled = false;
 
 	if (bio->bi_rw & REQ_THROTTLED) {
@@ -1118,7 +1118,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	 * just update the dispatch stats in lockless manner and return.
 	 */
 	rcu_read_lock();
-	blkcg = bio_blkio_cgroup(bio);
+	blkcg = bio_blkcg(bio);
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
 		if (tg_no_rule_group(tg, rw)) {
@@ -1243,7 +1243,7 @@ int blk_throtl_init(struct request_queue *q)
 	td->queue = q;
 
 	/* activate policy */
-	ret = blkcg_activate_policy(q, &blkio_policy_throtl);
+	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret)
 		kfree(td);
 	return ret;
@@ -1253,7 +1253,7 @@ void blk_throtl_exit(struct request_queue *q)
 {
 	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
-	blkcg_deactivate_policy(q, &blkio_policy_throtl);
+	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
 	kfree(q->td);
 }
 
@@ -1263,7 +1263,7 @@ static int __init throtl_init(void)
 	if (!kthrotld_workqueue)
 		panic("Failed to create kthrotld\n");
 
-	return blkio_policy_register(&blkio_policy_throtl);
+	return blkcg_policy_register(&blkcg_policy_throtl);
 }
 
 module_init(throtl_init);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 901286b..7922182 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -17,7 +17,7 @@
 #include "blk.h"
 #include "blk-cgroup.h"
 
-static struct blkio_policy_type blkio_policy_cfq __maybe_unused;
+static struct blkcg_policy blkcg_policy_cfq __maybe_unused;
 
 /*
  * tunables
@@ -202,7 +202,7 @@ struct cfqg_stats {
 	struct blkg_stat		dequeue;
 	/* total time spent waiting for it to be assigned a timeslice. */
 	struct blkg_stat		group_wait_time;
-	/* time spent idling for this blkio_group */
+	/* time spent idling for this blkcg_gq */
 	struct blkg_stat		idle_time;
 	/* total time with empty current active q with other requests queued */
 	struct blkg_stat		empty_time;
@@ -553,12 +553,12 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 
-static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
+static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 {
-	return blkg_to_pdata(blkg, &blkio_policy_cfq);
+	return blkg_to_pdata(blkg, &blkcg_policy_cfq);
 }
 
-static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg)
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
 {
 	return pdata_to_blkg(cfqg);
 }
@@ -637,7 +637,7 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 				io_start_time - start_time);
 }
 
-static void cfqg_stats_reset(struct blkio_group *blkg)
+static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 	struct cfqg_stats *stats = &cfqg->stats;
@@ -662,8 +662,8 @@ static void cfqg_stats_reset(struct blkio_group *blkg)
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
 
-static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) { return NULL; }
-static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg) { return NULL; }
+static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) { return NULL; }
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) { return NULL; }
 static inline void cfqg_get(struct cfq_group *cfqg) { }
 static inline void cfqg_put(struct cfq_group *cfqg) { }
 
@@ -1331,7 +1331,7 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfq_init_blkio_group(struct blkio_group *blkg)
+static void cfq_pd_init(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 
@@ -1344,16 +1344,16 @@ static void cfq_init_blkio_group(struct blkio_group *blkg)
  * be held.
  */
 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-						struct blkio_cgroup *blkcg)
+						struct blkcg *blkcg)
 {
 	struct request_queue *q = cfqd->queue;
 	struct cfq_group *cfqg = NULL;
 
-	/* avoid lookup for the common case where there's no blkio cgroup */
-	if (blkcg == &blkio_root_cgroup) {
+	/* avoid lookup for the common case where there's no blkcg */
+	if (blkcg == &blkcg_root) {
 		cfqg = cfqd->root_group;
 	} else {
-		struct blkio_group *blkg;
+		struct blkcg_gq *blkg;
 
 		blkg = blkg_lookup_create(blkcg, q);
 		if (!IS_ERR(blkg))
@@ -1386,8 +1386,8 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf, void *pdata, int off)
 static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
 				    struct seq_file *sf)
 {
-	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
-			  cfqg_prfill_weight_device, &blkio_policy_cfq, 0,
+	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+			  cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
 			  false);
 	return 0;
 }
@@ -1395,19 +1395,19 @@ static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
 static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
 			    struct seq_file *sf)
 {
-	seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->cfq_weight);
+	seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
 	return 0;
 }
 
 static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 				  const char *buf)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
 	int ret;
 
-	ret = blkg_conf_prep(blkcg, &blkio_policy_cfq, buf, &ctx);
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
 	if (ret)
 		return ret;
 
@@ -1425,8 +1425,8 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 
 static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
-	struct blkio_group *blkg;
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg_gq *blkg;
 	struct hlist_node *n;
 
 	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
@@ -1449,9 +1449,9 @@ static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 			   struct seq_file *sf)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkio_policy_cfq,
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
 			  cft->private, false);
 	return 0;
 }
@@ -1459,9 +1459,9 @@ static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
 			     struct seq_file *sf)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkio_policy_cfq,
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
 			  cft->private, true);
 	return 0;
 }
@@ -1485,10 +1485,10 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, void *pdata, int off)
 static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
 				     struct seq_file *sf)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 
 	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
-			  &blkio_policy_cfq, 0, false);
+			  &blkcg_policy_cfq, 0, false);
 	return 0;
 }
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
@@ -1580,7 +1580,7 @@ static struct cftype cfq_blkcg_files[] = {
 };
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-						struct blkio_cgroup *blkcg)
+						struct blkcg *blkcg)
 {
 	return cfqd->root_group;
 }
@@ -3135,7 +3135,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	uint64_t id;
 
 	rcu_read_lock();
-	id = bio_blkio_cgroup(bio)->id;
+	id = bio_blkcg(bio)->id;
 	rcu_read_unlock();
 
 	/*
@@ -3166,14 +3166,14 @@ static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 		     struct bio *bio, gfp_t gfp_mask)
 {
-	struct blkio_cgroup *blkcg;
+	struct blkcg *blkcg;
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_group *cfqg;
 
 retry:
 	rcu_read_lock();
 
-	blkcg = bio_blkio_cgroup(bio);
+	blkcg = bio_blkcg(bio);
 	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
 	cfqq = cic_to_cfqq(cic, is_sync);
 
@@ -3944,14 +3944,14 @@ static void cfq_exit_queue(struct elevator_queue *e)
 #ifndef CONFIG_CFQ_GROUP_IOSCHED
 	kfree(cfqd->root_group);
 #endif
-	blkcg_deactivate_policy(q, &blkio_policy_cfq);
+	blkcg_deactivate_policy(q, &blkcg_policy_cfq);
 	kfree(cfqd);
 }
 
 static int cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
-	struct blkio_group *blkg __maybe_unused;
+	struct blkcg_gq *blkg __maybe_unused;
 	int i, ret;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
@@ -3966,7 +3966,7 @@ static int cfq_init_queue(struct request_queue *q)
 
 	/* Init root group and prefer root group over other groups by default */
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-	ret = blkcg_activate_policy(q, &blkio_policy_cfq);
+	ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
 	if (ret)
 		goto out_free;
 
@@ -4156,10 +4156,10 @@ static struct elevator_type iosched_cfq = {
 };
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static struct blkio_policy_type blkio_policy_cfq = {
+static struct blkcg_policy blkcg_policy_cfq = {
 	.ops = {
-		.blkio_init_group_fn =		cfq_init_blkio_group,
-		.blkio_reset_group_stats_fn =	cfqg_stats_reset,
+		.pd_init_fn		= cfq_pd_init,
+		.pd_reset_stats_fn	= cfq_pd_reset_stats,
 	},
 	.pdata_size = sizeof(struct cfq_group),
 	.cftypes = cfq_blkcg_files,
@@ -4185,7 +4185,7 @@ static int __init cfq_init(void)
 		cfq_group_idle = 0;
 #endif
 
-	ret = blkio_policy_register(&blkio_policy_cfq);
+	ret = blkcg_policy_register(&blkcg_policy_cfq);
 	if (ret)
 		return ret;
 
@@ -4202,13 +4202,13 @@ static int __init cfq_init(void)
 err_free_pool:
 	kmem_cache_destroy(cfq_pool);
 err_pol_unreg:
-	blkio_policy_unregister(&blkio_policy_cfq);
+	blkcg_policy_unregister(&blkcg_policy_cfq);
 	return ret;
 }
 
 static void __exit cfq_exit(void)
 {
-	blkio_policy_unregister(&blkio_policy_cfq);
+	blkcg_policy_unregister(&blkcg_policy_cfq);
 	elv_unregister(&iosched_cfq);
 	kmem_cache_destroy(cfq_pool);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 68720ab..af33fb1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -31,7 +31,7 @@ struct blk_trace;
 struct request;
 struct sg_io_hdr;
 struct bsg_job;
-struct blkio_group;
+struct blkcg_gq;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -371,7 +371,7 @@ struct request_queue {
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
 	DECLARE_BITMAP		(blkcg_pols, BLKCG_MAX_POLS);
-	struct blkio_group	*root_blkg;
+	struct blkcg_gq		*root_blkg;
 	struct list_head	blkg_list;
 #endif
 
-- 
cgit v1.1


From f95a04afa80c0f4ddd645ef6a84ed118b5d1ad46 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 16 Apr 2012 13:57:26 -0700
Subject: blkcg: embed struct blkg_policy_data in policy specific data

Currently blkg_policy_data carries policy specific data as char flex
array instead of being embedded in policy specific data.  This was
forced by oddities around blkg allocation which are all gone now.

This patch makes blkg_policy_data embedded in policy specific data -
throtl_grp and cfq_group so that it's more conventional and consistent
with how io_cq is handled.

* blkcg_policy->pdata_size is renamed to ->pd_size.

* Functions which used to take void *pdata now takes struct
  blkg_policy_data *pd.

* blkg_to_pdata/pdata_to_blkg() updated to blkg_to_pd/pd_to_blkg().

* Dummy struct blkg_policy_data definition added.  Dummy
  pdata_to_blkg() definition was unused and inconsistent with the
  non-dummy version - correct dummy pd_to_blkg() added.

* throtl and cfq updated accordingly.

* As dummy blkg_to_pd/pd_to_blkg() are provided,
  blkg_to_cfqg/cfqg_to_blkg() don't need to be ifdef'd.  Moved outside
  ifdef block.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 52 ++++++++++++++++++++++-----------------------
 block/blk-cgroup.h   | 60 +++++++++++++++++++++++++++++-----------------------
 block/blk-throttle.c | 37 +++++++++++++++++++++-----------
 block/cfq-iosched.c  | 46 +++++++++++++++++++++++-----------------
 4 files changed, 112 insertions(+), 83 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9975703..3d49552 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -58,11 +58,6 @@ static bool blkcg_policy_enabled(struct request_queue *q,
 	return pol && test_bit(pol->plid, q->blkcg_pols);
 }
 
-static size_t blkg_pd_size(const struct blkcg_policy *pol)
-{
-	return sizeof(struct blkg_policy_data) + pol->pdata_size;
-}
-
 /**
  * blkg_free - free a blkg
  * @blkg: blkg to free
@@ -122,7 +117,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
 			continue;
 
 		/* alloc per-policy data and attach it to blkg */
-		pd = kzalloc_node(blkg_pd_size(pol), GFP_ATOMIC, q->node);
+		pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node);
 		if (!pd) {
 			blkg_free(blkg);
 			return NULL;
@@ -346,7 +341,8 @@ static const char *blkg_dev_name(struct blkcg_gq *blkg)
  * cftype->read_seq_string method.
  */
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
-		       u64 (*prfill)(struct seq_file *, void *, int),
+		       u64 (*prfill)(struct seq_file *,
+				     struct blkg_policy_data *, int),
 		       const struct blkcg_policy *pol, int data,
 		       bool show_total)
 {
@@ -357,7 +353,7 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 	spin_lock_irq(&blkcg->lock);
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
 		if (blkcg_policy_enabled(blkg->q, pol))
-			total += prfill(sf, blkg->pd[pol->plid]->pdata, data);
+			total += prfill(sf, blkg->pd[pol->plid], data);
 	spin_unlock_irq(&blkcg->lock);
 
 	if (show_total)
@@ -368,14 +364,14 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
 /**
  * __blkg_prfill_u64 - prfill helper for a single u64 value
  * @sf: seq_file to print to
- * @pdata: policy private data of interest
+ * @pd: policy private data of interest
  * @v: value to print
  *
- * Print @v to @sf for the device assocaited with @pdata.
+ * Print @v to @sf for the device assocaited with @pd.
  */
-u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v)
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
 {
-	const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
+	const char *dname = blkg_dev_name(pd->blkg);
 
 	if (!dname)
 		return 0;
@@ -388,12 +384,12 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
 /**
  * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
  * @sf: seq_file to print to
- * @pdata: policy private data of interest
+ * @pd: policy private data of interest
  * @rwstat: rwstat to print
  *
- * Print @rwstat to @sf for the device assocaited with @pdata.
+ * Print @rwstat to @sf for the device assocaited with @pd.
  */
-u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 			 const struct blkg_rwstat *rwstat)
 {
 	static const char *rwstr[] = {
@@ -402,7 +398,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
 		[BLKG_RWSTAT_SYNC]	= "Sync",
 		[BLKG_RWSTAT_ASYNC]	= "Async",
 	};
-	const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
+	const char *dname = blkg_dev_name(pd->blkg);
 	u64 v;
 	int i;
 
@@ -421,30 +417,31 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
 /**
  * blkg_prfill_stat - prfill callback for blkg_stat
  * @sf: seq_file to print to
- * @pdata: policy private data of interest
- * @off: offset to the blkg_stat in @pdata
+ * @pd: policy private data of interest
+ * @off: offset to the blkg_stat in @pd
  *
  * prfill callback for printing a blkg_stat.
  */
-u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off)
+u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
 {
-	return __blkg_prfill_u64(sf, pdata, blkg_stat_read(pdata + off));
+	return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 
 /**
  * blkg_prfill_rwstat - prfill callback for blkg_rwstat
  * @sf: seq_file to print to
- * @pdata: policy private data of interest
- * @off: offset to the blkg_rwstat in @pdata
+ * @pd: policy private data of interest
+ * @off: offset to the blkg_rwstat in @pd
  *
  * prfill callback for printing a blkg_rwstat.
  */
-u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off)
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+		       int off)
 {
-	struct blkg_rwstat rwstat = blkg_rwstat_read(pdata + off);
+	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
 
-	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
@@ -733,7 +730,7 @@ int blkcg_activate_policy(struct request_queue *q,
 
 	/* allocate policy_data for all existing blkgs */
 	while (cnt--) {
-		pd = kzalloc_node(blkg_pd_size(pol), GFP_KERNEL, q->node);
+		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 		if (!pd) {
 			ret = -ENOMEM;
 			goto out_free;
@@ -832,6 +829,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 {
 	int i, ret;
 
+	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
+		return -EINVAL;
+
 	mutex_lock(&blkcg_pol_mutex);
 
 	/* find an empty slot */
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index a443b84..18b021e 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -59,16 +59,25 @@ struct blkg_rwstat {
 	uint64_t			cnt[BLKG_RWSTAT_NR];
 };
 
-/* per-blkg per-policy data */
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q).  This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each has its private
+ * data on each blkg, the size of which is determined by
+ * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+ * together with blkg and invokes pd_init/exit_fn() methods.
+ *
+ * Such private data must embed struct blkg_policy_data (pd) at the
+ * beginning and pd_size can't be smaller than pd.
+ */
 struct blkg_policy_data {
 	/* the blkg this per-policy data belongs to */
 	struct blkcg_gq			*blkg;
 
 	/* used during policy activation */
 	struct list_head		alloc_node;
-
-	/* pol->pdata_size bytes of private data used by policy impl */
-	char				pdata[] __aligned(__alignof__(unsigned long long));
 };
 
 /* association between a blk cgroup and a request queue */
@@ -100,7 +109,7 @@ struct blkcg_policy {
 	struct blkcg_policy_ops		ops;
 	int				plid;
 	/* policy specific private data size */
-	size_t				pdata_size;
+	size_t				pd_size;
 	/* cgroup files for the policy */
 	struct cftype			*cftypes;
 };
@@ -125,14 +134,16 @@ void blkcg_deactivate_policy(struct request_queue *q,
 			     const struct blkcg_policy *pol);
 
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
-		       u64 (*prfill)(struct seq_file *, void *, int),
+		       u64 (*prfill)(struct seq_file *,
+				     struct blkg_policy_data *, int),
 		       const struct blkcg_policy *pol, int data,
 		       bool show_total);
-u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 			 const struct blkg_rwstat *rwstat);
-u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off);
-u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off);
+u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+		       int off);
 
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
@@ -152,26 +163,21 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx);
  *
  * Return pointer to private data associated with the @blkg-@pol pair.
  */
-static inline void *blkg_to_pdata(struct blkcg_gq *blkg,
-				  struct blkcg_policy *pol)
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol)
 {
-	return blkg ? blkg->pd[pol->plid]->pdata : NULL;
+	return blkg ? blkg->pd[pol->plid] : NULL;
 }
 
 /**
  * pdata_to_blkg - get blkg associated with policy private data
- * @pdata: policy private data of interest
+ * @pd: policy private data of interest
  *
- * @pdata is policy private data.  Determine the blkg it's associated with.
+ * @pd is policy private data.  Determine the blkg it's associated with.
  */
-static inline struct blkcg_gq *pdata_to_blkg(void *pdata)
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
 {
-	if (pdata) {
-		struct blkg_policy_data *pd =
-			container_of(pdata, struct blkg_policy_data, pdata);
-		return pd->blkg;
-	}
-	return NULL;
+	return pd ? pd->blkg : NULL;
 }
 
 /**
@@ -342,6 +348,9 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 
 struct cgroup;
 
+struct blkg_policy_data {
+};
+
 struct blkcg_gq {
 };
 
@@ -361,10 +370,9 @@ static inline int blkcg_activate_policy(struct request_queue *q,
 static inline void blkcg_deactivate_policy(struct request_queue *q,
 					   const struct blkcg_policy *pol) { }
 
-static inline void *blkg_to_pdata(struct blkcg_gq *blkg,
-				  struct blkcg_policy *pol) { return NULL; }
-static inline struct blkcg_gq *pdata_to_blkg(void *pdata,
-				  struct blkcg_policy *pol) { return NULL; }
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
 static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 00c7eff..6a0a17a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -49,6 +49,9 @@ struct tg_stats_cpu {
 };
 
 struct throtl_grp {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
 	/* active throtl group service_tree member */
 	struct rb_node rb_node;
 
@@ -120,14 +123,19 @@ static LIST_HEAD(tg_stats_alloc_list);
 static void tg_stats_alloc_fn(struct work_struct *);
 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
 
+static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
+}
+
 static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
 {
-	return blkg_to_pdata(blkg, &blkcg_policy_throtl);
+	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
 }
 
 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
 {
-	return pdata_to_blkg(tg);
+	return pd_to_blkg(&tg->pd);
 }
 
 static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
@@ -931,9 +939,10 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 	}
 }
 
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, void *pdata, int off)
+static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
+				struct blkg_policy_data *pd, int off)
 {
-	struct throtl_grp *tg = pdata;
+	struct throtl_grp *tg = pd_to_tg(pd);
 	struct blkg_rwstat rwstat = { }, tmp;
 	int i, cpu;
 
@@ -945,7 +954,7 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, void *pdata, int off)
 			rwstat.cnt[i] += tmp.cnt[i];
 	}
 
-	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 
 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
@@ -958,22 +967,26 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
-static u64 tg_prfill_conf_u64(struct seq_file *sf, void *pdata, int off)
+static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
+			      int off)
 {
-	u64 v = *(u64 *)(pdata + off);
+	struct throtl_grp *tg = pd_to_tg(pd);
+	u64 v = *(u64 *)((void *)tg + off);
 
 	if (v == -1)
 		return 0;
-	return __blkg_prfill_u64(sf, pdata, v);
+	return __blkg_prfill_u64(sf, pd, v);
 }
 
-static u64 tg_prfill_conf_uint(struct seq_file *sf, void *pdata, int off)
+static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
 {
-	unsigned int v = *(unsigned int *)(pdata + off);
+	struct throtl_grp *tg = pd_to_tg(pd);
+	unsigned int v = *(unsigned int *)((void *)tg + off);
 
 	if (v == -1)
 		return 0;
-	return __blkg_prfill_u64(sf, pdata, v);
+	return __blkg_prfill_u64(sf, pd, v);
 }
 
 static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
@@ -1092,7 +1105,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
 		.pd_exit_fn		= throtl_pd_exit,
 		.pd_reset_stats_fn	= throtl_pd_reset_stats,
 	},
-	.pdata_size = sizeof(struct throtl_grp),
+	.pd_size = sizeof(struct throtl_grp),
 	.cftypes = throtl_files,
 };
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7922182..7865cc3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -217,6 +217,9 @@ struct cfqg_stats {
 
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
 	/* group service_tree member */
 	struct rb_node rb_node;
 
@@ -409,6 +412,21 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
+static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
+}
+
+static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
+{
+	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
+}
+
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+	return pd_to_blkg(&cfqg->pd);
+}
+
 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 
 /* cfqg stats flags */
@@ -553,16 +571,6 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 
-static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
-{
-	return blkg_to_pdata(blkg, &blkcg_policy_cfq);
-}
-
-static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
-{
-	return pdata_to_blkg(cfqg);
-}
-
 static inline void cfqg_get(struct cfq_group *cfqg)
 {
 	return blkg_get(cfqg_to_blkg(cfqg));
@@ -662,8 +670,6 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
 
-static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) { return NULL; }
-static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) { return NULL; }
 static inline void cfqg_get(struct cfq_group *cfqg) { }
 static inline void cfqg_put(struct cfq_group *cfqg) { }
 
@@ -1374,13 +1380,14 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 	cfqg_get(cfqg);
 }
 
-static u64 cfqg_prfill_weight_device(struct seq_file *sf, void *pdata, int off)
+static u64 cfqg_prfill_weight_device(struct seq_file *sf,
+				     struct blkg_policy_data *pd, int off)
 {
-	struct cfq_group *cfqg = pdata;
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
 
 	if (!cfqg->dev_weight)
 		return 0;
-	return __blkg_prfill_u64(sf, pdata, cfqg->dev_weight);
+	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
 }
 
 static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
@@ -1467,9 +1474,10 @@ static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
 }
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, void *pdata, int off)
+static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
 {
-	struct cfq_group *cfqg = pdata;
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
 	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
 	u64 v = 0;
 
@@ -1477,7 +1485,7 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, void *pdata, int off)
 		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
 		do_div(v, samples);
 	}
-	__blkg_prfill_u64(sf, pdata, v);
+	__blkg_prfill_u64(sf, pd, v);
 	return 0;
 }
 
@@ -4161,7 +4169,7 @@ static struct blkcg_policy blkcg_policy_cfq = {
 		.pd_init_fn		= cfq_pd_init,
 		.pd_reset_stats_fn	= cfq_pd_reset_stats,
 	},
-	.pdata_size = sizeof(struct cfq_group),
+	.pd_size = sizeof(struct cfq_group),
 	.cftypes = cfq_blkcg_files,
 };
 #endif
-- 
cgit v1.1


From f9fcc2d3919b8eb575b3cee9274feefafb641bca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 16 Apr 2012 13:57:27 -0700
Subject: blkcg: collapse blkcg_policy_ops into blkcg_policy

There's no reason to keep blkcg_policy_ops separate.  Collapse it into
blkcg_policy.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 16 ++++++++--------
 block/blk-cgroup.h   | 12 +++++-------
 block/blk-throttle.c | 13 ++++++-------
 block/cfq-iosched.c  | 11 +++++------
 4 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 3d49552..8228385 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -78,8 +78,8 @@ static void blkg_free(struct blkcg_gq *blkg)
 		if (!pd)
 			continue;
 
-		if (pol && pol->ops.pd_exit_fn)
-			pol->ops.pd_exit_fn(blkg);
+		if (pol && pol->pd_exit_fn)
+			pol->pd_exit_fn(blkg);
 
 		kfree(pd);
 	}
@@ -132,7 +132,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
 		struct blkcg_policy *pol = blkcg_policy[i];
 
 		if (blkcg_policy_enabled(blkg->q, pol))
-			pol->ops.pd_init_fn(blkg);
+			pol->pd_init_fn(blkg);
 	}
 
 	return blkg;
@@ -305,8 +305,8 @@ static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
 			struct blkcg_policy *pol = blkcg_policy[i];
 
 			if (blkcg_policy_enabled(blkg->q, pol) &&
-			    pol->ops.pd_reset_stats_fn)
-				pol->ops.pd_reset_stats_fn(blkg);
+			    pol->pd_reset_stats_fn)
+				pol->pd_reset_stats_fn(blkg);
 		}
 	}
 
@@ -758,7 +758,7 @@ int blkcg_activate_policy(struct request_queue *q,
 
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
-		pol->ops.pd_init_fn(blkg);
+		pol->pd_init_fn(blkg);
 
 		spin_unlock(&blkg->blkcg->lock);
 	}
@@ -804,8 +804,8 @@ void blkcg_deactivate_policy(struct request_queue *q,
 		/* grab blkcg lock too while removing @pd from @blkg */
 		spin_lock(&blkg->blkcg->lock);
 
-		if (pol->ops.pd_exit_fn)
-			pol->ops.pd_exit_fn(blkg);
+		if (pol->pd_exit_fn)
+			pol->pd_exit_fn(blkg);
 
 		kfree(blkg->pd[pol->plid]);
 		blkg->pd[pol->plid] = NULL;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 18b021e..44cb908 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -99,19 +99,17 @@ typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
 
-struct blkcg_policy_ops {
-	blkcg_pol_init_pd_fn		*pd_init_fn;
-	blkcg_pol_exit_pd_fn		*pd_exit_fn;
-	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
-};
-
 struct blkcg_policy {
-	struct blkcg_policy_ops		ops;
 	int				plid;
 	/* policy specific private data size */
 	size_t				pd_size;
 	/* cgroup files for the policy */
 	struct cftype			*cftypes;
+
+	/* operations */
+	blkcg_pol_init_pd_fn		*pd_init_fn;
+	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
 };
 
 extern struct blkcg blkcg_root;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6a0a17a..46310ec 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1100,13 +1100,12 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
-	.ops = {
-		.pd_init_fn		= throtl_pd_init,
-		.pd_exit_fn		= throtl_pd_exit,
-		.pd_reset_stats_fn	= throtl_pd_reset_stats,
-	},
-	.pd_size = sizeof(struct throtl_grp),
-	.cftypes = throtl_files,
+	.pd_size		= sizeof(struct throtl_grp),
+	.cftypes		= throtl_files,
+
+	.pd_init_fn		= throtl_pd_init,
+	.pd_exit_fn		= throtl_pd_exit,
+	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
 
 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7865cc3..832b2ac 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4165,12 +4165,11 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-	.ops = {
-		.pd_init_fn		= cfq_pd_init,
-		.pd_reset_stats_fn	= cfq_pd_reset_stats,
-	},
-	.pd_size = sizeof(struct cfq_group),
-	.cftypes = cfq_blkcg_files,
+	.pd_size		= sizeof(struct cfq_group),
+	.cftypes		= cfq_blkcg_files,
+
+	.pd_init_fn		= cfq_pd_init,
+	.pd_reset_stats_fn	= cfq_pd_reset_stats,
 };
 #endif
 
-- 
cgit v1.1


From 29e2b09ab5fa790514d47838f3c05497130908b3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 19 Apr 2012 16:29:21 -0700
Subject: block: collapse blk_alloc_request() into get_request()

Allocation failure handling in get_request() is about to be updated.
To ease the update, collapse blk_alloc_request() into get_request().

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 46 +++++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3b02ba3..f6f68b0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -719,33 +719,6 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
 	mempool_free(rq, q->rq.rq_pool);
 }
 
-static struct request *
-blk_alloc_request(struct request_queue *q, struct bio *bio, struct io_cq *icq,
-		  unsigned int flags, gfp_t gfp_mask)
-{
-	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
-
-	if (!rq)
-		return NULL;
-
-	blk_rq_init(q, rq);
-
-	rq->cmd_flags = flags | REQ_ALLOCED;
-
-	if (flags & REQ_ELVPRIV) {
-		rq->elv.icq = icq;
-		if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
-			mempool_free(rq, q->rq.rq_pool);
-			return NULL;
-		}
-		/* @rq->elv.icq holds on to io_context until @rq is freed */
-		if (icq)
-			get_io_context(icq->ioc);
-	}
-
-	return rq;
-}
-
 /*
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
@@ -968,10 +941,25 @@ retry:
 			goto fail_alloc;
 	}
 
-	rq = blk_alloc_request(q, bio, icq, rw_flags, gfp_mask);
-	if (unlikely(!rq))
+	/* allocate and init request */
+	rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
+	if (!rq)
 		goto fail_alloc;
 
+	blk_rq_init(q, rq);
+	rq->cmd_flags = rw_flags | REQ_ALLOCED;
+
+	if (rw_flags & REQ_ELVPRIV) {
+		rq->elv.icq = icq;
+		if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
+			mempool_free(rq, q->rq.rq_pool);
+			goto fail_alloc;
+		}
+		/* @rq->elv.icq holds on to io_context until @rq is freed */
+		if (icq)
+			get_io_context(icq->ioc);
+	}
+
 	/*
 	 * ioc may be NULL here, and ioc_batching will be false. That's
 	 * OK, if the queue is under the request limit then requests need
-- 
cgit v1.1


From aaf7c680682f1999ef2e574f743c45d1674a8b8a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 19 Apr 2012 16:29:22 -0700
Subject: block: fix elvpriv allocation failure handling

Request allocation is mempool backed to guarantee forward progress
under memory pressure; unfortunately, this property got broken while
adding elvpriv data.  Failures during elvpriv allocation, including
ioc and icq creation failures, currently make get_request() fail as
whole.  There's no forward progress guarantee for these allocations -
they may fail indefinitely under memory pressure stalling IO and
deadlocking the system.

This patch updates get_request() such that elvpriv allocation failure
doesn't make the whole function fail.  If elvpriv allocation fails,
the allocation is degraded into !ELVPRIV.  This will force the request
to ELEVATOR_INSERT_BACK disturbing scheduling but elvpriv alloc
failures should be rare (nothing is per-request) and anything is
better than deadlocking.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 53 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f6f68b0..6cf13df 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -29,6 +29,7 @@
 #include <linux/fault-inject.h>
 #include <linux/list_sort.h>
 #include <linux/delay.h>
+#include <linux/ratelimit.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -930,17 +931,6 @@ retry:
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
-	/* create icq if missing */
-	if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) {
-		create_io_context(gfp_mask, q->node);
-		ioc = rq_ioc(bio);
-		if (!ioc)
-			goto fail_alloc;
-		icq = ioc_create_icq(ioc, q, gfp_mask);
-		if (!icq)
-			goto fail_alloc;
-	}
-
 	/* allocate and init request */
 	rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 	if (!rq)
@@ -949,17 +939,28 @@ retry:
 	blk_rq_init(q, rq);
 	rq->cmd_flags = rw_flags | REQ_ALLOCED;
 
+	/* init elvpriv */
 	if (rw_flags & REQ_ELVPRIV) {
-		rq->elv.icq = icq;
-		if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
-			mempool_free(rq, q->rq.rq_pool);
-			goto fail_alloc;
+		if (unlikely(et->icq_cache && !icq)) {
+			create_io_context(gfp_mask, q->node);
+			ioc = rq_ioc(bio);
+			if (!ioc)
+				goto fail_elvpriv;
+
+			icq = ioc_create_icq(ioc, q, gfp_mask);
+			if (!icq)
+				goto fail_elvpriv;
 		}
-		/* @rq->elv.icq holds on to io_context until @rq is freed */
+
+		rq->elv.icq = icq;
+		if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
+			goto fail_elvpriv;
+
+		/* @rq->elv.icq holds io_context until @rq is freed */
 		if (icq)
 			get_io_context(icq->ioc);
 	}
-
+out:
 	/*
 	 * ioc may be NULL here, and ioc_batching will be false. That's
 	 * OK, if the queue is under the request limit then requests need
@@ -972,6 +973,24 @@ retry:
 	trace_block_getrq(q, bio, rw_flags & 1);
 	return rq;
 
+fail_elvpriv:
+	/*
+	 * elvpriv init failed.  ioc, icq and elvpriv aren't mempool backed
+	 * and may fail indefinitely under memory pressure and thus
+	 * shouldn't stall IO.  Treat this request as !elvpriv.  This will
+	 * disturb iosched and blkcg but weird is bettern than dead.
+	 */
+	printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
+			   dev_name(q->backing_dev_info.dev));
+
+	rq->cmd_flags &= ~REQ_ELVPRIV;
+	rq->elv.icq = NULL;
+
+	spin_lock_irq(q->queue_lock);
+	rl->elvpriv--;
+	spin_unlock_irq(q->queue_lock);
+	goto out;
+
 fail_alloc:
 	/*
 	 * Allocation failed presumably due to memory. Undo anything we
-- 
cgit v1.1


From 496fb7806d616185a46865a4f3a20ef19dc6c7e3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 19 Apr 2012 16:29:23 -0700
Subject: blkcg: fix blkcg->css ref leak in __blkg_lookup_create()

__blkg_lookup_create() leaked blkcg->css ref if blkg allocation
failed.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8228385..30a7a9c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -174,6 +174,7 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct blkcg_gq *blkg;
+	int ret;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
@@ -186,24 +187,22 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 	if (!css_tryget(&blkcg->css))
 		return ERR_PTR(-EINVAL);
 
-	/*
-	 * Allocate and initialize.
-	 */
+	/* allocate */
+	ret = -ENOMEM;
 	blkg = blkg_alloc(blkcg, q);
-
-	/* did alloc fail? */
-	if (unlikely(!blkg)) {
-		blkg = ERR_PTR(-ENOMEM);
-		goto out;
-	}
+	if (unlikely(!blkg))
+		goto err_put;
 
 	/* insert */
 	spin_lock(&blkcg->lock);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	list_add(&blkg->q_node, &q->blkg_list);
 	spin_unlock(&blkcg->lock);
-out:
 	return blkg;
+
+err_put:
+	css_put(&blkcg->css);
+	return ERR_PTR(ret);
 }
 
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-- 
cgit v1.1


From a637120e49021d197e9578cba545bbaa459cbb51 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 19 Apr 2012 16:29:24 -0700
Subject: blkcg: use radix tree to index blkgs from blkcg

blkg lookup is currently performed by traversing linked list anchored
at blkcg->blkg_list.  This is very unscalable and with blk-throttle
enabled and enough request queues on the system, this can get very
ugly quickly (blk-throttle performs look up on every bio submission).

This patch makes blkcg use radix tree to index blkgs combined with
simple last-looked-up hint.  This is mostly identical to how icqs are
indexed from ioc.

Note that because __blkg_lookup() may be invoked without holding queue
lock, hint is only updated from __blkg_lookup_create().  Due to cfq's
cfqq caching, this makes hint updates overly lazy.  This will be
improved with scheduled blkcg aware request allocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 52 ++++++++++++++++++++++++++++++++++++++++++++--------
 block/blk-cgroup.h |  6 ++++++
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 30a7a9c..02cf633 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -142,11 +142,21 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
 				      struct request_queue *q)
 {
 	struct blkcg_gq *blkg;
-	struct hlist_node *n;
 
-	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
-		if (blkg->q == q)
-			return blkg;
+	blkg = rcu_dereference(blkcg->blkg_hint);
+	if (blkg && blkg->q == q)
+		return blkg;
+
+	/*
+	 * Hint didn't match.  Look up from the radix tree.  Note that we
+	 * may not be holding queue_lock and thus are not sure whether
+	 * @blkg from blkg_tree has already been removed or not, so we
+	 * can't update hint to the lookup result.  Leave it to the caller.
+	 */
+	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
+	if (blkg && blkg->q == q)
+		return blkg;
+
 	return NULL;
 }
 
@@ -179,9 +189,12 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
 
+	/* lookup and update hint on success, see __blkg_lookup() for details */
 	blkg = __blkg_lookup(blkcg, q);
-	if (blkg)
+	if (blkg) {
+		rcu_assign_pointer(blkcg->blkg_hint, blkg);
 		return blkg;
+	}
 
 	/* blkg holds a reference to blkcg */
 	if (!css_tryget(&blkcg->css))
@@ -194,12 +207,24 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 		goto err_put;
 
 	/* insert */
+	ret = radix_tree_preload(GFP_ATOMIC);
+	if (ret)
+		goto err_free;
+
 	spin_lock(&blkcg->lock);
-	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
-	list_add(&blkg->q_node, &q->blkg_list);
+	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
+	if (likely(!ret)) {
+		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+		list_add(&blkg->q_node, &q->blkg_list);
+	}
 	spin_unlock(&blkcg->lock);
-	return blkg;
 
+	radix_tree_preload_end();
+
+	if (!ret)
+		return blkg;
+err_free:
+	blkg_free(blkg);
 err_put:
 	css_put(&blkcg->css);
 	return ERR_PTR(ret);
@@ -229,10 +254,20 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 	/* Something wrong if we are trying to remove same group twice */
 	WARN_ON_ONCE(list_empty(&blkg->q_node));
 	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
+
+	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
 	list_del_init(&blkg->q_node);
 	hlist_del_init_rcu(&blkg->blkcg_node);
 
 	/*
+	 * Both setting lookup hint to and clearing it from @blkg are done
+	 * under queue_lock.  If it's not pointing to @blkg now, it never
+	 * will.  Hint assignment itself can race safely.
+	 */
+	if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
+		rcu_assign_pointer(blkcg->blkg_hint, NULL);
+
+	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
@@ -593,6 +628,7 @@ static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
 	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
 done:
 	spin_lock_init(&blkcg->lock);
+	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
 	return &blkcg->css;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 44cb908..8ac457c 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -16,6 +16,7 @@
 #include <linux/cgroup.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/seq_file.h>
+#include <linux/radix-tree.h>
 
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
@@ -37,9 +38,14 @@ enum blkg_rwstat_type {
 	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
 };
 
+struct blkcg_gq;
+
 struct blkcg {
 	struct cgroup_subsys_state	css;
 	spinlock_t			lock;
+
+	struct radix_tree_root		blkg_tree;
+	struct blkcg_gq			*blkg_hint;
 	struct hlist_head		blkg_list;
 
 	/* for policies to test whether associated blkcg has changed */
-- 
cgit v1.1


From bd1a68b59c8e3bce45fb76632c64e1e063c3962d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 4 Apr 2012 11:08:51 +0200
Subject: vmsplice: relax alignement requirements for SPLICE_F_GIFT

It seems there is no fundamental reason to limit vmsplice()
SPLICE_F_GIFT to page aligned chunks.

All helpers are prepared to cope with offsets in page.

This limitation makes vmsplice() API very impractical in the zero-copy
land.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Miller <davem@davemloft.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Hugh Dickins <hughd@google.com>
Cc: Changli Gao <xiaosuo@gmail.com>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/splice.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 5f883de..5417aa3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1385,7 +1385,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
  */
 static int get_iovec_page_array(const struct iovec __user *iov,
 				unsigned int nr_vecs, struct page **pages,
-				struct partial_page *partial, int aligned,
+				struct partial_page *partial, bool aligned,
 				unsigned int pipe_buffers)
 {
 	int buffers = 0, error = 0;
@@ -1623,7 +1623,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 		return -ENOMEM;
 
 	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
-					    spd.partial, flags & SPLICE_F_GIFT,
+					    spd.partial, false,
 					    pipe->buffers);
 	if (spd.nr_pages <= 0)
 		ret = spd.nr_pages;
-- 
cgit v1.1


From ff26eaadf4d914e397872b99885d45756104e9ae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 May 2012 12:16:21 +0200
Subject: blkcg: tg_stats_alloc_lock is an irq lock

tg_stats_alloc_lock nests inside queue lock and should always be held
with irq disabled.  throtl_pd_{init|exit}() were using non-irqsafe
spinlock ops which triggered inverse lock ordering via irq warning via
RCU freeing of blkg invoking throtl_pd_exit() w/o disabling IRQ.

Update both functions to use irq safe operations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
LKML-Reference: <1335339396.16988.80.camel@lappy>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 14dedec..5b06595 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -219,6 +219,7 @@ alloc_stats:
 static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
+	unsigned long flags;
 
 	RB_CLEAR_NODE(&tg->rb_node);
 	bio_list_init(&tg->bio_lists[0]);
@@ -235,19 +236,20 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	 * but percpu allocator can't be called from IO path.  Queue tg on
 	 * tg_stats_alloc_list and allocate from work item.
 	 */
-	spin_lock(&tg_stats_alloc_lock);
+	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
 	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
 	queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0);
-	spin_unlock(&tg_stats_alloc_lock);
+	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
 
 static void throtl_pd_exit(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
+	unsigned long flags;
 
-	spin_lock(&tg_stats_alloc_lock);
+	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
 	list_del_init(&tg->stats_alloc_node);
-	spin_unlock(&tg_stats_alloc_lock);
+	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 
 	free_percpu(tg->stats_cpu);
 }
-- 
cgit v1.1