From c15227de132f1295f3db6b7df9079956b1020fd8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 30 Sep 2009 13:52:12 +0200
Subject: block: use normal I/O path for discard requests

prepare_discard_fn() was being called in a place where memory allocation
was effectively impossible.  This makes it inappropriate for all but
the most trivial translations of Linux's DISCARD operation to the block
command set.  Additionally adding a payload there makes the ownership
of the bio backing unclear as it's now allocated by the device driver
and not the submitter as usual.

It is replaced with QUEUE_FLAG_DISCARD which is used to indicate whether
the queue supports discard operations or not.  blkdev_issue_discard now
allocates a one-page, sector-length payload which is the right thing
for the common ATA and SCSI implementations.

The mtd implementation of prepare_discard_fn() is replaced with simply
checking for the request being a discard.

Largely based on a previous patch from Matthew Wilcox <matthew@wil.cx>
which did the prepare_discard_fn but not the different payload allocation
yet.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c  | 35 ++++++++++++++++++++++++++++++-----
 block/blk-core.c     |  3 +--
 block/blk-settings.c | 17 -----------------
 3 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6593ab3..21f5025 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -350,6 +350,7 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
 
 	if (bio->bi_private)
 		complete(bio->bi_private);
+	__free_page(bio_page(bio));
 
 	bio_put(bio);
 }
@@ -372,26 +373,44 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	struct request_queue *q = bdev_get_queue(bdev);
 	int type = flags & DISCARD_FL_BARRIER ?
 		DISCARD_BARRIER : DISCARD_NOBARRIER;
+	struct bio *bio;
+	struct page *page;
 	int ret = 0;
 
 	if (!q)
 		return -ENXIO;
 
-	if (!q->prepare_discard_fn)
+	if (!blk_queue_discard(q))
 		return -EOPNOTSUPP;
 
 	while (nr_sects && !ret) {
-		struct bio *bio = bio_alloc(gfp_mask, 0);
-		if (!bio)
-			return -ENOMEM;
+		unsigned int sector_size = q->limits.logical_block_size;
 
+		bio = bio_alloc(gfp_mask, 1);
+		if (!bio)
+			goto out;
+		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
 		if (flags & DISCARD_FL_WAIT)
 			bio->bi_private = &wait;
 
-		bio->bi_sector = sector;
+		/*
+		 * Add a zeroed one-sector payload as that's what
+		 * our current implementations need.  If we'll ever need
+		 * more the interface will need revisiting.
+		 */
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!page)
+			goto out_free_bio;
+		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
+			goto out_free_page;
 
+		/*
+		 * And override the bio size - the way discard works we
+		 * touch many more blocks on disk than the actual payload
+		 * length.
+		 */
 		if (nr_sects > queue_max_hw_sectors(q)) {
 			bio->bi_size = queue_max_hw_sectors(q) << 9;
 			nr_sects -= queue_max_hw_sectors(q);
@@ -414,5 +433,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio_put(bio);
 	}
 	return ret;
+out_free_page:
+	__free_page(page);
+out_free_bio:
+	bio_put(bio);
+out:
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index 8135228..80a020d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1124,7 +1124,6 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_DISCARD;
 		if (bio_rw_flagged(bio, BIO_RW_BARRIER))
 			req->cmd_flags |= REQ_SOFTBARRIER;
-		req->q->prepare_discard_fn(req->q, req);
 	} else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
 		req->cmd_flags |= REQ_HARDBARRIER;
 
@@ -1470,7 +1469,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 
 		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
-		    !q->prepare_discard_fn) {
+		    !blk_queue_discard(q)) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index eaf122f..d29498e 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -34,23 +34,6 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
 /**
- * blk_queue_set_discard - set a discard_sectors function for queue
- * @q:		queue
- * @dfn:	prepare_discard function
- *
- * It's possible for a queue to register a discard callback which is used
- * to transform a discard request into the appropriate type for the
- * hardware. If none is registered, then discard requests are failed
- * with %EOPNOTSUPP.
- *
- */
-void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
-{
-	q->prepare_discard_fn = dfn;
-}
-EXPORT_SYMBOL(blk_queue_set_discard);
-
-/**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
  * @mbfn:	merge_bvec_fn
-- 
cgit v1.1


From 67efc9258010da35b27b3854d0880c7e193004ed Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 30 Sep 2009 13:54:20 +0200
Subject: block: allow large discard requests

Currently we set the bio size to the byte equivalent of the blocks to
be trimmed when submitting the initial DISCARD ioctl.  That means it
is subject to the max_hw_sectors limitation of the HBA which is
much lower than the size of a DISCARD request we can support.
Add a separate max_discard_sectors tunable to limit the size for discard
requests.

We limit the max discard request size in bytes to 32bit as that is the
limit for bio->bi_size.  This could be much larger if we had a way to pass
that information through the block layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c  | 10 ++++++----
 block/blk-core.c     |  3 ++-
 block/blk-settings.c | 13 +++++++++++++
 3 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 21f5025..8873b9b 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -385,6 +385,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 
 	while (nr_sects && !ret) {
 		unsigned int sector_size = q->limits.logical_block_size;
+		unsigned int max_discard_sectors =
+			min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 
 		bio = bio_alloc(gfp_mask, 1);
 		if (!bio)
@@ -411,10 +413,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 * touch many more blocks on disk than the actual payload
 		 * length.
 		 */
-		if (nr_sects > queue_max_hw_sectors(q)) {
-			bio->bi_size = queue_max_hw_sectors(q) << 9;
-			nr_sects -= queue_max_hw_sectors(q);
-			sector += queue_max_hw_sectors(q);
+		if (nr_sects > max_discard_sectors) {
+			bio->bi_size = max_discard_sectors << 9;
+			nr_sects -= max_discard_sectors;
+			sector += max_discard_sectors;
 		} else {
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 80a020d..34504f3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1436,7 +1436,8 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
-		if (unlikely(nr_sectors > queue_max_hw_sectors(q))) {
+		if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+			     nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 			       bdevname(bio->bi_bdev, b),
 			       bio_sectors(bio),
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d29498e..e0695bc 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segment_size = MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
+	lim->max_discard_sectors = SAFE_MAX_SECTORS;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -239,6 +240,18 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
 /**
+ * blk_queue_max_discard_sectors - set max sectors for a single discard
+ * @q:  the request queue for the device
+ * @max_discard: maximum number of sectors to discard
+ **/
+void blk_queue_max_discard_sectors(struct request_queue *q,
+		unsigned int max_discard_sectors)
+{
+	q->limits.max_discard_sectors = max_discard_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_discard_sectors);
+
+/**
  * blk_queue_max_phys_segments - set max phys segments for a request for this queue
  * @q:  the request queue for the device
  * @max_segments:  max number of segments
-- 
cgit v1.1


From b0da3f0dada78832c9da03ad2152ae76bd9a2496 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Thu, 1 Oct 2009 21:16:13 +0200
Subject: Add a tracepoint for block request remapping

Since 2.6.31 now has request-based device-mapper, it's useful to have
a tracepoint for request-remapping as well as bio-remapping.
This patch adds a tracepoint for request-remapping, trace_block_rq_remap().

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 34504f3..ddaaea4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,6 +34,7 @@
 #include "blk.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 static int __make_request(struct request_queue *q, struct bio *bio);
-- 
cgit v1.1


From 1d2235152dc745c6d94bedb550fea84cffdbf768 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 2 Oct 2009 19:27:04 +0200
Subject: cfq-iosched: add a knob for desktop interactiveness

This is basically identical to what Vivek Goyal posted, but combined
into one and labelled 'desktop' instead of 'fairness'. The goal
is to continue to improve on the latency side of things as it relates
to interactiveness, keeping the questionable bits under this sysfs
tunable so it would be easy for throughput-only people to turn off.

Apart from adding the interactive sysfs knob, it also adds the
behavioural change of allowing slice idling even if the hardware
does tagged command queuing.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1ca813b..8917f2b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -173,6 +173,7 @@ struct cfq_data {
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
+	unsigned int cfq_desktop;
 
 	struct list_head cic_list;
 
@@ -1951,7 +1952,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (cfqd->hw_tag && CIC_SEEKY(cic)))
+	    (!cfqd->cfq_desktop && cfqd->hw_tag && CIC_SEEKY(cic)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		if (cic->ttime_mean > cfqd->cfq_slice_idle)
@@ -2480,6 +2481,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
+	cfqd->cfq_desktop = 1;
 	cfqd->hw_tag = 1;
 
 	return cfqd;
@@ -2549,6 +2551,7 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
+SHOW_FUNCTION(cfq_desktop_show, cfqd->cfq_desktop, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -2580,6 +2583,7 @@ STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
+STORE_FUNCTION(cfq_desktop_store, &cfqd->cfq_desktop, 0, 1, 0);
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -2595,6 +2599,7 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_async),
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
+	CFQ_ATTR(desktop),
 	__ATTR_NULL
 };
 
-- 
cgit v1.1


From 365722bb917b08b7323b5a4a0a3386cc7d00397d Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Sat, 3 Oct 2009 15:21:27 +0200
Subject: cfq-iosched: delay async IO dispatch, if sync IO was just done

o Do not allow more than max_dispatch requests from an async queue, if some
  sync request has finished recently. This is in the hope that sync activity
  is still going on in the system and we might receive a sync request soon.
  Most likely from a sync queue which finished a request and we did not enable
  idling on it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8917f2b..70b48ea 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -181,6 +181,8 @@ struct cfq_data {
 	 * Fallback dummy cfqq for extreme OOM conditions
 	 */
 	struct cfq_queue oom_cfqq;
+
+	unsigned long last_end_sync_rq;
 };
 
 enum cfqq_state_flags {
@@ -1314,6 +1316,8 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 	 * Does this cfqq already have too much IO in flight?
 	 */
 	if (cfqq->dispatched >= max_dispatch) {
+		unsigned long load_at = cfqd->last_end_sync_rq + cfq_slice_sync;
+
 		/*
 		 * idle queue must always only have a single IO in flight
 		 */
@@ -1327,6 +1331,14 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 			return 0;
 
 		/*
+		 * If a sync request has completed recently, don't overload
+		 * the dispatch queue yet with async requests.
+		 */
+		if (cfqd->cfq_desktop && !cfq_cfqq_sync(cfqq)
+		    && time_before(jiffies, load_at))
+			return 0;
+
+		/*
 		 * we are the only queue, allow up to 4 times of 'quantum'
 		 */
 		if (cfqq->dispatched >= 4 * max_dispatch)
@@ -2158,8 +2170,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	if (cfq_cfqq_sync(cfqq))
 		cfqd->sync_flight--;
 
-	if (sync)
+	if (sync) {
 		RQ_CIC(rq)->last_end_request = now;
+		cfqd->last_end_sync_rq = now;
+	}
 
 	/*
 	 * If this is the active queue, check if it needs to be expired,
@@ -2483,7 +2497,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_desktop = 1;
 	cfqd->hw_tag = 1;
-
+	cfqd->last_end_sync_rq = jiffies;
 	return cfqd;
 }
 
-- 
cgit v1.1


From 8e2967555571659d2c8a70dd120710110ed7bba4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sat, 3 Oct 2009 16:26:03 +0200
Subject: cfq-iosched: implement slower async initiate and queue ramp up

This slowly ramps up the async queue depth based on the time
passed since the sync IO, and doesn't allow async at all until
a sync slice period has passed.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c    |  8 ++++++++
 block/cfq-iosched.c | 56 ++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 44 insertions(+), 20 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index ddaaea4..a8c7fbe 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2492,6 +2492,14 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
+int kblockd_schedule_delayed_work(struct request_queue *q,
+				  struct delayed_work *work,
+				  unsigned long delay)
+{
+	return queue_delayed_work(kblockd_workqueue, work, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 70b48ea..fce8a74 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -150,7 +150,7 @@ struct cfq_data {
 	 * idle window management
 	 */
 	struct timer_list idle_slice_timer;
-	struct work_struct unplug_work;
+	struct delayed_work unplug_work;
 
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
@@ -268,11 +268,13 @@ static inline int cfq_bio_sync(struct bio *bio)
  * scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing
  */
-static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
+static inline void cfq_schedule_dispatch(struct cfq_data *cfqd,
+					 unsigned long delay)
 {
 	if (cfqd->busy_queues) {
 		cfq_log(cfqd, "schedule dispatch");
-		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
+		kblockd_schedule_delayed_work(cfqd->queue, &cfqd->unplug_work,
+						delay);
 	}
 }
 
@@ -1316,8 +1318,6 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 	 * Does this cfqq already have too much IO in flight?
 	 */
 	if (cfqq->dispatched >= max_dispatch) {
-		unsigned long load_at = cfqd->last_end_sync_rq + cfq_slice_sync;
-
 		/*
 		 * idle queue must always only have a single IO in flight
 		 */
@@ -1331,20 +1331,36 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 			return 0;
 
 		/*
-		 * If a sync request has completed recently, don't overload
-		 * the dispatch queue yet with async requests.
+		 * Sole queue user, allow bigger slice
 		 */
-		if (cfqd->cfq_desktop && !cfq_cfqq_sync(cfqq)
-		    && time_before(jiffies, load_at))
-			return 0;
+		max_dispatch *= 4;
+	}
+
+	/*
+	 * Async queues must wait a bit before being allowed dispatch.
+	 * We also ramp up the dispatch depth gradually for async IO,
+	 * based on the last sync IO we serviced
+	 */
+	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_desktop) {
+		unsigned long last_sync = jiffies - cfqd->last_end_sync_rq;
+		unsigned int depth;
 
 		/*
-		 * we are the only queue, allow up to 4 times of 'quantum'
+		 * must wait a bit longer
 		 */
-		if (cfqq->dispatched >= 4 * max_dispatch)
+		if (last_sync < cfq_slice_sync) {
+			cfq_schedule_dispatch(cfqd, cfq_slice_sync - last_sync);
 			return 0;
+		}
+
+		depth = last_sync / cfq_slice_sync;
+		if (depth < max_dispatch)
+			max_dispatch = depth;
 	}
 
+	if (cfqq->dispatched >= max_dispatch)
+		return 0;
+
 	/*
 	 * Dispatch a request from this cfqq
 	 */
@@ -1389,7 +1405,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
-		cfq_schedule_dispatch(cfqd);
+		cfq_schedule_dispatch(cfqd, 0);
 	}
 
 	kmem_cache_free(cfq_pool, cfqq);
@@ -1484,7 +1500,7 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	if (unlikely(cfqq == cfqd->active_queue)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
-		cfq_schedule_dispatch(cfqd);
+		cfq_schedule_dispatch(cfqd, 0);
 	}
 
 	cfq_put_queue(cfqq);
@@ -2201,7 +2217,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	}
 
 	if (!rq_in_driver(cfqd))
-		cfq_schedule_dispatch(cfqd);
+		cfq_schedule_dispatch(cfqd, 0);
 }
 
 /*
@@ -2331,7 +2347,7 @@ queue_fail:
 	if (cic)
 		put_io_context(cic->ioc);
 
-	cfq_schedule_dispatch(cfqd);
+	cfq_schedule_dispatch(cfqd, 0);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	cfq_log(cfqd, "set_request fail");
 	return 1;
@@ -2340,7 +2356,7 @@ queue_fail:
 static void cfq_kick_queue(struct work_struct *work)
 {
 	struct cfq_data *cfqd =
-		container_of(work, struct cfq_data, unplug_work);
+		container_of(work, struct cfq_data, unplug_work.work);
 	struct request_queue *q = cfqd->queue;
 
 	spin_lock_irq(q->queue_lock);
@@ -2394,7 +2410,7 @@ static void cfq_idle_slice_timer(unsigned long data)
 expire:
 	cfq_slice_expired(cfqd, timed_out);
 out_kick:
-	cfq_schedule_dispatch(cfqd);
+	cfq_schedule_dispatch(cfqd, 0);
 out_cont:
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
@@ -2402,7 +2418,7 @@ out_cont:
 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 {
 	del_timer_sync(&cfqd->idle_slice_timer);
-	cancel_work_sync(&cfqd->unplug_work);
+	cancel_delayed_work_sync(&cfqd->unplug_work);
 }
 
 static void cfq_put_async_queues(struct cfq_data *cfqd)
@@ -2484,7 +2500,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
 	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
 
-	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
+	INIT_DELAYED_WORK(&cfqd->unplug_work, cfq_kick_queue);
 
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
-- 
cgit v1.1


From 963b72fc6664be12ea52f35a6addea14ec373433 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sat, 3 Oct 2009 19:42:18 +0200
Subject: cfq-iosched: rename 'desktop' sysfs entry to 'low_latency'

Don't think that's necessarily a perfect description of what this
option fiddles with, but it's probably better than 'desktop'.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fce8a74..3010e2e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -173,7 +173,7 @@ struct cfq_data {
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
-	unsigned int cfq_desktop;
+	unsigned int cfq_latency;
 
 	struct list_head cic_list;
 
@@ -1341,7 +1341,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 	 * We also ramp up the dispatch depth gradually for async IO,
 	 * based on the last sync IO we serviced
 	 */
-	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_desktop) {
+	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
 		unsigned long last_sync = jiffies - cfqd->last_end_sync_rq;
 		unsigned int depth;
 
@@ -1980,7 +1980,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (!cfqd->cfq_desktop && cfqd->hw_tag && CIC_SEEKY(cic)))
+	    (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		if (cic->ttime_mean > cfqd->cfq_slice_idle)
@@ -2511,7 +2511,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
-	cfqd->cfq_desktop = 1;
+	cfqd->cfq_latency = 1;
 	cfqd->hw_tag = 1;
 	cfqd->last_end_sync_rq = jiffies;
 	return cfqd;
@@ -2581,7 +2581,7 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
-SHOW_FUNCTION(cfq_desktop_show, cfqd->cfq_desktop, 0);
+SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -2613,7 +2613,7 @@ STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
-STORE_FUNCTION(cfq_desktop_store, &cfqd->cfq_desktop, 0, 1, 0);
+STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -2629,7 +2629,7 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_async),
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
-	CFQ_ATTR(desktop),
+	CFQ_ATTR(low_latency),
 	__ATTR_NULL
 };
 
-- 
cgit v1.1


From 61f0c1dcaaac71faabac6ef7c839b29f20204bea Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sat, 3 Oct 2009 19:46:03 +0200
Subject: cfq-iosched: use assigned slice sync value, not default

We should use the sysfs modified slice sync value, in case it differs
from the default.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3010e2e..ebab60c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1348,12 +1348,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 		/*
 		 * must wait a bit longer
 		 */
-		if (last_sync < cfq_slice_sync) {
-			cfq_schedule_dispatch(cfqd, cfq_slice_sync - last_sync);
+		if (last_sync < cfqd->cfq_slice[1]) {
+			cfq_schedule_dispatch(cfqd,
+						cfqd->cfq_slice[1] - last_sync);
 			return 0;
 		}
 
-		depth = last_sync / cfq_slice_sync;
+		depth = last_sync / cfqd->cfq_slice[1];
 		if (depth < max_dispatch)
 			max_dispatch = depth;
 	}
-- 
cgit v1.1


From ac481c20ef8f6c6f2be75d581863f40c43874ef7 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sat, 3 Oct 2009 20:52:01 +0200
Subject: block: Topology ioctls

Not all users of the topology information want to use libblkid.  Provide
the topology information through bdev ioctls.

Also clarify sector size comments for existing BLK ioctls.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/compat_ioctl.c | 13 +++++++++++++
 block/ioctl.c        | 17 +++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7865a34..9bd086c 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -21,6 +21,11 @@ static int compat_put_int(unsigned long arg, int val)
 	return put_user(val, (compat_int_t __user *)compat_ptr(arg));
 }
 
+static int compat_put_uint(unsigned long arg, unsigned int val)
+{
+	return put_user(val, (compat_uint_t __user *)compat_ptr(arg));
+}
+
 static int compat_put_long(unsigned long arg, long val)
 {
 	return put_user(val, (compat_long_t __user *)compat_ptr(arg));
@@ -734,6 +739,14 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	switch (cmd) {
 	case HDIO_GETGEO:
 		return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
+	case BLKPBSZGET:
+		return compat_put_uint(arg, bdev_physical_block_size(bdev));
+	case BLKIOMIN:
+		return compat_put_uint(arg, bdev_io_min(bdev));
+	case BLKIOOPT:
+		return compat_put_uint(arg, bdev_io_opt(bdev));
+	case BLKALIGNOFF:
+		return compat_put_int(arg, bdev_alignment_offset(bdev));
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:
diff --git a/block/ioctl.c b/block/ioctl.c
index d3e6b58..1f4d1de 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -138,6 +138,11 @@ static int put_int(unsigned long arg, int val)
 	return put_user(val, (int __user *)arg);
 }
 
+static int put_uint(unsigned long arg, unsigned int val)
+{
+	return put_user(val, (unsigned int __user *)arg);
+}
+
 static int put_long(unsigned long arg, long val)
 {
 	return put_user(val, (long __user *)arg);
@@ -263,10 +268,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
 	case BLKROGET:
 		return put_int(arg, bdev_read_only(bdev) != 0);
-	case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */
+	case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
 		return put_int(arg, block_size(bdev));
-	case BLKSSZGET: /* get block device hardware sector size */
+	case BLKSSZGET: /* get block device logical block size */
 		return put_int(arg, bdev_logical_block_size(bdev));
+	case BLKPBSZGET: /* get block device physical block size */
+		return put_uint(arg, bdev_physical_block_size(bdev));
+	case BLKIOMIN:
+		return put_uint(arg, bdev_io_min(bdev));
+	case BLKIOOPT:
+		return put_uint(arg, bdev_io_opt(bdev));
+	case BLKALIGNOFF:
+		return put_int(arg, bdev_alignment_offset(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET:
-- 
cgit v1.1


From e00c54c36ac2024c3a8a37432e2e2698ff849594 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sun, 4 Oct 2009 20:36:19 +0200
Subject: cfq-iosched: don't delay async queue if it hasn't dispatched at all

We cannot delay for the first dispatch of the async queue if it
hasn't dispatched at all, since that could present a local user
DoS attack vector using an app that just did slow timed sync reads
while filling memory.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ebab60c..9c4b679 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1345,16 +1345,9 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 		unsigned long last_sync = jiffies - cfqd->last_end_sync_rq;
 		unsigned int depth;
 
-		/*
-		 * must wait a bit longer
-		 */
-		if (last_sync < cfqd->cfq_slice[1]) {
-			cfq_schedule_dispatch(cfqd,
-						cfqd->cfq_slice[1] - last_sync);
-			return 0;
-		}
-
 		depth = last_sync / cfqd->cfq_slice[1];
+		if (!depth && !cfqq->dispatched)
+			depth = 1;
 		if (depth < max_dispatch)
 			max_dispatch = depth;
 	}
-- 
cgit v1.1


From 0f78ab9899e9d6acb09d5465def618704255963b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sun, 4 Oct 2009 21:04:38 +0200
Subject: Revert "Seperate read and write statistics of in_flight requests"

This reverts commit a9327cac440be4d8333bba975cbbf76045096275.

Corrado Zoccolo <czoccolo@gmail.com> reports:

"with 2.6.32-rc1 I started getting the following strange output from
"iostat -kx 2":
Linux 2.6.31bisect (et2) 	04/10/2009 	_i686_	(2 CPU)

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
          10,70    0,00    3,16   15,75    0,00   70,38

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s
avgrq-sz avgqu-sz   await  svctm  %util
sda              18,22     0,00    0,67    0,01    14,77     0,02
43,94     0,01   10,53 39043915,03 2629219,87
sdb              60,89     9,68   50,79    3,04  1724,43    50,52
65,95     0,70   13,06 488437,47 2629219,87

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           2,72    0,00    0,74    0,00    0,00   96,53

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s
avgrq-sz avgqu-sz   await  svctm  %util
sda               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00
sdb               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           6,68    0,00    0,99    0,00    0,00   92,33

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s
avgrq-sz avgqu-sz   await  svctm  %util
sda               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00
sdb               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           4,40    0,00    0,73    1,47    0,00   93,40

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s
avgrq-sz avgqu-sz   await  svctm  %util
sda               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00
sdb               0,00     4,00    0,00    3,00     0,00    28,00
18,67     0,06   19,50 333,33 100,00

Global values for service time and utilization are garbage. For
interval values, utilization is always 100%, and service time is
higher than normal.

I bisected it down to:
[a9327cac440be4d8333bba975cbbf76045096275] Seperate read and write
statistics of in_flight requests
and verified that reverting just that commit indeed solves the issue
on 2.6.32-rc1."

So until this is debugged, revert the bad commit.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c  | 6 +++---
 block/blk-merge.c | 2 +-
 block/genhd.c     | 4 +---
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index a8c7fbe..81f3431 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -70,7 +70,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	else {
 		part_round_stats(cpu, part);
-		part_inc_in_flight(part, rw);
+		part_inc_in_flight(part);
 	}
 
 	part_stat_unlock();
@@ -1032,7 +1032,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
 
 	if (part->in_flight) {
 		__part_stat_add(cpu, part, time_in_queue,
-				part_in_flight(part) * (now - part->stamp));
+				part->in_flight * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
@@ -1739,7 +1739,7 @@ static void blk_account_io_done(struct request *req)
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part, rw);
+		part_dec_in_flight(part);
 
 		part_stat_unlock();
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 99cb5cf..b0de857 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part, rq_data_dir(req));
+		part_dec_in_flight(part);
 
 		part_stat_unlock();
 	}
diff --git a/block/genhd.c b/block/genhd.c
index 517e433..5a0861d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -869,7 +869,6 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -889,7 +888,6 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
-	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -1055,7 +1053,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   part_stat_read(hd, merges[1]),
 			   (unsigned long long)part_stat_read(hd, sectors[1]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[1])),
-			   part_in_flight(hd),
+			   hd->in_flight,
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
 			);
-- 
cgit v1.1