From 484fc254b88257a2d8b3759aa062e8e8b35e0988 Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Thu, 8 Sep 2011 12:32:14 +0200
Subject: elevator: use ELV_NAME_MAX instead of magic number 16 for
 chosen_elevator

We have ELV_NAME_MAX defined to 16, and hence we should use it
instead of the magic nubmer 16 for elevator's name string.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/elevator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/elevator.c b/block/elevator.c
index a3b64bc..cb332cb 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -182,7 +182,7 @@ static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
 	eq->elevator_data = data;
 }
 
-static char chosen_elevator[16];
+static char chosen_elevator[ELV_NAME_MAX];
 
 static int __init elevator_setup(char *str)
 {
-- 
cgit v1.1


From 166e1f901b01872e8b70733a3f2e2c6980389cf8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 12 Sep 2011 12:08:27 +0200
Subject: block: export __make_request

Avoid the hacks need for request based device mappers currently by simply
exporting the symbol instead of trying to get it through the back door.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       |  5 ++---
 drivers/md/dm.c        | 13 +------------
 include/linux/blkdev.h |  2 ++
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b627558..56ef387 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -38,8 +38,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
-static int __make_request(struct request_queue *q, struct bio *bio);
-
 /*
  * For the allocated request tables
  */
@@ -1213,7 +1211,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-static int __make_request(struct request_queue *q, struct bio *bio)
+int __make_request(struct request_queue *q, struct bio *bio)
 {
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
 	struct blk_plug *plug;
@@ -1317,6 +1315,7 @@ out_unlock:
 out:
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__make_request);	/* for device mapper only */
 
 /*
  * If bio->bi_dev is a partition, remap the location
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 52b39f3..d8d7b8d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -180,9 +180,6 @@ struct mapped_device {
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
-	/* For saving the address of __make_request for request based dm */
-	make_request_fn *saved_make_request_fn;
-
 	/* sysfs handle */
 	struct kobject kobj;
 
@@ -1420,13 +1417,6 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
 	return 0;
 }
 
-static int dm_make_request(struct request_queue *q, struct bio *bio)
-{
-	struct mapped_device *md = q->queuedata;
-
-	return md->saved_make_request_fn(q, bio); /* call __make_request() */
-}
-
 static int dm_request_based(struct mapped_device *md)
 {
 	return blk_queue_stackable(md->queue);
@@ -1437,7 +1427,7 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	struct mapped_device *md = q->queuedata;
 
 	if (dm_request_based(md))
-		return dm_make_request(q, bio);
+		return __make_request(q, bio);
 
 	return _dm_request(q, bio);
 }
@@ -2172,7 +2162,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 		return 0;
 
 	md->queue = q;
-	md->saved_make_request_fn = md->queue->make_request_fn;
 	dm_init_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0e67c45..e9c3d9b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -675,6 +675,8 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
+extern int __make_request(struct request_queue *q, struct bio *bio);
+
 /*
  * A queue has just exitted congestion.  Note this in the global counter of
  * congested queues, and wake up anyone who was waiting for requests to be
-- 
cgit v1.1


From c20e8de27fef9f59869c81c288ad6cf28200e00c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 12 Sep 2011 12:03:37 +0200
Subject: block: rename __make_request() to blk_queue_bio()

Now that it's exported, lets put it in a more sane namespace.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       | 6 +++---
 drivers/md/dm.c        | 2 +-
 include/linux/blkdev.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 56ef387..ab673f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -540,7 +540,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 	/*
 	 * This also sets hw/phys segments, boundary and size
 	 */
-	blk_queue_make_request(q, __make_request);
+	blk_queue_make_request(q, blk_queue_bio);
 
 	q->sg_reserved_size = INT_MAX;
 
@@ -1211,7 +1211,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-int __make_request(struct request_queue *q, struct bio *bio)
+int blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
 	struct blk_plug *plug;
@@ -1315,7 +1315,7 @@ out_unlock:
 out:
 	return 0;
 }
-EXPORT_SYMBOL_GPL(__make_request);	/* for device mapper only */
+EXPORT_SYMBOL_GPL(blk_queue_bio);	/* for device mapper only */
 
 /*
  * If bio->bi_dev is a partition, remap the location
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d8d7b8d..78b2086 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1427,7 +1427,7 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	struct mapped_device *md = q->queuedata;
 
 	if (dm_request_based(md))
-		return __make_request(q, bio);
+		return blk_queue_bio(q, bio);
 
 	return _dm_request(q, bio);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e9c3d9b..085f954 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -675,7 +675,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int __make_request(struct request_queue *q, struct bio *bio);
+extern int blk_queue_bio(struct request_queue *q, struct bio *bio);
 
 /*
  * A queue has just exitted congestion.  Note this in the global counter of
-- 
cgit v1.1


From 5a7bbad27a410350e64a2d7f5ec18fc73836c14f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 12 Sep 2011 12:12:01 +0200
Subject: block: remove support for bio remapping from ->make_request

There is very little benefit in allowing to let a ->make_request
instance update the bios device and sector and loop around it in
__generic_make_request when we can archive the same through calling
generic_make_request from the driver and letting the loop in
generic_make_request handle it.

Note that various drivers got the return value from ->make_request and
returned non-zero values for errors.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 arch/m68k/emu/nfblock.c         |   3 +-
 arch/powerpc/sysdev/axonram.c   |   8 +--
 block/blk-core.c                | 153 ++++++++++++++++------------------------
 drivers/block/aoe/aoeblk.c      |  14 ++--
 drivers/block/brd.c             |   4 +-
 drivers/block/drbd/drbd_int.h   |   2 +-
 drivers/block/drbd/drbd_req.c   |   8 +--
 drivers/block/loop.c            |   5 +-
 drivers/block/pktcdvd.c         |  11 ++-
 drivers/block/ps3vram.c         |   6 +-
 drivers/block/umem.c            |   4 +-
 drivers/md/dm.c                 |  14 ++--
 drivers/md/faulty.c             |  14 ++--
 drivers/md/linear.c             |  17 ++---
 drivers/md/md.c                 |  12 ++--
 drivers/md/md.h                 |   2 +-
 drivers/md/multipath.c          |   8 +--
 drivers/md/raid0.c              |  22 +++---
 drivers/md/raid1.c              |   8 +--
 drivers/md/raid10.c             |  19 +++--
 drivers/md/raid5.c              |   8 +--
 drivers/s390/block/dcssblk.c    |   7 +-
 drivers/s390/block/xpram.c      |   5 +-
 drivers/staging/zram/zram_drv.c |   8 +--
 include/linux/blkdev.h          |   4 +-
 25 files changed, 151 insertions(+), 215 deletions(-)

diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 48e50f8..e301133 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -59,7 +59,7 @@ struct nfhd_device {
 	struct gendisk *disk;
 };
 
-static int nfhd_make_request(struct request_queue *queue, struct bio *bio)
+static void nfhd_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct nfhd_device *dev = queue->queuedata;
 	struct bio_vec *bvec;
@@ -76,7 +76,6 @@ static int nfhd_make_request(struct request_queue *queue, struct bio *bio)
 		sec += len;
 	}
 	bio_endio(bio, 0);
-	return 0;
 }
 
 static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 265f0f0..ba42719 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -104,7 +104,7 @@ axon_ram_irq_handler(int irq, void *dev)
  * axon_ram_make_request - make_request() method for block device
  * @queue, @bio: see blk_queue_make_request()
  */
-static int
+static void
 axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data;
@@ -113,7 +113,6 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 	struct bio_vec *vec;
 	unsigned int transfered;
 	unsigned short idx;
-	int rc = 0;
 
 	phys_mem = bank->io_addr + (bio->bi_sector << AXON_RAM_SECTOR_SHIFT);
 	phys_end = bank->io_addr + bank->size;
@@ -121,8 +120,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 	bio_for_each_segment(vec, bio, idx) {
 		if (unlikely(phys_mem + vec->bv_len > phys_end)) {
 			bio_io_error(bio);
-			rc = -ERANGE;
-			break;
+			return;
 		}
 
 		user_mem = page_address(vec->bv_page) + vec->bv_offset;
@@ -135,8 +133,6 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 		transfered += vec->bv_len;
 	}
 	bio_endio(bio, 0);
-
-	return rc;
 }
 
 /**
diff --git a/block/blk-core.c b/block/blk-core.c
index ab673f0..f58e019 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1211,7 +1211,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-int blk_queue_bio(struct request_queue *q, struct bio *bio)
+void blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
 	struct blk_plug *plug;
@@ -1236,7 +1236,7 @@ int blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * any locks.
 	 */
 	if (attempt_plug_merge(current, q, bio))
-		goto out;
+		return;
 
 	spin_lock_irq(q->queue_lock);
 
@@ -1312,8 +1312,6 @@ get_rq:
 out_unlock:
 		spin_unlock_irq(q->queue_lock);
 	}
-out:
-	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_queue_bio);	/* for device mapper only */
 
@@ -1441,112 +1439,85 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 static inline void __generic_make_request(struct bio *bio)
 {
 	struct request_queue *q;
-	sector_t old_sector;
-	int ret, nr_sectors = bio_sectors(bio);
-	dev_t old_dev;
+	int nr_sectors = bio_sectors(bio);
 	int err = -EIO;
+	char b[BDEVNAME_SIZE];
+	struct hd_struct *part;
 
 	might_sleep();
 
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
 
-	/*
-	 * Resolve the mapping until finished. (drivers are
-	 * still free to implement/resolve their own stacking
-	 * by explicitly returning 0)
-	 *
-	 * NOTE: we don't repeat the blk_size check for each new device.
-	 * Stacking drivers are expected to know what they are doing.
-	 */
-	old_sector = -1;
-	old_dev = 0;
-	do {
-		char b[BDEVNAME_SIZE];
-		struct hd_struct *part;
-
-		q = bdev_get_queue(bio->bi_bdev);
-		if (unlikely(!q)) {
-			printk(KERN_ERR
-			       "generic_make_request: Trying to access "
-				"nonexistent block-device %s (%Lu)\n",
-				bdevname(bio->bi_bdev, b),
-				(long long) bio->bi_sector);
-			goto end_io;
-		}
-
-		if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
-			     nr_sectors > queue_max_hw_sectors(q))) {
-			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-			       bdevname(bio->bi_bdev, b),
-			       bio_sectors(bio),
-			       queue_max_hw_sectors(q));
-			goto end_io;
-		}
-
-		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-			goto end_io;
-
-		part = bio->bi_bdev->bd_part;
-		if (should_fail_request(part, bio->bi_size) ||
-		    should_fail_request(&part_to_disk(part)->part0,
-					bio->bi_size))
-			goto end_io;
+	q = bdev_get_queue(bio->bi_bdev);
+	if (unlikely(!q)) {
+		printk(KERN_ERR
+		       "generic_make_request: Trying to access "
+			"nonexistent block-device %s (%Lu)\n",
+			bdevname(bio->bi_bdev, b),
+			(long long) bio->bi_sector);
+		goto end_io;
+	}
 
-		/*
-		 * If this device has partitions, remap block n
-		 * of partition p to block n+start(p) of the disk.
-		 */
-		blk_partition_remap(bio);
+	if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
+		     nr_sectors > queue_max_hw_sectors(q))) {
+		printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+		       bdevname(bio->bi_bdev, b),
+		       bio_sectors(bio),
+		       queue_max_hw_sectors(q));
+		goto end_io;
+	}
 
-		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
-			goto end_io;
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		goto end_io;
 
-		if (old_sector != -1)
-			trace_block_bio_remap(q, bio, old_dev, old_sector);
+	part = bio->bi_bdev->bd_part;
+	if (should_fail_request(part, bio->bi_size) ||
+	    should_fail_request(&part_to_disk(part)->part0,
+				bio->bi_size))
+		goto end_io;
 
-		old_sector = bio->bi_sector;
-		old_dev = bio->bi_bdev->bd_dev;
+	/*
+	 * If this device has partitions, remap block n
+	 * of partition p to block n+start(p) of the disk.
+	 */
+	blk_partition_remap(bio);
 
-		if (bio_check_eod(bio, nr_sectors))
-			goto end_io;
+	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+		goto end_io;
 
-		/*
-		 * Filter flush bio's early so that make_request based
-		 * drivers without flush support don't have to worry
-		 * about them.
-		 */
-		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
-			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
-			if (!nr_sectors) {
-				err = 0;
-				goto end_io;
-			}
-		}
+	if (bio_check_eod(bio, nr_sectors))
+		goto end_io;
 
-		if ((bio->bi_rw & REQ_DISCARD) &&
-		    (!blk_queue_discard(q) ||
-		     ((bio->bi_rw & REQ_SECURE) &&
-		      !blk_queue_secdiscard(q)))) {
-			err = -EOPNOTSUPP;
+	/*
+	 * Filter flush bio's early so that make_request based
+	 * drivers without flush support don't have to worry
+	 * about them.
+	 */
+	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+		bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+		if (!nr_sectors) {
+			err = 0;
 			goto end_io;
 		}
+	}
 
-		if (blk_throtl_bio(q, &bio))
-			goto end_io;
-
-		/*
-		 * If bio = NULL, bio has been throttled and will be submitted
-		 * later.
-		 */
-		if (!bio)
-			break;
-
-		trace_block_bio_queue(q, bio);
+	if ((bio->bi_rw & REQ_DISCARD) &&
+	    (!blk_queue_discard(q) ||
+	     ((bio->bi_rw & REQ_SECURE) &&
+	      !blk_queue_secdiscard(q)))) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
 
-		ret = q->make_request_fn(q, bio);
-	} while (ret);
+	if (blk_throtl_bio(q, &bio))
+		goto end_io;
 
+	/* if bio = NULL, bio has been throttled and will be submitted later. */
+	if (!bio)
+		return;
+	trace_block_bio_queue(q, bio);
+	q->make_request_fn(q, bio);
 	return;
 
 end_io:
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 528f631..167ba0a 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -159,7 +159,7 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
 	return 0;
 }
 
-static int
+static void
 aoeblk_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct sk_buff_head queue;
@@ -172,25 +172,25 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
 	if (bio == NULL) {
 		printk(KERN_ERR "aoe: bio is NULL\n");
 		BUG();
-		return 0;
+		return;
 	}
 	d = bio->bi_bdev->bd_disk->private_data;
 	if (d == NULL) {
 		printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
 		BUG();
 		bio_endio(bio, -ENXIO);
-		return 0;
+		return;
 	} else if (bio->bi_io_vec == NULL) {
 		printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
 		BUG();
 		bio_endio(bio, -ENXIO);
-		return 0;
+		return;
 	}
 	buf = mempool_alloc(d->bufpool, GFP_NOIO);
 	if (buf == NULL) {
 		printk(KERN_INFO "aoe: buf allocation failure\n");
 		bio_endio(bio, -ENOMEM);
-		return 0;
+		return;
 	}
 	memset(buf, 0, sizeof(*buf));
 	INIT_LIST_HEAD(&buf->bufs);
@@ -211,7 +211,7 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
 		spin_unlock_irqrestore(&d->lock, flags);
 		mempool_free(buf, d->bufpool);
 		bio_endio(bio, -ENXIO);
-		return 0;
+		return;
 	}
 
 	list_add_tail(&buf->bufs, &d->bufq);
@@ -222,8 +222,6 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
 
 	spin_unlock_irqrestore(&d->lock, flags);
 	aoenet_xmit(&queue);
-
-	return 0;
 }
 
 static int
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index dba1c32..d22119d 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -323,7 +323,7 @@ out:
 	return err;
 }
 
-static int brd_make_request(struct request_queue *q, struct bio *bio)
+static void brd_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct brd_device *brd = bdev->bd_disk->private_data;
@@ -359,8 +359,6 @@ static int brd_make_request(struct request_queue *q, struct bio *bio)
 
 out:
 	bio_endio(bio, err);
-
-	return 0;
 }
 
 #ifdef CONFIG_BLK_DEV_XIP
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index ef2ceed..36eee39 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1507,7 +1507,7 @@ extern void drbd_free_mdev(struct drbd_conf *mdev);
 extern int proc_details;
 
 /* drbd_req */
-extern int drbd_make_request(struct request_queue *q, struct bio *bio);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
 extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
 extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
 extern int is_valid_ar_handle(struct drbd_request *, sector_t);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3424d67..4a0f314 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1073,7 +1073,7 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
 	return 0;
 }
 
-int drbd_make_request(struct request_queue *q, struct bio *bio)
+void drbd_make_request(struct request_queue *q, struct bio *bio)
 {
 	unsigned int s_enr, e_enr;
 	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
@@ -1081,7 +1081,7 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
 
 	if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
 		bio_endio(bio, -EPERM);
-		return 0;
+		return;
 	}
 
 	start_time = jiffies;
@@ -1100,7 +1100,8 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
 
 	if (likely(s_enr == e_enr)) {
 		inc_ap_bio(mdev, 1);
-		return drbd_make_request_common(mdev, bio, start_time);
+		drbd_make_request_common(mdev, bio, start_time);
+		return;
 	}
 
 	/* can this bio be split generically?
@@ -1148,7 +1149,6 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
 
 		bio_pair_release(bp);
 	}
-	return 0;
 }
 
 /* This is called by bio_add_page().  With this function we reduce
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 76c8da7..8360239 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -514,7 +514,7 @@ static struct bio *loop_get_bio(struct loop_device *lo)
 	return bio_list_pop(&lo->lo_bio_list);
 }
 
-static int loop_make_request(struct request_queue *q, struct bio *old_bio)
+static void loop_make_request(struct request_queue *q, struct bio *old_bio)
 {
 	struct loop_device *lo = q->queuedata;
 	int rw = bio_rw(old_bio);
@@ -532,12 +532,11 @@ static int loop_make_request(struct request_queue *q, struct bio *old_bio)
 	loop_add_bio(lo, old_bio);
 	wake_up(&lo->lo_event);
 	spin_unlock_irq(&lo->lo_lock);
-	return 0;
+	return;
 
 out:
 	spin_unlock_irq(&lo->lo_lock);
 	bio_io_error(old_bio);
-	return 0;
 }
 
 struct switch_request {
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index e133f09..a63b0a2 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2444,7 +2444,7 @@ static void pkt_end_io_read_cloned(struct bio *bio, int err)
 	pkt_bio_finished(pd);
 }
 
-static int pkt_make_request(struct request_queue *q, struct bio *bio)
+static void pkt_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct pktcdvd_device *pd;
 	char b[BDEVNAME_SIZE];
@@ -2473,7 +2473,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 		cloned_bio->bi_end_io = pkt_end_io_read_cloned;
 		pd->stats.secs_r += bio->bi_size >> 9;
 		pkt_queue_bio(pd, cloned_bio);
-		return 0;
+		return;
 	}
 
 	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
@@ -2509,7 +2509,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 			pkt_make_request(q, &bp->bio1);
 			pkt_make_request(q, &bp->bio2);
 			bio_pair_release(bp);
-			return 0;
+			return;
 		}
 	}
 
@@ -2533,7 +2533,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 				}
 				spin_unlock(&pkt->lock);
 				spin_unlock(&pd->cdrw.active_list_lock);
-				return 0;
+				return;
 			} else {
 				blocked_bio = 1;
 			}
@@ -2584,10 +2584,9 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 		 */
 		wake_up(&pd->wqueue);
 	}
-	return 0;
+	return;
 end_io:
 	bio_io_error(bio);
-	return 0;
 }
 
 
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index b3bdb8a..7fad7af 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -596,7 +596,7 @@ out:
 	return next;
 }
 
-static int ps3vram_make_request(struct request_queue *q, struct bio *bio)
+static void ps3vram_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct ps3_system_bus_device *dev = q->queuedata;
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
@@ -610,13 +610,11 @@ static int ps3vram_make_request(struct request_queue *q, struct bio *bio)
 	spin_unlock_irq(&priv->lock);
 
 	if (busy)
-		return 0;
+		return;
 
 	do {
 		bio = ps3vram_do_bio(dev, bio);
 	} while (bio);
-
-	return 0;
 }
 
 static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 031ca72..aa27120 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -513,7 +513,7 @@ static void process_page(unsigned long data)
 	}
 }
 
-static int mm_make_request(struct request_queue *q, struct bio *bio)
+static void mm_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct cardinfo *card = q->queuedata;
 	pr_debug("mm_make_request %llu %u\n",
@@ -525,7 +525,7 @@ static int mm_make_request(struct request_queue *q, struct bio *bio)
 	card->biotail = &bio->bi_next;
 	spin_unlock_irq(&card->lock);
 
-	return 0;
+	return;
 }
 
 static irqreturn_t mm_interrupt(int irq, void *__card)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 78b2086..7b986e7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1388,7 +1388,7 @@ out:
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
-static int _dm_request(struct request_queue *q, struct bio *bio)
+static void _dm_request(struct request_queue *q, struct bio *bio)
 {
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
@@ -1409,12 +1409,12 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
 			queue_io(md, bio);
 		else
 			bio_io_error(bio);
-		return 0;
+		return;
 	}
 
 	__split_and_process_bio(md, bio);
 	up_read(&md->io_lock);
-	return 0;
+	return;
 }
 
 static int dm_request_based(struct mapped_device *md)
@@ -1422,14 +1422,14 @@ static int dm_request_based(struct mapped_device *md)
 	return blk_queue_stackable(md->queue);
 }
 
-static int dm_request(struct request_queue *q, struct bio *bio)
+static void dm_request(struct request_queue *q, struct bio *bio)
 {
 	struct mapped_device *md = q->queuedata;
 
 	if (dm_request_based(md))
-		return blk_queue_bio(q, bio);
-
-	return _dm_request(q, bio);
+		blk_queue_bio(q, bio);
+	else
+		_dm_request(q, bio);
 }
 
 void dm_dispatch_request(struct request *rq)
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 23078da..5ef304d 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -169,7 +169,7 @@ static void add_sector(conf_t *conf, sector_t start, int mode)
 		conf->nfaults = n+1;
 }
 
-static int make_request(mddev_t *mddev, struct bio *bio)
+static void make_request(mddev_t *mddev, struct bio *bio)
 {
 	conf_t *conf = mddev->private;
 	int failit = 0;
@@ -181,7 +181,7 @@ static int make_request(mddev_t *mddev, struct bio *bio)
 			 * just fail immediately
 			 */
 			bio_endio(bio, -EIO);
-			return 0;
+			return;
 		}
 
 		if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
@@ -211,15 +211,15 @@ static int make_request(mddev_t *mddev, struct bio *bio)
 	}
 	if (failit) {
 		struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
+
 		b->bi_bdev = conf->rdev->bdev;
 		b->bi_private = bio;
 		b->bi_end_io = faulty_fail;
-		generic_make_request(b);
-		return 0;
-	} else {
+		bio = b;
+	} else
 		bio->bi_bdev = conf->rdev->bdev;
-		return 1;
-	}
+
+	generic_make_request(bio);
 }
 
 static void status(struct seq_file *seq, mddev_t *mddev)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 6cd2c31..c6ee491 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -264,14 +264,14 @@ static int linear_stop (mddev_t *mddev)
 	return 0;
 }
 
-static int linear_make_request (mddev_t *mddev, struct bio *bio)
+static void linear_make_request (mddev_t *mddev, struct bio *bio)
 {
 	dev_info_t *tmp_dev;
 	sector_t start_sector;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
-		return 0;
+		return;
 	}
 
 	rcu_read_lock();
@@ -293,7 +293,7 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
 		       (unsigned long long)start_sector);
 		rcu_read_unlock();
 		bio_io_error(bio);
-		return 0;
+		return;
 	}
 	if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
 		     tmp_dev->end_sector)) {
@@ -307,20 +307,17 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
 
 		bp = bio_split(bio, end_sector - bio->bi_sector);
 
-		if (linear_make_request(mddev, &bp->bio1))
-			generic_make_request(&bp->bio1);
-		if (linear_make_request(mddev, &bp->bio2))
-			generic_make_request(&bp->bio2);
+		linear_make_request(mddev, &bp->bio1);
+		linear_make_request(mddev, &bp->bio2);
 		bio_pair_release(bp);
-		return 0;
+		return;
 	}
 		    
 	bio->bi_bdev = tmp_dev->rdev->bdev;
 	bio->bi_sector = bio->bi_sector - start_sector
 		+ tmp_dev->rdev->data_offset;
 	rcu_read_unlock();
-
-	return 1;
+	generic_make_request(bio);
 }
 
 static void linear_status (struct seq_file *seq, mddev_t *mddev)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e221a2..5c21785 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -330,18 +330,17 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
  * call has finished, the bio has been linked into some internal structure
  * and so is visible to ->quiesce(), so we don't need the refcount any more.
  */
-static int md_make_request(struct request_queue *q, struct bio *bio)
+static void md_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
 	mddev_t *mddev = q->queuedata;
-	int rv;
 	int cpu;
 	unsigned int sectors;
 
 	if (mddev == NULL || mddev->pers == NULL
 	    || !mddev->ready) {
 		bio_io_error(bio);
-		return 0;
+		return;
 	}
 	smp_rmb(); /* Ensure implications of  'active' are visible */
 	rcu_read_lock();
@@ -366,7 +365,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 	 * go away inside make_request
 	 */
 	sectors = bio_sectors(bio);
-	rv = mddev->pers->make_request(mddev, bio);
+	mddev->pers->make_request(mddev, bio);
 
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@ -375,8 +374,6 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 
 	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
 		wake_up(&mddev->sb_wait);
-
-	return rv;
 }
 
 /* mddev_suspend makes sure no new requests are submitted
@@ -475,8 +472,7 @@ static void md_submit_flush_data(struct work_struct *ws)
 		bio_endio(bio, 0);
 	else {
 		bio->bi_rw &= ~REQ_FLUSH;
-		if (mddev->pers->make_request(mddev, bio))
-			generic_make_request(bio);
+		mddev->pers->make_request(mddev, bio);
 	}
 
 	mddev->flush_bio = NULL;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1e586bb..bd47847 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -424,7 +424,7 @@ struct mdk_personality
 	int level;
 	struct list_head list;
 	struct module *owner;
-	int (*make_request)(mddev_t *mddev, struct bio *bio);
+	void (*make_request)(mddev_t *mddev, struct bio *bio);
 	int (*run)(mddev_t *mddev);
 	int (*stop)(mddev_t *mddev);
 	void (*status)(struct seq_file *seq, mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 3535c23..407cb56 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,7 +106,7 @@ static void multipath_end_request(struct bio *bio, int error)
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
-static int multipath_make_request(mddev_t *mddev, struct bio * bio)
+static void multipath_make_request(mddev_t *mddev, struct bio * bio)
 {
 	multipath_conf_t *conf = mddev->private;
 	struct multipath_bh * mp_bh;
@@ -114,7 +114,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
-		return 0;
+		return;
 	}
 
 	mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
@@ -126,7 +126,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 	if (mp_bh->path < 0) {
 		bio_endio(bio, -EIO);
 		mempool_free(mp_bh, conf->pool);
-		return 0;
+		return;
 	}
 	multipath = conf->multipaths + mp_bh->path;
 
@@ -137,7 +137,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 	mp_bh->bio.bi_end_io = multipath_end_request;
 	mp_bh->bio.bi_private = mp_bh;
 	generic_make_request(&mp_bh->bio);
-	return 0;
+	return;
 }
 
 static void multipath_status (struct seq_file *seq, mddev_t *mddev)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e86bf36..4066615 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -466,7 +466,7 @@ static inline int is_io_in_chunk_boundary(mddev_t *mddev,
 	}
 }
 
-static int raid0_make_request(mddev_t *mddev, struct bio *bio)
+static void raid0_make_request(mddev_t *mddev, struct bio *bio)
 {
 	unsigned int chunk_sects;
 	sector_t sector_offset;
@@ -475,7 +475,7 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
-		return 0;
+		return;
 	}
 
 	chunk_sects = mddev->chunk_sectors;
@@ -495,13 +495,10 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
 		else
 			bp = bio_split(bio, chunk_sects -
 				       sector_div(sector, chunk_sects));
-		if (raid0_make_request(mddev, &bp->bio1))
-			generic_make_request(&bp->bio1);
-		if (raid0_make_request(mddev, &bp->bio2))
-			generic_make_request(&bp->bio2);
-
+		raid0_make_request(mddev, &bp->bio1);
+		raid0_make_request(mddev, &bp->bio2);
 		bio_pair_release(bp);
-		return 0;
+		return;
 	}
 
 	sector_offset = bio->bi_sector;
@@ -511,10 +508,9 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
 	bio->bi_bdev = tmp_dev->bdev;
 	bio->bi_sector = sector_offset + zone->dev_start +
 		tmp_dev->data_offset;
-	/*
-	 * Let the main block layer submit the IO and resolve recursion:
-	 */
-	return 1;
+
+	generic_make_request(bio);
+	return;
 
 bad_map:
 	printk("md/raid0:%s: make_request bug: can't convert block across chunks"
@@ -523,7 +519,7 @@ bad_map:
 	       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
 
 	bio_io_error(bio);
-	return 0;
+	return;
 }
 
 static void raid0_status(struct seq_file *seq, mddev_t *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 32323f0..97f2a5f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -785,7 +785,7 @@ do_sync_io:
 	PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 }
 
-static int make_request(mddev_t *mddev, struct bio * bio)
+static void make_request(mddev_t *mddev, struct bio * bio)
 {
 	conf_t *conf = mddev->private;
 	mirror_info_t *mirror;
@@ -870,7 +870,7 @@ read_again:
 		if (rdisk < 0) {
 			/* couldn't find anywhere to read from */
 			raid_end_bio_io(r1_bio);
-			return 0;
+			return;
 		}
 		mirror = conf->mirrors + rdisk;
 
@@ -928,7 +928,7 @@ read_again:
 			goto read_again;
 		} else
 			generic_make_request(read_bio);
-		return 0;
+		return;
 	}
 
 	/*
@@ -1119,8 +1119,6 @@ read_again:
 
 	if (do_sync || !bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
-
-	return 0;
 }
 
 static void status(struct seq_file *seq, mddev_t *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8b29cd4..04b625e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -825,7 +825,7 @@ static void unfreeze_array(conf_t *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static int make_request(mddev_t *mddev, struct bio * bio)
+static void make_request(mddev_t *mddev, struct bio * bio)
 {
 	conf_t *conf = mddev->private;
 	mirror_info_t *mirror;
@@ -844,7 +844,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
-		return 0;
+		return;
 	}
 
 	/* If this request crosses a chunk boundary, we need to
@@ -876,10 +876,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		conf->nr_waiting++;
 		spin_unlock_irq(&conf->resync_lock);
 
-		if (make_request(mddev, &bp->bio1))
-			generic_make_request(&bp->bio1);
-		if (make_request(mddev, &bp->bio2))
-			generic_make_request(&bp->bio2);
+		make_request(mddev, &bp->bio1);
+		make_request(mddev, &bp->bio2);
 
 		spin_lock_irq(&conf->resync_lock);
 		conf->nr_waiting--;
@@ -887,14 +885,14 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		spin_unlock_irq(&conf->resync_lock);
 
 		bio_pair_release(bp);
-		return 0;
+		return;
 	bad_map:
 		printk("md/raid10:%s: make_request bug: can't convert block across chunks"
 		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
 		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
 
 		bio_io_error(bio);
-		return 0;
+		return;
 	}
 
 	md_write_start(mddev, bio);
@@ -937,7 +935,7 @@ read_again:
 		slot = r10_bio->read_slot;
 		if (disk < 0) {
 			raid_end_bio_io(r10_bio);
-			return 0;
+			return;
 		}
 		mirror = conf->mirrors + disk;
 
@@ -985,7 +983,7 @@ read_again:
 			goto read_again;
 		} else
 			generic_make_request(read_bio);
-		return 0;
+		return;
 	}
 
 	/*
@@ -1157,7 +1155,6 @@ retry_write:
 
 	if (do_sync || !mddev->bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
-	return 0;
 }
 
 static void status(struct seq_file *seq, mddev_t *mddev)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dbae459..96b7f6a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3695,7 +3695,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
 	return sh;
 }
 
-static int make_request(mddev_t *mddev, struct bio * bi)
+static void make_request(mddev_t *mddev, struct bio * bi)
 {
 	raid5_conf_t *conf = mddev->private;
 	int dd_idx;
@@ -3708,7 +3708,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 
 	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bi);
-		return 0;
+		return;
 	}
 
 	md_write_start(mddev, bi);
@@ -3716,7 +3716,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 	if (rw == READ &&
 	     mddev->reshape_position == MaxSector &&
 	     chunk_aligned_read(mddev,bi))
-		return 0;
+		return;
 
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
@@ -3851,8 +3851,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 
 		bio_endio(bi, 0);
 	}
-
-	return 0;
 }
 
 static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 9b43ae9..a5a55da2 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -27,7 +27,7 @@
 
 static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static int dcssblk_release(struct gendisk *disk, fmode_t mode);
-static int dcssblk_make_request(struct request_queue *q, struct bio *bio);
+static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
 static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
 				 void **kaddr, unsigned long *pfn);
 
@@ -814,7 +814,7 @@ out:
 	return rc;
 }
 
-static int
+static void
 dcssblk_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct dcssblk_dev_info *dev_info;
@@ -871,10 +871,9 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 		bytes_done += bvec->bv_len;
 	}
 	bio_endio(bio, 0);
-	return 0;
+	return;
 fail:
 	bio_io_error(bio);
-	return 0;
 }
 
 static int
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 1f6a4d8..98f3e4a 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -181,7 +181,7 @@ static unsigned long xpram_highest_page_index(void)
 /*
  * Block device make request function.
  */
-static int xpram_make_request(struct request_queue *q, struct bio *bio)
+static void xpram_make_request(struct request_queue *q, struct bio *bio)
 {
 	xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data;
 	struct bio_vec *bvec;
@@ -221,10 +221,9 @@ static int xpram_make_request(struct request_queue *q, struct bio *bio)
 	}
 	set_bit(BIO_UPTODATE, &bio->bi_flags);
 	bio_endio(bio, 0);
-	return 0;
+	return;
 fail:
 	bio_io_error(bio);
-	return 0;
 }
 
 static int xpram_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
index d70ec1a..02589ca 100644
--- a/drivers/staging/zram/zram_drv.c
+++ b/drivers/staging/zram/zram_drv.c
@@ -556,24 +556,22 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio)
 /*
  * Handler function for all zram I/O requests.
  */
-static int zram_make_request(struct request_queue *queue, struct bio *bio)
+static void zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
 	if (!valid_io_request(zram, bio)) {
 		zram_stat64_inc(zram, &zram->stats.invalid_io);
 		bio_io_error(bio);
-		return 0;
+		return;
 	}
 
 	if (unlikely(!zram->init_done) && zram_init_device(zram)) {
 		bio_io_error(bio);
-		return 0;
+		return;
 	}
 
 	__zram_make_request(zram, bio, bio_data_dir(bio));
-
-	return 0;
 }
 
 void zram_reset_device(struct zram *zram)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 085f954..c712efd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -193,7 +193,7 @@ struct request_pm_state
 #include <linux/elevator.h>
 
 typedef void (request_fn_proc) (struct request_queue *q);
-typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
+typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 
@@ -675,7 +675,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int blk_queue_bio(struct request_queue *q, struct bio *bio);
+extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
 
 /*
  * A queue has just exitted congestion.  Note this in the global counter of
-- 
cgit v1.1


From 27a84d54c02591e815d291ae0ee4bfb9cfd21065 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 15 Sep 2011 14:01:40 +0200
Subject: block: refactor generic_make_request

Move all the checks performed on a bio into a new helper, and call it as
soon as bio is submitted even if it is a re-submission from ->make_request.

We explicitly mark the new helper as beeing non-inlined as the stack
usage for printing the block device name in the failure case is quite
high and this a patch where we have to be extremely conservative about
stack usage.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 95 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 46 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f58e019..684d7eb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1412,31 +1412,8 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 	return 0;
 }
 
-/**
- * generic_make_request - hand a buffer to its device driver for I/O
- * @bio:  The bio describing the location in memory and on the device.
- *
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status.  The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may change bi_dev and
- * bi_sector for remaps as it sees fit.  So the values of these fields
- * should NOT be depended on after the call to generic_make_request.
- */
-static inline void __generic_make_request(struct bio *bio)
+static noinline_for_stack bool
+generic_make_request_checks(struct bio *bio)
 {
 	struct request_queue *q;
 	int nr_sectors = bio_sectors(bio);
@@ -1515,35 +1492,62 @@ static inline void __generic_make_request(struct bio *bio)
 
 	/* if bio = NULL, bio has been throttled and will be submitted later. */
 	if (!bio)
-		return;
+		return false;
+
 	trace_block_bio_queue(q, bio);
-	q->make_request_fn(q, bio);
-	return;
+	return true;
 
 end_io:
 	bio_endio(bio, err);
+	return false;
 }
 
-/*
- * We only want one ->make_request_fn to be active at a time,
- * else stack usage with stacked devices could be a problem.
- * So use current->bio_list to keep a list of requests
- * submited by a make_request_fn function.
- * current->bio_list is also used as a flag to say if
- * generic_make_request is currently active in this task or not.
- * If it is NULL, then no make_request is active.  If it is non-NULL,
- * then a make_request is active, and new requests should be added
- * at the tail
+/**
+ * generic_make_request - hand a buffer to its device driver for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * generic_make_request() is used to make I/O requests of block
+ * devices. It is passed a &struct bio, which describes the I/O that needs
+ * to be done.
+ *
+ * generic_make_request() does not return any status.  The
+ * success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the bio->bi_end_io
+ * function described (one day) else where.
+ *
+ * The caller of generic_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffer, and that bi_dev and bi_sector are
+ * set to describe the device address, and the
+ * bi_end_io and optionally bi_private are set to describe how
+ * completion notification should be signaled.
+ *
+ * generic_make_request and the drivers it calls may use bi_next if this
+ * bio happens to be merged with someone else, and may resubmit the bio to
+ * a lower device by calling into generic_make_request recursively, which
+ * means the bio should NOT be touched after the call to ->make_request_fn.
  */
 void generic_make_request(struct bio *bio)
 {
 	struct bio_list bio_list_on_stack;
 
+	if (!generic_make_request_checks(bio))
+		return;
+
+	/*
+	 * We only want one ->make_request_fn to be active at a time, else
+	 * stack usage with stacked devices could be a problem.  So use
+	 * current->bio_list to keep a list of requests submited by a
+	 * make_request_fn function.  current->bio_list is also used as a
+	 * flag to say if generic_make_request is currently active in this
+	 * task or not.  If it is NULL, then no make_request is active.  If
+	 * it is non-NULL, then a make_request is active, and new requests
+	 * should be added at the tail
+	 */
 	if (current->bio_list) {
-		/* make_request is active */
 		bio_list_add(current->bio_list, bio);
 		return;
 	}
+
 	/* following loop may be a bit non-obvious, and so deserves some
 	 * explanation.
 	 * Before entering the loop, bio->bi_next is NULL (as all callers
@@ -1551,22 +1555,21 @@ void generic_make_request(struct bio *bio)
 	 * We pretend that we have just taken it off a longer list, so
 	 * we assign bio_list to a pointer to the bio_list_on_stack,
 	 * thus initialising the bio_list of new bios to be
-	 * added.  __generic_make_request may indeed add some more bios
+	 * added.  ->make_request() may indeed add some more bios
 	 * through a recursive call to generic_make_request.  If it
 	 * did, we find a non-NULL value in bio_list and re-enter the loop
 	 * from the top.  In this case we really did just take the bio
 	 * of the top of the list (no pretending) and so remove it from
-	 * bio_list, and call into __generic_make_request again.
-	 *
-	 * The loop was structured like this to make only one call to
-	 * __generic_make_request (which is important as it is large and
-	 * inlined) and to keep the structure simple.
+	 * bio_list, and call into ->make_request() again.
 	 */
 	BUG_ON(bio->bi_next);
 	bio_list_init(&bio_list_on_stack);
 	current->bio_list = &bio_list_on_stack;
 	do {
-		__generic_make_request(bio);
+		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+		q->make_request_fn(q, bio);
+
 		bio = bio_list_pop(current->bio_list);
 	} while (bio);
 	current->bio_list = NULL; /* deactivate */
-- 
cgit v1.1


From 75df713627f28f88b901b329c8857747545fd4ab Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Wed, 21 Sep 2011 10:00:16 +0200
Subject: block: document blk-plug

Thus spake Andrew Morton:

"And I have the usual maintainability whine.  If someone comes up to
vmscan.c and sees it calling blk_start_plug(), how are they supposed to
work out why that call is there?  They go look at the blk_start_plug()
definition and it is undocumented.  I think we can do better than this?"

Adapted from the LWN article - http://lwn.net/Articles/438256/ by Jens
Axboe and from an earlier attempt by Shaohua Li to document blk-plug.

[akpm@linux-foundation.org: grammatical and spelling tweaks]
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 14 ++++++++++++++
 include/linux/blkdev.h | 24 +++++++++++++++---------
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 684d7eb..97e9e54 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2595,6 +2595,20 @@ EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 
 #define PLUG_MAGIC	0x91827364
 
+/**
+ * blk_start_plug - initialize blk_plug and track it inside the task_struct
+ * @plug:	The &struct blk_plug that needs to be initialized
+ *
+ * Description:
+ *   Tracking blk_plug inside the task_struct will help with auto-flushing the
+ *   pending I/O should the task end up blocking between blk_start_plug() and
+ *   blk_finish_plug(). This is important from a performance perspective, but
+ *   also ensures that we don't deadlock. For instance, if the task is blocking
+ *   for a memory allocation, memory reclaim could end up wanting to free a
+ *   page belonging to that request that is currently residing in our private
+ *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
+ *   this kind of deadlock.
+ */
 void blk_start_plug(struct blk_plug *plug)
 {
 	struct task_struct *tsk = current;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c712efd..1978655 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -860,17 +860,23 @@ struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
 
 /*
- * Note: Code in between changing the blk_plug list/cb_list or element of such
- * lists is preemptable, but such code can't do sleep (or be very careful),
- * otherwise data is corrupted. For details, please check schedule() where
- * blk_schedule_flush_plug() is called.
+ * blk_plug permits building a queue of related requests by holding the I/O
+ * fragments for a short period. This allows merging of sequential requests
+ * into single larger request. As the requests are moved from a per-task list to
+ * the device's request_queue in a batch, this results in improved scalability
+ * as the lock contention for request_queue lock is reduced.
+ *
+ * It is ok not to disable preemption when adding the request to the plug list
+ * or when attempting a merge, because blk_schedule_flush_list() will only flush
+ * the plug list when the task sleeps by itself. For details, please see
+ * schedule() where blk_schedule_flush_plug() is called.
  */
 struct blk_plug {
-	unsigned long magic;
-	struct list_head list;
-	struct list_head cb_list;
-	unsigned int should_sort;
-	unsigned int count;
+	unsigned long magic; /* detect uninitialized use-cases */
+	struct list_head list; /* requests */
+	struct list_head cb_list; /* md requires an unplug callback */
+	unsigned int should_sort; /* list to be sorted before flushing? */
+	unsigned int count; /* number of queued requests */
 };
 #define BLK_MAX_REQUEST_COUNT 16
 
-- 
cgit v1.1


From 499337bb6511e665a236a6a947f819d98ea340c6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 21 Sep 2011 10:01:22 +0200
Subject: block/blk-sysfs.c: fix kerneldoc references

The kerneldoc for blk_release_queue() is referring to blk_cleanup_queue().

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0ee17b5..adc923e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -455,11 +455,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 }
 
 /**
- * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
- * @kobj:    the kobj belonging of the request queue to be released
+ * blk_release_queue: - release a &struct request_queue when it is no longer needed
+ * @kobj:    the kobj belonging to the request queue to be released
  *
  * Description:
- *     blk_cleanup_queue is the pair to blk_init_queue() or
+ *     blk_release_queue is the pair to blk_init_queue() or
  *     blk_queue_make_request().  It should be called when a request queue is
  *     being released; typically when a block device is being de-registered.
  *     Currently, its primary task it to free all the &struct request
-- 
cgit v1.1


From 456be1484ffc72a24bdb4200b5847c4fa90139d9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 17 Oct 2011 12:57:20 +0200
Subject: loop: remove the incorrect write_begin/write_end shortcut

Currently the loop device tries to call directly into write_begin/write_end
instead of going through ->write if it can.  This is a fairly nasty shortcut
as write_begin and write_end are only callbacks for the generic write code
and expect to be called with filesystem specific locks held.

This code currently causes various issues for clustered filesystems as it
doesn't take the required cluster locks, and it also causes issues for XFS
as it doesn't properly lock against the swapext ioctl as called by the
defragmentation tools.  This in case causes data corruption if
defragmentation hits a busy loop device in the wrong time window, as
reported by RH QA.

The reason why we have this shortcut is that it saves a data copy when
doing a transformation on the loop device, which is the technical term
for using cryptoloop (or an XOR transformation).  Given that cryptoloop
has been deprecated in favour of dm-crypt my opinion is that we should
simply drop this shortcut instead of finding complicated ways to to
introduce a formal interface for this shortcut.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 135 +++++++++------------------------------------------
 include/linux/loop.h |   1 -
 2 files changed, 23 insertions(+), 113 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4720c7a..46cdd69 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -203,74 +203,6 @@ lo_do_transfer(struct loop_device *lo, int cmd,
 }
 
 /**
- * do_lo_send_aops - helper for writing data to a loop device
- *
- * This is the fast version for backing filesystems which implement the address
- * space operations write_begin and write_end.
- */
-static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
-		loff_t pos, struct page *unused)
-{
-	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
-	struct address_space *mapping = file->f_mapping;
-	pgoff_t index;
-	unsigned offset, bv_offs;
-	int len, ret;
-
-	mutex_lock(&mapping->host->i_mutex);
-	index = pos >> PAGE_CACHE_SHIFT;
-	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
-	bv_offs = bvec->bv_offset;
-	len = bvec->bv_len;
-	while (len > 0) {
-		sector_t IV;
-		unsigned size, copied;
-		int transfer_result;
-		struct page *page;
-		void *fsdata;
-
-		IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
-		size = PAGE_CACHE_SIZE - offset;
-		if (size > len)
-			size = len;
-
-		ret = pagecache_write_begin(file, mapping, pos, size, 0,
-							&page, &fsdata);
-		if (ret)
-			goto fail;
-
-		file_update_time(file);
-
-		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
-				bvec->bv_page, bv_offs, size, IV);
-		copied = size;
-		if (unlikely(transfer_result))
-			copied = 0;
-
-		ret = pagecache_write_end(file, mapping, pos, size, copied,
-							page, fsdata);
-		if (ret < 0 || ret != copied)
-			goto fail;
-
-		if (unlikely(transfer_result))
-			goto fail;
-
-		bv_offs += copied;
-		len -= copied;
-		offset = 0;
-		index++;
-		pos += copied;
-	}
-	ret = 0;
-out:
-	mutex_unlock(&mapping->host->i_mutex);
-	return ret;
-fail:
-	ret = -1;
-	goto out;
-}
-
-/**
  * __do_lo_send_write - helper for writing data to a loop device
  *
  * This helper just factors out common code between do_lo_send_direct_write()
@@ -297,10 +229,8 @@ static int __do_lo_send_write(struct file *file,
 /**
  * do_lo_send_direct_write - helper for writing data to a loop device
  *
- * This is the fast, non-transforming version for backing filesystems which do
- * not implement the address space operations write_begin and write_end.
- * It uses the write file operation which should be present on all writeable
- * filesystems.
+ * This is the fast, non-transforming version that does not need double
+ * buffering.
  */
 static int do_lo_send_direct_write(struct loop_device *lo,
 		struct bio_vec *bvec, loff_t pos, struct page *page)
@@ -316,15 +246,9 @@ static int do_lo_send_direct_write(struct loop_device *lo,
 /**
  * do_lo_send_write - helper for writing data to a loop device
  *
- * This is the slow, transforming version for filesystems which do not
- * implement the address space operations write_begin and write_end.  It
- * uses the write file operation which should be present on all writeable
- * filesystems.
- *
- * Using fops->write is slower than using aops->{prepare,commit}_write in the
- * transforming case because we need to double buffer the data as we cannot do
- * the transformations in place as we do not have direct access to the
- * destination pages of the backing file.
+ * This is the slow, transforming version that needs to double buffer the
+ * data as it cannot do the transformations in place without having direct
+ * access to the destination pages of the backing file.
  */
 static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
 		loff_t pos, struct page *page)
@@ -350,17 +274,16 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 	struct page *page = NULL;
 	int i, ret = 0;
 
-	do_lo_send = do_lo_send_aops;
-	if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
+	if (lo->transfer != transfer_none) {
+		page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+		if (unlikely(!page))
+			goto fail;
+		kmap(page);
+		do_lo_send = do_lo_send_write;
+	} else {
 		do_lo_send = do_lo_send_direct_write;
-		if (lo->transfer != transfer_none) {
-			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
-			if (unlikely(!page))
-				goto fail;
-			kmap(page);
-			do_lo_send = do_lo_send_write;
-		}
 	}
+
 	bio_for_each_segment(bvec, bio, i) {
 		ret = do_lo_send(lo, bvec, pos, page);
 		if (ret < 0)
@@ -849,35 +772,23 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	mapping = file->f_mapping;
 	inode = mapping->host;
 
-	if (!(file->f_mode & FMODE_WRITE))
-		lo_flags |= LO_FLAGS_READ_ONLY;
-
 	error = -EINVAL;
-	if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		const struct address_space_operations *aops = mapping->a_ops;
-
-		if (aops->write_begin)
-			lo_flags |= LO_FLAGS_USE_AOPS;
-		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
-			lo_flags |= LO_FLAGS_READ_ONLY;
+	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+		goto out_putf;
 
-		lo_blocksize = S_ISBLK(inode->i_mode) ?
-			inode->i_bdev->bd_block_size : PAGE_SIZE;
+	if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
+	    !file->f_op->write)
+		lo_flags |= LO_FLAGS_READ_ONLY;
 
-		error = 0;
-	} else {
-		goto out_putf;
-	}
+	lo_blocksize = S_ISBLK(inode->i_mode) ?
+		inode->i_bdev->bd_block_size : PAGE_SIZE;
 
+	error = -EFBIG;
 	size = get_loop_size(lo, file);
-
-	if ((loff_t)(sector_t)size != size) {
-		error = -EFBIG;
+	if ((loff_t)(sector_t)size != size)
 		goto out_putf;
-	}
 
-	if (!(mode & FMODE_WRITE))
-		lo_flags |= LO_FLAGS_READ_ONLY;
+	error = 0;
 
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 683d698..a068806 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -73,7 +73,6 @@ struct loop_device {
  */
 enum {
 	LO_FLAGS_READ_ONLY	= 1,
-	LO_FLAGS_USE_AOPS	= 2,
 	LO_FLAGS_AUTOCLEAR	= 4,
 };
 
-- 
cgit v1.1


From 523e1d399ce0e23bec562abe2b2f8d297af81161 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:07 +0200
Subject: block: make gendisk hold a reference to its queue

The following command sequence triggers an oops.

# mount /dev/sdb1 /mnt
# echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete
# umount /mnt

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:

 Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs
 RIP: 0010:[<ffffffff810d0879>]  [<ffffffff810d0879>] __lock_acquire+0x389/0x1d60
...
 Call Trace:
  [<ffffffff810d2845>] lock_acquire+0x95/0x140
  [<ffffffff81aed87b>] _raw_spin_lock+0x3b/0x50
  [<ffffffff811573bc>] bdi_lock_two+0x5c/0x70
  [<ffffffff811c2f6c>] bdev_inode_switch_bdi+0x4c/0xf0
  [<ffffffff811c3fcb>] __blkdev_put+0x11b/0x1d0
  [<ffffffff811c4010>] __blkdev_put+0x160/0x1d0
  [<ffffffff811c40df>] blkdev_put+0x5f/0x190
  [<ffffffff8118f18d>] kill_block_super+0x4d/0x80
  [<ffffffff8118f4a5>] deactivate_locked_super+0x45/0x70
  [<ffffffff8119003a>] deactivate_super+0x4a/0x70
  [<ffffffff811ac4ad>] mntput_no_expire+0xed/0x130
  [<ffffffff811acf2e>] sys_umount+0x7e/0x3a0
  [<ffffffff81aeeeab>] system_call_fastpath+0x16/0x1b

This is because bdev holds on to disk but disk doesn't pin the
associated queue.  If a SCSI device is removed while the device is
still open, the sdev puts the base reference to the queue on release.
When the bdev is finally released, the associated queue is already
gone along with the bdi and bdev_inode_switch_bdi() ends up
dereferencing already freed bdi.

Even if it were not for this bug, disk not holding onto the associated
queue is very unusual and error-prone.

Fix it by making add_disk() take an extra reference to its queue and
put it on disk_release() and ensuring that disk and its fops owner are
put in that order after all accesses to the disk and queue are
complete.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c  |  8 ++++++++
 fs/block_dev.c | 13 ++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index e2f6790..d261b73 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk)
 	register_disk(disk);
 	blk_register_queue(disk);
 
+	/*
+	 * Take an extra ref on queue which will be put on disk_release()
+	 * so that it sticks around as long as @disk is there.
+	 */
+	WARN_ON_ONCE(blk_get_queue(disk->queue));
+
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
@@ -1095,6 +1101,8 @@ static void disk_release(struct device *dev)
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
 	free_part_info(&disk->part0);
+	if (disk->queue)
+		blk_put_queue(disk->queue);
 	kfree(disk);
 }
 struct class block_class = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786e..1c44b8d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
+	struct module *owner;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out;
+	owner = disk->fops->owner;
 
 	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					bdev->bd_disk = NULL;
 					mutex_unlock(&bdev->bd_mutex);
 					disk_unblock_events(disk);
-					module_put(disk->fops->owner);
 					put_disk(disk);
+					module_put(owner);
 					goto restart;
 				}
 			}
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_unlock_bdev;
 		}
 		/* only one opener holds refs to the module and disk */
-		module_put(disk->fops->owner);
 		put_disk(disk);
+		module_put(owner);
 	}
 	bdev->bd_openers++;
 	if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
-	module_put(disk->fops->owner);
 	put_disk(disk);
+	module_put(owner);
  out:
 	bdput(bdev);
 
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
 
-		put_disk(disk);
-		module_put(owner);
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
 		if (bdev != bdev->bd_contains)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
+
+		put_disk(disk);
+		module_put(owner);
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
-- 
cgit v1.1


From ece84241b93c4693bfe0a8ec9a043a16d216d0cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:15 +0200
Subject: block: fix genhd refcounting in blkio_policy_parse_and_set()

blkio_policy_parse_and_set() calls blkio_check_dev_num() to check
whether the given dev_t is valid.  blkio_check_dev_num() uses
get_gendisk() for verification but never puts the returned genhd
leaking the reference.

This patch collapses blkio_check_dev_num() into its caller and updates
it such that the genhd is put before returning.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 56 ++++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b596e54..d61ec56 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -768,25 +768,14 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	return disk_total;
 }
 
-static int blkio_check_dev_num(dev_t dev)
-{
-	int part = 0;
-	struct gendisk *disk;
-
-	disk = get_gendisk(dev, &part);
-	if (!disk || part)
-		return -ENODEV;
-
-	return 0;
-}
-
 static int blkio_policy_parse_and_set(char *buf,
 	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 {
+	struct gendisk *disk = NULL;
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
-	int ret;
 	unsigned long major, minor;
-	int i = 0;
+	int i = 0, ret = -EINVAL;
+	int part;
 	dev_t dev;
 	u64 temp;
 
@@ -804,37 +793,36 @@ static int blkio_policy_parse_and_set(char *buf,
 	}
 
 	if (i != 2)
-		return -EINVAL;
+		goto out;
 
 	p = strsep(&s[0], ":");
 	if (p != NULL)
 		major_s = p;
 	else
-		return -EINVAL;
+		goto out;
 
 	minor_s = s[0];
 	if (!minor_s)
-		return -EINVAL;
+		goto out;
 
-	ret = strict_strtoul(major_s, 10, &major);
-	if (ret)
-		return -EINVAL;
+	if (strict_strtoul(major_s, 10, &major))
+		goto out;
 
-	ret = strict_strtoul(minor_s, 10, &minor);
-	if (ret)
-		return -EINVAL;
+	if (strict_strtoul(minor_s, 10, &minor))
+		goto out;
 
 	dev = MKDEV(major, minor);
 
-	ret = strict_strtoull(s[1], 10, &temp);
-	if (ret)
-		return -EINVAL;
+	if (strict_strtoull(s[1], 10, &temp))
+		goto out;
 
 	/* For rule removal, do not check for device presence. */
 	if (temp) {
-		ret = blkio_check_dev_num(dev);
-		if (ret)
-			return ret;
+		disk = get_gendisk(dev, &part);
+		if (!disk || part) {
+			ret = -ENODEV;
+			goto out;
+		}
 	}
 
 	newpn->dev = dev;
@@ -843,7 +831,7 @@ static int blkio_policy_parse_and_set(char *buf,
 	case BLKIO_POLICY_PROP:
 		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
 		     temp > BLKIO_WEIGHT_MAX)
-			return -EINVAL;
+			goto out;
 
 		newpn->plid = plid;
 		newpn->fileid = fileid;
@@ -860,7 +848,7 @@ static int blkio_policy_parse_and_set(char *buf,
 		case BLKIO_THROTL_read_iops_device:
 		case BLKIO_THROTL_write_iops_device:
 			if (temp > THROTL_IOPS_MAX)
-				return -EINVAL;
+				goto out;
 
 			newpn->plid = plid;
 			newpn->fileid = fileid;
@@ -871,8 +859,10 @@ static int blkio_policy_parse_and_set(char *buf,
 	default:
 		BUG();
 	}
-
-	return 0;
+	ret = 0;
+out:
+	put_disk(disk);
+	return ret;
 }
 
 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
-- 
cgit v1.1


From bc9fcbf9cb8ec76d340da16fbf48a9a316e14c52 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:18 +0200
Subject: block: move blk_throtl prototypes to block/blk.h

blk_throtl interface is block internal and there's no reason to have
them in linux/blkdev.h.  Move them to block/blk.h.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c   |  1 +
 block/blk.h            | 15 ++++++++++++++-
 include/linux/blkdev.h | 14 --------------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a19f58c..f3f495e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,6 +10,7 @@
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
+#include "blk.h"
 
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
diff --git a/block/blk.h b/block/blk.h
index 20b900a..da247ba 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -188,4 +188,17 @@ static inline int blk_do_io_stat(struct request *rq)
 	        (rq->cmd_flags & REQ_DISCARD));
 }
 
-#endif
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+extern int blk_throtl_init(struct request_queue *q);
+extern void blk_throtl_exit(struct request_queue *q);
+#else /* CONFIG_BLK_DEV_THROTTLING */
+static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+{
+	return 0;
+}
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline void blk_throtl_exit(struct request_queue *q) { }
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
+#endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0b68044..5267cd2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1197,20 +1197,6 @@ static inline uint64_t rq_io_start_time_ns(struct request *req)
 }
 #endif
 
-#ifdef CONFIG_BLK_DEV_THROTTLING
-extern int blk_throtl_init(struct request_queue *q);
-extern void blk_throtl_exit(struct request_queue *q);
-extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
-#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
-{
-	return 0;
-}
-
-static inline int blk_throtl_init(struct request_queue *q) { return 0; }
-static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
-#endif /* CONFIG_BLK_DEV_THROTTLING */
-
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
-- 
cgit v1.1


From 75eb6c372d41d6d140b893873f6687d78c987a44 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:22 +0200
Subject: block: pass around REQ_* flags instead of broken down booleans during
 request alloc/free

blk_alloc_request() and freed_request() take different combinations of
REQ_* @flags, @priv and @is_sync when @flags is superset of the latter
two.  Make them take @flags only.  This cleans up the code a bit and
will ease updating allocation related REQ_* flags.

This patch doesn't introduce any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 79e41a7..a3d2fdc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -574,7 +574,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
 }
 
 static struct request *
-blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
+blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
@@ -585,12 +585,10 @@ blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
 
 	rq->cmd_flags = flags | REQ_ALLOCED;
 
-	if (priv) {
-		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
-			mempool_free(rq, q->rq.rq_pool);
-			return NULL;
-		}
-		rq->cmd_flags |= REQ_ELVPRIV;
+	if ((flags & REQ_ELVPRIV) &&
+	    unlikely(elv_set_request(q, rq, gfp_mask))) {
+		mempool_free(rq, q->rq.rq_pool);
+		return NULL;
 	}
 
 	return rq;
@@ -649,12 +647,13 @@ static void __freed_request(struct request_queue *q, int sync)
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
-static void freed_request(struct request_queue *q, int sync, int priv)
+static void freed_request(struct request_queue *q, unsigned int flags)
 {
 	struct request_list *rl = &q->rq;
+	int sync = rw_is_sync(flags);
 
 	rl->count[sync]--;
-	if (priv)
+	if (flags & REQ_ELVPRIV)
 		rl->elvpriv--;
 
 	__freed_request(q, sync);
@@ -694,7 +693,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
-	int may_queue, priv = 0;
+	int may_queue;
 
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
@@ -738,17 +737,17 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
 
-	if (blk_rq_should_init_elevator(bio)) {
-		priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-		if (priv)
-			rl->elvpriv++;
+	if (blk_rq_should_init_elevator(bio) &&
+	    !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
+		rw_flags |= REQ_ELVPRIV;
+		rl->elvpriv++;
 	}
 
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
-	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
+	rq = blk_alloc_request(q, rw_flags, gfp_mask);
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@@ -758,7 +757,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 		 * wait queue, but this is pretty rare.
 		 */
 		spin_lock_irq(q->queue_lock);
-		freed_request(q, is_sync, priv);
+		freed_request(q, rw_flags);
 
 		/*
 		 * in the very unlikely event that allocation failed and no
@@ -1050,14 +1049,13 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
-		int is_sync = rq_is_sync(req) != 0;
-		int priv = req->cmd_flags & REQ_ELVPRIV;
+		unsigned int flags = req->cmd_flags;
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 
 		blk_free_request(q, req);
-		freed_request(q, is_sync, priv);
+		freed_request(q, flags);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
-- 
cgit v1.1


From 315fceee81155ef2aeed9316ca72aeea9347db5c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:25 +0200
Subject: block: drop unnecessary blk_get/put_queue() in scsi_cmd_ioctl() and
 blk_get_tg()

blk_get/put_queue() in scsi_cmd_ioctl() and throtl_get_tg() are
completely bogus.  The caller must have a reference to the queue on
entry and taking an extra reference doesn't change anything.

For scsi_cmd_ioctl(), the only effect is that it ends up checking
QUEUE_FLAG_DEAD on entry; however, this is bogus as queue can die
right after blk_get_queue().  Dead queue should be and is handled in
request issue path (it's somewhat broken now but that's a separate
problem and doesn't affect this one much).

throtl_get_tg() incorrectly assumes that q is rcu freed.  Also, it
doesn't check return value of blk_get_queue().  If the queue is
already dead, it ends up doing an extra put.

Drop them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 8 +-------
 block/scsi_ioctl.c   | 3 +--
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f3f495e..ecba5fc 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -324,12 +324,8 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	/*
 	 * Need to allocate a group. Allocation of group also needs allocation
 	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
-	 * we need to drop rcu lock and queue_lock before we call alloc
-	 *
-	 * Take the request queue reference to make sure queue does not
-	 * go away once we return from allocation.
+	 * we need to drop rcu lock and queue_lock before we call alloc.
 	 */
-	blk_get_queue(q);
 	rcu_read_unlock();
 	spin_unlock_irq(q->queue_lock);
 
@@ -339,13 +335,11 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	 * dead
 	 */
 	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-		blk_put_queue(q);
 		if (tg)
 			kfree(tg);
 
 		return ERR_PTR(-ENODEV);
 	}
-	blk_put_queue(q);
 
 	/* Group allocated and queue is still alive. take the lock */
 	spin_lock_irq(q->queue_lock);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 4f4230b..fbdf0d8 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -565,7 +565,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 {
 	int err;
 
-	if (!q || blk_get_queue(q))
+	if (!q)
 		return -ENXIO;
 
 	switch (cmd) {
@@ -686,7 +686,6 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 			err = -ENOTTY;
 	}
 
-	blk_put_queue(q);
 	return err;
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
-- 
cgit v1.1


From e3c78ca524d230bc145e902625e88c392a58ddf3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:32:38 +0200
Subject: block: reorganize queue draining

Reorganize queue draining related code in preparation of queue exit
changes.

* Factor out actual draining from elv_quiesce_start() to
  blk_drain_queue().

* Make elv_quiesce_start/end() responsible for their own locking.

* Replace open-coded ELVSWITCH clearing in elevator_switch() with
  elv_quiesce_end().

This patch doesn't cause any visible functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 28 ++++++++++++++++++++++++++++
 block/blk.h      |  1 +
 block/elevator.c | 37 +++++++++++--------------------------
 3 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a3d2fdc..149149d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
 #include <linux/list_sort.h>
+#include <linux/delay.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -345,6 +346,33 @@ void blk_put_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_put_queue);
 
+/**
+ * blk_drain_queue - drain requests from request_queue
+ * @q: queue to drain
+ *
+ * Drain ELV_PRIV requests from @q.  The caller is responsible for ensuring
+ * that no new requests which need to be drained are queued.
+ */
+void blk_drain_queue(struct request_queue *q)
+{
+	while (true) {
+		int nr_rqs;
+
+		spin_lock_irq(q->queue_lock);
+
+		elv_drain_elevator(q);
+
+		__blk_run_queue(q);
+		nr_rqs = q->rq.elvpriv;
+
+		spin_unlock_irq(q->queue_lock);
+
+		if (!nr_rqs)
+			break;
+		msleep(10);
+	}
+}
+
 /*
  * Note: If a driver supplied the queue lock, it is disconnected
  * by this function. The actual state of the lock doesn't matter
diff --git a/block/blk.h b/block/blk.h
index da247ba..2b66dc21 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -15,6 +15,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio);
+void blk_drain_queue(struct request_queue *q);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
diff --git a/block/elevator.c b/block/elevator.c
index cb332cb..74a277f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -31,7 +31,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
@@ -606,43 +605,35 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 void elv_drain_elevator(struct request_queue *q)
 {
 	static int printed;
+
+	lockdep_assert_held(q->queue_lock);
+
 	while (q->elevator->ops->elevator_dispatch_fn(q, 1))
 		;
-	if (q->nr_sorted == 0)
-		return;
-	if (printed++ < 10) {
+	if (q->nr_sorted && printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
 		       "(nr_sorted=%u), please report this\n",
 		       q->elevator->elevator_type->elevator_name, q->nr_sorted);
 	}
 }
 
-/*
- * Call with queue lock held, interrupts disabled
- */
 void elv_quiesce_start(struct request_queue *q)
 {
 	if (!q->elevator)
 		return;
 
+	spin_lock_irq(q->queue_lock);
 	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
+	spin_unlock_irq(q->queue_lock);
 
-	/*
-	 * make sure we don't have any requests in flight
-	 */
-	elv_drain_elevator(q);
-	while (q->rq.elvpriv) {
-		__blk_run_queue(q);
-		spin_unlock_irq(q->queue_lock);
-		msleep(10);
-		spin_lock_irq(q->queue_lock);
-		elv_drain_elevator(q);
-	}
+	blk_drain_queue(q);
 }
 
 void elv_quiesce_end(struct request_queue *q)
 {
+	spin_lock_irq(q->queue_lock);
 	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
+	spin_unlock_irq(q->queue_lock);
 }
 
 void __elv_add_request(struct request_queue *q, struct request *rq, int where)
@@ -972,7 +963,6 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data
 	 */
-	spin_lock_irq(q->queue_lock);
 	elv_quiesce_start(q);
 
 	/*
@@ -983,8 +973,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	/*
 	 * attach and start new elevator
 	 */
+	spin_lock_irq(q->queue_lock);
 	elevator_attach(q, e, data);
-
 	spin_unlock_irq(q->queue_lock);
 
 	if (old_elevator->registered) {
@@ -999,9 +989,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	 * finally exit old elevator and turn off BYPASS.
 	 */
 	elevator_exit(old_elevator);
-	spin_lock_irq(q->queue_lock);
 	elv_quiesce_end(q);
-	spin_unlock_irq(q->queue_lock);
 
 	blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
 
@@ -1015,10 +1003,7 @@ fail_register:
 	elevator_exit(e);
 	q->elevator = old_elevator;
 	elv_register_queue(q);
-
-	spin_lock_irq(q->queue_lock);
-	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
-	spin_unlock_irq(q->queue_lock);
+	elv_quiesce_end(q);
 
 	return err;
 }
-- 
cgit v1.1


From bc16a4f933bc5ed50826b20561e4c3515061998b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:33:01 +0200
Subject: block: reorganize throtl_get_tg() and blk_throtl_bio()

blk_throtl_bio() and throtl_get_tg() have rather unusual interface.

* throtl_get_tg() returns pointer to a valid tg or ERR_PTR(-ENODEV),
  and drops queue_lock in the latter case.  Different locking context
  depending on return value is error-prone and DEAD state is scheduled
  to be protected by queue_lock anyway.  Move DEAD check inside
  queue_lock and return valid tg or NULL.

* blk_throtl_bio() indicates return status both with its return value
  and in/out param **@bio.  The former is used to indicate whether
  queue is found to be dead during throtl processing.  The latter
  whether the bio is throttled.

  There's no point in returning DEAD check result from
  blk_throtl_bio().  The queue can die after blk_throtl_bio() is
  finished but before make_request_fn() grabs queue lock.

  Make it take *@bio instead and return boolean result indicating
  whether the request is throttled or not.

This patch doesn't cause any visible functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     |  8 ++------
 block/blk-throttle.c | 49 +++++++++++++++++--------------------------------
 block/blk.h          |  6 +++---
 3 files changed, 22 insertions(+), 41 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 149149d..6c491f2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1515,12 +1515,8 @@ generic_make_request_checks(struct bio *bio)
 		goto end_io;
 	}
 
-	if (blk_throtl_bio(q, &bio))
-		goto end_io;
-
-	/* if bio = NULL, bio has been throttled and will be submitted later. */
-	if (!bio)
-		return false;
+	if (blk_throtl_bio(q, bio))
+		return false;	/* throttled, will be resubmitted later */
 
 	trace_block_bio_queue(q, bio);
 	return true;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ecba5fc..900a0c9 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -303,10 +303,6 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 	return tg;
 }
 
-/*
- * This function returns with queue lock unlocked in case of error, like
- * request queue is no more
- */
 static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 {
 	struct throtl_grp *tg = NULL, *__tg = NULL;
@@ -330,20 +326,16 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	spin_unlock_irq(q->queue_lock);
 
 	tg = throtl_alloc_tg(td);
-	/*
-	 * We might have slept in group allocation. Make sure queue is not
-	 * dead
-	 */
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-		if (tg)
-			kfree(tg);
-
-		return ERR_PTR(-ENODEV);
-	}
 
 	/* Group allocated and queue is still alive. take the lock */
 	spin_lock_irq(q->queue_lock);
 
+	/* Make sure @q is still alive */
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+		kfree(tg);
+		return NULL;
+	}
+
 	/*
 	 * Initialize the new group. After sleeping, read the blkcg again.
 	 */
@@ -1118,17 +1110,17 @@ static struct blkio_policy_type blkio_policy_throtl = {
 	.plid = BLKIO_POLICY_THROTL,
 };
 
-int blk_throtl_bio(struct request_queue *q, struct bio **biop)
+bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	struct throtl_data *td = q->td;
 	struct throtl_grp *tg;
-	struct bio *bio = *biop;
 	bool rw = bio_data_dir(bio), update_disptime = true;
 	struct blkio_cgroup *blkcg;
+	bool throttled = false;
 
 	if (bio->bi_rw & REQ_THROTTLED) {
 		bio->bi_rw &= ~REQ_THROTTLED;
-		return 0;
+		goto out;
 	}
 
 	/*
@@ -1147,7 +1139,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
 					rw, rw_is_sync(bio->bi_rw));
 			rcu_read_unlock();
-			return 0;
+			goto out;
 		}
 	}
 	rcu_read_unlock();
@@ -1156,18 +1148,10 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	 * Either group has not been allocated yet or it is not an unlimited
 	 * IO group
 	 */
-
 	spin_lock_irq(q->queue_lock);
 	tg = throtl_get_tg(td);
-
-	if (IS_ERR(tg)) {
-		if (PTR_ERR(tg)	== -ENODEV) {
-			/*
-			 * Queue is gone. No queue lock held here.
-			 */
-			return -ENODEV;
-		}
-	}
+	if (unlikely(!tg))
+		goto out_unlock;
 
 	if (tg->nr_queued[rw]) {
 		/*
@@ -1195,7 +1179,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 		 * So keep on trimming slice even if bio is not queued.
 		 */
 		throtl_trim_slice(td, tg, rw);
-		goto out;
+		goto out_unlock;
 	}
 
 queue_bio:
@@ -1207,16 +1191,17 @@ queue_bio:
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
 	throtl_add_bio_tg(q->td, tg, bio);
-	*biop = NULL;
+	throttled = true;
 
 	if (update_disptime) {
 		tg_update_disptime(td, tg);
 		throtl_schedule_next_dispatch(td);
 	}
 
-out:
+out_unlock:
 	spin_unlock_irq(q->queue_lock);
-	return 0;
+out:
+	return throttled;
 }
 
 int blk_throtl_init(struct request_queue *q)
diff --git a/block/blk.h b/block/blk.h
index 2b66dc21..c018dba 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -190,13 +190,13 @@ static inline int blk_do_io_stat(struct request *rq)
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING
-extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
-static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
-	return 0;
+	return false;
 }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
-- 
cgit v1.1


From da8303c63b8de73619884382d6e573d44aae0810 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:33:05 +0200
Subject: block: make get_request[_wait]() fail if queue is dead

Currently get_request[_wait]() allocates request whether queue is dead
or not.  This patch makes get_request[_wait]() return NULL if @q is
dead.  blk_queue_bio() is updated to fail the submitted bio if request
allocation fails.  While at it, add docbook comments for
get_request[_wait]().

Note that the current code has rather unclear (there are spurious DEAD
tests scattered around) assumption that the owner of a queue
guarantees that no request travels block layer if the queue is dead
and this patch in itself doesn't change much; however, this will allow
fixing the broken assumption in the next patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 54 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 16 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 6c491f2..3508751 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -709,10 +709,19 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 	return true;
 }
 
-/*
- * Get a free request, queue_lock must be held.
- * Returns NULL on failure, with queue_lock held.
- * Returns !NULL on success, with queue_lock *not held*.
+/**
+ * get_request - get a free request
+ * @q: request_queue to allocate request from
+ * @rw_flags: RW and SYNC flags
+ * @bio: bio to allocate request for (can be %NULL)
+ * @gfp_mask: allocation mask
+ *
+ * Get a free request from @q.  This function may fail under memory
+ * pressure or if @q is dead.
+ *
+ * Must be callled with @q->queue_lock held and,
+ * Returns %NULL on failure, with @q->queue_lock held.
+ * Returns !%NULL on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
@@ -723,6 +732,9 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	int may_queue;
 
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		return NULL;
+
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
@@ -815,11 +827,18 @@ out:
 	return rq;
 }
 
-/*
- * No available requests for this queue, wait for some requests to become
- * available.
+/**
+ * get_request_wait - get a free request with retry
+ * @q: request_queue to allocate request from
+ * @rw_flags: RW and SYNC flags
+ * @bio: bio to allocate request for (can be %NULL)
  *
- * Called with q->queue_lock held, and returns with it unlocked.
+ * Get a free request from @q.  This function keeps retrying under memory
+ * pressure and fails iff @q is dead.
+ *
+ * Must be callled with @q->queue_lock held and,
+ * Returns %NULL on failure, with @q->queue_lock held.
+ * Returns !%NULL on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 					struct bio *bio)
@@ -833,6 +852,9 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 
+		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+			return NULL;
+
 		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
 				TASK_UNINTERRUPTIBLE);
 
@@ -863,19 +885,15 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	struct request *rq;
 
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-		return NULL;
-
 	BUG_ON(rw != READ && rw != WRITE);
 
 	spin_lock_irq(q->queue_lock);
-	if (gfp_mask & __GFP_WAIT) {
+	if (gfp_mask & __GFP_WAIT)
 		rq = get_request_wait(q, rw, NULL);
-	} else {
+	else
 		rq = get_request(q, rw, NULL, gfp_mask);
-		if (!rq)
-			spin_unlock_irq(q->queue_lock);
-	}
+	if (!rq)
+		spin_unlock_irq(q->queue_lock);
 	/* q->queue_lock is unlocked at this point */
 
 	return rq;
@@ -1299,6 +1317,10 @@ get_rq:
 	 * Returns with the queue unlocked.
 	 */
 	req = get_request_wait(q, rw_flags, bio);
+	if (unlikely(!req)) {
+		bio_endio(bio, -ENODEV);	/* @q is dead */
+		goto out_unlock;
+	}
 
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
-- 
cgit v1.1


From bd87b5898a72b1aef6acf3705c61c9f6372adf0c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:33:08 +0200
Subject: block: drop @tsk from attempt_plug_merge() and explain sync rules

attempt_plug_merge() accesses elevator without holding queue_lock and
may call into ->elevator_bio_merge_fn().  The elvator is guaranteed to
be valid because it's accessed iff the plugged list has requests and
elevator is never exited with live requests, so as long as the
elevator method can deal with unlocked access, this is safe.

Explain the sync rules around attempt_plug_merge() and drop the
unnecessary @tsk parameter.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c         | 28 +++++++++++++++++++++-------
 include/linux/elevator.h |  6 ++++++
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3508751..034cbb2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1203,18 +1203,32 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 	return true;
 }
 
-/*
- * Attempts to merge with the plugged list in the current process. Returns
- * true if merge was successful, otherwise false.
+/**
+ * attempt_plug_merge - try to merge with %current's plugged list
+ * @q: request_queue new bio is being queued at
+ * @bio: new bio being queued
+ * @request_count: out parameter for number of traversed plugged requests
+ *
+ * Determine whether @bio being queued on @q can be merged with a request
+ * on %current's plugged list.  Returns %true if merge was successful,
+ * otherwise %false.
+ *
+ * This function is called without @q->queue_lock; however, elevator is
+ * accessed iff there already are requests on the plugged list which in
+ * turn guarantees validity of the elevator.
+ *
+ * Note that, on successful merge, elevator operation
+ * elevator_bio_merged_fn() will be called without queue lock.  Elevator
+ * must be ready for this.
  */
-static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
-			       struct bio *bio, unsigned int *request_count)
+static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
+			       unsigned int *request_count)
 {
 	struct blk_plug *plug;
 	struct request *rq;
 	bool ret = false;
 
-	plug = tsk->plug;
+	plug = current->plug;
 	if (!plug)
 		goto out;
 	*request_count = 0;
@@ -1282,7 +1296,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * Check if we can merge with the plugged list before grabbing
 	 * any locks.
 	 */
-	if (attempt_plug_merge(current, q, bio, &request_count))
+	if (attempt_plug_merge(q, bio, &request_count))
 		return;
 
 	spin_lock_irq(q->queue_lock);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index d800d51..1d0f7a2 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -38,6 +38,12 @@ struct elevator_ops
 	elevator_merged_fn *elevator_merged_fn;
 	elevator_merge_req_fn *elevator_merge_req_fn;
 	elevator_allow_merge_fn *elevator_allow_merge_fn;
+
+	/*
+	 * Used for both plugged list and elevator merging and in the
+	 * former case called without queue_lock.  Read comment on top of
+	 * attempt_plug_merge() for details.
+	 */
 	elevator_bio_merged_fn *elevator_bio_merged_fn;
 
 	elevator_dispatch_fn *elevator_dispatch_fn;
-- 
cgit v1.1


From c9a929dde3913780b5c416f4bb9d9ed804f509ce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:42:16 +0200
Subject: block: fix request_queue lifetime handling by making
 blk_queue_cleanup() properly shutdown

request_queue is refcounted but actually depdends on lifetime
management from the queue owner - on blk_cleanup_queue(), block layer
expects that there's no request passing through request_queue and no
new one will.

This is fundamentally broken.  The queue owner (e.g. SCSI layer)
doesn't have a way to know whether there are other active users before
calling blk_cleanup_queue() and other users (e.g. bsg) don't have any
guarantee that the queue is and would stay valid while it's holding a
reference.

With delay added in blk_queue_bio() before queue_lock is grabbed, the
following oops can be easily triggered when a device is removed with
in-flight IOs.

 sd 0:0:1:0: [sdb] Stopping disk
 ata1.01: disabled
 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:

 Pid: 648, comm: test_rawio Not tainted 3.1.0-rc3-work+ #56 Bochs Bochs
 RIP: 0010:[<ffffffff8137d651>]  [<ffffffff8137d651>] elv_rqhash_find+0x61/0x100
 ...
 Process test_rawio (pid: 648, threadinfo ffff880019efa000, task ffff880019ef8a80)
 ...
 Call Trace:
  [<ffffffff8137d774>] elv_merge+0x84/0xe0
  [<ffffffff81385b54>] blk_queue_bio+0xf4/0x400
  [<ffffffff813838ea>] generic_make_request+0xca/0x100
  [<ffffffff81383994>] submit_bio+0x74/0x100
  [<ffffffff811c53ec>] dio_bio_submit+0xbc/0xc0
  [<ffffffff811c610e>] __blockdev_direct_IO+0x92e/0xb40
  [<ffffffff811c39f7>] blkdev_direct_IO+0x57/0x60
  [<ffffffff8113b1c5>] generic_file_aio_read+0x6d5/0x760
  [<ffffffff8118c1ca>] do_sync_read+0xda/0x120
  [<ffffffff8118ce55>] vfs_read+0xc5/0x180
  [<ffffffff8118cfaa>] sys_pread64+0x9a/0xb0
  [<ffffffff81afaf6b>] system_call_fastpath+0x16/0x1b

This happens because blk_queue_cleanup() destroys the queue and
elevator whether IOs are in progress or not and DEAD tests are
sprinkled in the request processing path without proper
synchronization.

Similar problem exists for blk-throtl.  On queue cleanup, blk-throtl
is shutdown whether it has requests in it or not.  Depending on
timing, it either oopses or throttled bios are lost putting tasks
which are waiting for bio completion into eternal D state.

The way it should work is having the usual clear distinction between
shutdown and release.  Shutdown drains all currently pending requests,
marks the queue dead, and performs partial teardown of the now
unnecessary part of the queue.  Even after shutdown is complete,
reference holders are still allowed to issue requests to the queue
although they will be immmediately failed.  The rest of teardown
happens on release.

This patch makes the following changes to make blk_queue_cleanup()
behave as proper shutdown.

* QUEUE_FLAG_DEAD is now set while holding both q->exit_mutex and
  queue_lock.

* Unsynchronized DEAD check in generic_make_request_checks() removed.
  This couldn't make any meaningful difference as the queue could die
  after the check.

* blk_drain_queue() updated such that it can drain all requests and is
  now called during cleanup.

* blk_throtl updated such that it checks DEAD on grabbing queue_lock,
  drains all throttled bios during cleanup and free td when queue is
  released.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     | 57 +++++++++++++++++++++++++++++++++-------------------
 block/blk-sysfs.c    |  1 +
 block/blk-throttle.c | 50 +++++++++++++++++++++++++++++++++++++++------
 block/blk.h          |  6 +++++-
 block/elevator.c     |  2 +-
 5 files changed, 87 insertions(+), 29 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 034cbb2..7e15235 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -349,11 +349,13 @@ EXPORT_SYMBOL(blk_put_queue);
 /**
  * blk_drain_queue - drain requests from request_queue
  * @q: queue to drain
+ * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
  *
- * Drain ELV_PRIV requests from @q.  The caller is responsible for ensuring
- * that no new requests which need to be drained are queued.
+ * Drain requests from @q.  If @drain_all is set, all requests are drained.
+ * If not, only ELVPRIV requests are drained.  The caller is responsible
+ * for ensuring that no new requests which need to be drained are queued.
  */
-void blk_drain_queue(struct request_queue *q)
+void blk_drain_queue(struct request_queue *q, bool drain_all)
 {
 	while (true) {
 		int nr_rqs;
@@ -361,9 +363,15 @@ void blk_drain_queue(struct request_queue *q)
 		spin_lock_irq(q->queue_lock);
 
 		elv_drain_elevator(q);
+		if (drain_all)
+			blk_throtl_drain(q);
 
 		__blk_run_queue(q);
-		nr_rqs = q->rq.elvpriv;
+
+		if (drain_all)
+			nr_rqs = q->rq.count[0] + q->rq.count[1];
+		else
+			nr_rqs = q->rq.elvpriv;
 
 		spin_unlock_irq(q->queue_lock);
 
@@ -373,30 +381,40 @@ void blk_drain_queue(struct request_queue *q)
 	}
 }
 
-/*
- * Note: If a driver supplied the queue lock, it is disconnected
- * by this function. The actual state of the lock doesn't matter
- * here as the request_queue isn't accessible after this point
- * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
+/**
+ * blk_cleanup_queue - shutdown a request queue
+ * @q: request queue to shutdown
+ *
+ * Mark @q DEAD, drain all pending requests, destroy and put it.  All
+ * future requests will be failed immediately with -ENODEV.
  */
 void blk_cleanup_queue(struct request_queue *q)
 {
-	/*
-	 * We know we have process context here, so we can be a little
-	 * cautious and ensure that pending block actions on this device
-	 * are done before moving on. Going into this function, we should
-	 * not have processes doing IO to this device.
-	 */
-	blk_sync_queue(q);
+	spinlock_t *lock = q->queue_lock;
 
-	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+	/* mark @q DEAD, no new request or merges will be allowed afterwards */
 	mutex_lock(&q->sysfs_lock);
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
-	mutex_unlock(&q->sysfs_lock);
+
+	spin_lock_irq(lock);
+	queue_flag_set(QUEUE_FLAG_NOMERGES, q);
+	queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+	queue_flag_set(QUEUE_FLAG_DEAD, q);
 
 	if (q->queue_lock != &q->__queue_lock)
 		q->queue_lock = &q->__queue_lock;
 
+	spin_unlock_irq(lock);
+	mutex_unlock(&q->sysfs_lock);
+
+	/* drain all requests queued before DEAD marking */
+	blk_drain_queue(q, true);
+
+	/* @q won't process any more request, flush async actions */
+	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+	blk_sync_queue(q);
+
+	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
@@ -1509,9 +1527,6 @@ generic_make_request_checks(struct bio *bio)
 		goto end_io;
 	}
 
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-		goto end_io;
-
 	part = bio->bi_bdev->bd_part;
 	if (should_fail_request(part, bio->bi_size) ||
 	    should_fail_request(&part_to_disk(part)->part0,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index a8eff5f..e7f9f65 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -490,6 +490,7 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	blk_throtl_release(q);
 	blk_trace_shutdown(q);
 
 	bdi_destroy(&q->backing_dev_info);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 900a0c9..8edb949 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -309,6 +309,10 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	struct blkio_cgroup *blkcg;
 	struct request_queue *q = td->queue;
 
+	/* no throttling for dead queue */
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		return NULL;
+
 	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
 	tg = throtl_find_tg(td, blkcg);
@@ -1001,11 +1005,6 @@ static void throtl_release_tgs(struct throtl_data *td)
 	}
 }
 
-static void throtl_td_free(struct throtl_data *td)
-{
-	kfree(td);
-}
-
 /*
  * Blk cgroup controller notification saying that blkio_group object is being
  * delinked as associated cgroup object is going away. That also means that
@@ -1204,6 +1203,41 @@ out:
 	return throttled;
 }
 
+/**
+ * blk_throtl_drain - drain throttled bios
+ * @q: request_queue to drain throttled bios for
+ *
+ * Dispatch all currently throttled bios on @q through ->make_request_fn().
+ */
+void blk_throtl_drain(struct request_queue *q)
+	__releases(q->queue_lock) __acquires(q->queue_lock)
+{
+	struct throtl_data *td = q->td;
+	struct throtl_rb_root *st = &td->tg_service_tree;
+	struct throtl_grp *tg;
+	struct bio_list bl;
+	struct bio *bio;
+
+	lockdep_is_held(q->queue_lock);
+
+	bio_list_init(&bl);
+
+	while ((tg = throtl_rb_first(st))) {
+		throtl_dequeue_tg(td, tg);
+
+		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
+			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+		while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+	}
+	spin_unlock_irq(q->queue_lock);
+
+	while ((bio = bio_list_pop(&bl)))
+		generic_make_request(bio);
+
+	spin_lock_irq(q->queue_lock);
+}
+
 int blk_throtl_init(struct request_queue *q)
 {
 	struct throtl_data *td;
@@ -1276,7 +1310,11 @@ void blk_throtl_exit(struct request_queue *q)
 	 * it.
 	 */
 	throtl_shutdown_wq(q);
-	throtl_td_free(td);
+}
+
+void blk_throtl_release(struct request_queue *q)
+{
+	kfree(q->td);
 }
 
 static int __init throtl_init(void)
diff --git a/block/blk.h b/block/blk.h
index c018dba..3f6551b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -15,7 +15,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio);
-void blk_drain_queue(struct request_queue *q);
+void blk_drain_queue(struct request_queue *q, bool drain_all);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
@@ -191,15 +191,19 @@ static inline int blk_do_io_stat(struct request *rq)
 
 #ifdef CONFIG_BLK_DEV_THROTTLING
 extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
+extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
+extern void blk_throtl_release(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	return false;
 }
+static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_release(struct request_queue *q) { }
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 
 #endif /* BLK_INTERNAL_H */
diff --git a/block/elevator.c b/block/elevator.c
index 74a277f..66343d6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -626,7 +626,7 @@ void elv_quiesce_start(struct request_queue *q)
 	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
 	spin_unlock_irq(q->queue_lock);
 
-	blk_drain_queue(q);
+	blk_drain_queue(q, false);
 }
 
 void elv_quiesce_end(struct request_queue *q)
-- 
cgit v1.1


From 3bcfeaf93f44112053e1c36aa681d9efc1185ddc Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 20 Oct 2011 21:24:30 +0200
Subject: block: initialize the bounce pool if high memory may be added later

init_emergency_pool() does not create the page pool for bouncing block
requests if the current count of high pages is zero.  If high memory
may be added later (either via memory hotplug or a balloon driver in a
virtualized system) then a oops occurs if a request with a high page
need bouncing because the pool does not exist.

So, always create the pool if memory hotplug is enabled and change the
test so it's valid even if all high pages are currently in the balloon
(the balloon drivers adjust totalhigh_pages but not max_pfn).

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/bounce.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mm/bounce.c b/mm/bounce.c
index 1481de6..434fb4f 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/bootmem.h>
 #include <asm/tlbflush.h>
 
 #include <trace/events/block.h>
@@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool;
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
-	struct sysinfo i;
-	si_meminfo(&i);
-	si_swapinfo(&i);
-
-	if (!i.totalhigh)
+#ifndef CONFIG_MEMORY_HOTPLUG
+	if (max_pfn <= max_low_pfn)
 		return 0;
+#endif
 
 	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
 	BUG_ON(!page_pool);
-- 
cgit v1.1


From e890413af4c2dfebf5432ef30cc70cb11dad3213 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Mon, 24 Oct 2011 16:08:38 +0200
Subject: block: fix a typo in the blk-cgroup.h file

byptes -> bytes.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index a71d290..6f3ace7 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -188,7 +188,7 @@ struct blkio_policy_node {
 	union {
 		unsigned int weight;
 		/*
-		 * Rate read/write in terms of byptes per second
+		 * Rate read/write in terms of bytes per second
 		 * Whether this rate represents read or write is determined
 		 * by file type "fileid".
 		 */
-- 
cgit v1.1


From 9562ad9ab36df7ccef920d119f3b5100025db95f Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 24 Oct 2011 16:11:30 +0200
Subject: block: Remove the control of complete cpu from bio.

bio originally has the functionality to set the complete cpu, but
it is broken.

Chirstoph said that "This code is unused, and from the all the
discussions lately pretty obviously broken.  The only thing keeping
it serves is creating more confusion and possibly more bugs."

And Jens replied with "We can kill bio_set_completion_cpu(). I'm fine
with leaving cpu control to the request based drivers, they are the
only ones that can toggle the setting anyway".

So this patch tries to remove all the work of controling complete cpu
from a bio.

Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          |  4 +---
 drivers/md/raid1.c        |  1 -
 fs/bio.c                  |  1 -
 include/linux/bio.h       |  8 --------
 include/linux/blk_types.h | 11 ++++-------
 5 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 7e15235..da69793 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1276,7 +1276,6 @@ out:
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
-	req->cpu = bio->bi_comp_cpu;
 	req->cmd_type = REQ_TYPE_FS;
 
 	req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
@@ -1362,8 +1361,7 @@ get_rq:
 	 */
 	init_request_from_bio(req, bio);
 
-	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-	    bio_flagged(bio, BIO_CPU_AFFINE))
+	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
 		req->cpu = raw_smp_processor_id();
 
 	plug = current->plug;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d4ddfa6..2948a52 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2172,7 +2172,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		bio->bi_next = NULL;
 		bio->bi_flags &= ~(BIO_POOL_MASK-1);
 		bio->bi_flags |= 1 << BIO_UPTODATE;
-		bio->bi_comp_cpu = -1;
 		bio->bi_rw = READ;
 		bio->bi_vcnt = 0;
 		bio->bi_idx = 0;
diff --git a/fs/bio.c b/fs/bio.c
index 9bfade8..41c93c7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	bio->bi_comp_cpu = -1;
 	atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ce33e68..a3c071c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -269,14 +269,6 @@ extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 /*
- * Allow queuer to specify a completion CPU for this bio
- */
-static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
-{
-	bio->bi_comp_cpu = cpu;
-}
-
-/*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
  * These memory pools in turn all allocate from the bio_slab
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 71fc53b..4053cbd 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -59,8 +59,6 @@ struct bio {
 
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 
-	unsigned int		bi_comp_cpu;	/* completion CPU */
-
 	atomic_t		bi_cnt;		/* pin count */
 
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
@@ -93,11 +91,10 @@ struct bio {
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
 #define BIO_EOPNOTSUPP	7	/* not supported */
-#define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
-#define BIO_NULL_MAPPED 9	/* contains invalid user pages */
-#define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
-#define BIO_QUIET	11	/* Make BIO Quiet */
-#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */
+#define BIO_NULL_MAPPED 8	/* contains invalid user pages */
+#define BIO_FS_INTEGRITY 9	/* fs owns integrity data, not block layer */
+#define BIO_QUIET	10	/* Make BIO Quiet */
+#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
-- 
cgit v1.1


From 834f9f61a525d2f6d3d0c93894e26326c8d3ceed Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Mon, 17 Oct 2011 12:57:22 +0200
Subject: blk-flush: fix invalid BUG_ON in blk_insert_flush

A user reported a regression due to commit
4853abaae7e4a2af938115ce9071ef8684fb7af4 (block: fix flush
machinery for stacking drivers with differring flush flags).
Part of the problem is that blk_insert_flush required a
single bio be attached to the request.  In reality, having
no attached bio is also a valid case, as can be observed with
an empty flush.

[1] http://www.redhat.com/archives/dm-devel/2011-September/msg00154.html

Reported-by: Christophe Saout <christophe@saout.de>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com
Acked-by: Tejun Heo <tj@kernel.org>

Stable note: 3.1
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 491eb30..89ae3b9 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -320,7 +320,7 @@ void blk_insert_flush(struct request *rq)
 		return;
 	}
 
-	BUG_ON(!rq->bio || rq->bio != rq->biotail);
+	BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
 
 	/*
 	 * If there's data but flush is not necessary, the request can be
-- 
cgit v1.1


From e67b77c791ca2778198c9e7088f3266ed2da7a55 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Mon, 17 Oct 2011 12:57:23 +0200
Subject: blk-flush: move the queue kick into

A dm-multipath user reported[1] a problem when trying to boot
a kernel with commit 4853abaae7e4a2af938115ce9071ef8684fb7af4
(block: fix flush machinery for stacking drivers with differring
flush flags) applied.  It turns out that an empty flush request
can be sent into blk_insert_flush.  When the BUG_ON was fixed
to allow for this, I/O on the underlying device would stall.  The
reason is that blk_insert_cloned_request does not kick the queue.
In the aforementioned commit, I had added a special case to
kick the queue if data was sent down but the queue flags did
not require a flush.  A better solution is to push the queue
kick up into blk_insert_cloned_request.

This patch, along with a follow-on which fixes the BUG_ON, fixes
the issue reported.

[1] http://www.redhat.com/archives/dm-devel/2011-September/msg00154.html

Reported-by: Christophe Saout <christophe@saout.de>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>

Stable note: 3.1
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c  | 2 ++
 block/blk-flush.c | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d34433a..795154e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1725,6 +1725,8 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 		where = ELEVATOR_INSERT_FLUSH;
 
 	add_acct_request(q, rq, where);
+	if (where == ELEVATOR_INSERT_FLUSH)
+		__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	return 0;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 89ae3b9..720ad60 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -330,7 +330,6 @@ void blk_insert_flush(struct request *rq)
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 		list_add_tail(&rq->queuelist, &q->queue_head);
-		blk_run_queue_async(q);
 		return;
 	}
 
-- 
cgit v1.1


From f992ae801a7dec34a4ed99a6598bbbbfb82af4fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 17 Oct 2011 13:42:43 +0200
Subject: block: make gendisk hold a reference to its queue

The following command sequence triggers an oops.

# mount /dev/sdb1 /mnt
# echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete
# umount /mnt

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:

 Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs
 RIP: 0010:[<ffffffff810d0879>]  [<ffffffff810d0879>] __lock_acquire+0x389/0x1d60
...
 Call Trace:
  [<ffffffff810d2845>] lock_acquire+0x95/0x140
  [<ffffffff81aed87b>] _raw_spin_lock+0x3b/0x50
  [<ffffffff811573bc>] bdi_lock_two+0x5c/0x70
  [<ffffffff811c2f6c>] bdev_inode_switch_bdi+0x4c/0xf0
  [<ffffffff811c3fcb>] __blkdev_put+0x11b/0x1d0
  [<ffffffff811c4010>] __blkdev_put+0x160/0x1d0
  [<ffffffff811c40df>] blkdev_put+0x5f/0x190
  [<ffffffff8118f18d>] kill_block_super+0x4d/0x80
  [<ffffffff8118f4a5>] deactivate_locked_super+0x45/0x70
  [<ffffffff8119003a>] deactivate_super+0x4a/0x70
  [<ffffffff811ac4ad>] mntput_no_expire+0xed/0x130
  [<ffffffff811acf2e>] sys_umount+0x7e/0x3a0
  [<ffffffff81aeeeab>] system_call_fastpath+0x16/0x1b

This is because bdev holds on to disk but disk doesn't pin the
associated queue.  If a SCSI device is removed while the device is
still open, the sdev puts the base reference to the queue on release.
When the bdev is finally released, the associated queue is already
gone along with the bdi and bdev_inode_switch_bdi() ends up
dereferencing already freed bdi.

Even if it were not for this bug, disk not holding onto the associated
queue is very unusual and error-prone.

Fix it by making add_disk() take an extra reference to its queue and
put it on disk_release() and ensuring that disk and its fops owner are
put in that order after all accesses to the disk and queue are
complete.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c  |  8 ++++++++
 fs/block_dev.c | 13 ++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index e2f6790..d261b73 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk)
 	register_disk(disk);
 	blk_register_queue(disk);
 
+	/*
+	 * Take an extra ref on queue which will be put on disk_release()
+	 * so that it sticks around as long as @disk is there.
+	 */
+	WARN_ON_ONCE(blk_get_queue(disk->queue));
+
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
@@ -1095,6 +1101,8 @@ static void disk_release(struct device *dev)
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
 	free_part_info(&disk->part0);
+	if (disk->queue)
+		blk_put_queue(disk->queue);
 	kfree(disk);
 }
 struct class block_class = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786e..1c44b8d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
+	struct module *owner;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out;
+	owner = disk->fops->owner;
 
 	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					bdev->bd_disk = NULL;
 					mutex_unlock(&bdev->bd_mutex);
 					disk_unblock_events(disk);
-					module_put(disk->fops->owner);
 					put_disk(disk);
+					module_put(owner);
 					goto restart;
 				}
 			}
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_unlock_bdev;
 		}
 		/* only one opener holds refs to the module and disk */
-		module_put(disk->fops->owner);
 		put_disk(disk);
+		module_put(owner);
 	}
 	bdev->bd_openers++;
 	if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
-	module_put(disk->fops->owner);
 	put_disk(disk);
+	module_put(owner);
  out:
 	bdput(bdev);
 
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
 
-		put_disk(disk);
-		module_put(owner);
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
 		if (bdev != bdev->bd_contains)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
+
+		put_disk(disk);
+		module_put(owner);
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
-- 
cgit v1.1


From 5e08159197b5b98a6648a172008de23f420e6c11 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 25 Oct 2011 10:20:05 +0200
Subject: block: warn if tag is greater than real_max_depth.

In case tag depth is reduced, it is max_depth not real_max_depth.
So we should allow a request with tag >= max_depth, but for a
tag >= real_max_depth, there really should be some problem.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-tag.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/blk-tag.c b/block/blk-tag.c
index ece65fc..e74d6d1 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -286,12 +286,14 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 
 	BUG_ON(tag == -1);
 
-	if (unlikely(tag >= bqt->real_max_depth))
+	if (unlikely(tag >= bqt->max_depth)) {
 		/*
 		 * This can happen after tag depth has been reduced.
-		 * FIXME: how about a warning or info message here?
+		 * But tag shouldn't be larger than real_max_depth.
 		 */
+		WARN_ON(tag >= bqt->real_max_depth);
 		return;
+	}
 
 	list_del_init(&rq->queuelist);
 	rq->cmd_flags &= ~REQ_QUEUED;
-- 
cgit v1.1


From e060f00beee23568fe2c4faf1e88ff22edefd7b2 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 25 Oct 2011 15:48:12 +0200
Subject: blk-throttle: Free up policy node associated with deleted rule

If a rule is being deleted, free up associated policy node. Otherwise
that memory is leaked.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d61ec56..6155367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1075,6 +1075,7 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 
 	if (blkio_delete_rule_command(newpn)) {
 		blkio_policy_delete_node(pn);
+		kfree(pn);
 		spin_unlock_irq(&blkcg->lock);
 		goto update_io_group;
 	}
-- 
cgit v1.1


From a38eb630fa224d6fba8c14a4063174bc5e0f63bb Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 25 Oct 2011 15:48:12 +0200
Subject: blk-throttle: Take blkcg->lock while traversing blkcg->policy_list

blkcg->policy_list is protected by blkcg->lock. Its not rcu protected
list. So even for readers, they need to take blkcg->lock. There are
few functions which were reading the list without taking lock. Fix it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 54 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6155367..8f630ce 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -869,60 +869,86 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 			      dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	unsigned int weight;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
 
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
 				BLKIO_PROP_weight_device);
 	if (pn)
-		return pn->val.weight;
+		weight = pn->val.weight;
 	else
-		return blkcg->weight;
+		weight = blkcg->weight;
+
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return weight;
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	uint64_t bps = -1;
 
+	spin_lock_irqsave(&blkcg->lock, flags);
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_read_bps_device);
 	if (pn)
-		return pn->val.bps;
-	else
-		return -1;
+		bps = pn->val.bps;
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return bps;
 }
 
 uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	uint64_t bps = -1;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_write_bps_device);
 	if (pn)
-		return pn->val.bps;
-	else
-		return -1;
+		bps = pn->val.bps;
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return bps;
 }
 
 unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	unsigned int iops = -1;
 
+	spin_lock_irqsave(&blkcg->lock, flags);
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_read_iops_device);
 	if (pn)
-		return pn->val.iops;
-	else
-		return -1;
+		iops = pn->val.iops;
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return iops;
 }
 
 unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	unsigned int iops = -1;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_write_iops_device);
 	if (pn)
-		return pn->val.iops;
-	else
-		return -1;
+		iops = pn->val.iops;
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return iops;
 }
 
 /* Checks whether user asked for deleting a policy rule */
-- 
cgit v1.1


From 334c2b0b8b2ab186fa198413386cba41fffcb4f2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 25 Oct 2011 15:51:48 +0200
Subject: blk-throttle: use queue_is_locked() instead of lockdep_is_held()

We can't use the latter if !CONFIG_LOCKDEP.

Reported-by: Sedat Dilek <sedat.dilek@googlemail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8edb949..4553245 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1218,7 +1218,7 @@ void blk_throtl_drain(struct request_queue *q)
 	struct bio_list bl;
 	struct bio *bio;
 
-	lockdep_is_held(q->queue_lock);
+	WARN_ON_ONCE(!queue_is_locked(q));
 
 	bio_list_init(&bl);
 
-- 
cgit v1.1


From 6dd9ad7df2019b1e33a372a501907db293ebcd0d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 3 Nov 2011 18:52:11 +0100
Subject: block: don't call blk_drain_queue() if elevator is not up

blk_cleanup_queue() may be called before elevator is set up on a
queue which triggers the following oops.

 BUG: unable to handle kernel NULL pointer dereference at           (null)
 IP: [<ffffffff8125a69c>] elv_drain_elevator+0x1c/0x70
 ...
 Pid: 830, comm: kworker/0:2 Not tainted 3.1.0-next-20111025_64+ #1590
 Bochs Bochs
 RIP: 0010:[<ffffffff8125a69c>]  [<ffffffff8125a69c>] elv_drain_elevator+0x1c/0x70
 ...
 Call Trace:
  [<ffffffff8125da92>] blk_drain_queue+0x42/0x70
  [<ffffffff8125db90>] blk_cleanup_queue+0xd0/0x1c0
  [<ffffffff81469640>] md_free+0x50/0x70
  [<ffffffff8126f43b>] kobject_release+0x8b/0x1d0
  [<ffffffff81270d56>] kref_put+0x36/0xa0
  [<ffffffff8126f2b7>] kobject_put+0x27/0x60
  [<ffffffff814693af>] mddev_delayed_delete+0x2f/0x40
  [<ffffffff81083450>] process_one_work+0x100/0x3b0
  [<ffffffff8108527f>] worker_thread+0x15f/0x3a0
  [<ffffffff81089937>] kthread+0x87/0x90
  [<ffffffff81621834>] kernel_thread_helper+0x4/0x10

Fix it by making blk_cleanup_queue() check whether q->elevator is set
up before invoking blk_drain_queue.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f658711..f43c8a5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -407,8 +407,13 @@ void blk_cleanup_queue(struct request_queue *q)
 	spin_unlock_irq(lock);
 	mutex_unlock(&q->sysfs_lock);
 
-	/* drain all requests queued before DEAD marking */
-	blk_drain_queue(q, true);
+	/*
+	 * Drain all requests queued before DEAD marking.  The caller might
+	 * be trying to tear down @q before its elevator is initialized, in
+	 * which case we don't want to call into draining.
+	 */
+	if (q->elevator)
+		blk_drain_queue(q, true);
 
 	/* @q won't process any more request, flush async actions */
 	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
-- 
cgit v1.1