From b5dd2f6047ca108001328aac0e8588edd15f1778 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 31 Dec 2014 13:22:57 +0000
Subject: block: loop: improve performance via blk-mq

The conversion is a bit straightforward, and use work queue to
dispatch requests of loop block, and one big change is that requests
is submitted to backend file/device concurrently with work queue,
so throughput may get improved much. Given write requests over same
file are often run exclusively, so don't handle them concurrently for
avoiding extra context switch cost, possible lock contention and work
schedule cost. Also with blk-mq, there is opportunity to get loop I/O
merged before submitting to backend file/device.

In the following test:
	- base: v3.19-rc2-2041231
	- loop over file in ext4 file system on SSD disk
	- bs: 4k, libaio, io depth: 64, O_DIRECT, num of jobs: 1
	- throughput: IOPS

	------------------------------------------------------
	|            | base      | base with loop-mq | delta |
	------------------------------------------------------
	| randread   | 1740      | 25318             | +1355%|
	------------------------------------------------------
	| read       | 42196     | 51771             | +22.6%|
	-----------------------------------------------------
	| randwrite  | 35709     | 34624             | -3%   |
	-----------------------------------------------------
	| write      | 39137     | 40326             | +3%   |
	-----------------------------------------------------

So loop-mq can improve throughput for both read and randread, meantime,
performance of write and randwrite isn't hurted basically.

Another benefit is that loop driver code gets simplified
much after blk-mq conversion, and the patch can be thought as
cleanup too.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 327 +++++++++++++++++++++++++--------------------------
 drivers/block/loop.h |  17 ++-
 2 files changed, 174 insertions(+), 170 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6cb1beb..c678eb2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -75,6 +75,7 @@
 #include <linux/sysfs.h>
 #include <linux/miscdevice.h>
 #include <linux/falloc.h>
+#include <linux/blk-mq.h>
 #include "loop.h"
 
 #include <asm/uaccess.h>
@@ -85,6 +86,8 @@ static DEFINE_MUTEX(loop_index_mutex);
 static int max_part;
 static int part_shift;
 
+static struct workqueue_struct *loop_wq;
+
 /*
  * Transfer functions
  */
@@ -466,109 +469,36 @@ out:
 	return ret;
 }
 
-/*
- * Add bio to back of pending list
- */
-static void loop_add_bio(struct loop_device *lo, struct bio *bio)
-{
-	lo->lo_bio_count++;
-	bio_list_add(&lo->lo_bio_list, bio);
-}
-
-/*
- * Grab first pending buffer
- */
-static struct bio *loop_get_bio(struct loop_device *lo)
-{
-	lo->lo_bio_count--;
-	return bio_list_pop(&lo->lo_bio_list);
-}
-
-static void loop_make_request(struct request_queue *q, struct bio *old_bio)
-{
-	struct loop_device *lo = q->queuedata;
-	int rw = bio_rw(old_bio);
-
-	if (rw == READA)
-		rw = READ;
-
-	BUG_ON(!lo || (rw != READ && rw != WRITE));
-
-	spin_lock_irq(&lo->lo_lock);
-	if (lo->lo_state != Lo_bound)
-		goto out;
-	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
-		goto out;
-	if (lo->lo_bio_count >= q->nr_congestion_on)
-		wait_event_lock_irq(lo->lo_req_wait,
-				    lo->lo_bio_count < q->nr_congestion_off,
-				    lo->lo_lock);
-	loop_add_bio(lo, old_bio);
-	wake_up(&lo->lo_event);
-	spin_unlock_irq(&lo->lo_lock);
-	return;
-
-out:
-	spin_unlock_irq(&lo->lo_lock);
-	bio_io_error(old_bio);
-}
-
 struct switch_request {
 	struct file *file;
 	struct completion wait;
 };
 
-static void do_loop_switch(struct loop_device *, struct switch_request *);
-
-static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
+static inline int loop_handle_bio(struct loop_device *lo, struct bio *bio)
 {
-	if (unlikely(!bio->bi_bdev)) {
-		do_loop_switch(lo, bio->bi_private);
-		bio_put(bio);
-	} else {
-		int ret = do_bio_filebacked(lo, bio);
-		bio_endio(bio, ret);
-	}
+	return do_bio_filebacked(lo, bio);
 }
 
 /*
- * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
- *
- * Loop explanation:  loop_clr_fd() sets lo_state to Lo_rundown before
- * calling kthread_stop().  Therefore once kthread_should_stop() is
- * true, make_request will not place any more requests.  Therefore
- * once kthread_should_stop() is true and lo_bio is NULL, we are
- * done with the loop.
+ * Do the actual switch; called from the BIO completion routine
  */
-static int loop_thread(void *data)
+static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
 {
-	struct loop_device *lo = data;
-	struct bio *bio;
-
-	set_user_nice(current, MIN_NICE);
-
-	while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
-
-		wait_event_interruptible(lo->lo_event,
-				!bio_list_empty(&lo->lo_bio_list) ||
-				kthread_should_stop());
-
-		if (bio_list_empty(&lo->lo_bio_list))
-			continue;
-		spin_lock_irq(&lo->lo_lock);
-		bio = loop_get_bio(lo);
-		if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
-			wake_up(&lo->lo_req_wait);
-		spin_unlock_irq(&lo->lo_lock);
+	struct file *file = p->file;
+	struct file *old_file = lo->lo_backing_file;
+	struct address_space *mapping;
 
-		BUG_ON(!bio);
-		loop_handle_bio(lo, bio);
-	}
+	/* if no new file, only flush of queued bios requested */
+	if (!file)
+		return;
 
-	return 0;
+	mapping = file->f_mapping;
+	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+	lo->lo_backing_file = file;
+	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+		mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
+	lo->old_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 }
 
 /*
@@ -579,15 +509,18 @@ static int loop_thread(void *data)
 static int loop_switch(struct loop_device *lo, struct file *file)
 {
 	struct switch_request w;
-	struct bio *bio = bio_alloc(GFP_KERNEL, 0);
-	if (!bio)
-		return -ENOMEM;
-	init_completion(&w.wait);
+
 	w.file = file;
-	bio->bi_private = &w;
-	bio->bi_bdev = NULL;
-	loop_make_request(lo->lo_queue, bio);
-	wait_for_completion(&w.wait);
+
+	/* freeze queue and wait for completion of scheduled requests */
+	blk_mq_freeze_queue(lo->lo_queue);
+
+	/* do the switch action */
+	do_loop_switch(lo, &w);
+
+	/* unfreeze */
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
 	return 0;
 }
 
@@ -596,39 +529,10 @@ static int loop_switch(struct loop_device *lo, struct file *file)
  */
 static int loop_flush(struct loop_device *lo)
 {
-	/* loop not yet configured, no running thread, nothing to flush */
-	if (!lo->lo_thread)
-		return 0;
-
 	return loop_switch(lo, NULL);
 }
 
 /*
- * Do the actual switch; called from the BIO completion routine
- */
-static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
-{
-	struct file *file = p->file;
-	struct file *old_file = lo->lo_backing_file;
-	struct address_space *mapping;
-
-	/* if no new file, only flush of queued bios requested */
-	if (!file)
-		goto out;
-
-	mapping = file->f_mapping;
-	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
-	lo->lo_backing_file = file;
-	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
-		mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
-	lo->old_gfp_mask = mapping_gfp_mask(mapping);
-	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
-out:
-	complete(&p->wait);
-}
-
-
-/*
  * loop_change_fd switched the backing store of a loopback device to
  * a new file. This is useful for operating system installers to free up
  * the original file and in High Availability environments to switch to
@@ -889,12 +793,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->transfer = transfer_none;
 	lo->ioctl = NULL;
 	lo->lo_sizelimit = 0;
-	lo->lo_bio_count = 0;
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 
-	bio_list_init(&lo->lo_bio_list);
-
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
 		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
@@ -906,14 +807,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	set_blocksize(bdev, lo_blocksize);
 
-	lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
-						lo->lo_number);
-	if (IS_ERR(lo->lo_thread)) {
-		error = PTR_ERR(lo->lo_thread);
-		goto out_clr;
-	}
 	lo->lo_state = Lo_bound;
-	wake_up_process(lo->lo_thread);
 	if (part_shift)
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
@@ -925,18 +819,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	bdgrab(bdev);
 	return 0;
 
-out_clr:
-	loop_sysfs_exit(lo);
-	lo->lo_thread = NULL;
-	lo->lo_device = NULL;
-	lo->lo_backing_file = NULL;
-	lo->lo_flags = 0;
-	set_capacity(lo->lo_disk, 0);
-	invalidate_bdev(bdev);
-	bd_set_size(bdev, 0);
-	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
-	mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
-	lo->lo_state = Lo_unbound;
  out_putf:
 	fput(file);
  out:
@@ -1012,11 +894,6 @@ static int loop_clr_fd(struct loop_device *lo)
 
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_state = Lo_rundown;
-	spin_unlock_irq(&lo->lo_lock);
-
-	kthread_stop(lo->lo_thread);
-
-	spin_lock_irq(&lo->lo_lock);
 	lo->lo_backing_file = NULL;
 	spin_unlock_irq(&lo->lo_lock);
 
@@ -1028,7 +905,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_offset = 0;
 	lo->lo_sizelimit = 0;
 	lo->lo_encrypt_key_size = 0;
-	lo->lo_thread = NULL;
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
@@ -1601,6 +1477,108 @@ int loop_unregister_transfer(int number)
 EXPORT_SYMBOL(loop_register_transfer);
 EXPORT_SYMBOL(loop_unregister_transfer);
 
+static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+
+	blk_mq_start_request(bd->rq);
+
+	if (cmd->rq->cmd_flags & REQ_WRITE) {
+		struct loop_device *lo = cmd->rq->q->queuedata;
+		bool need_sched = true;
+
+		spin_lock_irq(&lo->lo_lock);
+		if (lo->write_started)
+			need_sched = false;
+		else
+			lo->write_started = true;
+		list_add_tail(&cmd->list, &lo->write_cmd_head);
+		spin_unlock_irq(&lo->lo_lock);
+
+		if (need_sched)
+			queue_work(loop_wq, &lo->write_work);
+	} else {
+		queue_work(loop_wq, &cmd->read_work);
+	}
+
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static void loop_handle_cmd(struct loop_cmd *cmd)
+{
+	const bool write = cmd->rq->cmd_flags & REQ_WRITE;
+	struct loop_device *lo = cmd->rq->q->queuedata;
+	int ret = -EIO;
+	struct bio *bio;
+
+	if (lo->lo_state != Lo_bound)
+		goto failed;
+
+	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+		goto failed;
+
+	ret = 0;
+	__rq_for_each_bio(bio, cmd->rq)
+		ret |= loop_handle_bio(lo, bio);
+
+ failed:
+	if (ret)
+		cmd->rq->errors = -EIO;
+	blk_mq_complete_request(cmd->rq);
+}
+
+static void loop_queue_write_work(struct work_struct *work)
+{
+	struct loop_device *lo =
+		container_of(work, struct loop_device, write_work);
+	LIST_HEAD(cmd_list);
+
+	spin_lock_irq(&lo->lo_lock);
+ repeat:
+	list_splice_init(&lo->write_cmd_head, &cmd_list);
+	spin_unlock_irq(&lo->lo_lock);
+
+	while (!list_empty(&cmd_list)) {
+		struct loop_cmd *cmd = list_first_entry(&cmd_list,
+				struct loop_cmd, list);
+		list_del_init(&cmd->list);
+		loop_handle_cmd(cmd);
+	}
+
+	spin_lock_irq(&lo->lo_lock);
+	if (!list_empty(&lo->write_cmd_head))
+		goto repeat;
+	lo->write_started = false;
+	spin_unlock_irq(&lo->lo_lock);
+}
+
+static void loop_queue_read_work(struct work_struct *work)
+{
+	struct loop_cmd *cmd =
+		container_of(work, struct loop_cmd, read_work);
+
+	loop_handle_cmd(cmd);
+}
+
+static int loop_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
+{
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	cmd->rq = rq;
+	INIT_WORK(&cmd->read_work, loop_queue_read_work);
+
+	return 0;
+}
+
+static struct blk_mq_ops loop_mq_ops = {
+	.queue_rq       = loop_queue_rq,
+	.map_queue      = blk_mq_map_queue,
+	.init_request	= loop_init_request,
+};
+
 static int loop_add(struct loop_device **l, int i)
 {
 	struct loop_device *lo;
@@ -1627,16 +1605,28 @@ static int loop_add(struct loop_device **l, int i)
 	i = err;
 
 	err = -ENOMEM;
-	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
-	if (!lo->lo_queue)
+	lo->tag_set.ops = &loop_mq_ops;
+	lo->tag_set.nr_hw_queues = 1;
+	lo->tag_set.queue_depth = 128;
+	lo->tag_set.numa_node = NUMA_NO_NODE;
+	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
+	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	lo->tag_set.driver_data = lo;
+
+	err = blk_mq_alloc_tag_set(&lo->tag_set);
+	if (err)
 		goto out_free_idr;
 
-	/*
-	 * set queue make_request_fn
-	 */
-	blk_queue_make_request(lo->lo_queue, loop_make_request);
+	lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
+	if (IS_ERR_OR_NULL(lo->lo_queue)) {
+		err = PTR_ERR(lo->lo_queue);
+		goto out_cleanup_tags;
+	}
 	lo->lo_queue->queuedata = lo;
 
+	INIT_LIST_HEAD(&lo->write_cmd_head);
+	INIT_WORK(&lo->write_work, loop_queue_write_work);
+
 	disk = lo->lo_disk = alloc_disk(1 << part_shift);
 	if (!disk)
 		goto out_free_queue;
@@ -1664,9 +1654,6 @@ static int loop_add(struct loop_device **l, int i)
 	disk->flags |= GENHD_FL_EXT_DEVT;
 	mutex_init(&lo->lo_ctl_mutex);
 	lo->lo_number		= i;
-	lo->lo_thread		= NULL;
-	init_waitqueue_head(&lo->lo_event);
-	init_waitqueue_head(&lo->lo_req_wait);
 	spin_lock_init(&lo->lo_lock);
 	disk->major		= LOOP_MAJOR;
 	disk->first_minor	= i << part_shift;
@@ -1680,6 +1667,8 @@ static int loop_add(struct loop_device **l, int i)
 
 out_free_queue:
 	blk_cleanup_queue(lo->lo_queue);
+out_cleanup_tags:
+	blk_mq_free_tag_set(&lo->tag_set);
 out_free_idr:
 	idr_remove(&loop_index_idr, i);
 out_free_dev:
@@ -1692,6 +1681,7 @@ static void loop_remove(struct loop_device *lo)
 {
 	del_gendisk(lo->lo_disk);
 	blk_cleanup_queue(lo->lo_queue);
+	blk_mq_free_tag_set(&lo->tag_set);
 	put_disk(lo->lo_disk);
 	kfree(lo);
 }
@@ -1875,6 +1865,13 @@ static int __init loop_init(void)
 		goto misc_out;
 	}
 
+	loop_wq = alloc_workqueue("kloopd",
+			WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
+	if (!loop_wq) {
+		err = -ENOMEM;
+		goto misc_out;
+	}
+
 	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
@@ -1912,6 +1909,8 @@ static void __exit loop_exit(void)
 	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
 	unregister_blkdev(LOOP_MAJOR, "loop");
 
+	destroy_workqueue(loop_wq);
+
 	misc_deregister(&loop_misc);
 }
 
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 90df5d6..e20cdbb 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -13,6 +13,7 @@
 #include <linux/blkdev.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
+#include <linux/workqueue.h>
 #include <uapi/linux/loop.h>
 
 /* Possible states of device */
@@ -52,19 +53,23 @@ struct loop_device {
 	gfp_t		old_gfp_mask;
 
 	spinlock_t		lo_lock;
-	struct bio_list		lo_bio_list;
-	unsigned int		lo_bio_count;
+	struct list_head	write_cmd_head;
+	struct work_struct	write_work;
+	bool			write_started;
 	int			lo_state;
 	struct mutex		lo_ctl_mutex;
-	struct task_struct	*lo_thread;
-	wait_queue_head_t	lo_event;
-	/* wait queue for incoming requests */
-	wait_queue_head_t	lo_req_wait;
 
 	struct request_queue	*lo_queue;
+	struct blk_mq_tag_set	tag_set;
 	struct gendisk		*lo_disk;
 };
 
+struct loop_cmd {
+	struct work_struct read_work;
+	struct request *rq;
+	struct list_head list;
+};
+
 /* Support for loadable transfer modules */
 struct loop_func_table {
 	int number;	/* filter type */ 
-- 
cgit v1.1


From 301120134628c49346e5f23f7c00e7377013d5e0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 31 Dec 2014 13:22:58 +0000
Subject: block: loop: say goodby to bio

Switch to block request completely.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 45 ++++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c678eb2..d958901 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -287,12 +287,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
 	return ret;
 }
 
-static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
+static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos)
 {
 	int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
 			struct page *page);
 	struct bio_vec bvec;
-	struct bvec_iter iter;
+	struct req_iterator iter;
 	struct page *page = NULL;
 	int ret = 0;
 
@@ -306,7 +306,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 		do_lo_send = do_lo_send_direct_write;
 	}
 
-	bio_for_each_segment(bvec, bio, iter) {
+	rq_for_each_segment(bvec, rq, iter) {
 		ret = do_lo_send(lo, &bvec, pos, page);
 		if (ret < 0)
 			break;
@@ -394,19 +394,22 @@ do_lo_receive(struct loop_device *lo,
 }
 
 static int
-lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
+lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos)
 {
 	struct bio_vec bvec;
-	struct bvec_iter iter;
+	struct req_iterator iter;
 	ssize_t s;
 
-	bio_for_each_segment(bvec, bio, iter) {
+	rq_for_each_segment(bvec, rq, iter) {
 		s = do_lo_receive(lo, &bvec, bsize, pos);
 		if (s < 0)
 			return s;
 
 		if (s != bvec.bv_len) {
-			zero_fill_bio(bio);
+			struct bio *bio;
+
+			__rq_for_each_bio(bio, rq)
+				zero_fill_bio(bio);
 			break;
 		}
 		pos += bvec.bv_len;
@@ -414,17 +417,17 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
 	return 0;
 }
 
-static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
+static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
 	loff_t pos;
 	int ret;
 
-	pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset;
+	pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
 
-	if (bio_rw(bio) == WRITE) {
+	if (rq->cmd_flags & REQ_WRITE) {
 		struct file *file = lo->lo_backing_file;
 
-		if (bio->bi_rw & REQ_FLUSH) {
+		if (rq->cmd_flags & REQ_FLUSH) {
 			ret = vfs_fsync(file, 0);
 			if (unlikely(ret && ret != -EINVAL)) {
 				ret = -EIO;
@@ -438,7 +441,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 		 * encryption is enabled, because it may give an attacker
 		 * useful information.
 		 */
-		if (bio->bi_rw & REQ_DISCARD) {
+		if (rq->cmd_flags & REQ_DISCARD) {
 			struct file *file = lo->lo_backing_file;
 			int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
 
@@ -448,22 +451,22 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 				goto out;
 			}
 			ret = file->f_op->fallocate(file, mode, pos,
-						    bio->bi_iter.bi_size);
+						    blk_rq_bytes(rq));
 			if (unlikely(ret && ret != -EINVAL &&
 				     ret != -EOPNOTSUPP))
 				ret = -EIO;
 			goto out;
 		}
 
-		ret = lo_send(lo, bio, pos);
+		ret = lo_send(lo, rq, pos);
 
-		if ((bio->bi_rw & REQ_FUA) && !ret) {
+		if ((rq->cmd_flags & REQ_FUA) && !ret) {
 			ret = vfs_fsync(file, 0);
 			if (unlikely(ret && ret != -EINVAL))
 				ret = -EIO;
 		}
 	} else
-		ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
+		ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
 
 out:
 	return ret;
@@ -474,11 +477,6 @@ struct switch_request {
 	struct completion wait;
 };
 
-static inline int loop_handle_bio(struct loop_device *lo, struct bio *bio)
-{
-	return do_bio_filebacked(lo, bio);
-}
-
 /*
  * Do the actual switch; called from the BIO completion routine
  */
@@ -1510,7 +1508,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	const bool write = cmd->rq->cmd_flags & REQ_WRITE;
 	struct loop_device *lo = cmd->rq->q->queuedata;
 	int ret = -EIO;
-	struct bio *bio;
 
 	if (lo->lo_state != Lo_bound)
 		goto failed;
@@ -1518,9 +1515,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
 		goto failed;
 
-	ret = 0;
-	__rq_for_each_bio(bio, cmd->rq)
-		ret |= loop_handle_bio(lo, bio);
+	ret = do_req_filebacked(lo, cmd->rq);
 
  failed:
 	if (ret)
-- 
cgit v1.1


From cf655d953422c846558a320ac9b3c8e659b68275 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 31 Dec 2014 13:22:59 +0000
Subject: block: loop: introduce lo_discard() and lo_req_flush()

No behaviour change, just move the handling for REQ_DISCARD
and REQ_FLUSH in these two functions.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 73 ++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 33 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d958901..018af27 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -417,6 +417,40 @@ lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos)
 	return 0;
 }
 
+static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
+{
+	/*
+	 * We use punch hole to reclaim the free space used by the
+	 * image a.k.a. discard. However we do not support discard if
+	 * encryption is enabled, because it may give an attacker
+	 * useful information.
+	 */
+	struct file *file = lo->lo_backing_file;
+	int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+	int ret;
+
+	if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
+	if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
+		ret = -EIO;
+ out:
+	return ret;
+}
+
+static int lo_req_flush(struct loop_device *lo, struct request *rq)
+{
+	struct file *file = lo->lo_backing_file;
+	int ret = vfs_fsync(file, 0);
+	if (unlikely(ret && ret != -EINVAL))
+		ret = -EIO;
+
+	return ret;
+}
+
 static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
 	loff_t pos;
@@ -425,46 +459,19 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 	pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
 
 	if (rq->cmd_flags & REQ_WRITE) {
-		struct file *file = lo->lo_backing_file;
-
-		if (rq->cmd_flags & REQ_FLUSH) {
-			ret = vfs_fsync(file, 0);
-			if (unlikely(ret && ret != -EINVAL)) {
-				ret = -EIO;
-				goto out;
-			}
-		}
 
-		/*
-		 * We use punch hole to reclaim the free space used by the
-		 * image a.k.a. discard. However we do not support discard if
-		 * encryption is enabled, because it may give an attacker
-		 * useful information.
-		 */
+		if (rq->cmd_flags & REQ_FLUSH)
+			ret = lo_req_flush(lo, rq);
+
 		if (rq->cmd_flags & REQ_DISCARD) {
-			struct file *file = lo->lo_backing_file;
-			int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
-
-			if ((!file->f_op->fallocate) ||
-			    lo->lo_encrypt_key_size) {
-				ret = -EOPNOTSUPP;
-				goto out;
-			}
-			ret = file->f_op->fallocate(file, mode, pos,
-						    blk_rq_bytes(rq));
-			if (unlikely(ret && ret != -EINVAL &&
-				     ret != -EOPNOTSUPP))
-				ret = -EIO;
+			ret = lo_discard(lo, rq, pos);
 			goto out;
 		}
 
 		ret = lo_send(lo, rq, pos);
 
-		if ((rq->cmd_flags & REQ_FUA) && !ret) {
-			ret = vfs_fsync(file, 0);
-			if (unlikely(ret && ret != -EINVAL))
-				ret = -EIO;
-		}
+		if ((rq->cmd_flags & REQ_FUA) && !ret)
+			ret = lo_req_flush(lo, rq);
 	} else
 		ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
 
-- 
cgit v1.1


From af65aa8ea78b296857f257bdc52338d03101813b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 31 Dec 2014 13:23:00 +0000
Subject: block: loop: don't handle REQ_FUA explicitly

block core handles REQ_FUA by its flush state machine, so
won't do it in loop explicitly.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 018af27..185a86d 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -459,23 +459,15 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 	pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
 
 	if (rq->cmd_flags & REQ_WRITE) {
-
 		if (rq->cmd_flags & REQ_FLUSH)
 			ret = lo_req_flush(lo, rq);
-
-		if (rq->cmd_flags & REQ_DISCARD) {
+		else if (rq->cmd_flags & REQ_DISCARD)
 			ret = lo_discard(lo, rq, pos);
-			goto out;
-		}
-
-		ret = lo_send(lo, rq, pos);
-
-		if ((rq->cmd_flags & REQ_FUA) && !ret)
-			ret = lo_req_flush(lo, rq);
+		else
+			ret = lo_send(lo, rq, pos);
 	} else
 		ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
 
-out:
 	return ret;
 }
 
-- 
cgit v1.1


From 78e367a3601f35ea811e7f5660b7362afa2401fa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 2 Jan 2015 15:20:25 -0700
Subject: loop: add blk-mq.h include

Looks like we pull it in through other ways on x86, but we fail
on sparc:

In file included from drivers/block/cryptoloop.c:30:0:
drivers/block/loop.h:63:24: error: field 'tag_set' has incomplete type
struct blk_mq_tag_set tag_set;

Add the include to loop.h, kill it from loop.c.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 1 -
 drivers/block/loop.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 185a86d..d1f168b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -75,7 +75,6 @@
 #include <linux/sysfs.h>
 #include <linux/miscdevice.h>
 #include <linux/falloc.h>
-#include <linux/blk-mq.h>
 #include "loop.h"
 
 #include <asm/uaccess.h>
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index e20cdbb..301c27f 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -11,6 +11,7 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
-- 
cgit v1.1


From 937af5ecd0591e84ee54180fa97dcbe9bbe5fed6 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <boaz@plexistor.com>
Date: Wed, 7 Jan 2015 18:07:56 +0200
Subject: brd: Fix all partitions BUGs

This patch fixes up brd's partitions scheme, now enjoying all worlds.

The MAIN fix here is that currently, if one fdisks some partitions,
a BAD bug will make all partitions point to the same start-end sector
ie: 0 - brd_size And an mkfs of any partition would trash the partition
table and the other partition.

Another fix is that "mount -U uuid" will not work if show_part was not
specified, because of the GENHD_FL_SUPPRESS_PARTITION_INFO flag.
We now always load without it and remove the show_part parameter.

[We remove Dmitry's new module-param part_show it is now always
 show]

So NOW the logic goes like this:
* max_part - Just says how many minors to reserve between ramX
  devices. In any way, there can be as many partition as requested.
  If minors between devices ends, then dynamic 259-major ids will
  be allocated on the fly.
  The default is now max_part=1, which means all partitions devt(s)
  will be from the dynamic (259) major-range.
  (If persistent partition minors is needed use max_part=X)
  For example with /dev/sdX max_part is hard coded 16.

* Creation of new devices on the fly still/always work:
  mknod /path/devnod b 1 X
  fdisk -l /path/devnod
  Will create a new device if [X / max_part] was not already
  created before. (Just as before)

  partitions on the dynamically created device will work as well
  Same logic applies with minors as with the pre-created ones.

TODO: dynamic grow of device size. So each device can have it's
      own size.

CC: Dmitry Monakhov <dmonakhov@openvz.org>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/brd.c | 100 ++++++++++++++++++++--------------------------------
 1 file changed, 38 insertions(+), 62 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 89e90ec..a7463c959 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = {
 /*
  * And now the modules code and kernel interface.
  */
-static int rd_nr;
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
-static int max_part;
-static int part_shift;
-static int part_show = 0;
+static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
 module_param(rd_nr, int, S_IRUGO);
 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+
+int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 module_param(rd_size, int, S_IRUGO);
 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+
+static int max_part = 1;
 module_param(max_part, int, S_IRUGO);
-MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
-module_param(part_show, int, S_IRUGO);
-MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions");
+MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
+
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 MODULE_ALIAS("rd");
@@ -496,16 +495,15 @@ static struct brd_device *brd_alloc(int i)
 	brd->brd_queue->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
 
-	disk = brd->brd_disk = alloc_disk(1 << part_shift);
+	disk = brd->brd_disk = alloc_disk(max_part);
 	if (!disk)
 		goto out_free_queue;
 	disk->major		= RAMDISK_MAJOR;
-	disk->first_minor	= i << part_shift;
+	disk->first_minor	= i * max_part;
 	disk->fops		= &brd_fops;
 	disk->private_data	= brd;
 	disk->queue		= brd->brd_queue;
-	if (!part_show)
-		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+	disk->flags		= GENHD_FL_EXT_DEVT;
 	sprintf(disk->disk_name, "ram%d", i);
 	set_capacity(disk, rd_size * 2);
 
@@ -527,10 +525,11 @@ static void brd_free(struct brd_device *brd)
 	kfree(brd);
 }
 
-static struct brd_device *brd_init_one(int i)
+static struct brd_device *brd_init_one(int i, bool *new)
 {
 	struct brd_device *brd;
 
+	*new = false;
 	list_for_each_entry(brd, &brd_devices, brd_list) {
 		if (brd->brd_number == i)
 			goto out;
@@ -541,6 +540,7 @@ static struct brd_device *brd_init_one(int i)
 		add_disk(brd->brd_disk);
 		list_add_tail(&brd->brd_list, &brd_devices);
 	}
+	*new = true;
 out:
 	return brd;
 }
@@ -556,70 +556,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
 {
 	struct brd_device *brd;
 	struct kobject *kobj;
+	bool new;
 
 	mutex_lock(&brd_devices_mutex);
-	brd = brd_init_one(MINOR(dev) >> part_shift);
+	brd = brd_init_one(MINOR(dev) / max_part, &new);
 	kobj = brd ? get_disk(brd->brd_disk) : NULL;
 	mutex_unlock(&brd_devices_mutex);
 
-	*part = 0;
+	if (new)
+		*part = 0;
+
 	return kobj;
 }
 
 static int __init brd_init(void)
 {
-	int i, nr;
-	unsigned long range;
 	struct brd_device *brd, *next;
+	int i;
 
 	/*
 	 * brd module now has a feature to instantiate underlying device
 	 * structure on-demand, provided that there is an access dev node.
-	 * However, this will not work well with user space tool that doesn't
-	 * know about such "feature".  In order to not break any existing
-	 * tool, we do the following:
 	 *
-	 * (1) if rd_nr is specified, create that many upfront, and this
-	 *     also becomes a hard limit.
-	 * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
-	 *     (default 16) rd device on module load, user can further
-	 *     extend brd device by create dev node themselves and have
-	 *     kernel automatically instantiate actual device on-demand.
+	 * (1) if rd_nr is specified, create that many upfront. else
+	 *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
+	 * (2) User can further extend brd devices by create dev node themselves
+	 *     and have kernel automatically instantiate actual device
+	 *     on-demand. Example:
+	 *		mknod /path/devnod_name b 1 X	# 1 is the rd major
+	 *		fdisk -l /path/devnod_name
+	 *	If (X / max_part) was not already created it will be created
+	 *	dynamically.
 	 */
 
-	part_shift = 0;
-	if (max_part > 0) {
-		part_shift = fls(max_part);
-
-		/*
-		 * Adjust max_part according to part_shift as it is exported
-		 * to user space so that user can decide correct minor number
-		 * if [s]he want to create more devices.
-		 *
-		 * Note that -1 is required because partition 0 is reserved
-		 * for the whole disk.
-		 */
-		max_part = (1UL << part_shift) - 1;
-	}
-
-	if ((1UL << part_shift) > DISK_MAX_PARTS)
-		return -EINVAL;
-
-	if (rd_nr > 1UL << (MINORBITS - part_shift))
-		return -EINVAL;
-
-	if (rd_nr) {
-		nr = rd_nr;
-		range = rd_nr << part_shift;
-	} else {
-		nr = CONFIG_BLK_DEV_RAM_COUNT;
-		range = 1UL << MINORBITS;
-	}
-
 	if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
 		return -EIO;
 
-	for (i = 0; i < nr; i++) {
+	if (unlikely(!max_part))
+		max_part = 1;
+
+	for (i = 0; i < rd_nr; i++) {
 		brd = brd_alloc(i);
 		if (!brd)
 			goto out_free;
@@ -631,10 +607,10 @@ static int __init brd_init(void)
 	list_for_each_entry(brd, &brd_devices, brd_list)
 		add_disk(brd->brd_disk);
 
-	blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
+	blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
 				  THIS_MODULE, brd_probe, NULL, NULL);
 
-	printk(KERN_INFO "brd: module loaded\n");
+	pr_info("brd: module loaded\n");
 	return 0;
 
 out_free:
@@ -644,21 +620,21 @@ out_free:
 	}
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 
+	pr_info("brd: module NOT loaded !!!\n");
 	return -ENOMEM;
 }
 
 static void __exit brd_exit(void)
 {
-	unsigned long range;
 	struct brd_device *brd, *next;
 
-	range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
-
 	list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 		brd_del_one(brd);
 
-	blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
+	blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+	pr_info("brd: module unloaded\n");
 }
 
 module_init(brd_init);
-- 
cgit v1.1


From c8fa31730fc72a83bad762112a872675b3e76886 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <boaz@plexistor.com>
Date: Wed, 7 Jan 2015 18:09:38 +0200
Subject: brd: Request from fdisk 4k alignment

Because of the direct_access() API which returns a PFN. partitions
better start on 4K boundary, else offset ZERO of a partition will
not be aligned and blk_direct_access() will fail the call.

By setting blk_queue_physical_block_size(PAGE_SIZE) we can communicate
this to fdisk and friends.

The call to blk_queue_physical_block_size() is harmless and will
not affect the Kernel behavior in any way. It is only for
communication to user-mode.

before this patch running fdisk on a default size brd of 4M
the first sector offered is 34 (BAD), but after this patch it
will be 40, ie 8 sectors aligned. Also when entering some random
partition sizes the next partition-start sector is offered 8 sectors
aligned after this patch. (Please note that with fdisk the user
can still enter bad values, only the offered default values will
be correct)

Note that with bdev-size > 4M fdisk will try to align on a 1M
boundary (above first-sector will be 2048), in any case.

CC: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/brd.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index a7463c959..c01b921 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -486,10 +486,19 @@ static struct brd_device *brd_alloc(int i)
 	brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
 	if (!brd->brd_queue)
 		goto out_free_dev;
+
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
 	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 
+	/* This is so fdisk will align partitions on 4k, because of
+	 * direct_access API needing 4k alignment, returning a PFN
+	 * (This is only a problem on very small devices <= 4M,
+	 *  otherwise fdisk will align on 1M. Regardless this call
+	 *  is harmless)
+	 */
+	blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
+
 	brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
 	brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
 	brd->brd_queue->limits.discard_zeroes_data = 1;
-- 
cgit v1.1


From 227290b4690510148d1f40f277c6c3a181fcb440 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 16 Jan 2015 16:02:24 -0700
Subject: null_blk: suppress invalid partition info

null_blk is partitionable, but it doesn't store any of the info. When
it is loaded, you would normally see:

[1226739.343608]  nullb0: unknown partition table
[1226739.343746]  nullb1: unknown partition table

which can confuse some people. Add the appropriate gendisk flag
to suppress this info.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index aa2224a..65cd61a 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -579,7 +579,7 @@ static int null_add_dev(void)
 	sector_div(size, bs);
 	set_capacity(disk, size);
 
-	disk->flags |= GENHD_FL_EXT_DEVT;
+	disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
 	disk->major		= null_major;
 	disk->first_minor	= nullb->index;
 	disk->fops		= &null_fops;
-- 
cgit v1.1


From 121c7ad4ef3f3a9e697c4b8b78d74e4d6847d9e4 Mon Sep 17 00:00:00 2001
From: kaoudis <kaoudis@colorado.edu>
Date: Wed, 14 Jan 2015 21:01:58 -0700
Subject: NVMe: within nvme_free_queues(), delete RCU sychro/deferred free

Converting from to blk-queue got rid of the driver's RCU
locking-on-queue, so removing unnecessary RCU locking-on-queue
artefacts.

Reviewed-by: Keith Busch <keith.busch@intel.com>

Signed-off-by: Kelly Nicole Kaoudis <kaoudis@colorado.edu>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f7d083b..f4aa641 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1108,21 +1108,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
 
 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 {
-	LLIST_HEAD(q_list);
-	struct nvme_queue *nvmeq, *next;
-	struct llist_node *entry;
 	int i;
 
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
 		struct nvme_queue *nvmeq = dev->queues[i];
-		llist_add(&nvmeq->node, &q_list);
 		dev->queue_count--;
 		dev->queues[i] = NULL;
-	}
-	synchronize_rcu();
-	entry = llist_del_all(&q_list);
-	llist_for_each_entry_safe(nvmeq, next, entry, node)
 		nvme_free_queue(nvmeq);
+	}
 }
 
 /**
-- 
cgit v1.1


From ac3dd5bd128b1d1ce2a037775766f39d06a4848a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 22 Jan 2015 12:07:58 -0700
Subject: NVMe: avoid kmalloc/kfree for smaller IO

Currently we allocate an nvme_iod for each IO, which holds the
sg list, prps, and other IO related info. Set a threshold of
2 pages and/or 8KB of data, below which we can just embed this
in the per-command pdu in blk-mq. For any IO at or below
NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree.

For higher IOPS, this saves up to 1% of CPU time.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 119 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 88 insertions(+), 31 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f4aa641..3eaa0be 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
 	void *ctx;
 	int aborted;
 	struct nvme_queue *nvmeq;
+	struct nvme_iod iod[0];
 };
 
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES		2
+#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->page_size)
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+	unsigned int ret = sizeof(struct nvme_cmd_info);
+
+	ret += sizeof(struct nvme_iod);
+	ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
+	ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+
+	return ret;
+}
+
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 				unsigned int hctx_idx)
 {
@@ -217,6 +246,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
 	cmd->aborted = 0;
 }
 
+static void *iod_get_private(struct nvme_iod *iod)
+{
+	return (void *) (iod->private & ~0x1UL);
+}
+
+/*
+ * If bit 0 is set, the iod is embedded in the request payload.
+ */
+static bool iod_should_kfree(struct nvme_iod *iod)
+{
+	return (iod->private & 0x01) == 0;
+}
+
 /* Special values must be less than 0x1000 */
 #define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
@@ -360,35 +402,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
 	return ((void *)iod) + iod->offset;
 }
 
-/*
- * Will slightly overestimate the number of pages needed.  This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static int nvme_npages(unsigned size, struct nvme_dev *dev)
+static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
+			    unsigned nseg, unsigned long private)
 {
-	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
-	return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
+	iod->private = private;
+	iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+	iod->npages = -1;
+	iod->length = nbytes;
+	iod->nents = 0;
 }
 
 static struct nvme_iod *
-nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
+__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
+		 unsigned long priv, gfp_t gfp)
 {
 	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
-				sizeof(__le64 *) * nvme_npages(nbytes, dev) +
+				sizeof(__le64 *) * nvme_npages(bytes, dev) +
 				sizeof(struct scatterlist) * nseg, gfp);
 
-	if (iod) {
-		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
-		iod->npages = -1;
-		iod->length = nbytes;
-		iod->nents = 0;
-		iod->first_dma = 0ULL;
-	}
+	if (iod)
+		iod_init(iod, bytes, nseg, priv);
 
 	return iod;
 }
 
+static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
+			               gfp_t gfp)
+{
+	unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
+                                                sizeof(struct nvme_dsm_range);
+	unsigned long mask = 0;
+	struct nvme_iod *iod;
+
+	if (rq->nr_phys_segments <= NVME_INT_PAGES &&
+	    size <= NVME_INT_BYTES(dev)) {
+		struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+
+		iod = cmd->iod;
+		mask = 0x01;
+		iod_init(iod, size, rq->nr_phys_segments,
+				(unsigned long) rq | 0x01);
+		return iod;
+	}
+
+	return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
+				(unsigned long) rq, gfp);
+}
+
 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = dev->page_size / 8 - 1;
@@ -404,7 +464,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
 		prp_dma = next_prp_dma;
 	}
-	kfree(iod);
+
+	if (iod_should_kfree(iod))
+		kfree(iod);
 }
 
 static int nvme_error_status(u16 status)
@@ -423,7 +485,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_iod *iod = ctx;
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
 
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -579,7 +641,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 							struct nvme_ns *ns)
 {
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_command *cmnd;
 	u16 control = 0;
 	u32 dsmgmt = 0;
@@ -620,17 +682,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *req = bd->rq;
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
-	int psegs = req->nr_phys_segments;
 	enum dma_data_direction dma_dir;
-	unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
-						sizeof(struct nvme_dsm_range);
 
-	iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
+	iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
-	iod->private = req;
-
 	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
@@ -645,10 +702,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			goto retry_cmd;
 		iod_list(iod)[0] = (__le64 *)range;
 		iod->npages = 0;
-	} else if (psegs) {
+	} else if (req->nr_phys_segments) {
 		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 
-		sg_init_table(iod->sg, psegs);
+		sg_init_table(iod->sg, req->nr_phys_segments);
 		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
 		if (!iod->nents)
 			goto error_cmd;
@@ -1362,7 +1419,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
-		dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
 		dev->admin_tagset.driver_data = dev;
 
 		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1483,7 +1540,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	}
 
 	err = -ENOMEM;
-	iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
+	iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
 	if (!iod)
 		goto put_pages;
 
@@ -2109,7 +2166,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
 	dev->tagset.queue_depth =
 				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
-	dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+	dev->tagset.cmd_size = nvme_cmd_size(dev);
 	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
 	dev->tagset.driver_data = dev;
 
-- 
cgit v1.1


From b7f120b211510b80cb72c1d790d9a4531271edfa Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 2 Feb 2015 17:08:45 +0100
Subject: floppy: Avoid manual call of device_create_file()

Use the static attribute groups assigned to the device instead of
calling device_create_file() after the device registration.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/block/floppy.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 56d46ff..a08cda9 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev,
 
 static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
 
+static struct attribute *floppy_dev_attrs[] = {
+	&dev_attr_cmos.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(floppy_dev);
+
 static void floppy_device_release(struct device *dev)
 {
 }
@@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void)
 		floppy_device[drive].name = floppy_device_name;
 		floppy_device[drive].id = drive;
 		floppy_device[drive].dev.release = floppy_device_release;
+		floppy_device[drive].dev.groups = floppy_dev_groups;
 
 		err = platform_device_register(&floppy_device[drive]);
 		if (err)
 			goto out_remove_drives;
 
-		err = device_create_file(&floppy_device[drive].dev,
-					 &dev_attr_cmos);
-		if (err)
-			goto out_unreg_platform_dev;
-
 		/* to be cleaned up... */
 		disks[drive]->private_data = (void *)(long)drive;
 		disks[drive]->flags |= GENHD_FL_REMOVABLE;
@@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void)
 
 	return 0;
 
-out_unreg_platform_dev:
-	platform_device_unregister(&floppy_device[drive]);
 out_remove_drives:
 	while (drive--) {
 		if (floppy_available(drive)) {
 			del_gendisk(disks[drive]);
-			device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
 			platform_device_unregister(&floppy_device[drive]);
 		}
 	}
@@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void)
 
 		if (floppy_available(drive)) {
 			del_gendisk(disks[drive]);
-			device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
 			platform_device_unregister(&floppy_device[drive]);
 		}
 		blk_cleanup_queue(disks[drive]->queue);
-- 
cgit v1.1


From 3bb8c98e5612f069010ad04e5f463389e2eb6563 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Mon, 2 Feb 2015 11:28:21 +0000
Subject: xen-blkfront: fix accounting of reqs when migrating
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current migration code uses blk_put_request in order to finish a request
before requeuing it. This function doesn't update the statistics of the
queue, which completely screws accounting. Use blk_end_request_all instead
which properly updates the statistics of the queue.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reported-and-Tested-by: Ouyang Zhaowei (Charles) <ouyangzhaowei@huawei.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: xen-devel@lists.xenproject.org
---
 drivers/block/xen-blkfront.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2236c6f..7f66d2e 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info)
 		merge_bio.tail = copy[i].request->biotail;
 		bio_list_merge(&bio_list, &merge_bio);
 		copy[i].request->bio = NULL;
-		blk_put_request(copy[i].request);
+		blk_end_request_all(copy[i].request, 0);
 	}
 
 	kfree(copy);
@@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info)
 		req->bio = NULL;
 		if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
 			pr_alert("diskcache flush request found!\n");
-		__blk_put_request(info->rq, req);
+		__blk_end_request_all(req, 0);
 	}
 	spin_unlock_irq(&info->io_lock);
 
-- 
cgit v1.1


From b042a3ca949053231950a1b15f31cccca9e305f3 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 5 Feb 2015 17:09:56 +0000
Subject: xen-blkback: default to X86_32 ABI on x86
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prior to the existance of 64-bit backends using the X86_64 ABI,
frontends used the X86_32 ABI.  These old frontends do not specify the
ABI and when used with a 64-bit backend do not work.

On x86, default to the X86_32 ABI if one is not specified.  Backends
on ARM continue to default to their NATIVE ABI.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
---
 drivers/block/xen-blkback/common.h | 9 +++++++++
 drivers/block/xen-blkback/xenbus.c | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index f65b807..78b0411 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -214,6 +214,15 @@ enum blkif_protocol {
 	BLKIF_PROTOCOL_X86_64 = 3,
 };
 
+/*
+ * Default protocol if the frontend doesn't specify one.
+ */
+#ifdef CONFIG_X86
+#  define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
+#else
+#  define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
+#endif
+
 struct xen_vbd {
 	/* What the domain refers to this vbd as. */
 	blkif_vdev_t		handle;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 630a489..e3afe97 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be)
 		return err;
 	}
 
-	be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
 	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
 			    "%63s", protocol, NULL);
 	if (err)
-		strcpy(protocol, "unspecified, assuming native");
+		strcpy(protocol, "unspecified, assuming default");
 	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
 		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
 	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-- 
cgit v1.1