diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-12 14:30:53 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-12 14:30:53 -0800 |
commit | 8494bcf5b7c4b2416687e233dd34d4c6b6fe5653 (patch) | |
tree | cebb468e170e639ecfd61ddc5ebcba86c21105fa | |
parent | 3e12cefbe143b4947171ff92dd50024c4841e291 (diff) | |
parent | b042a3ca949053231950a1b15f31cccca9e305f3 (diff) | |
download | op-kernel-dev-8494bcf5b7c4b2416687e233dd34d4c6b6fe5653.zip op-kernel-dev-8494bcf5b7c4b2416687e233dd34d4c6b6fe5653.tar.gz |
Merge branch 'for-3.20/drivers' of git://git.kernel.dk/linux-block
Pull block driver changes from Jens Axboe:
"This contains:
- The 4k/partition fixes for brd from Boaz/Matthew.
- A few xen front/back block fixes from David Vrabel and Roger Pau
Monne.
- Floppy changes from Takashi, cleaning the device file creation.
- Switching libata to use the new blk-mq tagging policy, removing
code (and a suboptimal implementation) from libata. This will
throw you a merge conflict, since a bug in the original libata
tagging code was fixed since this code was branched. Trivial.
From Shaohua.
- Conversion of loop to blk-mq, from Ming Lei.
- Cleanup of the io_schedule() handling in bsg from Peter Zijlstra.
He claims it improves on unreadable code, which will cost him a
beer.
- Maintainer update or NDB, now handled by Markus Pargmann.
- NVMe:
- Optimization from me that avoids a kmalloc/kfree per IO for
smaller (<= 8KB) IO. This cuts about 1% of high IOPS CPU
overhead.
- Removal of (now) dead RCU code, a relic from before NVMe was
converted to blk-mq"
* 'for-3.20/drivers' of git://git.kernel.dk/linux-block:
xen-blkback: default to X86_32 ABI on x86
xen-blkfront: fix accounting of reqs when migrating
xen-blkback,xen-blkfront: add myself as maintainer
block: Simplify bsg complete all
floppy: Avoid manual call of device_create_file()
NVMe: avoid kmalloc/kfree for smaller IO
MAINTAINERS: Update NBD maintainer
libata: make sata_sil24 use fifo tag allocator
libata: move sas ata tag allocation to libata-scsi.c
libata: use blk taging
NVMe: within nvme_free_queues(), delete RCU sychro/deferred free
null_blk: suppress invalid partition info
brd: Request from fdisk 4k alignment
brd: Fix all partitions BUGs
axonram: Fix bug in direct_access
loop: add blk-mq.h include
block: loop: don't handle REQ_FUA explicitly
block: loop: introduce lo_discard() and lo_req_flush()
block: loop: say goodby to bio
block: loop: improve performance via blk-mq
-rw-r--r-- | MAINTAINERS | 4 | ||||
-rw-r--r-- | arch/powerpc/sysdev/axonram.c | 2 | ||||
-rw-r--r-- | block/bsg.c | 72 | ||||
-rw-r--r-- | drivers/ata/libata-core.c | 70 | ||||
-rw-r--r-- | drivers/ata/libata-scsi.c | 33 | ||||
-rw-r--r-- | drivers/ata/libata.h | 4 | ||||
-rw-r--r-- | drivers/ata/sata_sil24.c | 1 | ||||
-rw-r--r-- | drivers/block/brd.c | 109 | ||||
-rw-r--r-- | drivers/block/floppy.c | 17 | ||||
-rw-r--r-- | drivers/block/loop.c | 416 | ||||
-rw-r--r-- | drivers/block/loop.h | 18 | ||||
-rw-r--r-- | drivers/block/null_blk.c | 2 | ||||
-rw-r--r-- | drivers/block/nvme-core.c | 128 | ||||
-rw-r--r-- | drivers/block/xen-blkback/common.h | 9 | ||||
-rw-r--r-- | drivers/block/xen-blkback/xenbus.c | 4 | ||||
-rw-r--r-- | drivers/block/xen-blkfront.c | 4 | ||||
-rw-r--r-- | include/linux/libata.h | 5 | ||||
-rw-r--r-- | include/linux/nvme.h | 3 | ||||
-rw-r--r-- | include/linux/wait.h | 15 |
19 files changed, 476 insertions, 440 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 2299965..482bfc5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6642,9 +6642,10 @@ F: include/uapi/linux/netrom.h F: net/netrom/ NETWORK BLOCK DEVICE (NBD) -M: Paul Clements <Paul.Clements@steeleye.com> +M: Markus Pargmann <mpa@pengutronix.de> S: Maintained L: nbd-general@lists.sourceforge.net +T: git git://git.pengutronix.de/git/mpa/linux-nbd.git F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c F: include/linux/nbd.h @@ -10690,6 +10691,7 @@ F: drivers/pci/*xen* XEN BLOCK SUBSYSTEM M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +M: Roger Pau Monné <roger.pau@citrix.com> L: xen-devel@lists.xenproject.org (moderated for non-subscribers) S: Supported F: drivers/block/xen-blkback/* diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index 20f8afe..ee90db1 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -147,7 +147,7 @@ axon_ram_direct_access(struct block_device *device, sector_t sector, loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; *kaddr = (void *)(bank->ph_addr + offset); - *pfn = virt_to_phys(kaddr) >> PAGE_SHIFT; + *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; return bank->size - offset; } diff --git a/block/bsg.c b/block/bsg.c index 276e869..d214e92 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index) return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; } -static int bsg_io_schedule(struct bsg_device *bd) -{ - DEFINE_WAIT(wait); - int ret = 0; - - spin_lock_irq(&bd->lock); - - BUG_ON(bd->done_cmds > bd->queued_cmds); - - /* - * -ENOSPC or -ENODATA? I'm going for -ENODATA, meaning "I have no - * work to do", even though we return -ENOSPC after this same test - * during bsg_write() -- there, it means our buffer can't have more - * bsg_commands added to it, thus has no space left. - */ - if (bd->done_cmds == bd->queued_cmds) { - ret = -ENODATA; - goto unlock; - } - - if (!test_bit(BSG_F_BLOCK, &bd->flags)) { - ret = -EAGAIN; - goto unlock; - } - - prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&bd->lock); - io_schedule(); - finish_wait(&bd->wq_done, &wait); - - return ret; -unlock: - spin_unlock_irq(&bd->lock); - return ret; -} - static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, struct sg_io_v4 *hdr, struct bsg_device *bd, fmode_t has_write_perm) @@ -482,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, return ret; } +static bool bsg_complete(struct bsg_device *bd) +{ + bool ret = false; + bool spin; + + do { + spin_lock_irq(&bd->lock); + + BUG_ON(bd->done_cmds > bd->queued_cmds); + + /* + * All commands consumed. + */ + if (bd->done_cmds == bd->queued_cmds) + ret = true; + + spin = !test_bit(BSG_F_BLOCK, &bd->flags); + + spin_unlock_irq(&bd->lock); + } while (!ret && spin); + + return ret; +} + static int bsg_complete_all_commands(struct bsg_device *bd) { struct bsg_command *bc; @@ -492,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd) /* * wait for all commands to complete */ - ret = 0; - do { - ret = bsg_io_schedule(bd); - /* - * look for -ENODATA specifically -- we'll sometimes get - * -ERESTARTSYS when we've taken a signal, but we can't - * return until we're done freeing the queue, so ignore - * it. The signal will get handled when we're done freeing - * the bsg_device. - */ - } while (ret != -ENODATA); + io_wait_event(bd->wq_done, bsg_complete(bd)); /* * discard done commands diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 4b0d5e7..4c35f08 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -1585,8 +1585,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev, else tag = 0; - if (test_and_set_bit(tag, &ap->qc_allocated)) - BUG(); qc = __ata_qc_from_tag(ap, tag); qc->tag = tag; @@ -4722,69 +4720,36 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words) } /** - * ata_qc_new - Request an available ATA command, for queueing - * @ap: target port - * - * Some ATA host controllers may implement a queue depth which is less - * than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond - * the hardware limitation. + * ata_qc_new_init - Request an available ATA command, and initialize it + * @dev: Device from whom we request an available command structure * * LOCKING: * None. */ -static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap) +struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag) { - struct ata_queued_cmd *qc = NULL; - unsigned int max_queue = ap->host->n_tags; - unsigned int i, tag; + struct ata_port *ap = dev->link->ap; + struct ata_queued_cmd *qc; /* no command while frozen */ if (unlikely(ap->pflags & ATA_PFLAG_FROZEN)) return NULL; - for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) { - if (ap->flags & ATA_FLAG_LOWTAG) - tag = i; - else - tag = tag < max_queue ? tag : 0; - - /* the last tag is reserved for internal command. */ - if (tag == ATA_TAG_INTERNAL) - continue; - - if (!test_and_set_bit(tag, &ap->qc_allocated)) { - qc = __ata_qc_from_tag(ap, tag); - qc->tag = tag; - ap->last_tag = tag; - break; - } + /* libsas case */ + if (!ap->scsi_host) { + tag = ata_sas_allocate_tag(ap); + if (tag < 0) + return NULL; } - return qc; -} - -/** - * ata_qc_new_init - Request an available ATA command, and initialize it - * @dev: Device from whom we request an available command structure - * - * LOCKING: - * None. - */ - -struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev) -{ - struct ata_port *ap = dev->link->ap; - struct ata_queued_cmd *qc; - - qc = ata_qc_new(ap); - if (qc) { - qc->scsicmd = NULL; - qc->ap = ap; - qc->dev = dev; + qc = __ata_qc_from_tag(ap, tag); + qc->tag = tag; + qc->scsicmd = NULL; + qc->ap = ap; + qc->dev = dev; - ata_qc_reinit(qc); - } + ata_qc_reinit(qc); return qc; } @@ -4811,7 +4776,8 @@ void ata_qc_free(struct ata_queued_cmd *qc) tag = qc->tag; if (likely(ata_tag_valid(tag))) { qc->tag = ATA_TAG_POISON; - clear_bit(tag, &ap->qc_allocated); + if (!ap->scsi_host) + ata_sas_free_tag(tag, ap); } } diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 2807293..b061ba2 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev, { struct ata_queued_cmd *qc; - qc = ata_qc_new_init(dev); + qc = ata_qc_new_init(dev, cmd->request->tag); if (qc) { qc->scsicmd = cmd; qc->scsidone = cmd->scsi_done; @@ -3668,6 +3668,9 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht) */ shost->max_host_blocked = 1; + if (scsi_init_shared_tag_map(shost, host->n_tags)) + goto err_add; + rc = scsi_add_host_with_dma(ap->scsi_host, &ap->tdev, ap->host->dev); if (rc) @@ -4230,3 +4233,31 @@ int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap) return rc; } EXPORT_SYMBOL_GPL(ata_sas_queuecmd); + +int ata_sas_allocate_tag(struct ata_port *ap) +{ + unsigned int max_queue = ap->host->n_tags; + unsigned int i, tag; + + for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) { + if (ap->flags & ATA_FLAG_LOWTAG) + tag = 1; + else + tag = tag < max_queue ? tag : 0; + + /* the last tag is reserved for internal command. */ + if (tag == ATA_TAG_INTERNAL) + continue; + + if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) { + ap->sas_last_tag = tag; + return tag; + } + } + return -1; +} + +void ata_sas_free_tag(unsigned int tag, struct ata_port *ap) +{ + clear_bit(tag, &ap->sas_tag_allocated); +} diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h index 82ebe26..f840ca1 100644 --- a/drivers/ata/libata.h +++ b/drivers/ata/libata.h @@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev); extern void ata_force_cbl(struct ata_port *ap); extern u64 ata_tf_to_lba(const struct ata_taskfile *tf); extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf); -extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev); +extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag); extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev, u64 block, u32 n_block, unsigned int tf_flags, unsigned int tag); @@ -144,6 +144,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work); extern int ata_bus_probe(struct ata_port *ap); extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel, unsigned int id, u64 lun); +int ata_sas_allocate_tag(struct ata_port *ap); +void ata_sas_free_tag(unsigned int tag, struct ata_port *ap); /* libata-eh.c */ diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c index ea65594..ba2667f 100644 --- a/drivers/ata/sata_sil24.c +++ b/drivers/ata/sata_sil24.c @@ -388,6 +388,7 @@ static struct scsi_host_template sil24_sht = { .can_queue = SIL24_MAX_CMDS, .sg_tablesize = SIL24_MAX_SGE, .dma_boundary = ATA_DMA_BOUNDARY, + .tag_alloc_policy = BLK_TAG_ALLOC_FIFO, }; static struct ata_port_operations sil24_ops = { diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 89e90ec..c01b921 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = { /* * And now the modules code and kernel interface. */ -static int rd_nr; -int rd_size = CONFIG_BLK_DEV_RAM_SIZE; -static int max_part; -static int part_shift; -static int part_show = 0; +static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; module_param(rd_nr, int, S_IRUGO); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); + +int rd_size = CONFIG_BLK_DEV_RAM_SIZE; module_param(rd_size, int, S_IRUGO); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); + +static int max_part = 1; module_param(max_part, int, S_IRUGO); -MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); -module_param(part_show, int, S_IRUGO); -MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions"); +MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); + MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -487,25 +486,33 @@ static struct brd_device *brd_alloc(int i) brd->brd_queue = blk_alloc_queue(GFP_KERNEL); if (!brd->brd_queue) goto out_free_dev; + blk_queue_make_request(brd->brd_queue, brd_make_request); blk_queue_max_hw_sectors(brd->brd_queue, 1024); blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); + /* This is so fdisk will align partitions on 4k, because of + * direct_access API needing 4k alignment, returning a PFN + * (This is only a problem on very small devices <= 4M, + * otherwise fdisk will align on 1M. Regardless this call + * is harmless) + */ + blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); + brd->brd_queue->limits.discard_granularity = PAGE_SIZE; brd->brd_queue->limits.max_discard_sectors = UINT_MAX; brd->brd_queue->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); - disk = brd->brd_disk = alloc_disk(1 << part_shift); + disk = brd->brd_disk = alloc_disk(max_part); if (!disk) goto out_free_queue; disk->major = RAMDISK_MAJOR; - disk->first_minor = i << part_shift; + disk->first_minor = i * max_part; disk->fops = &brd_fops; disk->private_data = brd; disk->queue = brd->brd_queue; - if (!part_show) - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); @@ -527,10 +534,11 @@ static void brd_free(struct brd_device *brd) kfree(brd); } -static struct brd_device *brd_init_one(int i) +static struct brd_device *brd_init_one(int i, bool *new) { struct brd_device *brd; + *new = false; list_for_each_entry(brd, &brd_devices, brd_list) { if (brd->brd_number == i) goto out; @@ -541,6 +549,7 @@ static struct brd_device *brd_init_one(int i) add_disk(brd->brd_disk); list_add_tail(&brd->brd_list, &brd_devices); } + *new = true; out: return brd; } @@ -556,70 +565,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data) { struct brd_device *brd; struct kobject *kobj; + bool new; mutex_lock(&brd_devices_mutex); - brd = brd_init_one(MINOR(dev) >> part_shift); + brd = brd_init_one(MINOR(dev) / max_part, &new); kobj = brd ? get_disk(brd->brd_disk) : NULL; mutex_unlock(&brd_devices_mutex); - *part = 0; + if (new) + *part = 0; + return kobj; } static int __init brd_init(void) { - int i, nr; - unsigned long range; struct brd_device *brd, *next; + int i; /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. - * However, this will not work well with user space tool that doesn't - * know about such "feature". In order to not break any existing - * tool, we do the following: * - * (1) if rd_nr is specified, create that many upfront, and this - * also becomes a hard limit. - * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT - * (default 16) rd device on module load, user can further - * extend brd device by create dev node themselves and have - * kernel automatically instantiate actual device on-demand. + * (1) if rd_nr is specified, create that many upfront. else + * it defaults to CONFIG_BLK_DEV_RAM_COUNT + * (2) User can further extend brd devices by create dev node themselves + * and have kernel automatically instantiate actual device + * on-demand. Example: + * mknod /path/devnod_name b 1 X # 1 is the rd major + * fdisk -l /path/devnod_name + * If (X / max_part) was not already created it will be created + * dynamically. */ - part_shift = 0; - if (max_part > 0) { - part_shift = fls(max_part); - - /* - * Adjust max_part according to part_shift as it is exported - * to user space so that user can decide correct minor number - * if [s]he want to create more devices. - * - * Note that -1 is required because partition 0 is reserved - * for the whole disk. - */ - max_part = (1UL << part_shift) - 1; - } - - if ((1UL << part_shift) > DISK_MAX_PARTS) - return -EINVAL; - - if (rd_nr > 1UL << (MINORBITS - part_shift)) - return -EINVAL; - - if (rd_nr) { - nr = rd_nr; - range = rd_nr << part_shift; - } else { - nr = CONFIG_BLK_DEV_RAM_COUNT; - range = 1UL << MINORBITS; - } - if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) return -EIO; - for (i = 0; i < nr; i++) { + if (unlikely(!max_part)) + max_part = 1; + + for (i = 0; i < rd_nr; i++) { brd = brd_alloc(i); if (!brd) goto out_free; @@ -631,10 +616,10 @@ static int __init brd_init(void) list_for_each_entry(brd, &brd_devices, brd_list) add_disk(brd->brd_disk); - blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range, + blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS, THIS_MODULE, brd_probe, NULL, NULL); - printk(KERN_INFO "brd: module loaded\n"); + pr_info("brd: module loaded\n"); return 0; out_free: @@ -644,21 +629,21 @@ out_free: } unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + pr_info("brd: module NOT loaded !!!\n"); return -ENOMEM; } static void __exit brd_exit(void) { - unsigned long range; struct brd_device *brd, *next; - range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS; - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); - blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range); + blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS); unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + + pr_info("brd: module unloaded\n"); } module_init(brd_init); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 56d46ff..a08cda9 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev, static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL); +static struct attribute *floppy_dev_attrs[] = { + &dev_attr_cmos.attr, + NULL +}; + +ATTRIBUTE_GROUPS(floppy_dev); + static void floppy_device_release(struct device *dev) { } @@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void) floppy_device[drive].name = floppy_device_name; floppy_device[drive].id = drive; floppy_device[drive].dev.release = floppy_device_release; + floppy_device[drive].dev.groups = floppy_dev_groups; err = platform_device_register(&floppy_device[drive]); if (err) goto out_remove_drives; - err = device_create_file(&floppy_device[drive].dev, - &dev_attr_cmos); - if (err) - goto out_unreg_platform_dev; - /* to be cleaned up... */ disks[drive]->private_data = (void *)(long)drive; disks[drive]->flags |= GENHD_FL_REMOVABLE; @@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void) return 0; -out_unreg_platform_dev: - platform_device_unregister(&floppy_device[drive]); out_remove_drives: while (drive--) { if (floppy_available(drive)) { del_gendisk(disks[drive]); - device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); platform_device_unregister(&floppy_device[drive]); } } @@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void) if (floppy_available(drive)) { del_gendisk(disks[drive]); - device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); platform_device_unregister(&floppy_device[drive]); } blk_cleanup_queue(disks[drive]->queue); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6cb1beb..d1f168b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -85,6 +85,8 @@ static DEFINE_MUTEX(loop_index_mutex); static int max_part; static int part_shift; +static struct workqueue_struct *loop_wq; + /* * Transfer functions */ @@ -284,12 +286,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, return ret; } -static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) +static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos) { int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t, struct page *page); struct bio_vec bvec; - struct bvec_iter iter; + struct req_iterator iter; struct page *page = NULL; int ret = 0; @@ -303,7 +305,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) do_lo_send = do_lo_send_direct_write; } - bio_for_each_segment(bvec, bio, iter) { + rq_for_each_segment(bvec, rq, iter) { ret = do_lo_send(lo, &bvec, pos, page); if (ret < 0) break; @@ -391,19 +393,22 @@ do_lo_receive(struct loop_device *lo, } static int -lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) +lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos) { struct bio_vec bvec; - struct bvec_iter iter; + struct req_iterator iter; ssize_t s; - bio_for_each_segment(bvec, bio, iter) { + rq_for_each_segment(bvec, rq, iter) { s = do_lo_receive(lo, &bvec, bsize, pos); if (s < 0) return s; if (s != bvec.bv_len) { - zero_fill_bio(bio); + struct bio *bio; + + __rq_for_each_bio(bio, rq) + zero_fill_bio(bio); break; } pos += bvec.bv_len; @@ -411,106 +416,58 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) return 0; } -static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) +static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos) { - loff_t pos; + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do not support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + struct file *file = lo->lo_backing_file; + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; int ret; - pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset; - - if (bio_rw(bio) == WRITE) { - struct file *file = lo->lo_backing_file; - - if (bio->bi_rw & REQ_FLUSH) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) { - ret = -EIO; - goto out; - } - } - - /* - * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. - */ - if (bio->bi_rw & REQ_DISCARD) { - struct file *file = lo->lo_backing_file; - int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; - - if ((!file->f_op->fallocate) || - lo->lo_encrypt_key_size) { - ret = -EOPNOTSUPP; - goto out; - } - ret = file->f_op->fallocate(file, mode, pos, - bio->bi_iter.bi_size); - if (unlikely(ret && ret != -EINVAL && - ret != -EOPNOTSUPP)) - ret = -EIO; - goto out; - } - - ret = lo_send(lo, bio, pos); - - if ((bio->bi_rw & REQ_FUA) && !ret) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) - ret = -EIO; - } - } else - ret = lo_receive(lo, bio, lo->lo_blocksize, pos); + if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { + ret = -EOPNOTSUPP; + goto out; + } -out: + ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); + if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) + ret = -EIO; + out: return ret; } -/* - * Add bio to back of pending list - */ -static void loop_add_bio(struct loop_device *lo, struct bio *bio) +static int lo_req_flush(struct loop_device *lo, struct request *rq) { - lo->lo_bio_count++; - bio_list_add(&lo->lo_bio_list, bio); -} + struct file *file = lo->lo_backing_file; + int ret = vfs_fsync(file, 0); + if (unlikely(ret && ret != -EINVAL)) + ret = -EIO; -/* - * Grab first pending buffer - */ -static struct bio *loop_get_bio(struct loop_device *lo) -{ - lo->lo_bio_count--; - return bio_list_pop(&lo->lo_bio_list); + return ret; } -static void loop_make_request(struct request_queue *q, struct bio *old_bio) +static int do_req_filebacked(struct loop_device *lo, struct request *rq) { - struct loop_device *lo = q->queuedata; - int rw = bio_rw(old_bio); - - if (rw == READA) - rw = READ; + loff_t pos; + int ret; - BUG_ON(!lo || (rw != READ && rw != WRITE)); + pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - spin_lock_irq(&lo->lo_lock); - if (lo->lo_state != Lo_bound) - goto out; - if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) - goto out; - if (lo->lo_bio_count >= q->nr_congestion_on) - wait_event_lock_irq(lo->lo_req_wait, - lo->lo_bio_count < q->nr_congestion_off, - lo->lo_lock); - loop_add_bio(lo, old_bio); - wake_up(&lo->lo_event); - spin_unlock_irq(&lo->lo_lock); - return; + if (rq->cmd_flags & REQ_WRITE) { + if (rq->cmd_flags & REQ_FLUSH) + ret = lo_req_flush(lo, rq); + else if (rq->cmd_flags & REQ_DISCARD) + ret = lo_discard(lo, rq, pos); + else + ret = lo_send(lo, rq, pos); + } else + ret = lo_receive(lo, rq, lo->lo_blocksize, pos); -out: - spin_unlock_irq(&lo->lo_lock); - bio_io_error(old_bio); + return ret; } struct switch_request { @@ -518,57 +475,26 @@ struct switch_request { struct completion wait; }; -static void do_loop_switch(struct loop_device *, struct switch_request *); - -static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) -{ - if (unlikely(!bio->bi_bdev)) { - do_loop_switch(lo, bio->bi_private); - bio_put(bio); - } else { - int ret = do_bio_filebacked(lo, bio); - bio_endio(bio, ret); - } -} - /* - * worker thread that handles reads/writes to file backed loop devices, - * to avoid blocking in our make_request_fn. it also does loop decrypting - * on reads for block backed loop, as that is too heavy to do from - * b_end_io context where irqs may be disabled. - * - * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before - * calling kthread_stop(). Therefore once kthread_should_stop() is - * true, make_request will not place any more requests. Therefore - * once kthread_should_stop() is true and lo_bio is NULL, we are - * done with the loop. + * Do the actual switch; called from the BIO completion routine */ -static int loop_thread(void *data) +static void do_loop_switch(struct loop_device *lo, struct switch_request *p) { - struct loop_device *lo = data; - struct bio *bio; - - set_user_nice(current, MIN_NICE); - - while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { - - wait_event_interruptible(lo->lo_event, - !bio_list_empty(&lo->lo_bio_list) || - kthread_should_stop()); - - if (bio_list_empty(&lo->lo_bio_list)) - continue; - spin_lock_irq(&lo->lo_lock); - bio = loop_get_bio(lo); - if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off) - wake_up(&lo->lo_req_wait); - spin_unlock_irq(&lo->lo_lock); + struct file *file = p->file; + struct file *old_file = lo->lo_backing_file; + struct address_space *mapping; - BUG_ON(!bio); - loop_handle_bio(lo, bio); - } + /* if no new file, only flush of queued bios requested */ + if (!file) + return; - return 0; + mapping = file->f_mapping; + mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); + lo->lo_backing_file = file; + lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? + mapping->host->i_bdev->bd_block_size : PAGE_SIZE; + lo->old_gfp_mask = mapping_gfp_mask(mapping); + mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); } /* @@ -579,15 +505,18 @@ static int loop_thread(void *data) static int loop_switch(struct loop_device *lo, struct file *file) { struct switch_request w; - struct bio *bio = bio_alloc(GFP_KERNEL, 0); - if (!bio) - return -ENOMEM; - init_completion(&w.wait); + w.file = file; - bio->bi_private = &w; - bio->bi_bdev = NULL; - loop_make_request(lo->lo_queue, bio); - wait_for_completion(&w.wait); + + /* freeze queue and wait for completion of scheduled requests */ + blk_mq_freeze_queue(lo->lo_queue); + + /* do the switch action */ + do_loop_switch(lo, &w); + + /* unfreeze */ + blk_mq_unfreeze_queue(lo->lo_queue); + return 0; } @@ -596,39 +525,10 @@ static int loop_switch(struct loop_device *lo, struct file *file) */ static int loop_flush(struct loop_device *lo) { - /* loop not yet configured, no running thread, nothing to flush */ - if (!lo->lo_thread) - return 0; - return loop_switch(lo, NULL); } /* - * Do the actual switch; called from the BIO completion routine - */ -static void do_loop_switch(struct loop_device *lo, struct switch_request *p) -{ - struct file *file = p->file; - struct file *old_file = lo->lo_backing_file; - struct address_space *mapping; - - /* if no new file, only flush of queued bios requested */ - if (!file) - goto out; - - mapping = file->f_mapping; - mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? - mapping->host->i_bdev->bd_block_size : PAGE_SIZE; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); -out: - complete(&p->wait); -} - - -/* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up * the original file and in High Availability environments to switch to @@ -889,12 +789,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->transfer = transfer_none; lo->ioctl = NULL; lo->lo_sizelimit = 0; - lo->lo_bio_count = 0; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - bio_list_init(&lo->lo_bio_list); - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_flush(lo->lo_queue, REQ_FLUSH); @@ -906,14 +803,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, set_blocksize(bdev, lo_blocksize); - lo->lo_thread = kthread_create(loop_thread, lo, "loop%d", - lo->lo_number); - if (IS_ERR(lo->lo_thread)) { - error = PTR_ERR(lo->lo_thread); - goto out_clr; - } lo->lo_state = Lo_bound; - wake_up_process(lo->lo_thread); if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) @@ -925,18 +815,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, bdgrab(bdev); return 0; -out_clr: - loop_sysfs_exit(lo); - lo->lo_thread = NULL; - lo->lo_device = NULL; - lo->lo_backing_file = NULL; - lo->lo_flags = 0; - set_capacity(lo->lo_disk, 0); - invalidate_bdev(bdev); - bd_set_size(bdev, 0); - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask); - lo->lo_state = Lo_unbound; out_putf: fput(file); out: @@ -1012,11 +890,6 @@ static int loop_clr_fd(struct loop_device *lo) spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; - spin_unlock_irq(&lo->lo_lock); - - kthread_stop(lo->lo_thread); - - spin_lock_irq(&lo->lo_lock); lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); @@ -1028,7 +901,6 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_offset = 0; lo->lo_sizelimit = 0; lo->lo_encrypt_key_size = 0; - lo->lo_thread = NULL; memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); @@ -1601,6 +1473,105 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); +static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + + blk_mq_start_request(bd->rq); + + if (cmd->rq->cmd_flags & REQ_WRITE) { + struct loop_device *lo = cmd->rq->q->queuedata; + bool need_sched = true; + + spin_lock_irq(&lo->lo_lock); + if (lo->write_started) + need_sched = false; + else + lo->write_started = true; + list_add_tail(&cmd->list, &lo->write_cmd_head); + spin_unlock_irq(&lo->lo_lock); + + if (need_sched) + queue_work(loop_wq, &lo->write_work); + } else { + queue_work(loop_wq, &cmd->read_work); + } + + return BLK_MQ_RQ_QUEUE_OK; +} + +static void loop_handle_cmd(struct loop_cmd *cmd) +{ + const bool write = cmd->rq->cmd_flags & REQ_WRITE; + struct loop_device *lo = cmd->rq->q->queuedata; + int ret = -EIO; + + if (lo->lo_state != Lo_bound) + goto failed; + + if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) + goto failed; + + ret = do_req_filebacked(lo, cmd->rq); + + failed: + if (ret) + cmd->rq->errors = -EIO; + blk_mq_complete_request(cmd->rq); +} + +static void loop_queue_write_work(struct work_struct *work) +{ + struct loop_device *lo = + container_of(work, struct loop_device, write_work); + LIST_HEAD(cmd_list); + + spin_lock_irq(&lo->lo_lock); + repeat: + list_splice_init(&lo->write_cmd_head, &cmd_list); + spin_unlock_irq(&lo->lo_lock); + + while (!list_empty(&cmd_list)) { + struct loop_cmd *cmd = list_first_entry(&cmd_list, + struct loop_cmd, list); + list_del_init(&cmd->list); + loop_handle_cmd(cmd); + } + + spin_lock_irq(&lo->lo_lock); + if (!list_empty(&lo->write_cmd_head)) + goto repeat; + lo->write_started = false; + spin_unlock_irq(&lo->lo_lock); +} + +static void loop_queue_read_work(struct work_struct *work) +{ + struct loop_cmd *cmd = + container_of(work, struct loop_cmd, read_work); + + loop_handle_cmd(cmd); +} + +static int loop_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + + cmd->rq = rq; + INIT_WORK(&cmd->read_work, loop_queue_read_work); + + return 0; +} + +static struct blk_mq_ops loop_mq_ops = { + .queue_rq = loop_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = loop_init_request, +}; + static int loop_add(struct loop_device **l, int i) { struct loop_device *lo; @@ -1627,16 +1598,28 @@ static int loop_add(struct loop_device **l, int i) i = err; err = -ENOMEM; - lo->lo_queue = blk_alloc_queue(GFP_KERNEL); - if (!lo->lo_queue) + lo->tag_set.ops = &loop_mq_ops; + lo->tag_set.nr_hw_queues = 1; + lo->tag_set.queue_depth = 128; + lo->tag_set.numa_node = NUMA_NO_NODE; + lo->tag_set.cmd_size = sizeof(struct loop_cmd); + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + lo->tag_set.driver_data = lo; + + err = blk_mq_alloc_tag_set(&lo->tag_set); + if (err) goto out_free_idr; - /* - * set queue make_request_fn - */ - blk_queue_make_request(lo->lo_queue, loop_make_request); + lo->lo_queue = blk_mq_init_queue(&lo->tag_set); + if (IS_ERR_OR_NULL(lo->lo_queue)) { + err = PTR_ERR(lo->lo_queue); + goto out_cleanup_tags; + } lo->lo_queue->queuedata = lo; + INIT_LIST_HEAD(&lo->write_cmd_head); + INIT_WORK(&lo->write_work, loop_queue_write_work); + disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) goto out_free_queue; @@ -1664,9 +1647,6 @@ static int loop_add(struct loop_device **l, int i) disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); lo->lo_number = i; - lo->lo_thread = NULL; - init_waitqueue_head(&lo->lo_event); - init_waitqueue_head(&lo->lo_req_wait); spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; @@ -1680,6 +1660,8 @@ static int loop_add(struct loop_device **l, int i) out_free_queue: blk_cleanup_queue(lo->lo_queue); +out_cleanup_tags: + blk_mq_free_tag_set(&lo->tag_set); out_free_idr: idr_remove(&loop_index_idr, i); out_free_dev: @@ -1692,6 +1674,7 @@ static void loop_remove(struct loop_device *lo) { del_gendisk(lo->lo_disk); blk_cleanup_queue(lo->lo_queue); + blk_mq_free_tag_set(&lo->tag_set); put_disk(lo->lo_disk); kfree(lo); } @@ -1875,6 +1858,13 @@ static int __init loop_init(void) goto misc_out; } + loop_wq = alloc_workqueue("kloopd", + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0); + if (!loop_wq) { + err = -ENOMEM; + goto misc_out; + } + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, THIS_MODULE, loop_probe, NULL, NULL); @@ -1912,6 +1902,8 @@ static void __exit loop_exit(void) blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); unregister_blkdev(LOOP_MAJOR, "loop"); + destroy_workqueue(loop_wq); + misc_deregister(&loop_misc); } diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 90df5d6..301c27f 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -11,8 +11,10 @@ #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/workqueue.h> #include <uapi/linux/loop.h> /* Possible states of device */ @@ -52,19 +54,23 @@ struct loop_device { gfp_t old_gfp_mask; spinlock_t lo_lock; - struct bio_list lo_bio_list; - unsigned int lo_bio_count; + struct list_head write_cmd_head; + struct work_struct write_work; + bool write_started; int lo_state; struct mutex lo_ctl_mutex; - struct task_struct *lo_thread; - wait_queue_head_t lo_event; - /* wait queue for incoming requests */ - wait_queue_head_t lo_req_wait; struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; struct gendisk *lo_disk; }; +struct loop_cmd { + struct work_struct read_work; + struct request *rq; + struct list_head list; +}; + /* Support for loadable transfer modules */ struct loop_func_table { int number; /* filter type */ diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index aa2224a..65cd61a 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -579,7 +579,7 @@ static int null_add_dev(void) sector_div(size, bs); set_capacity(disk, size); - disk->flags |= GENHD_FL_EXT_DEVT; + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; disk->first_minor = nullb->index; disk->fops = &null_fops; diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index d826bf3..cbdfbbf 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -144,8 +144,37 @@ struct nvme_cmd_info { void *ctx; int aborted; struct nvme_queue *nvmeq; + struct nvme_iod iod[0]; }; +/* + * Max size of iod being embedded in the request payload + */ +#define NVME_INT_PAGES 2 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size, struct nvme_dev *dev) +{ + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + unsigned int ret = sizeof(struct nvme_cmd_info); + + ret += sizeof(struct nvme_iod); + ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); + ret += sizeof(struct scatterlist) * NVME_INT_PAGES; + + return ret; +} + static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); } +static void *iod_get_private(struct nvme_iod *iod) +{ + return (void *) (iod->private & ~0x1UL); +} + +/* + * If bit 0 is set, the iod is embedded in the request payload. + */ +static bool iod_should_kfree(struct nvme_iod *iod) +{ + return (iod->private & 0x01) == 0; +} + /* Special values must be less than 0x1000 */ #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) @@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod) return ((void *)iod) + iod->offset; } -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) +static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, + unsigned nseg, unsigned long private) { - unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); - return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); + iod->private = private; + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + iod->nents = 0; } static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) +__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, + unsigned long priv, gfp_t gfp) { struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes, dev) + + sizeof(__le64 *) * nvme_npages(bytes, dev) + sizeof(struct scatterlist) * nseg, gfp); - if (iod) { - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; - iod->first_dma = 0ULL; - } + if (iod) + iod_init(iod, bytes, nseg, priv); return iod; } +static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, + gfp_t gfp) +{ + unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : + sizeof(struct nvme_dsm_range); + unsigned long mask = 0; + struct nvme_iod *iod; + + if (rq->nr_phys_segments <= NVME_INT_PAGES && + size <= NVME_INT_BYTES(dev)) { + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); + + iod = cmd->iod; + mask = 0x01; + iod_init(iod, size, rq->nr_phys_segments, + (unsigned long) rq | 0x01); + return iod; + } + + return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, + (unsigned long) rq, gfp); +} + void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = dev->page_size / 8 - 1; @@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); prp_dma = next_prp_dma; } - kfree(iod); + + if (iod_should_kfree(iod)) + kfree(iod); } static int nvme_error_status(u16 status) @@ -424,7 +486,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); u16 status = le16_to_cpup(&cqe->status) >> 1; @@ -585,7 +647,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct nvme_ns *ns) { - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_command *cmnd; u16 control = 0; u32 dsmgmt = 0; @@ -626,17 +688,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = req->nr_phys_segments; enum dma_data_direction dma_dir; - unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : - sizeof(struct nvme_dsm_range); - iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); + iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; - iod->private = req; - if (req->cmd_flags & REQ_DISCARD) { void *range; /* @@ -651,10 +708,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto retry_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; - } else if (psegs) { + } else if (req->nr_phys_segments) { dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; - sg_init_table(iod->sg, psegs); + sg_init_table(iod->sg, req->nr_phys_segments); iod->nents = blk_rq_map_sg(req->q, req, iod->sg); if (!iod->nents) goto error_cmd; @@ -1137,21 +1194,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) static void nvme_free_queues(struct nvme_dev *dev, int lowest) { - LLIST_HEAD(q_list); - struct nvme_queue *nvmeq, *next; - struct llist_node *entry; int i; for (i = dev->queue_count - 1; i >= lowest; i--) { struct nvme_queue *nvmeq = dev->queues[i]; - llist_add(&nvmeq->node, &q_list); dev->queue_count--; dev->queues[i] = NULL; - } - synchronize_rcu(); - entry = llist_del_all(&q_list); - llist_for_each_entry_safe(nvmeq, next, entry, node) nvme_free_queue(nvmeq); + } } /** @@ -1408,7 +1458,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); - dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->admin_tagset.cmd_size = nvme_cmd_size(dev); dev->admin_tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->admin_tagset)) @@ -1522,7 +1572,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, } err = -ENOMEM; - iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); + iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); if (!iod) goto put_pages; @@ -2148,7 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->tagset.cmd_size = nvme_cmd_size(dev); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index cc90a84..375d288 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -214,6 +214,15 @@ enum blkif_protocol { BLKIF_PROTOCOL_X86_64 = 3, }; +/* + * Default protocol if the frontend doesn't specify one. + */ +#ifdef CONFIG_X86 +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32 +#else +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE +#endif + struct xen_vbd { /* What the domain refers to this vbd as. */ blkif_vdev_t handle; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 630a489..e3afe97 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be) return err; } - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", "%63s", protocol, NULL); if (err) - strcpy(protocol, "unspecified, assuming native"); + strcpy(protocol, "unspecified, assuming default"); else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2236c6f..7f66d2e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info) merge_bio.tail = copy[i].request->biotail; bio_list_merge(&bio_list, &merge_bio); copy[i].request->bio = NULL; - blk_put_request(copy[i].request); + blk_end_request_all(copy[i].request, 0); } kfree(copy); @@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info) req->bio = NULL; if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) pr_alert("diskcache flush request found!\n"); - __blk_put_request(info->rq, req); + __blk_end_request_all(req, 0); } spin_unlock_irq(&info->io_lock); diff --git a/include/linux/libata.h b/include/linux/libata.h index 61df823..fc03efa 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -823,10 +823,10 @@ struct ata_port { unsigned int cbl; /* cable type; ATA_CBL_xxx */ struct ata_queued_cmd qcmd[ATA_MAX_QUEUE]; - unsigned long qc_allocated; + unsigned long sas_tag_allocated; /* for sas tag allocation only */ unsigned int qc_active; int nr_active_links; /* #links with active qcs */ - unsigned int last_tag; /* track next tag hw expects */ + unsigned int sas_last_tag; /* track next tag hw expects */ struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ @@ -1352,6 +1352,7 @@ extern struct device_attribute *ata_common_sdev_attrs[]; .ioctl = ata_scsi_ioctl, \ .queuecommand = ata_scsi_queuecmd, \ .can_queue = ATA_DEF_QUEUE, \ + .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ .this_id = ATA_SHT_THIS_ID, \ .cmd_per_lun = ATA_SHT_CMD_PER_LUN, \ .emulated = ATA_SHT_EMULATED, \ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 258945f..19a5d4b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -132,13 +132,12 @@ struct nvme_ns { * allocated to store the PRP list. */ struct nvme_iod { - void *private; /* For the use of the submitter of the I/O */ + unsigned long private; /* For the use of the submitter of the I/O */ int npages; /* In the PRP list. 0 means small pool in use */ int offset; /* Of PRP list */ int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ dma_addr_t first_dma; - struct list_head node; struct scatterlist sg[0]; }; diff --git a/include/linux/wait.h b/include/linux/wait.h index 537d58e..2db8334 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -267,6 +267,21 @@ do { \ __wait_event(wq, condition); \ } while (0) +#define __io_wait_event(wq, condition) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + io_schedule()) + +/* + * io_wait_event() -- like wait_event() but with io_schedule() + */ +#define io_wait_event(wq, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __io_wait_event(wq, condition); \ +} while (0) + #define __wait_event_freezable(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule(); try_to_freeze()) |