diff options
Diffstat (limited to 'drivers')
54 files changed, 1788 insertions, 1387 deletions
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 4b0d5e7..4c35f08 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -1585,8 +1585,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev, else tag = 0; - if (test_and_set_bit(tag, &ap->qc_allocated)) - BUG(); qc = __ata_qc_from_tag(ap, tag); qc->tag = tag; @@ -4722,69 +4720,36 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words) } /** - * ata_qc_new - Request an available ATA command, for queueing - * @ap: target port - * - * Some ATA host controllers may implement a queue depth which is less - * than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond - * the hardware limitation. + * ata_qc_new_init - Request an available ATA command, and initialize it + * @dev: Device from whom we request an available command structure * * LOCKING: * None. */ -static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap) +struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag) { - struct ata_queued_cmd *qc = NULL; - unsigned int max_queue = ap->host->n_tags; - unsigned int i, tag; + struct ata_port *ap = dev->link->ap; + struct ata_queued_cmd *qc; /* no command while frozen */ if (unlikely(ap->pflags & ATA_PFLAG_FROZEN)) return NULL; - for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) { - if (ap->flags & ATA_FLAG_LOWTAG) - tag = i; - else - tag = tag < max_queue ? tag : 0; - - /* the last tag is reserved for internal command. */ - if (tag == ATA_TAG_INTERNAL) - continue; - - if (!test_and_set_bit(tag, &ap->qc_allocated)) { - qc = __ata_qc_from_tag(ap, tag); - qc->tag = tag; - ap->last_tag = tag; - break; - } + /* libsas case */ + if (!ap->scsi_host) { + tag = ata_sas_allocate_tag(ap); + if (tag < 0) + return NULL; } - return qc; -} - -/** - * ata_qc_new_init - Request an available ATA command, and initialize it - * @dev: Device from whom we request an available command structure - * - * LOCKING: - * None. - */ - -struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev) -{ - struct ata_port *ap = dev->link->ap; - struct ata_queued_cmd *qc; - - qc = ata_qc_new(ap); - if (qc) { - qc->scsicmd = NULL; - qc->ap = ap; - qc->dev = dev; + qc = __ata_qc_from_tag(ap, tag); + qc->tag = tag; + qc->scsicmd = NULL; + qc->ap = ap; + qc->dev = dev; - ata_qc_reinit(qc); - } + ata_qc_reinit(qc); return qc; } @@ -4811,7 +4776,8 @@ void ata_qc_free(struct ata_queued_cmd *qc) tag = qc->tag; if (likely(ata_tag_valid(tag))) { qc->tag = ATA_TAG_POISON; - clear_bit(tag, &ap->qc_allocated); + if (!ap->scsi_host) + ata_sas_free_tag(tag, ap); } } diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 2807293..b061ba2 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev, { struct ata_queued_cmd *qc; - qc = ata_qc_new_init(dev); + qc = ata_qc_new_init(dev, cmd->request->tag); if (qc) { qc->scsicmd = cmd; qc->scsidone = cmd->scsi_done; @@ -3668,6 +3668,9 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht) */ shost->max_host_blocked = 1; + if (scsi_init_shared_tag_map(shost, host->n_tags)) + goto err_add; + rc = scsi_add_host_with_dma(ap->scsi_host, &ap->tdev, ap->host->dev); if (rc) @@ -4230,3 +4233,31 @@ int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap) return rc; } EXPORT_SYMBOL_GPL(ata_sas_queuecmd); + +int ata_sas_allocate_tag(struct ata_port *ap) +{ + unsigned int max_queue = ap->host->n_tags; + unsigned int i, tag; + + for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) { + if (ap->flags & ATA_FLAG_LOWTAG) + tag = 1; + else + tag = tag < max_queue ? tag : 0; + + /* the last tag is reserved for internal command. */ + if (tag == ATA_TAG_INTERNAL) + continue; + + if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) { + ap->sas_last_tag = tag; + return tag; + } + } + return -1; +} + +void ata_sas_free_tag(unsigned int tag, struct ata_port *ap) +{ + clear_bit(tag, &ap->sas_tag_allocated); +} diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h index 82ebe26..f840ca1 100644 --- a/drivers/ata/libata.h +++ b/drivers/ata/libata.h @@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev); extern void ata_force_cbl(struct ata_port *ap); extern u64 ata_tf_to_lba(const struct ata_taskfile *tf); extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf); -extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev); +extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag); extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev, u64 block, u32 n_block, unsigned int tf_flags, unsigned int tag); @@ -144,6 +144,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work); extern int ata_bus_probe(struct ata_port *ap); extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel, unsigned int id, u64 lun); +int ata_sas_allocate_tag(struct ata_port *ap); +void ata_sas_free_tag(unsigned int tag, struct ata_port *ap); /* libata-eh.c */ diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c index ea65594..ba2667f 100644 --- a/drivers/ata/sata_sil24.c +++ b/drivers/ata/sata_sil24.c @@ -388,6 +388,7 @@ static struct scsi_host_template sil24_sht = { .can_queue = SIL24_MAX_CMDS, .sg_tablesize = SIL24_MAX_SGE, .dma_boundary = ATA_DMA_BOUNDARY, + .tag_alloc_policy = BLK_TAG_ALLOC_FIFO, }; static struct ata_port_operations sil24_ops = { diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 3598110..c01b921 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -370,25 +370,25 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, } #ifdef CONFIG_BLK_DEV_XIP -static int brd_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn) +static long brd_direct_access(struct block_device *bdev, sector_t sector, + void **kaddr, unsigned long *pfn, long size) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; if (!brd) return -ENODEV; - if (sector & (PAGE_SECTORS-1)) - return -EINVAL; - if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk)) - return -ERANGE; page = brd_insert_page(brd, sector); if (!page) return -ENOSPC; *kaddr = page_address(page); *pfn = page_to_pfn(page); - return 0; + /* + * TODO: If size > PAGE_SIZE, we could look to see if the next page in + * the file happens to be mapped to the next page of physical RAM. + */ + return PAGE_SIZE; } #endif @@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = { /* * And now the modules code and kernel interface. */ -static int rd_nr; -int rd_size = CONFIG_BLK_DEV_RAM_SIZE; -static int max_part; -static int part_shift; -static int part_show = 0; +static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; module_param(rd_nr, int, S_IRUGO); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); + +int rd_size = CONFIG_BLK_DEV_RAM_SIZE; module_param(rd_size, int, S_IRUGO); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); + +static int max_part = 1; module_param(max_part, int, S_IRUGO); -MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); -module_param(part_show, int, S_IRUGO); -MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions"); +MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); + MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -487,25 +486,33 @@ static struct brd_device *brd_alloc(int i) brd->brd_queue = blk_alloc_queue(GFP_KERNEL); if (!brd->brd_queue) goto out_free_dev; + blk_queue_make_request(brd->brd_queue, brd_make_request); blk_queue_max_hw_sectors(brd->brd_queue, 1024); blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); + /* This is so fdisk will align partitions on 4k, because of + * direct_access API needing 4k alignment, returning a PFN + * (This is only a problem on very small devices <= 4M, + * otherwise fdisk will align on 1M. Regardless this call + * is harmless) + */ + blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); + brd->brd_queue->limits.discard_granularity = PAGE_SIZE; brd->brd_queue->limits.max_discard_sectors = UINT_MAX; brd->brd_queue->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); - disk = brd->brd_disk = alloc_disk(1 << part_shift); + disk = brd->brd_disk = alloc_disk(max_part); if (!disk) goto out_free_queue; disk->major = RAMDISK_MAJOR; - disk->first_minor = i << part_shift; + disk->first_minor = i * max_part; disk->fops = &brd_fops; disk->private_data = brd; disk->queue = brd->brd_queue; - if (!part_show) - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); @@ -527,10 +534,11 @@ static void brd_free(struct brd_device *brd) kfree(brd); } -static struct brd_device *brd_init_one(int i) +static struct brd_device *brd_init_one(int i, bool *new) { struct brd_device *brd; + *new = false; list_for_each_entry(brd, &brd_devices, brd_list) { if (brd->brd_number == i) goto out; @@ -541,6 +549,7 @@ static struct brd_device *brd_init_one(int i) add_disk(brd->brd_disk); list_add_tail(&brd->brd_list, &brd_devices); } + *new = true; out: return brd; } @@ -556,70 +565,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data) { struct brd_device *brd; struct kobject *kobj; + bool new; mutex_lock(&brd_devices_mutex); - brd = brd_init_one(MINOR(dev) >> part_shift); + brd = brd_init_one(MINOR(dev) / max_part, &new); kobj = brd ? get_disk(brd->brd_disk) : NULL; mutex_unlock(&brd_devices_mutex); - *part = 0; + if (new) + *part = 0; + return kobj; } static int __init brd_init(void) { - int i, nr; - unsigned long range; struct brd_device *brd, *next; + int i; /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. - * However, this will not work well with user space tool that doesn't - * know about such "feature". In order to not break any existing - * tool, we do the following: * - * (1) if rd_nr is specified, create that many upfront, and this - * also becomes a hard limit. - * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT - * (default 16) rd device on module load, user can further - * extend brd device by create dev node themselves and have - * kernel automatically instantiate actual device on-demand. + * (1) if rd_nr is specified, create that many upfront. else + * it defaults to CONFIG_BLK_DEV_RAM_COUNT + * (2) User can further extend brd devices by create dev node themselves + * and have kernel automatically instantiate actual device + * on-demand. Example: + * mknod /path/devnod_name b 1 X # 1 is the rd major + * fdisk -l /path/devnod_name + * If (X / max_part) was not already created it will be created + * dynamically. */ - part_shift = 0; - if (max_part > 0) { - part_shift = fls(max_part); - - /* - * Adjust max_part according to part_shift as it is exported - * to user space so that user can decide correct minor number - * if [s]he want to create more devices. - * - * Note that -1 is required because partition 0 is reserved - * for the whole disk. - */ - max_part = (1UL << part_shift) - 1; - } - - if ((1UL << part_shift) > DISK_MAX_PARTS) - return -EINVAL; - - if (rd_nr > 1UL << (MINORBITS - part_shift)) - return -EINVAL; - - if (rd_nr) { - nr = rd_nr; - range = rd_nr << part_shift; - } else { - nr = CONFIG_BLK_DEV_RAM_COUNT; - range = 1UL << MINORBITS; - } - if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) return -EIO; - for (i = 0; i < nr; i++) { + if (unlikely(!max_part)) + max_part = 1; + + for (i = 0; i < rd_nr; i++) { brd = brd_alloc(i); if (!brd) goto out_free; @@ -631,10 +616,10 @@ static int __init brd_init(void) list_for_each_entry(brd, &brd_devices, brd_list) add_disk(brd->brd_disk); - blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range, + blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS, THIS_MODULE, brd_probe, NULL, NULL); - printk(KERN_INFO "brd: module loaded\n"); + pr_info("brd: module loaded\n"); return 0; out_free: @@ -644,21 +629,21 @@ out_free: } unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + pr_info("brd: module NOT loaded !!!\n"); return -ENOMEM; } static void __exit brd_exit(void) { - unsigned long range; struct brd_device *brd, *next; - range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS; - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); - blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range); + blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS); unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + + pr_info("brd: module unloaded\n"); } module_init(brd_init); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d169b4a..cee2035 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device, list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); if (blkdev_issue_zeroout(device->ldev->backing_bdev, - sector, data_size >> 9, GFP_NOIO)) + sector, data_size >> 9, GFP_NOIO, false)) peer_req->flags |= EE_WAS_ERROR; drbd_endio_write_sec_final(peer_req); return 0; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 56d46ff..a08cda9 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev, static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL); +static struct attribute *floppy_dev_attrs[] = { + &dev_attr_cmos.attr, + NULL +}; + +ATTRIBUTE_GROUPS(floppy_dev); + static void floppy_device_release(struct device *dev) { } @@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void) floppy_device[drive].name = floppy_device_name; floppy_device[drive].id = drive; floppy_device[drive].dev.release = floppy_device_release; + floppy_device[drive].dev.groups = floppy_dev_groups; err = platform_device_register(&floppy_device[drive]); if (err) goto out_remove_drives; - err = device_create_file(&floppy_device[drive].dev, - &dev_attr_cmos); - if (err) - goto out_unreg_platform_dev; - /* to be cleaned up... */ disks[drive]->private_data = (void *)(long)drive; disks[drive]->flags |= GENHD_FL_REMOVABLE; @@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void) return 0; -out_unreg_platform_dev: - platform_device_unregister(&floppy_device[drive]); out_remove_drives: while (drive--) { if (floppy_available(drive)) { del_gendisk(disks[drive]); - device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); platform_device_unregister(&floppy_device[drive]); } } @@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void) if (floppy_available(drive)) { del_gendisk(disks[drive]); - device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); platform_device_unregister(&floppy_device[drive]); } blk_cleanup_queue(disks[drive]->queue); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6cb1beb..d1f168b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -85,6 +85,8 @@ static DEFINE_MUTEX(loop_index_mutex); static int max_part; static int part_shift; +static struct workqueue_struct *loop_wq; + /* * Transfer functions */ @@ -284,12 +286,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, return ret; } -static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) +static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos) { int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t, struct page *page); struct bio_vec bvec; - struct bvec_iter iter; + struct req_iterator iter; struct page *page = NULL; int ret = 0; @@ -303,7 +305,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) do_lo_send = do_lo_send_direct_write; } - bio_for_each_segment(bvec, bio, iter) { + rq_for_each_segment(bvec, rq, iter) { ret = do_lo_send(lo, &bvec, pos, page); if (ret < 0) break; @@ -391,19 +393,22 @@ do_lo_receive(struct loop_device *lo, } static int -lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) +lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos) { struct bio_vec bvec; - struct bvec_iter iter; + struct req_iterator iter; ssize_t s; - bio_for_each_segment(bvec, bio, iter) { + rq_for_each_segment(bvec, rq, iter) { s = do_lo_receive(lo, &bvec, bsize, pos); if (s < 0) return s; if (s != bvec.bv_len) { - zero_fill_bio(bio); + struct bio *bio; + + __rq_for_each_bio(bio, rq) + zero_fill_bio(bio); break; } pos += bvec.bv_len; @@ -411,106 +416,58 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) return 0; } -static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) +static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos) { - loff_t pos; + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do not support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + struct file *file = lo->lo_backing_file; + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; int ret; - pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset; - - if (bio_rw(bio) == WRITE) { - struct file *file = lo->lo_backing_file; - - if (bio->bi_rw & REQ_FLUSH) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) { - ret = -EIO; - goto out; - } - } - - /* - * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. - */ - if (bio->bi_rw & REQ_DISCARD) { - struct file *file = lo->lo_backing_file; - int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; - - if ((!file->f_op->fallocate) || - lo->lo_encrypt_key_size) { - ret = -EOPNOTSUPP; - goto out; - } - ret = file->f_op->fallocate(file, mode, pos, - bio->bi_iter.bi_size); - if (unlikely(ret && ret != -EINVAL && - ret != -EOPNOTSUPP)) - ret = -EIO; - goto out; - } - - ret = lo_send(lo, bio, pos); - - if ((bio->bi_rw & REQ_FUA) && !ret) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) - ret = -EIO; - } - } else - ret = lo_receive(lo, bio, lo->lo_blocksize, pos); + if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { + ret = -EOPNOTSUPP; + goto out; + } -out: + ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); + if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) + ret = -EIO; + out: return ret; } -/* - * Add bio to back of pending list - */ -static void loop_add_bio(struct loop_device *lo, struct bio *bio) +static int lo_req_flush(struct loop_device *lo, struct request *rq) { - lo->lo_bio_count++; - bio_list_add(&lo->lo_bio_list, bio); -} + struct file *file = lo->lo_backing_file; + int ret = vfs_fsync(file, 0); + if (unlikely(ret && ret != -EINVAL)) + ret = -EIO; -/* - * Grab first pending buffer - */ -static struct bio *loop_get_bio(struct loop_device *lo) -{ - lo->lo_bio_count--; - return bio_list_pop(&lo->lo_bio_list); + return ret; } -static void loop_make_request(struct request_queue *q, struct bio *old_bio) +static int do_req_filebacked(struct loop_device *lo, struct request *rq) { - struct loop_device *lo = q->queuedata; - int rw = bio_rw(old_bio); - - if (rw == READA) - rw = READ; + loff_t pos; + int ret; - BUG_ON(!lo || (rw != READ && rw != WRITE)); + pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - spin_lock_irq(&lo->lo_lock); - if (lo->lo_state != Lo_bound) - goto out; - if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) - goto out; - if (lo->lo_bio_count >= q->nr_congestion_on) - wait_event_lock_irq(lo->lo_req_wait, - lo->lo_bio_count < q->nr_congestion_off, - lo->lo_lock); - loop_add_bio(lo, old_bio); - wake_up(&lo->lo_event); - spin_unlock_irq(&lo->lo_lock); - return; + if (rq->cmd_flags & REQ_WRITE) { + if (rq->cmd_flags & REQ_FLUSH) + ret = lo_req_flush(lo, rq); + else if (rq->cmd_flags & REQ_DISCARD) + ret = lo_discard(lo, rq, pos); + else + ret = lo_send(lo, rq, pos); + } else + ret = lo_receive(lo, rq, lo->lo_blocksize, pos); -out: - spin_unlock_irq(&lo->lo_lock); - bio_io_error(old_bio); + return ret; } struct switch_request { @@ -518,57 +475,26 @@ struct switch_request { struct completion wait; }; -static void do_loop_switch(struct loop_device *, struct switch_request *); - -static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) -{ - if (unlikely(!bio->bi_bdev)) { - do_loop_switch(lo, bio->bi_private); - bio_put(bio); - } else { - int ret = do_bio_filebacked(lo, bio); - bio_endio(bio, ret); - } -} - /* - * worker thread that handles reads/writes to file backed loop devices, - * to avoid blocking in our make_request_fn. it also does loop decrypting - * on reads for block backed loop, as that is too heavy to do from - * b_end_io context where irqs may be disabled. - * - * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before - * calling kthread_stop(). Therefore once kthread_should_stop() is - * true, make_request will not place any more requests. Therefore - * once kthread_should_stop() is true and lo_bio is NULL, we are - * done with the loop. + * Do the actual switch; called from the BIO completion routine */ -static int loop_thread(void *data) +static void do_loop_switch(struct loop_device *lo, struct switch_request *p) { - struct loop_device *lo = data; - struct bio *bio; - - set_user_nice(current, MIN_NICE); - - while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { - - wait_event_interruptible(lo->lo_event, - !bio_list_empty(&lo->lo_bio_list) || - kthread_should_stop()); - - if (bio_list_empty(&lo->lo_bio_list)) - continue; - spin_lock_irq(&lo->lo_lock); - bio = loop_get_bio(lo); - if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off) - wake_up(&lo->lo_req_wait); - spin_unlock_irq(&lo->lo_lock); + struct file *file = p->file; + struct file *old_file = lo->lo_backing_file; + struct address_space *mapping; - BUG_ON(!bio); - loop_handle_bio(lo, bio); - } + /* if no new file, only flush of queued bios requested */ + if (!file) + return; - return 0; + mapping = file->f_mapping; + mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); + lo->lo_backing_file = file; + lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? + mapping->host->i_bdev->bd_block_size : PAGE_SIZE; + lo->old_gfp_mask = mapping_gfp_mask(mapping); + mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); } /* @@ -579,15 +505,18 @@ static int loop_thread(void *data) static int loop_switch(struct loop_device *lo, struct file *file) { struct switch_request w; - struct bio *bio = bio_alloc(GFP_KERNEL, 0); - if (!bio) - return -ENOMEM; - init_completion(&w.wait); + w.file = file; - bio->bi_private = &w; - bio->bi_bdev = NULL; - loop_make_request(lo->lo_queue, bio); - wait_for_completion(&w.wait); + + /* freeze queue and wait for completion of scheduled requests */ + blk_mq_freeze_queue(lo->lo_queue); + + /* do the switch action */ + do_loop_switch(lo, &w); + + /* unfreeze */ + blk_mq_unfreeze_queue(lo->lo_queue); + return 0; } @@ -596,39 +525,10 @@ static int loop_switch(struct loop_device *lo, struct file *file) */ static int loop_flush(struct loop_device *lo) { - /* loop not yet configured, no running thread, nothing to flush */ - if (!lo->lo_thread) - return 0; - return loop_switch(lo, NULL); } /* - * Do the actual switch; called from the BIO completion routine - */ -static void do_loop_switch(struct loop_device *lo, struct switch_request *p) -{ - struct file *file = p->file; - struct file *old_file = lo->lo_backing_file; - struct address_space *mapping; - - /* if no new file, only flush of queued bios requested */ - if (!file) - goto out; - - mapping = file->f_mapping; - mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? - mapping->host->i_bdev->bd_block_size : PAGE_SIZE; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); -out: - complete(&p->wait); -} - - -/* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up * the original file and in High Availability environments to switch to @@ -889,12 +789,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->transfer = transfer_none; lo->ioctl = NULL; lo->lo_sizelimit = 0; - lo->lo_bio_count = 0; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - bio_list_init(&lo->lo_bio_list); - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_flush(lo->lo_queue, REQ_FLUSH); @@ -906,14 +803,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, set_blocksize(bdev, lo_blocksize); - lo->lo_thread = kthread_create(loop_thread, lo, "loop%d", - lo->lo_number); - if (IS_ERR(lo->lo_thread)) { - error = PTR_ERR(lo->lo_thread); - goto out_clr; - } lo->lo_state = Lo_bound; - wake_up_process(lo->lo_thread); if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) @@ -925,18 +815,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, bdgrab(bdev); return 0; -out_clr: - loop_sysfs_exit(lo); - lo->lo_thread = NULL; - lo->lo_device = NULL; - lo->lo_backing_file = NULL; - lo->lo_flags = 0; - set_capacity(lo->lo_disk, 0); - invalidate_bdev(bdev); - bd_set_size(bdev, 0); - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask); - lo->lo_state = Lo_unbound; out_putf: fput(file); out: @@ -1012,11 +890,6 @@ static int loop_clr_fd(struct loop_device *lo) spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; - spin_unlock_irq(&lo->lo_lock); - - kthread_stop(lo->lo_thread); - - spin_lock_irq(&lo->lo_lock); lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); @@ -1028,7 +901,6 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_offset = 0; lo->lo_sizelimit = 0; lo->lo_encrypt_key_size = 0; - lo->lo_thread = NULL; memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); @@ -1601,6 +1473,105 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); +static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + + blk_mq_start_request(bd->rq); + + if (cmd->rq->cmd_flags & REQ_WRITE) { + struct loop_device *lo = cmd->rq->q->queuedata; + bool need_sched = true; + + spin_lock_irq(&lo->lo_lock); + if (lo->write_started) + need_sched = false; + else + lo->write_started = true; + list_add_tail(&cmd->list, &lo->write_cmd_head); + spin_unlock_irq(&lo->lo_lock); + + if (need_sched) + queue_work(loop_wq, &lo->write_work); + } else { + queue_work(loop_wq, &cmd->read_work); + } + + return BLK_MQ_RQ_QUEUE_OK; +} + +static void loop_handle_cmd(struct loop_cmd *cmd) +{ + const bool write = cmd->rq->cmd_flags & REQ_WRITE; + struct loop_device *lo = cmd->rq->q->queuedata; + int ret = -EIO; + + if (lo->lo_state != Lo_bound) + goto failed; + + if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) + goto failed; + + ret = do_req_filebacked(lo, cmd->rq); + + failed: + if (ret) + cmd->rq->errors = -EIO; + blk_mq_complete_request(cmd->rq); +} + +static void loop_queue_write_work(struct work_struct *work) +{ + struct loop_device *lo = + container_of(work, struct loop_device, write_work); + LIST_HEAD(cmd_list); + + spin_lock_irq(&lo->lo_lock); + repeat: + list_splice_init(&lo->write_cmd_head, &cmd_list); + spin_unlock_irq(&lo->lo_lock); + + while (!list_empty(&cmd_list)) { + struct loop_cmd *cmd = list_first_entry(&cmd_list, + struct loop_cmd, list); + list_del_init(&cmd->list); + loop_handle_cmd(cmd); + } + + spin_lock_irq(&lo->lo_lock); + if (!list_empty(&lo->write_cmd_head)) + goto repeat; + lo->write_started = false; + spin_unlock_irq(&lo->lo_lock); +} + +static void loop_queue_read_work(struct work_struct *work) +{ + struct loop_cmd *cmd = + container_of(work, struct loop_cmd, read_work); + + loop_handle_cmd(cmd); +} + +static int loop_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + + cmd->rq = rq; + INIT_WORK(&cmd->read_work, loop_queue_read_work); + + return 0; +} + +static struct blk_mq_ops loop_mq_ops = { + .queue_rq = loop_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = loop_init_request, +}; + static int loop_add(struct loop_device **l, int i) { struct loop_device *lo; @@ -1627,16 +1598,28 @@ static int loop_add(struct loop_device **l, int i) i = err; err = -ENOMEM; - lo->lo_queue = blk_alloc_queue(GFP_KERNEL); - if (!lo->lo_queue) + lo->tag_set.ops = &loop_mq_ops; + lo->tag_set.nr_hw_queues = 1; + lo->tag_set.queue_depth = 128; + lo->tag_set.numa_node = NUMA_NO_NODE; + lo->tag_set.cmd_size = sizeof(struct loop_cmd); + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + lo->tag_set.driver_data = lo; + + err = blk_mq_alloc_tag_set(&lo->tag_set); + if (err) goto out_free_idr; - /* - * set queue make_request_fn - */ - blk_queue_make_request(lo->lo_queue, loop_make_request); + lo->lo_queue = blk_mq_init_queue(&lo->tag_set); + if (IS_ERR_OR_NULL(lo->lo_queue)) { + err = PTR_ERR(lo->lo_queue); + goto out_cleanup_tags; + } lo->lo_queue->queuedata = lo; + INIT_LIST_HEAD(&lo->write_cmd_head); + INIT_WORK(&lo->write_work, loop_queue_write_work); + disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) goto out_free_queue; @@ -1664,9 +1647,6 @@ static int loop_add(struct loop_device **l, int i) disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); lo->lo_number = i; - lo->lo_thread = NULL; - init_waitqueue_head(&lo->lo_event); - init_waitqueue_head(&lo->lo_req_wait); spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; @@ -1680,6 +1660,8 @@ static int loop_add(struct loop_device **l, int i) out_free_queue: blk_cleanup_queue(lo->lo_queue); +out_cleanup_tags: + blk_mq_free_tag_set(&lo->tag_set); out_free_idr: idr_remove(&loop_index_idr, i); out_free_dev: @@ -1692,6 +1674,7 @@ static void loop_remove(struct loop_device *lo) { del_gendisk(lo->lo_disk); blk_cleanup_queue(lo->lo_queue); + blk_mq_free_tag_set(&lo->tag_set); put_disk(lo->lo_disk); kfree(lo); } @@ -1875,6 +1858,13 @@ static int __init loop_init(void) goto misc_out; } + loop_wq = alloc_workqueue("kloopd", + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0); + if (!loop_wq) { + err = -ENOMEM; + goto misc_out; + } + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, THIS_MODULE, loop_probe, NULL, NULL); @@ -1912,6 +1902,8 @@ static void __exit loop_exit(void) blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); unregister_blkdev(LOOP_MAJOR, "loop"); + destroy_workqueue(loop_wq); + misc_deregister(&loop_misc); } diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 90df5d6..301c27f 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -11,8 +11,10 @@ #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/workqueue.h> #include <uapi/linux/loop.h> /* Possible states of device */ @@ -52,19 +54,23 @@ struct loop_device { gfp_t old_gfp_mask; spinlock_t lo_lock; - struct bio_list lo_bio_list; - unsigned int lo_bio_count; + struct list_head write_cmd_head; + struct work_struct write_work; + bool write_started; int lo_state; struct mutex lo_ctl_mutex; - struct task_struct *lo_thread; - wait_queue_head_t lo_event; - /* wait queue for incoming requests */ - wait_queue_head_t lo_req_wait; struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; struct gendisk *lo_disk; }; +struct loop_cmd { + struct work_struct read_work; + struct request *rq; + struct list_head list; +}; + /* Support for loadable transfer modules */ struct loop_func_table { int number; /* filter type */ diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index aa2224a..65cd61a 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -579,7 +579,7 @@ static int null_add_dev(void) sector_div(size, bs); set_capacity(disk, size); - disk->flags |= GENHD_FL_EXT_DEVT; + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; disk->first_minor = nullb->index; disk->fops = &null_fops; diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index d826bf3..cbdfbbf 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -144,8 +144,37 @@ struct nvme_cmd_info { void *ctx; int aborted; struct nvme_queue *nvmeq; + struct nvme_iod iod[0]; }; +/* + * Max size of iod being embedded in the request payload + */ +#define NVME_INT_PAGES 2 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size, struct nvme_dev *dev) +{ + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + unsigned int ret = sizeof(struct nvme_cmd_info); + + ret += sizeof(struct nvme_iod); + ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); + ret += sizeof(struct scatterlist) * NVME_INT_PAGES; + + return ret; +} + static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); } +static void *iod_get_private(struct nvme_iod *iod) +{ + return (void *) (iod->private & ~0x1UL); +} + +/* + * If bit 0 is set, the iod is embedded in the request payload. + */ +static bool iod_should_kfree(struct nvme_iod *iod) +{ + return (iod->private & 0x01) == 0; +} + /* Special values must be less than 0x1000 */ #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) @@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod) return ((void *)iod) + iod->offset; } -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) +static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, + unsigned nseg, unsigned long private) { - unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); - return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); + iod->private = private; + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + iod->nents = 0; } static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) +__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, + unsigned long priv, gfp_t gfp) { struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes, dev) + + sizeof(__le64 *) * nvme_npages(bytes, dev) + sizeof(struct scatterlist) * nseg, gfp); - if (iod) { - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; - iod->first_dma = 0ULL; - } + if (iod) + iod_init(iod, bytes, nseg, priv); return iod; } +static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, + gfp_t gfp) +{ + unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : + sizeof(struct nvme_dsm_range); + unsigned long mask = 0; + struct nvme_iod *iod; + + if (rq->nr_phys_segments <= NVME_INT_PAGES && + size <= NVME_INT_BYTES(dev)) { + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); + + iod = cmd->iod; + mask = 0x01; + iod_init(iod, size, rq->nr_phys_segments, + (unsigned long) rq | 0x01); + return iod; + } + + return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, + (unsigned long) rq, gfp); +} + void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = dev->page_size / 8 - 1; @@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); prp_dma = next_prp_dma; } - kfree(iod); + + if (iod_should_kfree(iod)) + kfree(iod); } static int nvme_error_status(u16 status) @@ -424,7 +486,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); u16 status = le16_to_cpup(&cqe->status) >> 1; @@ -585,7 +647,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct nvme_ns *ns) { - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_command *cmnd; u16 control = 0; u32 dsmgmt = 0; @@ -626,17 +688,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = req->nr_phys_segments; enum dma_data_direction dma_dir; - unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : - sizeof(struct nvme_dsm_range); - iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); + iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; - iod->private = req; - if (req->cmd_flags & REQ_DISCARD) { void *range; /* @@ -651,10 +708,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto retry_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; - } else if (psegs) { + } else if (req->nr_phys_segments) { dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; - sg_init_table(iod->sg, psegs); + sg_init_table(iod->sg, req->nr_phys_segments); iod->nents = blk_rq_map_sg(req->q, req, iod->sg); if (!iod->nents) goto error_cmd; @@ -1137,21 +1194,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) static void nvme_free_queues(struct nvme_dev *dev, int lowest) { - LLIST_HEAD(q_list); - struct nvme_queue *nvmeq, *next; - struct llist_node *entry; int i; for (i = dev->queue_count - 1; i >= lowest; i--) { struct nvme_queue *nvmeq = dev->queues[i]; - llist_add(&nvmeq->node, &q_list); dev->queue_count--; dev->queues[i] = NULL; - } - synchronize_rcu(); - entry = llist_del_all(&q_list); - llist_for_each_entry_safe(nvmeq, next, entry, node) nvme_free_queue(nvmeq); + } } /** @@ -1408,7 +1458,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); - dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->admin_tagset.cmd_size = nvme_cmd_size(dev); dev->admin_tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->admin_tagset)) @@ -1522,7 +1572,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, } err = -ENOMEM; - iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); + iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); if (!iod) goto put_pages; @@ -2148,7 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->tagset.cmd_size = nvme_cmd_size(dev); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 79aa179..e229425 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -423,7 +423,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev) } /* switch queue to TCQ mode; allocate tag map */ - rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL); + rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO); if (rc) { blk_cleanup_queue(q); put_disk(disk); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index cc90a84..375d288 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -214,6 +214,15 @@ enum blkif_protocol { BLKIF_PROTOCOL_X86_64 = 3, }; +/* + * Default protocol if the frontend doesn't specify one. + */ +#ifdef CONFIG_X86 +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32 +#else +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE +#endif + struct xen_vbd { /* What the domain refers to this vbd as. */ blkif_vdev_t handle; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 630a489..e3afe97 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be) return err; } - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", "%63s", protocol, NULL); if (err) - strcpy(protocol, "unspecified, assuming native"); + strcpy(protocol, "unspecified, assuming default"); else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d2cae5f..37779e4 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info) merge_bio.tail = copy[i].request->biotail; bio_list_merge(&bio_list, &merge_bio); copy[i].request->bio = NULL; - blk_put_request(copy[i].request); + blk_end_request_all(copy[i].request, 0); } kfree(copy); @@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info) req->bio = NULL; if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) pr_alert("diskcache flush request found!\n"); - __blk_put_request(info->rq, req); + __blk_end_request_all(req, 0); } spin_unlock_irq(&info->io_lock); diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 4c58333..9a6b637 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -287,13 +287,24 @@ static unsigned long get_unmapped_area_mem(struct file *file, return pgoff << PAGE_SHIFT; } +/* permit direct mmap, for read, write or exec */ +static unsigned memory_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | + NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC; +} + +static unsigned zero_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_COPY; +} + /* can't do an in-place private mapping if there's no MMU */ static inline int private_mapping_ok(struct vm_area_struct *vma) { return vma->vm_flags & VM_MAYSHARE; } #else -#define get_unmapped_area_mem NULL static inline int private_mapping_ok(struct vm_area_struct *vma) { @@ -721,7 +732,10 @@ static const struct file_operations mem_fops = { .write = write_mem, .mmap = mmap_mem, .open = open_mem, +#ifndef CONFIG_MMU .get_unmapped_area = get_unmapped_area_mem, + .mmap_capabilities = memory_mmap_capabilities, +#endif }; #ifdef CONFIG_DEVKMEM @@ -731,7 +745,10 @@ static const struct file_operations kmem_fops = { .write = write_kmem, .mmap = mmap_kmem, .open = open_kmem, +#ifndef CONFIG_MMU .get_unmapped_area = get_unmapped_area_mem, + .mmap_capabilities = memory_mmap_capabilities, +#endif }; #endif @@ -760,16 +777,9 @@ static const struct file_operations zero_fops = { .read_iter = read_iter_zero, .aio_write = aio_write_zero, .mmap = mmap_zero, -}; - -/* - * capabilities for /dev/zero - * - permits private mappings, "copies" are taken of the source of zeros - * - no writeback happens - */ -static struct backing_dev_info zero_bdi = { - .name = "char/mem", - .capabilities = BDI_CAP_MAP_COPY | BDI_CAP_NO_ACCT_AND_WRITEBACK, +#ifndef CONFIG_MMU + .mmap_capabilities = zero_mmap_capabilities, +#endif }; static const struct file_operations full_fops = { @@ -783,22 +793,22 @@ static const struct memdev { const char *name; umode_t mode; const struct file_operations *fops; - struct backing_dev_info *dev_info; + fmode_t fmode; } devlist[] = { - [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi }, + [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET }, #ifdef CONFIG_DEVKMEM - [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi }, + [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET }, #endif - [3] = { "null", 0666, &null_fops, NULL }, + [3] = { "null", 0666, &null_fops, 0 }, #ifdef CONFIG_DEVPORT - [4] = { "port", 0, &port_fops, NULL }, + [4] = { "port", 0, &port_fops, 0 }, #endif - [5] = { "zero", 0666, &zero_fops, &zero_bdi }, - [7] = { "full", 0666, &full_fops, NULL }, - [8] = { "random", 0666, &random_fops, NULL }, - [9] = { "urandom", 0666, &urandom_fops, NULL }, + [5] = { "zero", 0666, &zero_fops, 0 }, + [7] = { "full", 0666, &full_fops, 0 }, + [8] = { "random", 0666, &random_fops, 0 }, + [9] = { "urandom", 0666, &urandom_fops, 0 }, #ifdef CONFIG_PRINTK - [11] = { "kmsg", 0644, &kmsg_fops, NULL }, + [11] = { "kmsg", 0644, &kmsg_fops, 0 }, #endif }; @@ -816,12 +826,7 @@ static int memory_open(struct inode *inode, struct file *filp) return -ENXIO; filp->f_op = dev->fops; - if (dev->dev_info) - filp->f_mapping->backing_dev_info = dev->dev_info; - - /* Is /dev/mem or /dev/kmem ? */ - if (dev->dev_info == &directly_mappable_cdev_bdi) - filp->f_mode |= FMODE_UNSIGNED_OFFSET; + filp->f_mode |= dev->fmode; if (dev->fops->open) return dev->fops->open(inode, filp); @@ -846,11 +851,6 @@ static struct class *mem_class; static int __init chr_dev_init(void) { int minor; - int err; - - err = bdi_init(&zero_bdi); - if (err) - return err; if (register_chrdev(MEM_MAJOR, "mem", &memory_fops)) printk("unable to get major %d for memory devs\n", MEM_MAJOR); diff --git a/drivers/char/raw.c b/drivers/char/raw.c index a24891b..6e29bf2 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -104,11 +104,9 @@ static int raw_release(struct inode *inode, struct file *filp) mutex_lock(&raw_mutex); bdev = raw_devices[minor].binding; - if (--raw_devices[minor].inuse == 0) { + if (--raw_devices[minor].inuse == 0) /* Here inode->i_mapping == bdev->bd_inode->i_mapping */ inode->i_mapping = &inode->i_data; - inode->i_mapping->backing_dev_info = &default_backing_dev_info; - } mutex_unlock(&raw_mutex); blkdev_put(bdev, filp->f_mode | FMODE_EXCL); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index c355a22..c396444 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -231,9 +231,8 @@ config DM_CRYPT transparently encrypts the data on it. You'll need to activate the ciphers you're going to use in the cryptoapi configuration. - Information on how to use dm-crypt can be found on - - <http://www.saout.de/misc/dm-crypt/> + For further information on dm-crypt and userspace tools see: + <http://code.google.com/p/cryptsetup/wiki/DMCrypt> To compile this code as a module, choose M here: the module will be called dm-crypt. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1695ee5..3a57679 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1619,7 +1619,9 @@ void bitmap_destroy(struct mddev *mddev) return; mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); mddev->bitmap = NULL; /* disconnect from the md device */ + spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; @@ -2209,11 +2211,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t can_clear_show(struct mddev *mddev, char *page) { int len; + spin_lock(&mddev->lock); if (mddev->bitmap) len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? "false" : "true")); else len = sprintf(page, "\n"); + spin_unlock(&mddev->lock); return len; } @@ -2238,10 +2242,15 @@ __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); static ssize_t behind_writes_used_show(struct mddev *mddev, char *page) { + ssize_t ret; + spin_lock(&mddev->lock); if (mddev->bitmap == NULL) - return sprintf(page, "0\n"); - return sprintf(page, "%lu\n", - mddev->bitmap->behind_writes_used); + ret = sprintf(page, "0\n"); + else + ret = sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); + spin_unlock(&mddev->lock); + return ret; } static ssize_t diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index c33b497..86dbbc7 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -11,6 +11,7 @@ #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/slab.h> +#include <linux/jiffies.h> #include <linux/vmalloc.h> #include <linux/shrinker.h> #include <linux/module.h> @@ -1739,7 +1740,7 @@ static unsigned get_max_age_hz(void) static bool older_than(struct dm_buffer *b, unsigned long age_hz) { - return (jiffies - b->last_accessed) >= age_hz; + return time_after_eq(jiffies, b->last_accessed + age_hz); } static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index e165053..7755af3 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -11,6 +11,7 @@ #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> +#include <linux/jiffies.h> #include <linux/init.h> #include <linux/mempool.h> #include <linux/module.h> @@ -1562,8 +1563,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, static int need_commit_due_to_time(struct cache *cache) { - return jiffies < cache->last_commit_jiffies || - jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; + return !time_in_range(jiffies, cache->last_commit_jiffies, + cache->last_commit_jiffies + COMMIT_PERIOD); } static int commit_if_needed(struct cache *cache) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 73f791b..c8a18e4 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -639,8 +639,8 @@ static int check_name(const char *name) /* * On successful return, the caller must not attempt to acquire - * _hash_lock without first calling dm_table_put, because dm_table_destroy - * waits for this dm_table_put and could be called under this lock. + * _hash_lock without first calling dm_put_live_table, because dm_table_destroy + * waits for this dm_put_live_table and could be called under this lock. */ static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx) { diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index b953db6..03177ca 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -6,6 +6,7 @@ #include <linux/bio.h> #include <linux/slab.h> +#include <linux/jiffies.h> #include <linux/dm-dirty-log.h> #include <linux/device-mapper.h> #include <linux/dm-log-userspace.h> @@ -829,7 +830,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log, int r; uint64_t region64 = region; struct log_c *lc = log->context; - static unsigned long long limit; + static unsigned long limit; struct { int64_t is_recovering; uint64_t in_sync_hint; @@ -845,7 +846,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log, */ if (region < lc->in_sync_hint) return 0; - else if (jiffies < limit) + else if (time_after(limit, jiffies)) return 1; limit = jiffies + (HZ / 4); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 7b6b0f0..d376dc8 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -11,6 +11,7 @@ #include "dm-path-selector.h" #include "dm-uevent.h" +#include <linux/blkdev.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/mempool.h> @@ -378,18 +379,18 @@ static int __must_push_back(struct multipath *m) /* * Map cloned requests */ -static int multipath_map(struct dm_target *ti, struct request *clone, - union map_info *map_context) +static int __multipath_map(struct dm_target *ti, struct request *clone, + union map_info *map_context, + struct request *rq, struct request **__clone) { struct multipath *m = (struct multipath *) ti->private; int r = DM_MAPIO_REQUEUE; - size_t nr_bytes = blk_rq_bytes(clone); - unsigned long flags; + size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); struct pgpath *pgpath; struct block_device *bdev; struct dm_mpath_io *mpio; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); /* Do we need to select a new pgpath? */ if (!m->current_pgpath || @@ -411,25 +412,61 @@ static int multipath_map(struct dm_target *ti, struct request *clone, /* ENOMEM, requeue */ goto out_unlock; - bdev = pgpath->path.dev->bdev; - clone->q = bdev_get_queue(bdev); - clone->rq_disk = bdev->bd_disk; - clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; mpio = map_context->ptr; mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; + + bdev = pgpath->path.dev->bdev; + + spin_unlock_irq(&m->lock); + + if (clone) { + /* Old request-based interface: allocated clone is passed in */ + clone->q = bdev_get_queue(bdev); + clone->rq_disk = bdev->bd_disk; + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + } else { + /* blk-mq request-based interface */ + *__clone = blk_get_request(bdev_get_queue(bdev), + rq_data_dir(rq), GFP_KERNEL); + if (IS_ERR(*__clone)) + /* ENOMEM, requeue */ + return r; + (*__clone)->bio = (*__clone)->biotail = NULL; + (*__clone)->rq_disk = bdev->bd_disk; + (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; + } + if (pgpath->pg->ps.type->start_io) pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, nr_bytes); - r = DM_MAPIO_REMAPPED; + return DM_MAPIO_REMAPPED; out_unlock: - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); return r; } +static int multipath_map(struct dm_target *ti, struct request *clone, + union map_info *map_context) +{ + return __multipath_map(ti, clone, map_context, NULL, NULL); +} + +static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, + union map_info *map_context, + struct request **clone) +{ + return __multipath_map(ti, NULL, map_context, rq, clone); +} + +static void multipath_release_clone(struct request *clone) +{ + blk_put_request(clone); +} + /* * If we run out of usable paths, should we queue I/O or error it? */ @@ -1666,11 +1703,13 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 7, 0}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, .map_rq = multipath_map, + .clone_and_map_rq = multipath_clone_and_map, + .release_clone_rq = multipath_release_clone, .rq_end_io = multipath_end_io, .presuspend = multipath_presuspend, .postsuspend = multipath_postsuspend, @@ -1694,16 +1733,15 @@ static int __init dm_multipath_init(void) r = dm_register_target(&multipath_target); if (r < 0) { DMERR("register failed %d", r); - kmem_cache_destroy(_mpio_cache); - return -EINVAL; + r = -EINVAL; + goto bad_register_target; } kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); if (!kmultipathd) { DMERR("failed to create workqueue kmpathd"); - dm_unregister_target(&multipath_target); - kmem_cache_destroy(_mpio_cache); - return -ENOMEM; + r = -ENOMEM; + goto bad_alloc_kmultipathd; } /* @@ -1716,16 +1754,23 @@ static int __init dm_multipath_init(void) WQ_MEM_RECLAIM); if (!kmpath_handlerd) { DMERR("failed to create workqueue kmpath_handlerd"); - destroy_workqueue(kmultipathd); - dm_unregister_target(&multipath_target); - kmem_cache_destroy(_mpio_cache); - return -ENOMEM; + r = -ENOMEM; + goto bad_alloc_kmpath_handlerd; } DMINFO("version %u.%u.%u loaded", multipath_target.version[0], multipath_target.version[1], multipath_target.version[2]); + return 0; + +bad_alloc_kmpath_handlerd: + destroy_workqueue(kmultipathd); +bad_alloc_kmultipathd: + dm_unregister_target(&multipath_target); +bad_register_target: + kmem_cache_destroy(_mpio_cache); + return r; } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 07c0fa0..88e4c7f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -746,13 +746,7 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) { struct raid_set *rs = container_of(cb, struct raid_set, callbacks); - if (rs->raid_type->level == 1) - return md_raid1_congested(&rs->md, bits); - - if (rs->raid_type->level == 10) - return md_raid10_congested(&rs->md, bits); - - return md_raid5_congested(&rs->md, bits); + return mddev_congested(&rs->md, bits); } /* @@ -1243,7 +1237,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) argv++; /* Skip over RAID params for now and find out # of devices */ - if (num_raid_params + 1 > argc) { + if (num_raid_params >= argc) { ti->error = "Arguments do not agree with counts given"; return -EINVAL; } @@ -1254,6 +1248,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } + argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ + if (argc != (num_raid_devs * 2)) { + ti->error = "Supplied RAID devices does not match the count given"; + return -EINVAL; + } + rs = context_alloc(ti, rt, (unsigned)num_raid_devs); if (IS_ERR(rs)) return PTR_ERR(rs); @@ -1262,16 +1262,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) if (ret) goto bad; - ret = -EINVAL; - - argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ argv += num_raid_params + 1; - if (argc != (num_raid_devs * 2)) { - ti->error = "Supplied RAID devices does not match the count given"; - goto bad; - } - ret = dev_parms(rs, argv); if (ret) goto bad; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index d6e8817..808b841 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -200,16 +200,11 @@ err_area: static void free_area(struct pstore *ps) { - if (ps->area) - vfree(ps->area); + vfree(ps->area); ps->area = NULL; - - if (ps->zero_area) - vfree(ps->zero_area); + vfree(ps->zero_area); ps->zero_area = NULL; - - if (ps->header_area) - vfree(ps->header_area); + vfree(ps->header_area); ps->header_area = NULL; } @@ -605,8 +600,7 @@ static void persistent_dtr(struct dm_exception_store *store) free_area(ps); /* Allocated in persistent_read_metadata */ - if (ps->callbacks) - vfree(ps->callbacks); + vfree(ps->callbacks); kfree(ps); } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 3afae9e..6554d91 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -827,10 +827,11 @@ static int dm_table_set_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; + bool use_blk_mq = false; struct dm_target *tgt; struct dm_dev_internal *dd; struct list_head *devices; - unsigned live_md_type; + unsigned live_md_type = dm_get_md_type(t->md); for (i = 0; i < t->num_targets; i++) { tgt = t->targets + i; @@ -854,8 +855,8 @@ static int dm_table_set_type(struct dm_table *t) * Determine the type from the live device. * Default to bio-based if device is new. */ - live_md_type = dm_get_md_type(t->md); - if (live_md_type == DM_TYPE_REQUEST_BASED) + if (live_md_type == DM_TYPE_REQUEST_BASED || + live_md_type == DM_TYPE_MQ_REQUEST_BASED) request_based = 1; else bio_based = 1; @@ -869,16 +870,6 @@ static int dm_table_set_type(struct dm_table *t) BUG_ON(!request_based); /* No targets in this table */ - /* Non-request-stackable devices can't be used for request-based dm */ - devices = dm_table_get_devices(t); - list_for_each_entry(dd, devices, list) { - if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) { - DMWARN("table load rejected: including" - " non-request-stackable devices"); - return -EINVAL; - } - } - /* * Request-based dm supports only tables that have a single target now. * To support multiple targets, request splitting support is needed, @@ -890,7 +881,37 @@ static int dm_table_set_type(struct dm_table *t) return -EINVAL; } - t->type = DM_TYPE_REQUEST_BASED; + /* Non-request-stackable devices can't be used for request-based dm */ + devices = dm_table_get_devices(t); + list_for_each_entry(dd, devices, list) { + struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); + + if (!blk_queue_stackable(q)) { + DMERR("table load rejected: including" + " non-request-stackable devices"); + return -EINVAL; + } + + if (q->mq_ops) + use_blk_mq = true; + } + + if (use_blk_mq) { + /* verify _all_ devices in the table are blk-mq devices */ + list_for_each_entry(dd, devices, list) + if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) { + DMERR("table load rejected: not all devices" + " are blk-mq request-stackable"); + return -EINVAL; + } + t->type = DM_TYPE_MQ_REQUEST_BASED; + + } else if (hybrid && list_empty(devices) && live_md_type != DM_TYPE_NONE) { + /* inherit live MD type */ + t->type = live_md_type; + + } else + t->type = DM_TYPE_REQUEST_BASED; return 0; } @@ -907,7 +928,15 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) bool dm_table_request_based(struct dm_table *t) { - return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; + unsigned table_type = dm_table_get_type(t); + + return (table_type == DM_TYPE_REQUEST_BASED || + table_type == DM_TYPE_MQ_REQUEST_BASED); +} + +bool dm_table_mq_request_based(struct dm_table *t) +{ + return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED; } static int dm_table_alloc_md_mempools(struct dm_table *t) @@ -1360,6 +1389,14 @@ static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); } +static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags); +} + static bool dm_table_all_devices_attribute(struct dm_table *t, iterate_devices_callout_fn func) { @@ -1480,6 +1517,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps)) + queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q); + dm_table_set_integrity(t); /* diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 242e3ce..925ec1b 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -137,13 +137,26 @@ static int io_err_map_rq(struct dm_target *ti, struct request *clone, return -EIO; } +static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, + union map_info *map_context, + struct request **clone) +{ + return -EIO; +} + +static void io_err_release_clone_rq(struct request *clone) +{ +} + static struct target_type error_target = { .name = "error", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, .map_rq = io_err_map_rq, + .clone_and_map_rq = io_err_clone_and_map_rq, + .release_clone_rq = io_err_release_clone_rq, }; int __init dm_target_init(void) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 43adbb8..79f6941 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1635,15 +1635,6 @@ int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, return r; } -int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) -{ - down_read(&pmd->root_lock); - *result = pmd->data_block_size; - up_read(&pmd->root_lock); - - return 0; -} - int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) { int r = -EINVAL; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 921d15e..fac01a9 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -182,8 +182,6 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); -int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result); - int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 07705ee..654773c 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -11,6 +11,7 @@ #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> +#include <linux/jiffies.h> #include <linux/log2.h> #include <linux/list.h> #include <linux/rculist.h> @@ -1700,8 +1701,8 @@ static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell */ static int need_commit_due_to_time(struct pool *pool) { - return jiffies < pool->last_commit_jiffies || - jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; + return !time_in_range(jiffies, pool->last_commit_jiffies, + pool->last_commit_jiffies + COMMIT_PERIOD); } #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2caf5b3..ec1444f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -20,6 +20,7 @@ #include <linux/hdreg.h> #include <linux/delay.h> #include <linux/wait.h> +#include <linux/kthread.h> #include <trace/events/block.h> @@ -78,7 +79,8 @@ struct dm_io { struct dm_rq_target_io { struct mapped_device *md; struct dm_target *ti; - struct request *orig, clone; + struct request *orig, *clone; + struct kthread_work work; int error; union map_info info; }; @@ -179,6 +181,7 @@ struct mapped_device { * io objects are allocated from here. */ mempool_t *io_pool; + mempool_t *rq_pool; struct bio_set *bs; @@ -210,6 +213,9 @@ struct mapped_device { unsigned internal_suspend_count; struct dm_stats stats; + + struct kthread_worker kworker; + struct task_struct *kworker_task; }; /* @@ -217,6 +223,7 @@ struct mapped_device { */ struct dm_md_mempools { mempool_t *io_pool; + mempool_t *rq_pool; struct bio_set *bs; }; @@ -231,6 +238,7 @@ struct table_device { #define RESERVED_MAX_IOS 1024 static struct kmem_cache *_io_cache; static struct kmem_cache *_rq_tio_cache; +static struct kmem_cache *_rq_cache; /* * Bio-based DM's mempools' reserved IOs set by the user. @@ -288,9 +296,14 @@ static int __init local_init(void) if (!_rq_tio_cache) goto out_free_io_cache; + _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), + __alignof__(struct request), 0, NULL); + if (!_rq_cache) + goto out_free_rq_tio_cache; + r = dm_uevent_init(); if (r) - goto out_free_rq_tio_cache; + goto out_free_rq_cache; deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); if (!deferred_remove_workqueue) { @@ -312,6 +325,8 @@ out_free_workqueue: destroy_workqueue(deferred_remove_workqueue); out_uevent_exit: dm_uevent_exit(); +out_free_rq_cache: + kmem_cache_destroy(_rq_cache); out_free_rq_tio_cache: kmem_cache_destroy(_rq_tio_cache); out_free_io_cache: @@ -325,6 +340,7 @@ static void local_exit(void) flush_scheduled_work(); destroy_workqueue(deferred_remove_workqueue); + kmem_cache_destroy(_rq_cache); kmem_cache_destroy(_rq_tio_cache); kmem_cache_destroy(_io_cache); unregister_blkdev(_major, _name); @@ -577,6 +593,17 @@ static void free_rq_tio(struct dm_rq_target_io *tio) mempool_free(tio, tio->md->io_pool); } +static struct request *alloc_clone_request(struct mapped_device *md, + gfp_t gfp_mask) +{ + return mempool_alloc(md->rq_pool, gfp_mask); +} + +static void free_clone_request(struct mapped_device *md, struct request *rq) +{ + mempool_free(rq, md->rq_pool); +} + static int md_in_flight(struct mapped_device *md) { return atomic_read(&md->pending[READ]) + @@ -992,7 +1019,7 @@ static void end_clone_bio(struct bio *clone, int error) * the md may be freed in dm_put() at the end of this function. * Or do dm_get() before calling this function and dm_put() later. */ -static void rq_completed(struct mapped_device *md, int rw, int run_queue) +static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { atomic_dec(&md->pending[rw]); @@ -1020,12 +1047,17 @@ static void free_rq_clone(struct request *clone) struct dm_rq_target_io *tio = clone->end_io_data; blk_rq_unprep_clone(clone); + if (clone->q && clone->q->mq_ops) + tio->ti->type->release_clone_rq(clone); + else + free_clone_request(tio->md, clone); free_rq_tio(tio); } /* * Complete the clone and the original request. - * Must be called without queue lock. + * Must be called without clone's queue lock held, + * see end_clone_request() for more details. */ static void dm_end_request(struct request *clone, int error) { @@ -1054,23 +1086,23 @@ static void dm_end_request(struct request *clone, int error) static void dm_unprep_request(struct request *rq) { - struct request *clone = rq->special; + struct dm_rq_target_io *tio = rq->special; + struct request *clone = tio->clone; rq->special = NULL; rq->cmd_flags &= ~REQ_DONTPREP; - free_rq_clone(clone); + if (clone) + free_rq_clone(clone); } /* * Requeue the original request of a clone. */ -void dm_requeue_unmapped_request(struct request *clone) +static void dm_requeue_unmapped_original_request(struct mapped_device *md, + struct request *rq) { - int rw = rq_data_dir(clone); - struct dm_rq_target_io *tio = clone->end_io_data; - struct mapped_device *md = tio->md; - struct request *rq = tio->orig; + int rw = rq_data_dir(rq); struct request_queue *q = rq->q; unsigned long flags; @@ -1080,9 +1112,15 @@ void dm_requeue_unmapped_request(struct request *clone) blk_requeue_request(q, rq); spin_unlock_irqrestore(q->queue_lock, flags); - rq_completed(md, rw, 0); + rq_completed(md, rw, false); +} + +static void dm_requeue_unmapped_request(struct request *clone) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + + dm_requeue_unmapped_original_request(tio->md, tio->orig); } -EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); static void __stop_queue(struct request_queue *q) { @@ -1151,8 +1189,15 @@ static void dm_done(struct request *clone, int error, bool mapped) static void dm_softirq_done(struct request *rq) { bool mapped = true; - struct request *clone = rq->completion_data; - struct dm_rq_target_io *tio = clone->end_io_data; + struct dm_rq_target_io *tio = rq->special; + struct request *clone = tio->clone; + + if (!clone) { + blk_end_request_all(rq, tio->error); + rq_completed(tio->md, rq_data_dir(rq), false); + free_rq_tio(tio); + return; + } if (rq->cmd_flags & REQ_FAILED) mapped = false; @@ -1164,13 +1209,11 @@ static void dm_softirq_done(struct request *rq) * Complete the clone and the original request with the error status * through softirq context. */ -static void dm_complete_request(struct request *clone, int error) +static void dm_complete_request(struct request *rq, int error) { - struct dm_rq_target_io *tio = clone->end_io_data; - struct request *rq = tio->orig; + struct dm_rq_target_io *tio = rq->special; tio->error = error; - rq->completion_data = clone; blk_complete_request(rq); } @@ -1178,40 +1221,40 @@ static void dm_complete_request(struct request *clone, int error) * Complete the not-mapped clone and the original request with the error status * through softirq context. * Target's rq_end_io() function isn't called. - * This may be used when the target's map_rq() function fails. + * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. */ -void dm_kill_unmapped_request(struct request *clone, int error) +static void dm_kill_unmapped_request(struct request *rq, int error) { - struct dm_rq_target_io *tio = clone->end_io_data; - struct request *rq = tio->orig; - rq->cmd_flags |= REQ_FAILED; - dm_complete_request(clone, error); + dm_complete_request(rq, error); } -EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); /* - * Called with the queue lock held + * Called with the clone's queue lock held */ static void end_clone_request(struct request *clone, int error) { - /* - * For just cleaning up the information of the queue in which - * the clone was dispatched. - * The clone is *NOT* freed actually here because it is alloced from - * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. - */ - __blk_put_request(clone->q, clone); + struct dm_rq_target_io *tio = clone->end_io_data; + + if (!clone->q->mq_ops) { + /* + * For just cleaning up the information of the queue in which + * the clone was dispatched. + * The clone is *NOT* freed actually here because it is alloced + * from dm own mempool (REQ_ALLOCED isn't set). + */ + __blk_put_request(clone->q, clone); + } /* * Actual request completion is done in a softirq context which doesn't - * hold the queue lock. Otherwise, deadlock could occur because: + * hold the clone's queue lock. Otherwise, deadlock could occur because: * - another request may be submitted by the upper level driver * of the stacking during the completion * - the submission which requires queue lock may be done - * against this queue + * against this clone's queue */ - dm_complete_request(clone, error); + dm_complete_request(tio->orig, error); } /* @@ -1689,19 +1732,19 @@ static void dm_request(struct request_queue *q, struct bio *bio) _dm_request(q, bio); } -void dm_dispatch_request(struct request *rq) +static void dm_dispatch_clone_request(struct request *clone, struct request *rq) { int r; - if (blk_queue_io_stat(rq->q)) - rq->cmd_flags |= REQ_IO_STAT; + if (blk_queue_io_stat(clone->q)) + clone->cmd_flags |= REQ_IO_STAT; - rq->start_time = jiffies; - r = blk_insert_cloned_request(rq->q, rq); + clone->start_time = jiffies; + r = blk_insert_cloned_request(clone->q, clone); if (r) + /* must complete clone in terms of original request */ dm_complete_request(rq, r); } -EXPORT_SYMBOL_GPL(dm_dispatch_request); static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, void *data) @@ -1718,11 +1761,11 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, } static int setup_clone(struct request *clone, struct request *rq, - struct dm_rq_target_io *tio) + struct dm_rq_target_io *tio, gfp_t gfp_mask) { int r; - r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, dm_rq_bio_constructor, tio); if (r) return r; @@ -1733,14 +1776,37 @@ static int setup_clone(struct request *clone, struct request *rq, clone->end_io = end_clone_request; clone->end_io_data = tio; + tio->clone = clone; + return 0; } static struct request *clone_rq(struct request *rq, struct mapped_device *md, - gfp_t gfp_mask) + struct dm_rq_target_io *tio, gfp_t gfp_mask) +{ + struct request *clone = alloc_clone_request(md, gfp_mask); + + if (!clone) + return NULL; + + blk_rq_init(NULL, clone); + if (setup_clone(clone, rq, tio, gfp_mask)) { + /* -ENOMEM */ + free_clone_request(md, clone); + return NULL; + } + + return clone; +} + +static void map_tio_request(struct kthread_work *work); + +static struct dm_rq_target_io *prep_tio(struct request *rq, + struct mapped_device *md, gfp_t gfp_mask) { - struct request *clone; struct dm_rq_target_io *tio; + int srcu_idx; + struct dm_table *table; tio = alloc_rq_tio(md, gfp_mask); if (!tio) @@ -1748,18 +1814,23 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md, tio->md = md; tio->ti = NULL; + tio->clone = NULL; tio->orig = rq; tio->error = 0; memset(&tio->info, 0, sizeof(tio->info)); - - clone = &tio->clone; - if (setup_clone(clone, rq, tio)) { - /* -ENOMEM */ - free_rq_tio(tio); - return NULL; + init_kthread_work(&tio->work, map_tio_request); + + table = dm_get_live_table(md, &srcu_idx); + if (!dm_table_mq_request_based(table)) { + if (!clone_rq(rq, md, tio, gfp_mask)) { + dm_put_live_table(md, srcu_idx); + free_rq_tio(tio); + return NULL; + } } + dm_put_live_table(md, srcu_idx); - return clone; + return tio; } /* @@ -1768,18 +1839,18 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md, static int dm_prep_fn(struct request_queue *q, struct request *rq) { struct mapped_device *md = q->queuedata; - struct request *clone; + struct dm_rq_target_io *tio; if (unlikely(rq->special)) { DMWARN("Already has something in rq->special."); return BLKPREP_KILL; } - clone = clone_rq(rq, md, GFP_ATOMIC); - if (!clone) + tio = prep_tio(rq, md, GFP_ATOMIC); + if (!tio) return BLKPREP_DEFER; - rq->special = clone; + rq->special = tio; rq->cmd_flags |= REQ_DONTPREP; return BLKPREP_OK; @@ -1787,17 +1858,36 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) /* * Returns: - * 0 : the request has been processed (not requeued) - * !0 : the request has been requeued + * 0 : the request has been processed + * DM_MAPIO_REQUEUE : the original request needs to be requeued + * < 0 : the request was completed due to failure */ -static int map_request(struct dm_target *ti, struct request *clone, +static int map_request(struct dm_target *ti, struct request *rq, struct mapped_device *md) { - int r, requeued = 0; - struct dm_rq_target_io *tio = clone->end_io_data; + int r; + struct dm_rq_target_io *tio = rq->special; + struct request *clone = NULL; + + if (tio->clone) { + clone = tio->clone; + r = ti->type->map_rq(ti, clone, &tio->info); + } else { + r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); + if (r < 0) { + /* The target wants to complete the I/O */ + dm_kill_unmapped_request(rq, r); + return r; + } + if (IS_ERR(clone)) + return DM_MAPIO_REQUEUE; + if (setup_clone(clone, rq, tio, GFP_KERNEL)) { + /* -ENOMEM */ + ti->type->release_clone_rq(clone); + return DM_MAPIO_REQUEUE; + } + } - tio->ti = ti; - r = ti->type->map_rq(ti, clone, &tio->info); switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ @@ -1805,13 +1895,12 @@ static int map_request(struct dm_target *ti, struct request *clone, case DM_MAPIO_REMAPPED: /* The target has remapped the I/O so dispatch it */ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), - blk_rq_pos(tio->orig)); - dm_dispatch_request(clone); + blk_rq_pos(rq)); + dm_dispatch_clone_request(clone, rq); break; case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ dm_requeue_unmapped_request(clone); - requeued = 1; break; default: if (r > 0) { @@ -1820,20 +1909,27 @@ static int map_request(struct dm_target *ti, struct request *clone, } /* The target wants to complete the I/O */ - dm_kill_unmapped_request(clone, r); - break; + dm_kill_unmapped_request(rq, r); + return r; } - return requeued; + return 0; } -static struct request *dm_start_request(struct mapped_device *md, struct request *orig) +static void map_tio_request(struct kthread_work *work) { - struct request *clone; + struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); + struct request *rq = tio->orig; + struct mapped_device *md = tio->md; + if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE) + dm_requeue_unmapped_original_request(md, rq); +} + +static void dm_start_request(struct mapped_device *md, struct request *orig) +{ blk_start_request(orig); - clone = orig->special; - atomic_inc(&md->pending[rq_data_dir(clone)]); + atomic_inc(&md->pending[rq_data_dir(orig)]); /* * Hold the md reference here for the in-flight I/O. @@ -1843,8 +1939,6 @@ static struct request *dm_start_request(struct mapped_device *md, struct request * See the comment in rq_completed() too. */ dm_get(md); - - return clone; } /* @@ -1857,7 +1951,8 @@ static void dm_request_fn(struct request_queue *q) int srcu_idx; struct dm_table *map = dm_get_live_table(md, &srcu_idx); struct dm_target *ti; - struct request *rq, *clone; + struct request *rq; + struct dm_rq_target_io *tio; sector_t pos; /* @@ -1879,34 +1974,29 @@ static void dm_request_fn(struct request_queue *q) ti = dm_table_find_target(map, pos); if (!dm_target_is_valid(ti)) { /* - * Must perform setup, that dm_done() requires, + * Must perform setup, that rq_completed() requires, * before calling dm_kill_unmapped_request */ DMERR_LIMIT("request attempted access beyond the end of device"); - clone = dm_start_request(md, rq); - dm_kill_unmapped_request(clone, -EIO); + dm_start_request(md, rq); + dm_kill_unmapped_request(rq, -EIO); continue; } if (ti->type->busy && ti->type->busy(ti)) goto delay_and_out; - clone = dm_start_request(md, rq); - - spin_unlock(q->queue_lock); - if (map_request(ti, clone, md)) - goto requeued; + dm_start_request(md, rq); + tio = rq->special; + /* Establish tio->ti before queuing work (map_tio_request) */ + tio->ti = ti; + queue_kthread_work(&md->kworker, &tio->work); BUG_ON(!irqs_disabled()); - spin_lock(q->queue_lock); } goto out; -requeued: - BUG_ON(!irqs_disabled()); - spin_lock(q->queue_lock); - delay_and_out: blk_delay_queue(q, HZ / 10); out: @@ -2092,6 +2182,7 @@ static struct mapped_device *alloc_dev(int minor) INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); + md->kworker_task = NULL; md->disk->major = _major; md->disk->first_minor = minor; @@ -2152,8 +2243,13 @@ static void free_dev(struct mapped_device *md) unlock_fs(md); bdput(md->bdev); destroy_workqueue(md->wq); + + if (md->kworker_task) + kthread_stop(md->kworker_task); if (md->io_pool) mempool_destroy(md->io_pool); + if (md->rq_pool) + mempool_destroy(md->rq_pool); if (md->bs) bioset_free(md->bs); blk_integrity_unregister(md->disk); @@ -2187,23 +2283,24 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) bioset_free(md->bs); md->bs = p->bs; p->bs = NULL; - } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { - /* - * There's no need to reload with request-based dm - * because the size of front_pad doesn't change. - * Note for future: If you are to reload bioset, - * prep-ed requests in the queue may refer - * to bio from the old bioset, so you must walk - * through the queue to unprep. - */ } + /* + * There's no need to reload with request-based dm + * because the size of front_pad doesn't change. + * Note for future: If you are to reload bioset, + * prep-ed requests in the queue may refer + * to bio from the old bioset, so you must walk + * through the queue to unprep. + */ goto out; } - BUG_ON(!p || md->io_pool || md->bs); + BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); md->io_pool = p->io_pool; p->io_pool = NULL; + md->rq_pool = p->rq_pool; + p->rq_pool = NULL; md->bs = p->bs; p->bs = NULL; @@ -2406,6 +2503,14 @@ unsigned dm_get_md_type(struct mapped_device *md) return md->type; } +static bool dm_md_type_request_based(struct mapped_device *md) +{ + unsigned table_type = dm_get_md_type(md); + + return (table_type == DM_TYPE_REQUEST_BASED || + table_type == DM_TYPE_MQ_REQUEST_BASED); +} + struct target_type *dm_get_immutable_target_type(struct mapped_device *md) { return md->immutable_target_type; @@ -2443,6 +2548,11 @@ static int dm_init_request_based_queue(struct mapped_device *md) blk_queue_prep_rq(md->queue, dm_prep_fn); blk_queue_lld_busy(md->queue, dm_lld_busy); + /* Also initialize the request-based DM worker thread */ + init_kthread_worker(&md->kworker); + md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, + "kdmwork-%s", dm_device_name(md)); + elv_register_queue(md->queue); return 1; @@ -2453,8 +2563,7 @@ static int dm_init_request_based_queue(struct mapped_device *md) */ int dm_setup_md_queue(struct mapped_device *md) { - if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && - !dm_init_request_based_queue(md)) { + if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) { DMWARN("Cannot initialize queue for request-based mapped device"); return -EINVAL; } @@ -2533,6 +2642,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); + if (dm_request_based(md)) + flush_kthread_worker(&md->kworker); + if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); dm_table_postsuspend_targets(map); @@ -2776,8 +2888,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ - if (dm_request_based(md)) + if (dm_request_based(md)) { stop_queue(md->queue); + flush_kthread_worker(&md->kworker); + } flush_workqueue(md->wq); @@ -3123,24 +3237,35 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u { struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); struct kmem_cache *cachep; - unsigned int pool_size; + unsigned int pool_size = 0; unsigned int front_pad; if (!pools) return NULL; - if (type == DM_TYPE_BIO_BASED) { + switch (type) { + case DM_TYPE_BIO_BASED: cachep = _io_cache; pool_size = dm_get_reserved_bio_based_ios(); front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); - } else if (type == DM_TYPE_REQUEST_BASED) { - cachep = _rq_tio_cache; + break; + case DM_TYPE_REQUEST_BASED: pool_size = dm_get_reserved_rq_based_ios(); + pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); + if (!pools->rq_pool) + goto out; + /* fall through to setup remaining rq-based pools */ + case DM_TYPE_MQ_REQUEST_BASED: + cachep = _rq_tio_cache; + if (!pool_size) + pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_bio_data_size is not used. See __bind_mempools(). */ WARN_ON(per_bio_data_size != 0); - } else + break; + default: goto out; + } pools->io_pool = mempool_create_slab_pool(pool_size, cachep); if (!pools->io_pool) @@ -3169,6 +3294,9 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) if (pools->io_pool) mempool_destroy(pools->io_pool); + if (pools->rq_pool) + mempool_destroy(pools->rq_pool); + if (pools->bs) bioset_free(pools->bs); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 84b0f9e4..59f53e7 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -34,9 +34,10 @@ /* * Type of table and mapped_device's mempool */ -#define DM_TYPE_NONE 0 -#define DM_TYPE_BIO_BASED 1 -#define DM_TYPE_REQUEST_BASED 2 +#define DM_TYPE_NONE 0 +#define DM_TYPE_BIO_BASED 1 +#define DM_TYPE_REQUEST_BASED 2 +#define DM_TYPE_MQ_REQUEST_BASED 3 /* * List of devices that a metadevice uses and should open/close. @@ -73,6 +74,7 @@ int dm_table_any_busy_target(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); +bool dm_table_mq_request_based(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); @@ -99,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md); /* * To check whether the target type is request-based or not (bio-based). */ -#define dm_target_request_based(t) ((t)->type->map_rq != NULL) +#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \ + ((t)->type->clone_and_map_rq != NULL)) /* * To check whether the target type is a hybrid (capable of being diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index e8b4574..1277eb2 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -332,13 +332,11 @@ static int run(struct mddev *mddev) return 0; } -static int stop(struct mddev *mddev) +static void faulty_free(struct mddev *mddev, void *priv) { - struct faulty_conf *conf = mddev->private; + struct faulty_conf *conf = priv; kfree(conf); - mddev->private = NULL; - return 0; } static struct md_personality faulty_personality = @@ -348,7 +346,7 @@ static struct md_personality faulty_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = faulty_free, .status = status, .check_reshape = reshape, .size = faulty_size, diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 64713b7..fa7d577 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -34,7 +34,7 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) lo = 0; hi = mddev->raid_disks - 1; - conf = rcu_dereference(mddev->private); + conf = mddev->private; /* * Binary Search @@ -60,18 +60,16 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) * * Return amount of bytes we can take at this offset */ -static int linear_mergeable_bvec(struct request_queue *q, +static int linear_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct dev_info *dev0; unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int maxbytes = biovec->bv_len; struct request_queue *subq; - rcu_read_lock(); dev0 = which_dev(mddev, sector); maxsectors = dev0->end_sector - sector; subq = bdev_get_queue(dev0->rdev->bdev); @@ -81,7 +79,6 @@ static int linear_mergeable_bvec(struct request_queue *q, maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, biovec)); } - rcu_read_unlock(); if (maxsectors < bio_sectors) maxsectors = 0; @@ -97,24 +94,18 @@ static int linear_mergeable_bvec(struct request_queue *q, return maxsectors << 9; } -static int linear_congested(void *data, int bits) +static int linear_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct linear_conf *conf; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - - rcu_read_lock(); - conf = rcu_dereference(mddev->private); + conf = mddev->private; for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } - rcu_read_unlock(); return ret; } @@ -123,12 +114,10 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk struct linear_conf *conf; sector_t array_sectors; - rcu_read_lock(); - conf = rcu_dereference(mddev->private); + conf = mddev->private; WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); array_sectors = conf->array_sectors; - rcu_read_unlock(); return array_sectors; } @@ -217,10 +206,6 @@ static int linear_run (struct mddev *mddev) mddev->private = conf; md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); - mddev->queue->backing_dev_info.congested_fn = linear_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - ret = md_integrity_register(mddev); if (ret) { kfree(conf); @@ -252,38 +237,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; - oldconf = rcu_dereference_protected(mddev->private, - lockdep_is_held( - &mddev->reconfig_mutex)); + mddev_suspend(mddev); + oldconf = mddev->private; mddev->raid_disks++; - rcu_assign_pointer(mddev->private, newconf); + mddev->private = newconf; md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); + mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree_rcu(oldconf, rcu); + kfree(oldconf); return 0; } -static int linear_stop (struct mddev *mddev) +static void linear_free(struct mddev *mddev, void *priv) { - struct linear_conf *conf = - rcu_dereference_protected(mddev->private, - lockdep_is_held( - &mddev->reconfig_mutex)); + struct linear_conf *conf = priv; - /* - * We do not require rcu protection here since - * we hold reconfig_mutex for both linear_add and - * linear_stop, so they cannot race. - * We should make sure any old 'conf's are properly - * freed though. - */ - rcu_barrier(); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); - mddev->private = NULL; - - return 0; } static void linear_make_request(struct mddev *mddev, struct bio *bio) @@ -299,16 +269,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) } do { - rcu_read_lock(); - tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; end_sector = tmp_dev->end_sector; data_offset = tmp_dev->rdev->data_offset; bio->bi_bdev = tmp_dev->rdev->bdev; - rcu_read_unlock(); - if (unlikely(bio->bi_iter.bi_sector >= end_sector || bio->bi_iter.bi_sector < start_sector)) goto out_of_bounds; @@ -355,6 +321,10 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } +static void linear_quiesce(struct mddev *mddev, int state) +{ +} + static struct md_personality linear_personality = { .name = "linear", @@ -362,10 +332,13 @@ static struct md_personality linear_personality = .owner = THIS_MODULE, .make_request = linear_make_request, .run = linear_run, - .stop = linear_stop, + .free = linear_free, .status = linear_status, .hot_add_disk = linear_add, .size = linear_size, + .quiesce = linear_quiesce, + .congested = linear_congested, + .mergeable_bvec = linear_mergeable_bvec, }; static int __init linear_init (void) diff --git a/drivers/md/md.c b/drivers/md/md.c index 709755f..c8d2bac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -72,6 +72,7 @@ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); +static void mddev_detach(struct mddev *mddev); /* * Default number of read corrections we'll attempt on an rdev @@ -292,8 +293,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio) /* mddev_suspend makes sure no new requests are submitted * to the device, and that any requests that have been submitted * are completely handled. - * Once ->stop is called and completes, the module will be completely - * unused. + * Once mddev_detach() is called and completes, the module will be + * completely unused. */ void mddev_suspend(struct mddev *mddev) { @@ -321,10 +322,47 @@ EXPORT_SYMBOL_GPL(mddev_resume); int mddev_congested(struct mddev *mddev, int bits) { - return mddev->suspended; + struct md_personality *pers = mddev->pers; + int ret = 0; + + rcu_read_lock(); + if (mddev->suspended) + ret = 1; + else if (pers && pers->congested) + ret = pers->congested(mddev, bits); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(mddev_congested); +static int md_congested(void *data, int bits) +{ + struct mddev *mddev = data; + return mddev_congested(mddev, bits); } -EXPORT_SYMBOL(mddev_congested); +static int md_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + int ret; + rcu_read_lock(); + if (mddev->suspended) { + /* Must always allow one vec */ + if (bvm->bi_size == 0) + ret = biovec->bv_len; + else + ret = 0; + } else { + struct md_personality *pers = mddev->pers; + if (pers && pers->mergeable_bvec) + ret = pers->mergeable_bvec(mddev, bvm, biovec); + else + ret = biovec->bv_len; + } + rcu_read_unlock(); + return ret; +} /* * Generic flush handling for md */ @@ -397,12 +435,12 @@ static void md_submit_flush_data(struct work_struct *ws) void md_flush_request(struct mddev *mddev, struct bio *bio) { - spin_lock_irq(&mddev->write_lock); + spin_lock_irq(&mddev->lock); wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio, - mddev->write_lock); + mddev->lock); mddev->flush_bio = bio; - spin_unlock_irq(&mddev->write_lock); + spin_unlock_irq(&mddev->lock); INIT_WORK(&mddev->flush_work, submit_flushes); queue_work(md_wq, &mddev->flush_work); @@ -465,7 +503,7 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); - spin_lock_init(&mddev->write_lock); + spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); @@ -552,32 +590,9 @@ static struct mddev *mddev_find(dev_t unit) goto retry; } -static inline int __must_check mddev_lock(struct mddev *mddev) -{ - return mutex_lock_interruptible(&mddev->reconfig_mutex); -} - -/* Sometimes we need to take the lock in a situation where - * failure due to interrupts is not acceptable. - */ -static inline void mddev_lock_nointr(struct mddev *mddev) -{ - mutex_lock(&mddev->reconfig_mutex); -} - -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - -static inline int mddev_trylock(struct mddev *mddev) -{ - return mutex_trylock(&mddev->reconfig_mutex); -} - static struct attribute_group md_redundancy_group; -static void mddev_unlock(struct mddev *mddev) +void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as @@ -619,6 +634,7 @@ static void mddev_unlock(struct mddev *mddev) md_wakeup_thread(mddev->thread); spin_unlock(&pers_lock); } +EXPORT_SYMBOL_GPL(mddev_unlock); static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) { @@ -2230,7 +2246,7 @@ repeat: return; } - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); mddev->utime = get_seconds(); @@ -2287,7 +2303,7 @@ repeat: } sync_sbs(mddev, nospares); - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); @@ -2326,15 +2342,15 @@ repeat: md_super_wait(mddev); /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync != sync_req || test_bit(MD_CHANGE_DEVS, &mddev->flags)) { /* have to write it out again */ - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); goto repeat; } clear_bit(MD_CHANGE_PENDING, &mddev->flags); - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); @@ -2381,40 +2397,41 @@ state_show(struct md_rdev *rdev, char *page) { char *sep = ""; size_t len = 0; + unsigned long flags = ACCESS_ONCE(rdev->flags); - if (test_bit(Faulty, &rdev->flags) || + if (test_bit(Faulty, &flags) || rdev->badblocks.unacked_exist) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } - if (test_bit(In_sync, &rdev->flags)) { + if (test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sin_sync",sep); sep = ","; } - if (test_bit(WriteMostly, &rdev->flags)) { + if (test_bit(WriteMostly, &flags)) { len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } - if (test_bit(Blocked, &rdev->flags) || + if (test_bit(Blocked, &flags) || (rdev->badblocks.unacked_exist - && !test_bit(Faulty, &rdev->flags))) { + && !test_bit(Faulty, &flags))) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } - if (!test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags)) { + if (!test_bit(Faulty, &flags) && + !test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { + if (test_bit(WriteErrorSeen, &flags)) { len += sprintf(page+len, "%swrite_error", sep); sep = ","; } - if (test_bit(WantReplacement, &rdev->flags)) { + if (test_bit(WantReplacement, &flags)) { len += sprintf(page+len, "%swant_replacement", sep); sep = ","; } - if (test_bit(Replacement, &rdev->flags)) { + if (test_bit(Replacement, &flags)) { len += sprintf(page+len, "%sreplacement", sep); sep = ","; } @@ -2927,21 +2944,12 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); - struct mddev *mddev = rdev->mddev; - ssize_t rv; if (!entry->show) return -EIO; - - rv = mddev ? mddev_lock(mddev) : -EBUSY; - if (!rv) { - if (rdev->mddev == NULL) - rv = -EBUSY; - else - rv = entry->show(rdev, page); - mddev_unlock(mddev); - } - return rv; + if (!rdev->mddev) + return -EBUSY; + return entry->show(rdev, page); } static ssize_t @@ -3212,11 +3220,13 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) mddev->safemode_delay = 0; else { unsigned long old_delay = mddev->safemode_delay; - mddev->safemode_delay = (msec*HZ)/1000; - if (mddev->safemode_delay == 0) - mddev->safemode_delay = 1; - if (mddev->safemode_delay < old_delay || old_delay == 0) - md_safemode_timeout((unsigned long)mddev); + unsigned long new_delay = (msec*HZ)/1000; + + if (new_delay == 0) + new_delay = 1; + mddev->safemode_delay = new_delay; + if (new_delay < old_delay || old_delay == 0) + mod_timer(&mddev->safemode_timer, jiffies+1); } return len; } @@ -3226,41 +3236,52 @@ __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t level_show(struct mddev *mddev, char *page) { - struct md_personality *p = mddev->pers; + struct md_personality *p; + int ret; + spin_lock(&mddev->lock); + p = mddev->pers; if (p) - return sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->name); else if (mddev->clevel[0]) - return sprintf(page, "%s\n", mddev->clevel); + ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) - return sprintf(page, "%d\n", mddev->level); + ret = sprintf(page, "%d\n", mddev->level); else - return 0; + ret = 0; + spin_unlock(&mddev->lock); + return ret; } static ssize_t level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; - ssize_t rv = len; - struct md_personality *pers; + ssize_t rv; + size_t slen = len; + struct md_personality *pers, *oldpers; long level; - void *priv; + void *priv, *oldpriv; struct md_rdev *rdev; + if (slen == 0 || slen >= sizeof(clevel)) + return -EINVAL; + + rv = mddev_lock(mddev); + if (rv) + return rv; + if (mddev->pers == NULL) { - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') - len--; - mddev->clevel[len] = 0; + strncpy(mddev->clevel, buf, slen); + if (mddev->clevel[slen-1] == '\n') + slen--; + mddev->clevel[slen] = 0; mddev->level = LEVEL_NONE; - return rv; + rv = len; + goto out_unlock; } + rv = -EROFS; if (mddev->ro) - return -EROFS; + goto out_unlock; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape @@ -3268,25 +3289,25 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * - new personality will access other array. */ + rv = -EBUSY; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) - return -EBUSY; + goto out_unlock; + rv = -EINVAL; if (!mddev->pers->quiesce) { printk(KERN_WARNING "md: %s: %s does not support online personality change\n", mdname(mddev), mddev->pers->name); - return -EINVAL; + goto out_unlock; } /* Now find the new personality */ - if (len == 0 || len >= sizeof(clevel)) - return -EINVAL; - strncpy(clevel, buf, len); - if (clevel[len-1] == '\n') - len--; - clevel[len] = 0; + strncpy(clevel, buf, slen); + if (clevel[slen-1] == '\n') + slen--; + clevel[slen] = 0; if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; @@ -3297,20 +3318,23 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); printk(KERN_WARNING "md: personality %s not loaded\n", clevel); - return -EINVAL; + rv = -EINVAL; + goto out_unlock; } spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ module_put(pers->owner); - return rv; + rv = len; + goto out_unlock; } if (!pers->takeover) { module_put(pers->owner); printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); - return -EINVAL; + rv = -EINVAL; + goto out_unlock; } rdev_for_each(rdev, mddev) @@ -3330,30 +3354,29 @@ level_store(struct mddev *mddev, const char *buf, size_t len) module_put(pers->owner); printk(KERN_WARNING "md: %s: %s would not accept array\n", mdname(mddev), clevel); - return PTR_ERR(priv); + rv = PTR_ERR(priv); + goto out_unlock; } /* Looks like we have a winner */ mddev_suspend(mddev); - mddev->pers->stop(mddev); + mddev_detach(mddev); - if (mddev->pers->sync_request == NULL && - pers->sync_request != NULL) { - /* need to add the md_redundancy_group */ - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) - printk(KERN_WARNING - "md: cannot register extra attributes for %s\n", - mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); - } - if (mddev->pers->sync_request != NULL && - pers->sync_request == NULL) { - /* need to remove the md_redundancy_group */ - if (mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - } + spin_lock(&mddev->lock); + oldpers = mddev->pers; + oldpriv = mddev->private; + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->degraded = 0; + spin_unlock(&mddev->lock); - if (mddev->pers->sync_request == NULL && + if (oldpers->sync_request == NULL && mddev->external) { /* We are converting from a no-redundancy array * to a redundancy array and metadata is managed @@ -3367,6 +3390,24 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->safemode = 0; } + oldpers->free(mddev, oldpriv); + + if (oldpers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + } + if (oldpers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; @@ -3392,17 +3433,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } } - module_put(mddev->pers->owner); - mddev->pers = pers; - mddev->private = priv; - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - mddev->level = mddev->new_level; - mddev->layout = mddev->new_layout; - mddev->chunk_sectors = mddev->new_chunk_sectors; - mddev->delta_disks = 0; - mddev->reshape_backwards = 0; - mddev->degraded = 0; - if (mddev->pers->sync_request == NULL) { + if (pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync */ @@ -3417,6 +3448,9 @@ level_store(struct mddev *mddev, const char *buf, size_t len) md_update_sb(mddev, 1); sysfs_notify(&mddev->kobj, NULL, "level"); md_new_event(mddev); + rv = len; +out_unlock: + mddev_unlock(mddev); return rv; } @@ -3439,28 +3473,32 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); + int err; if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - if (mddev->ro) - return -EROFS; - mddev->new_layout = n; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_layout = mddev->layout; - return err; + err = -EBUSY; + else if (mddev->ro) + err = -EROFS; + else { + mddev->new_layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_layout = mddev->layout; } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) mddev->layout = n; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_layout = __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); @@ -3483,32 +3521,39 @@ static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { char *e; - int rv = 0; + int err; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) - rv = update_raid_disks(mddev, n); + err = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + err = -EINVAL; rdev_for_each(rdev, mddev) { if (olddisks < n && rdev->data_offset < rdev->new_data_offset) - return -EINVAL; + goto out_unlock; if (olddisks > n && rdev->data_offset > rdev->new_data_offset) - return -EINVAL; + goto out_unlock; } + err = 0; mddev->delta_disks = n - olddisks; mddev->raid_disks = n; mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; - return rv ? rv : len; +out_unlock: + mddev_unlock(mddev); + return err ? err : len; } static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); @@ -3527,30 +3572,34 @@ chunk_size_show(struct mddev *mddev, char *page) static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { + int err; char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - if (mddev->ro) - return -EROFS; - mddev->new_chunk_sectors = n >> 9; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_chunk_sectors = mddev->chunk_sectors; - return err; + err = -EBUSY; + else if (mddev->ro) + err = -EROFS; + else { + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else { mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) mddev->chunk_sectors = n >> 9; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); @@ -3566,20 +3615,27 @@ resync_start_show(struct mddev *mddev, char *page) static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { + int err; char *e; unsigned long long n = simple_strtoull(buf, &e, 10); + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - return -EBUSY; - if (cmd_match(buf, "none")) + err = -EBUSY; + else if (cmd_match(buf, "none")) n = MaxSector; else if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = -EINVAL; - mddev->recovery_cp = n; - if (mddev->pers) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - return len; + if (!err) { + mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_resync_start = __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); @@ -3677,8 +3733,39 @@ static int restart_array(struct mddev *mddev); static ssize_t array_state_store(struct mddev *mddev, const char *buf, size_t len) { - int err = -EINVAL; + int err; enum array_state st = match_word(buf, array_states); + + if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { + /* don't take reconfig_mutex when toggling between + * clean and active + */ + spin_lock(&mddev->lock); + if (st == active) { + restart_array(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + wake_up(&mddev->sb_wait); + err = 0; + } else /* st == clean */ { + restart_array(mddev); + if (atomic_read(&mddev->writes_pending) == 0) { + if (mddev->in_sync == 0) { + mddev->in_sync = 1; + if (mddev->safemode == 1) + mddev->safemode = 0; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + err = 0; + } else + err = -EBUSY; + } + spin_unlock(&mddev->lock); + return err; + } + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; switch(st) { case bad_word: break; @@ -3722,7 +3809,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) case clean: if (mddev->pers) { restart_array(mddev); - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { mddev->in_sync = 1; @@ -3733,7 +3820,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = 0; } else err = -EBUSY; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } else err = -EINVAL; break; @@ -3754,14 +3841,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) /* these cannot be set */ break; } - if (err) - return err; - else { + + if (!err) { if (mddev->hold_active == UNTIL_IOCTL) mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); - return len; } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); @@ -3822,6 +3909,11 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; + flush_workqueue(md_misc_wq); + + err = mddev_lock(mddev); + if (err) + return err; if (mddev->persistent) { rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); @@ -3845,6 +3937,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) out: if (err) export_rdev(rdev); + mddev_unlock(mddev); return err ? err : len; } @@ -3856,7 +3949,11 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; + int err; + err = mddev_lock(mddev); + if (err) + return err; if (!mddev->bitmap) goto out; /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ @@ -3874,6 +3971,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) } bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: + mddev_unlock(mddev); return len; } @@ -3901,6 +3999,9 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err < 0) return err; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { err = update_size(mddev, sectors); md_update_sb(mddev, 1); @@ -3911,6 +4012,7 @@ size_store(struct mddev *mddev, const char *buf, size_t len) else err = -ENOSPC; } + mddev_unlock(mddev); return err ? err : len; } @@ -3940,21 +4042,28 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; + int err; /* Changing the details of 'external' metadata is * always permitted. Otherwise there must be * no devices attached to the array. */ + + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; if (mddev->external && strncmp(buf, "external:", 9) == 0) ; else if (!list_empty(&mddev->disks)) - return -EBUSY; + goto out_unlock; + err = 0; if (cmd_match(buf, "none")) { mddev->persistent = 0; mddev->external = 0; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } if (strncmp(buf, "external:", 9) == 0) { size_t namelen = len-9; @@ -3968,22 +4077,27 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } major = simple_strtoul(buf, &e, 10); + err = -EINVAL; if (e==buf || *e != '.') - return -EINVAL; + goto out_unlock; buf = e+1; minor = simple_strtoul(buf, &e, 10); if (e==buf || (*e && *e != '\n') ) - return -EINVAL; + goto out_unlock; + err = -ENOENT; if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) - return -ENOENT; + goto out_unlock; mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; mddev->external = 0; - return len; + err = 0; +out_unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_metadata = @@ -3993,20 +4107,21 @@ static ssize_t action_show(struct mddev *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + unsigned long recovery = mddev->recovery; + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_CHECK, &recovery)) type = "check"; else type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) type = "recover"; } return sprintf(page, "%s\n", type); @@ -4027,7 +4142,10 @@ action_store(struct mddev *mddev, const char *page, size_t len) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); + if (mddev_lock(mddev) == 0) { + md_reap_sync_thread(mddev); + mddev_unlock(mddev); + } } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -4041,7 +4159,11 @@ action_store(struct mddev *mddev, const char *page, size_t len) int err; if (mddev->pers->start_reshape == NULL) return -EINVAL; - err = mddev->pers->start_reshape(mddev); + err = mddev_lock(mddev); + if (!err) { + err = mddev->pers->start_reshape(mddev); + mddev_unlock(mddev); + } if (err) return err; sysfs_notify(&mddev->kobj, NULL, "degraded"); @@ -4225,22 +4347,36 @@ static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; + int err; + int chunk; + if (kstrtoull(buf, 10, &min)) return -EINVAL; + + spin_lock(&mddev->lock); + err = -EINVAL; if (min > mddev->resync_max) - return -EINVAL; + goto out_unlock; + + err = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { + chunk = mddev->chunk_sectors; + if (chunk) { sector_t temp = min; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; } mddev->resync_min = min; + err = 0; - return len; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_min_sync = @@ -4258,29 +4394,42 @@ max_sync_show(struct mddev *mddev, char *page) static ssize_t max_sync_store(struct mddev *mddev, const char *buf, size_t len) { + int err; + spin_lock(&mddev->lock); if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; + int chunk; + + err = -EINVAL; if (kstrtoull(buf, 10, &max)) - return -EINVAL; + goto out_unlock; if (max < mddev->resync_min) - return -EINVAL; + goto out_unlock; + + err = -EBUSY; if (max < mddev->resync_max && mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { + chunk = mddev->chunk_sectors; + if (chunk) { sector_t temp = max; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; } mddev->resync_max = max; } wake_up(&mddev->recovery_wait); - return len; + err = 0; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_max_sync = @@ -4297,14 +4446,20 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_lo; + unsigned long long old; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + goto unlock; + old = mddev->suspend_lo; mddev->suspend_lo = new; if (new >= old) /* Shrinking suspended region */ @@ -4314,7 +4469,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -4330,14 +4488,20 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_hi; + unsigned long long old; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + goto unlock; + old = mddev->suspend_hi; mddev->suspend_hi = new; if (new <= old) /* Shrinking suspended region */ @@ -4347,7 +4511,10 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -4367,11 +4534,17 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; char *e; + int err; unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers) - return -EBUSY; + if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; + if (mddev->pers) + goto unlock; mddev->reshape_position = new; mddev->delta_disks = 0; mddev->reshape_backwards = 0; @@ -4380,7 +4553,10 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) mddev->new_chunk_sectors = mddev->chunk_sectors; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_position = @@ -4398,6 +4574,8 @@ static ssize_t reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) { int backwards = 0; + int err; + if (cmd_match(buf, "forwards")) backwards = 0; else if (cmd_match(buf, "backwards")) @@ -4407,16 +4585,19 @@ reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->reshape_backwards == backwards) return len; + err = mddev_lock(mddev); + if (err) + return err; /* check if we are allowed to change */ if (mddev->delta_disks) - return -EBUSY; - - if (mddev->persistent && + err = -EBUSY; + else if (mddev->persistent && mddev->major_version == 0) - return -EINVAL; - - mddev->reshape_backwards = backwards; - return len; + err = -EINVAL; + else + mddev->reshape_backwards = backwards; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_direction = @@ -4437,6 +4618,11 @@ static ssize_t array_size_store(struct mddev *mddev, const char *buf, size_t len) { sector_t sectors; + int err; + + err = mddev_lock(mddev); + if (err) + return err; if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) @@ -4447,19 +4633,22 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) mddev->external_size = 0; } else { if (strict_blocks_to_sectors(buf, §ors) < 0) - return -EINVAL; - if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -E2BIG; - - mddev->external_size = 1; + err = -EINVAL; + else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + err = -E2BIG; + else + mddev->external_size = 1; } - mddev->array_sectors = sectors; - if (mddev->pers) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (!err) { + mddev->array_sectors = sectors; + if (mddev->pers) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_array_size = @@ -4523,11 +4712,7 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) mddev_get(mddev); spin_unlock(&all_mddevs_lock); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->show(mddev, page); - mddev_unlock(mddev); - } + rv = entry->show(mddev, page); mddev_put(mddev); return rv; } @@ -4551,13 +4736,7 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } mddev_get(mddev); spin_unlock(&all_mddevs_lock); - if (entry->store == new_dev_store) - flush_workqueue(md_misc_wq); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->store(mddev, page, length); - mddev_unlock(mddev); - } + rv = entry->store(mddev, page, length); mddev_put(mddev); return rv; } @@ -4825,7 +5004,6 @@ int md_run(struct mddev *mddev) mddev->clevel); return -EINVAL; } - mddev->pers = pers; spin_unlock(&pers_lock); if (mddev->level != pers->level) { mddev->level = pers->level; @@ -4836,7 +5014,6 @@ int md_run(struct mddev *mddev) if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - mddev->pers = NULL; module_put(pers->owner); return -EINVAL; } @@ -4880,35 +5057,38 @@ int md_run(struct mddev *mddev) if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ - err = mddev->pers->run(mddev); + err = pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { WARN_ONCE(!mddev->external_size, "%s: default size too small," " but 'external_size' not in effect?\n", __func__); printk(KERN_ERR "md: invalid array_size %llu > default size %llu\n", (unsigned long long)mddev->array_sectors / 2, - (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; - mddev->pers->stop(mddev); } - if (err == 0 && mddev->pers->sync_request && + if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = bitmap_create(mddev); - if (err) { + if (err) printk(KERN_ERR "%s: failed to create bitmap (%d)\n", mdname(mddev), err); - mddev->pers->stop(mddev); - } } if (err) { - module_put(mddev->pers->owner); - mddev->pers = NULL; + mddev_detach(mddev); + pers->free(mddev, mddev->private); + module_put(pers->owner); bitmap_destroy(mddev); return err; } - if (mddev->pers->sync_request) { + if (mddev->queue) { + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->queue->backing_dev_info.congested_fn = md_congested; + blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec); + } + if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING @@ -4927,7 +5107,10 @@ int md_run(struct mddev *mddev) mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; smp_wmb(); + spin_lock(&mddev->lock); + mddev->pers = pers; mddev->ready = 1; + spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) if (sysfs_link_rdev(mddev, rdev)) @@ -5070,14 +5253,38 @@ void md_stop_writes(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_stop_writes); +static void mddev_detach(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + /* wait for behind writes to complete */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); + /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } + if (mddev->pers && mddev->pers->quiesce) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + md_unregister_thread(&mddev->thread); + if (mddev->queue) + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +} + static void __md_stop(struct mddev *mddev) { + struct md_personality *pers = mddev->pers; + mddev_detach(mddev); + spin_lock(&mddev->lock); mddev->ready = 0; - mddev->pers->stop(mddev); - if (mddev->pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - module_put(mddev->pers->owner); mddev->pers = NULL; + spin_unlock(&mddev->lock); + pers->free(mddev, mddev->private); + if (pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } @@ -5226,8 +5433,11 @@ static int do_md_stop(struct mddev *mddev, int mode, bitmap_destroy(mddev); if (mddev->bitmap_info.file) { - fput(mddev->bitmap_info.file); + struct file *f = mddev->bitmap_info.file; + spin_lock(&mddev->lock); mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); } mddev->bitmap_info.offset = 0; @@ -5436,37 +5646,31 @@ static int get_array_info(struct mddev *mddev, void __user *arg) static int get_bitmap_file(struct mddev *mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ - char *ptr, *buf = NULL; - int err = -ENOMEM; + char *ptr; + int err; file = kmalloc(sizeof(*file), GFP_NOIO); - if (!file) - goto out; + return -ENOMEM; + err = 0; + spin_lock(&mddev->lock); /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->storage.file) { + if (!mddev->bitmap_info.file) file->pathname[0] = '\0'; - goto copy_out; - } - - buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); - if (!buf) - goto out; - - ptr = d_path(&mddev->bitmap->storage.file->f_path, - buf, sizeof(file->pathname)); - if (IS_ERR(ptr)) - goto out; - - strcpy(file->pathname, ptr); + else if ((ptr = d_path(&mddev->bitmap_info.file->f_path, + file->pathname, sizeof(file->pathname))), + IS_ERR(ptr)) + err = PTR_ERR(ptr); + else + memmove(file->pathname, ptr, + sizeof(file->pathname)-(ptr-file->pathname)); + spin_unlock(&mddev->lock); -copy_out: - err = 0; - if (copy_to_user(arg, file, sizeof(*file))) + if (err == 0 && + copy_to_user(arg, file, sizeof(*file))) err = -EFAULT; -out: - kfree(buf); + kfree(file); return err; } @@ -5789,22 +5993,24 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (fd >= 0) { struct inode *inode; - if (mddev->bitmap) + struct file *f; + + if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_info.file = fget(fd); + f = fget(fd); - if (mddev->bitmap_info.file == NULL) { + if (f == NULL) { printk(KERN_ERR "%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } - inode = mddev->bitmap_info.file->f_mapping->host; + inode = f->f_mapping->host; if (!S_ISREG(inode->i_mode)) { printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", mdname(mddev)); err = -EBADF; - } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { + } else if (!(f->f_mode & FMODE_WRITE)) { printk(KERN_ERR "%s: error: bitmap file must open for write\n", mdname(mddev)); err = -EBADF; @@ -5814,10 +6020,10 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = -EBUSY; } if (err) { - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; + fput(f); return err; } + mddev->bitmap_info.file = f; mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ @@ -5836,9 +6042,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_info.file) - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; + struct file *f = mddev->bitmap_info.file; + if (f) { + spin_lock(&mddev->lock); + mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); + } } return err; @@ -6251,6 +6461,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, case SET_DISK_FAULTY: err = set_disk_faulty(mddev, new_decode_dev(arg)); goto out; + + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto out; + } if (cmd == ADD_NEW_DISK) @@ -6342,10 +6557,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * Commands even a read-only array can execute: */ switch (cmd) { - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); goto unlock; @@ -6873,9 +7084,7 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } - if (mddev_lock(mddev) < 0) - return -EINTR; - + spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); @@ -6888,7 +7097,8 @@ static int md_seq_show(struct seq_file *seq, void *v) } sectors = 0; - rdev_for_each(rdev, mddev) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -6904,6 +7114,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "(R)"); sectors += rdev->sectors; } + rcu_read_unlock(); if (!list_empty(&mddev->disks)) { if (mddev->pers) @@ -6946,7 +7157,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } - mddev_unlock(mddev); + spin_unlock(&mddev->lock); return 0; } @@ -7102,7 +7313,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) if (mddev->safemode == 1) mddev->safemode = 0; if (mddev->in_sync) { - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -7110,7 +7321,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) md_wakeup_thread(mddev->thread); did_change = 1; } - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -7148,7 +7359,7 @@ int md_allow_write(struct mddev *mddev) if (!mddev->pers->sync_request) return 0; - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -7156,11 +7367,11 @@ int md_allow_write(struct mddev *mddev) if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); } else - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) return -EAGAIN; @@ -7513,6 +7724,7 @@ void md_do_sync(struct md_thread *thread) skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) @@ -7521,6 +7733,8 @@ void md_do_sync(struct md_thread *thread) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; + spin_unlock(&mddev->lock); + wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -7688,7 +7902,7 @@ void md_check_recovery(struct mddev *mddev) if (!mddev->external) { int did_change = 0; - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && @@ -7699,7 +7913,7 @@ void md_check_recovery(struct mddev *mddev) } if (mddev->safemode == 1) mddev->safemode = 0; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); } @@ -7721,7 +7935,9 @@ void md_check_recovery(struct mddev *mddev) * any transients in the value of "sync_action". */ mddev->curr_resync_completed = 0; + spin_lock(&mddev->lock); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + spin_unlock(&mddev->lock); /* Clear some bits that don't mean anything, but * might be left set */ diff --git a/drivers/md/md.h b/drivers/md/md.h index 03cec5b..318ca8f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -386,7 +386,18 @@ struct mddev { struct work_struct del_work; /* used for delayed sysfs removal */ - spinlock_t write_lock; + /* "lock" protects: + * flush_bio transition from NULL to !NULL + * rdev superblocks, events + * clearing MD_CHANGE_* + * in_sync - and related safemode and MD_CHANGE changes + * pers (also protected by reconfig_mutex and pending IO). + * clearing ->bitmap + * clearing ->bitmap_info.file + * changing ->resync_{min,max} + * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max}) + */ + spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ atomic_t pending_writes; /* number of active superblock writes */ @@ -439,13 +450,30 @@ struct mddev { void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); }; -static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +static inline int __must_check mddev_lock(struct mddev *mddev) { - int faulty = test_bit(Faulty, &rdev->flags); - if (atomic_dec_and_test(&rdev->nr_pending) && faulty) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + return mutex_lock_interruptible(&mddev->reconfig_mutex); +} + +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev *mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + +static inline int mddev_is_locked(struct mddev *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); } +static inline int mddev_trylock(struct mddev *mddev) +{ + return mutex_trylock(&mddev->reconfig_mutex); +} +extern void mddev_unlock(struct mddev *mddev); + static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); @@ -459,7 +487,7 @@ struct md_personality struct module *owner; void (*make_request)(struct mddev *mddev, struct bio *bio); int (*run)(struct mddev *mddev); - int (*stop)(struct mddev *mddev); + void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); /* error_handler must set ->faulty and clear ->in_sync * if appropriate, and should abort recovery if needed @@ -490,6 +518,13 @@ struct md_personality * array. */ void *(*takeover) (struct mddev *mddev); + /* congested implements bdi.congested_fn(). + * Will not be called while array is 'suspended' */ + int (*congested)(struct mddev *mddev, int bits); + /* mergeable_bvec is use to implement ->merge_bvec_fn */ + int (*mergeable_bvec)(struct mddev *mddev, + struct bvec_merge_data *bvm, + struct bio_vec *biovec); }; struct md_sysfs_entry { @@ -624,4 +659,14 @@ static inline int mddev_check_plugged(struct mddev *mddev) return !!blk_check_plugged(md_unplug, mddev, sizeof(struct blk_plug_cb)); } + +static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } +} + #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 399272f..ac3ede2 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -153,15 +153,11 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev) seq_printf (seq, "]"); } -static int multipath_congested(void *data, int bits) +static int multipath_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct mpconf *conf = mddev->private; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - rcu_read_lock(); for (i = 0; i < mddev->raid_disks ; i++) { struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); @@ -403,7 +399,7 @@ static int multipath_run (struct mddev *mddev) /* * copy the already verified devices into our private MULTIPATH * bookkeeping area. [whatever we allocate in multipath_run(), - * should be freed in multipath_stop()] + * should be freed in multipath_free()] */ conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); @@ -489,9 +485,6 @@ static int multipath_run (struct mddev *mddev) */ md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - mddev->queue->backing_dev_info.congested_fn = multipath_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - if (md_integrity_register(mddev)) goto out_free_conf; @@ -507,17 +500,13 @@ out: return -EIO; } -static int multipath_stop (struct mddev *mddev) +static void multipath_free(struct mddev *mddev, void *priv) { - struct mpconf *conf = mddev->private; + struct mpconf *conf = priv; - md_unregister_thread(&mddev->thread); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ mempool_destroy(conf->pool); kfree(conf->multipaths); kfree(conf); - mddev->private = NULL; - return 0; } static struct md_personality multipath_personality = @@ -527,12 +516,13 @@ static struct md_personality multipath_personality = .owner = THIS_MODULE, .make_request = multipath_make_request, .run = multipath_run, - .stop = multipath_stop, + .free = multipath_free, .status = multipath_status, .error_handler = multipath_error, .hot_add_disk = multipath_add_disk, .hot_remove_disk= multipath_remove_disk, .size = multipath_size, + .congested = multipath_congested, }; static int __init multipath_init (void) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ba6b85d..a13f738 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -25,17 +25,13 @@ #include "raid0.h" #include "raid5.h" -static int raid0_congested(void *data, int bits) +static int raid0_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct r0conf *conf = mddev->private; struct md_rdev **devlist = conf->devlist; int raid_disks = conf->strip_zone[0].nb_dev; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); @@ -263,8 +259,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) mdname(mddev), (unsigned long long)smallest->sectors); } - mddev->queue->backing_dev_info.congested_fn = raid0_congested; - mddev->queue->backing_dev_info.congested_data = mddev; /* * now since we have the hard sector sizes, we can make sure @@ -356,17 +350,16 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, /** * raid0_mergeable_bvec -- tell bio layer if two requests can be merged - * @q: request queue + * @mddev: the md device * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset */ -static int raid0_mergeable_bvec(struct request_queue *q, +static int raid0_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r0conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); sector_t sector_offset = sector; @@ -422,7 +415,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static int raid0_stop(struct mddev *mddev); +static void raid0_free(struct mddev *mddev, void *priv); static int raid0_run(struct mddev *mddev) { @@ -471,26 +464,22 @@ static int raid0_run(struct mddev *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); dump_zones(mddev); ret = md_integrity_register(mddev); if (ret) - raid0_stop(mddev); + raid0_free(mddev, conf); return ret; } -static int raid0_stop(struct mddev *mddev) +static void raid0_free(struct mddev *mddev, void *priv) { - struct r0conf *conf = mddev->private; + struct r0conf *conf = priv; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); - mddev->private = NULL; - return 0; } /* @@ -724,11 +713,13 @@ static struct md_personality raid0_personality= .owner = THIS_MODULE, .make_request = raid0_make_request, .run = raid0_run, - .stop = raid0_stop, + .free = raid0_free, .status = raid0_status, .size = raid0_size, .takeover = raid0_takeover, .quiesce = raid0_quiesce, + .congested = raid0_congested, + .mergeable_bvec = raid0_mergeable_bvec, }; static int __init raid0_init (void) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 40b35be..5dd0c2e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -701,11 +701,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } -static int raid1_mergeable_bvec(struct request_queue *q, +static int raid1_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r1conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max = biovec->bv_len; @@ -734,7 +733,7 @@ static int raid1_mergeable_bvec(struct request_queue *q, } -int md_raid1_congested(struct mddev *mddev, int bits) +static int raid1_congested(struct mddev *mddev, int bits) { struct r1conf *conf = mddev->private; int i, ret = 0; @@ -763,15 +762,6 @@ int md_raid1_congested(struct mddev *mddev, int bits) rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(md_raid1_congested); - -static int raid1_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid1_congested(mddev, bits); -} static void flush_pending_writes(struct r1conf *conf) { @@ -2882,7 +2872,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } -static int stop(struct mddev *mddev); +static void raid1_free(struct mddev *mddev, void *priv); static int run(struct mddev *mddev) { struct r1conf *conf; @@ -2904,7 +2894,7 @@ static int run(struct mddev *mddev) /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] + * should be freed in raid1_free()] */ if (mddev->private == NULL) conf = setup_conf(mddev); @@ -2955,10 +2945,6 @@ static int run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); if (mddev->queue) { - mddev->queue->backing_dev_info.congested_fn = raid1_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); - if (discard_supported) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); @@ -2968,37 +2954,23 @@ static int run(struct mddev *mddev) } ret = md_integrity_register(mddev); - if (ret) - stop(mddev); + if (ret) { + md_unregister_thread(&mddev->thread); + raid1_free(mddev, conf); + } return ret; } -static int stop(struct mddev *mddev) +static void raid1_free(struct mddev *mddev, void *priv) { - struct r1conf *conf = mddev->private; - struct bitmap *bitmap = mddev->bitmap; + struct r1conf *conf = priv; - /* wait for behind writes to complete */ - if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", - mdname(mddev)); - /* need to kick something here to make sure I/O goes? */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } - - freeze_array(conf, 0); - unfreeze_array(conf); - - md_unregister_thread(&mddev->thread); if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); - mddev->private = NULL; - return 0; } static int raid1_resize(struct mddev *mddev, sector_t sectors) @@ -3181,7 +3153,7 @@ static struct md_personality raid1_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid1_free, .status = status, .error_handler = error, .hot_add_disk = raid1_add_disk, @@ -3193,6 +3165,8 @@ static struct md_personality raid1_personality = .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, .takeover = raid1_takeover, + .congested = raid1_congested, + .mergeable_bvec = raid1_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 33bda55..14ebb28 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -170,7 +170,4 @@ struct r1bio { */ #define R1BIO_MadeGood 7 #define R1BIO_WriteError 8 - -extern int md_raid1_congested(struct mddev *mddev, int bits); - #endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 32e282f..b8d76b1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -674,7 +674,7 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged - * @q: request queue + * @mddev: the md device * @bvm: properties of new bio * @biovec: the request that could be merged to it. * @@ -682,11 +682,10 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * This requires checking for end-of-chunk if near_copies != raid_disks, * and for subordinate merge_bvec_fns if merge_check_needed. */ -static int raid10_mergeable_bvec(struct request_queue *q, +static int raid10_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r10conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; @@ -910,7 +909,7 @@ retry: return rdev; } -int md_raid10_congested(struct mddev *mddev, int bits) +static int raid10_congested(struct mddev *mddev, int bits) { struct r10conf *conf = mddev->private; int i, ret = 0; @@ -934,15 +933,6 @@ int md_raid10_congested(struct mddev *mddev, int bits) rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(md_raid10_congested); - -static int raid10_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid10_congested(mddev, bits); -} static void flush_pending_writes(struct r10conf *conf) { @@ -3757,8 +3747,6 @@ static int run(struct mddev *mddev) if (mddev->queue) { int stripe = conf->geo.raid_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - mddev->queue->backing_dev_info.congested_fn = raid10_congested; - mddev->queue->backing_dev_info.congested_data = mddev; /* Calculate max read-ahead size. * We need to readahead at least twice a whole stripe.... @@ -3767,7 +3755,6 @@ static int run(struct mddev *mddev) stripe /= conf->geo.near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); } if (md_integrity_register(mddev)) @@ -3811,17 +3798,9 @@ out: return -EIO; } -static int stop(struct mddev *mddev) +static void raid10_free(struct mddev *mddev, void *priv) { - struct r10conf *conf = mddev->private; - - raise_barrier(conf, 0); - lower_barrier(conf); - - md_unregister_thread(&mddev->thread); - if (mddev->queue) - /* the unplug fn references 'conf'*/ - blk_sync_queue(mddev->queue); + struct r10conf *conf = priv; if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); @@ -3830,8 +3809,6 @@ static int stop(struct mddev *mddev) kfree(conf->mirrors_old); kfree(conf->mirrors_new); kfree(conf); - mddev->private = NULL; - return 0; } static void raid10_quiesce(struct mddev *mddev, int state) @@ -3895,7 +3872,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return 0; } -static void *raid10_takeover_raid0(struct mddev *mddev) +static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) { struct md_rdev *rdev; struct r10conf *conf; @@ -3905,6 +3882,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } + sector_div(size, devs); /* Set new parameters */ mddev->new_level = 10; @@ -3915,12 +3893,15 @@ static void *raid10_takeover_raid0(struct mddev *mddev) mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; + mddev->dev_sectors = size; conf = setup_conf(mddev); if (!IS_ERR(conf)) { rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0) + if (rdev->raid_disk >= 0) { rdev->new_raid_disk = rdev->raid_disk * 2; + rdev->sectors = size; + } conf->barrier = 1; } @@ -3943,7 +3924,9 @@ static void *raid10_takeover(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } - return raid10_takeover_raid0(mddev); + return raid10_takeover_raid0(mddev, + raid0_conf->strip_zone->zone_end, + raid0_conf->strip_zone->nb_dev); } return ERR_PTR(-EINVAL); } @@ -4713,7 +4696,7 @@ static struct md_personality raid10_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid10_free, .status = status, .error_handler = error, .hot_add_disk = raid10_add_disk, @@ -4727,6 +4710,8 @@ static struct md_personality raid10_personality = .check_reshape = raid10_check_reshape, .start_reshape = raid10_start_reshape, .finish_reshape = raid10_finish_reshape, + .congested = raid10_congested, + .mergeable_bvec = raid10_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 157d69e..5ee6473 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -150,7 +150,4 @@ enum r10bio_state { */ R10BIO_Previous, }; - -extern int md_raid10_congested(struct mddev *mddev, int bits); - #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b98765f..aa76865 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - if (atomic_read(&conf->preread_active_stripes) - < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { @@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx) * Returns 1 when no more member devices need to be checked, otherwise returns * 0 to tell the loop in handle_stripe_fill to continue */ -static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, - int disk_idx, int disks) + +static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; + int i; + + + if (test_bit(R5_LOCKED, &dev->flags) || + test_bit(R5_UPTODATE, &dev->flags)) + /* No point reading this as we already have it or have + * decided to get it. + */ + return 0; + + if (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) + /* We need this block to directly satisfy a request */ + return 1; + + if (s->syncing || s->expanding || + (s->replacing && want_replace(sh, disk_idx))) + /* When syncing, or expanding we read everything. + * When replacing, we need the replaced block. + */ + return 1; + + if ((s->failed >= 1 && fdev[0]->toread) || + (s->failed >= 2 && fdev[1]->toread)) + /* If we want to read from a failed device, then + * we need to actually read every other device. + */ + return 1; + + /* Sometimes neither read-modify-write nor reconstruct-write + * cycles can work. In those cases we read every block we + * can. Then the parity-update is certain to have enough to + * work with. + * This can only be a problem when we need to write something, + * and some device has failed. If either of those tests + * fail we need look no further. + */ + if (!s->failed || !s->to_write) + return 0; + + if (test_bit(R5_Insync, &dev->flags) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + /* Pre-reads at not permitted until after short delay + * to gather multiple requests. However if this + * device is no Insync, the block could only be be computed + * and there is no need to delay that. + */ + return 0; + + for (i = 0; i < s->failed; i++) { + if (fdev[i]->towrite && + !test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + /* If we have a partial write to a failed + * device, then we will need to reconstruct + * the content of that device, so all other + * devices must be read. + */ + return 1; + } + + /* If we are forced to do a reconstruct-write, either because + * the current RAID6 implementation only supports that, or + * or because parity cannot be trusted and we are currently + * recovering it, there is extra need to be careful. + * If one of the devices that we would need to read, because + * it is not being overwritten (and maybe not written at all) + * is missing/faulty, then we need to read everything we can. + */ + if (sh->raid_conf->level != 6 && + sh->sector < sh->raid_conf->mddev->recovery_cp) + /* reconstruct-write isn't being forced */ + return 0; + for (i = 0; i < s->failed; i++) { + if (!test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + return 1; + } + + return 0; +} + +static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; /* is the data in this block needed, and can we get it? */ - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->replacing && want_replace(sh, disk_idx)) || - (s->failed >= 1 && fdev[0]->toread) || - (s->failed >= 2 && fdev[1]->toread) || - (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && - !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - ((sh->raid_conf->level == 6 || - sh->sector >= sh->raid_conf->mddev->recovery_cp) - && s->failed && s->to_write && - (s->to_write - s->non_overwrite < - sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { + if (need_this_block(sh, s, disk_idx, disks)) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -4081,7 +4149,7 @@ static void activate_bit_delay(struct r5conf *conf, } } -int md_raid5_congested(struct mddev *mddev, int bits) +static int raid5_congested(struct mddev *mddev, int bits) { struct r5conf *conf = mddev->private; @@ -4098,24 +4166,14 @@ int md_raid5_congested(struct mddev *mddev, int bits) return 0; } -EXPORT_SYMBOL_GPL(md_raid5_congested); - -static int raid5_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid5_congested(mddev, bits); -} /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, +static int raid5_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_sectors; @@ -5296,11 +5354,14 @@ static void raid5d(struct md_thread *thread) static ssize_t raid5_show_stripe_cache_size(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->max_nr_stripes); - else - return 0; + ret = sprintf(page, "%d\n", conf->max_nr_stripes); + spin_unlock(&mddev->lock); + return ret; } int @@ -5339,21 +5400,25 @@ EXPORT_SYMBOL(raid5_set_cache_size); static ssize_t raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - err = raid5_set_cache_size(mddev, new); + err = mddev_lock(mddev); if (err) return err; - return len; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else + err = raid5_set_cache_size(mddev, new); + mddev_unlock(mddev); + + return err ?: len; } static struct md_sysfs_entry @@ -5364,29 +5429,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->bypass_threshold); - else - return 0; + ret = sprintf(page, "%d\n", conf->bypass_threshold); + spin_unlock(&mddev->lock); + return ret; } static ssize_t raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new > conf->max_nr_stripes) - return -EINVAL; - conf->bypass_threshold = new; - return len; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new > conf->max_nr_stripes) + err = -EINVAL; + else + conf->bypass_threshold = new; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5398,39 +5474,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, static ssize_t raid5_show_skip_copy(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->skip_copy); - else - return 0; + ret = sprintf(page, "%d\n", conf->skip_copy); + spin_unlock(&mddev->lock); + return ret; } static ssize_t raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; new = !!new; - if (new == conf->skip_copy) - return len; - mddev_suspend(mddev); - conf->skip_copy = new; - if (new) - mddev->queue->backing_dev_info.capabilities |= - BDI_CAP_STABLE_WRITES; - else - mddev->queue->backing_dev_info.capabilities &= - ~BDI_CAP_STABLE_WRITES; - mddev_resume(mddev); - return len; + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->skip_copy) { + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5454,11 +5539,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active); static ssize_t raid5_show_group_thread_cnt(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->worker_cnt_per_group); - else - return 0; + ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); + spin_unlock(&mddev->lock); + return ret; } static int alloc_thread_groups(struct r5conf *conf, int cnt, @@ -5468,7 +5556,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; struct r5worker_group *new_groups, *old_groups; @@ -5476,41 +5564,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new == conf->worker_cnt_per_group) - return len; - - mddev_suspend(mddev); + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->worker_cnt_per_group) { + mddev_suspend(mddev); - old_groups = conf->worker_groups; - if (old_groups) - flush_workqueue(raid5_wq); + old_groups = conf->worker_groups; + if (old_groups) + flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); - if (!err) { - spin_lock_irq(&conf->device_lock); - conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; - conf->worker_groups = new_groups; - spin_unlock_irq(&conf->device_lock); + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); - if (old_groups) - kfree(old_groups[0].workers); - kfree(old_groups); + if (old_groups) + kfree(old_groups[0].workers); + kfree(old_groups); + } + mddev_resume(mddev); } + mddev_unlock(mddev); - mddev_resume(mddev); - - if (err) - return err; - return len; + return err ?: len; } static struct md_sysfs_entry @@ -6178,11 +6266,6 @@ static int run(struct mddev *mddev) if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; - chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * @@ -6260,17 +6343,12 @@ abort: return -EIO; } -static int stop(struct mddev *mddev) +static void raid5_free(struct mddev *mddev, void *priv) { - struct r5conf *conf = mddev->private; + struct r5conf *conf = priv; - md_unregister_thread(&mddev->thread); - if (mddev->queue) - mddev->queue->backing_dev_info.congested_fn = NULL; free_conf(conf); - mddev->private = NULL; mddev->to_remove = &raid5_attrs_group; - return 0; } static void status(struct seq_file *seq, struct mddev *mddev) @@ -7044,7 +7122,7 @@ static struct md_personality raid6_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7058,6 +7136,8 @@ static struct md_personality raid6_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid6_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid5_personality = { @@ -7066,7 +7146,7 @@ static struct md_personality raid5_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7080,6 +7160,8 @@ static struct md_personality raid5_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid5_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid4_personality = @@ -7089,7 +7171,7 @@ static struct md_personality raid4_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7103,6 +7185,8 @@ static struct md_personality raid4_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid4_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static int __init raid5_init(void) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index d59f5ca..983e18a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -558,7 +558,6 @@ static inline int algorithm_is_DDF(int layout) return layout >= 8 && layout <= 10; } -extern int md_raid5_congested(struct mddev *mddev, int bits); extern void md_raid5_kick_device(struct r5conf *conf); extern int raid5_set_cache_size(struct mddev *mddev, int size); #endif diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 5356395..55fa27e 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -49,7 +49,6 @@ static DEFINE_MUTEX(mtd_mutex); */ struct mtd_file_info { struct mtd_info *mtd; - struct inode *ino; enum mtd_file_modes mode; }; @@ -59,10 +58,6 @@ static loff_t mtdchar_lseek(struct file *file, loff_t offset, int orig) return fixed_size_llseek(file, offset, orig, mfi->mtd->size); } -static int count; -static struct vfsmount *mnt; -static struct file_system_type mtd_inodefs_type; - static int mtdchar_open(struct inode *inode, struct file *file) { int minor = iminor(inode); @@ -70,7 +65,6 @@ static int mtdchar_open(struct inode *inode, struct file *file) int ret = 0; struct mtd_info *mtd; struct mtd_file_info *mfi; - struct inode *mtd_ino; pr_debug("MTD_open\n"); @@ -78,10 +72,6 @@ static int mtdchar_open(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE) && (minor & 1)) return -EACCES; - ret = simple_pin_fs(&mtd_inodefs_type, &mnt, &count); - if (ret) - return ret; - mutex_lock(&mtd_mutex); mtd = get_mtd_device(NULL, devnum); @@ -95,43 +85,26 @@ static int mtdchar_open(struct inode *inode, struct file *file) goto out1; } - mtd_ino = iget_locked(mnt->mnt_sb, devnum); - if (!mtd_ino) { - ret = -ENOMEM; - goto out1; - } - if (mtd_ino->i_state & I_NEW) { - mtd_ino->i_private = mtd; - mtd_ino->i_mode = S_IFCHR; - mtd_ino->i_data.backing_dev_info = mtd->backing_dev_info; - unlock_new_inode(mtd_ino); - } - file->f_mapping = mtd_ino->i_mapping; - /* You can't open it RW if it's not a writeable device */ if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) { ret = -EACCES; - goto out2; + goto out1; } mfi = kzalloc(sizeof(*mfi), GFP_KERNEL); if (!mfi) { ret = -ENOMEM; - goto out2; + goto out1; } - mfi->ino = mtd_ino; mfi->mtd = mtd; file->private_data = mfi; mutex_unlock(&mtd_mutex); return 0; -out2: - iput(mtd_ino); out1: put_mtd_device(mtd); out: mutex_unlock(&mtd_mutex); - simple_release_fs(&mnt, &count); return ret; } /* mtdchar_open */ @@ -148,12 +121,9 @@ static int mtdchar_close(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE)) mtd_sync(mtd); - iput(mfi->ino); - put_mtd_device(mtd); file->private_data = NULL; kfree(mfi); - simple_release_fs(&mnt, &count); return 0; } /* mtdchar_close */ @@ -1117,6 +1087,13 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file, ret = mtd_get_unmapped_area(mtd, len, offset, flags); return ret == -EOPNOTSUPP ? -ENODEV : ret; } + +static unsigned mtdchar_mmap_capabilities(struct file *file) +{ + struct mtd_file_info *mfi = file->private_data; + + return mtd_mmap_capabilities(mfi->mtd); +} #endif /* @@ -1160,27 +1137,10 @@ static const struct file_operations mtd_fops = { .mmap = mtdchar_mmap, #ifndef CONFIG_MMU .get_unmapped_area = mtdchar_get_unmapped_area, + .mmap_capabilities = mtdchar_mmap_capabilities, #endif }; -static const struct super_operations mtd_ops = { - .drop_inode = generic_delete_inode, - .statfs = simple_statfs, -}; - -static struct dentry *mtd_inodefs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_pseudo(fs_type, "mtd_inode:", &mtd_ops, NULL, MTD_INODE_FS_MAGIC); -} - -static struct file_system_type mtd_inodefs_type = { - .name = "mtd_inodefs", - .mount = mtd_inodefs_mount, - .kill_sb = kill_anon_super, -}; -MODULE_ALIAS_FS("mtd_inodefs"); - int __init init_mtdchar(void) { int ret; @@ -1193,23 +1153,11 @@ int __init init_mtdchar(void) return ret; } - ret = register_filesystem(&mtd_inodefs_type); - if (ret) { - pr_err("Can't register mtd_inodefs filesystem, error %d\n", - ret); - goto err_unregister_chdev; - } - - return ret; - -err_unregister_chdev: - __unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd"); return ret; } void __exit cleanup_mtdchar(void) { - unregister_filesystem(&mtd_inodefs_type); __unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd"); } diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c index b900056..eacc3aa 100644 --- a/drivers/mtd/mtdconcat.c +++ b/drivers/mtd/mtdconcat.c @@ -732,8 +732,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks; - concat->mtd.backing_dev_info = subdev[0]->backing_dev_info; - concat->subdev[0] = subdev[0]; for (i = 1; i < num_devs; i++) { @@ -761,14 +759,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c subdev[i]->flags & MTD_WRITEABLE; } - /* only permit direct mapping if the BDIs are all the same - * - copy-mapping is still permitted - */ - if (concat->mtd.backing_dev_info != - subdev[i]->backing_dev_info) - concat->mtd.backing_dev_info = - &default_backing_dev_info; - concat->mtd.size += subdev[i]->size; concat->mtd.ecc_stats.badblocks += subdev[i]->ecc_stats.badblocks; diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 4c61187..0ec4d6e 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -43,33 +43,7 @@ #include "mtdcore.h" -/* - * backing device capabilities for non-mappable devices (such as NAND flash) - * - permits private mappings, copies are taken of the data - */ -static struct backing_dev_info mtd_bdi_unmappable = { - .capabilities = BDI_CAP_MAP_COPY, -}; - -/* - * backing device capabilities for R/O mappable devices (such as ROM) - * - permits private mappings, copies are taken of the data - * - permits non-writable shared mappings - */ -static struct backing_dev_info mtd_bdi_ro_mappable = { - .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT | - BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP), -}; - -/* - * backing device capabilities for writable mappable devices (such as RAM) - * - permits private mappings, copies are taken of the data - * - permits non-writable shared mappings - */ -static struct backing_dev_info mtd_bdi_rw_mappable = { - .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT | - BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP | - BDI_CAP_WRITE_MAP), +static struct backing_dev_info mtd_bdi = { }; static int mtd_cls_suspend(struct device *dev, pm_message_t state); @@ -365,6 +339,23 @@ static struct device_type mtd_devtype = { .release = mtd_release, }; +#ifndef CONFIG_MMU +unsigned mtd_mmap_capabilities(struct mtd_info *mtd) +{ + switch (mtd->type) { + case MTD_RAM: + return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | + NOMMU_MAP_READ | NOMMU_MAP_WRITE; + case MTD_ROM: + return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | + NOMMU_MAP_READ; + default: + return NOMMU_MAP_COPY; + } +} +EXPORT_SYMBOL_GPL(mtd_mmap_capabilities); +#endif + /** * add_mtd_device - register an MTD device * @mtd: pointer to new MTD device info structure @@ -380,19 +371,7 @@ int add_mtd_device(struct mtd_info *mtd) struct mtd_notifier *not; int i, error; - if (!mtd->backing_dev_info) { - switch (mtd->type) { - case MTD_RAM: - mtd->backing_dev_info = &mtd_bdi_rw_mappable; - break; - case MTD_ROM: - mtd->backing_dev_info = &mtd_bdi_ro_mappable; - break; - default: - mtd->backing_dev_info = &mtd_bdi_unmappable; - break; - } - } + mtd->backing_dev_info = &mtd_bdi; BUG_ON(mtd->writesize == 0); mutex_lock(&mtd_table_mutex); @@ -1237,17 +1216,9 @@ static int __init init_mtd(void) if (ret) goto err_reg; - ret = mtd_bdi_init(&mtd_bdi_unmappable, "mtd-unmap"); - if (ret) - goto err_bdi1; - - ret = mtd_bdi_init(&mtd_bdi_ro_mappable, "mtd-romap"); - if (ret) - goto err_bdi2; - - ret = mtd_bdi_init(&mtd_bdi_rw_mappable, "mtd-rwmap"); + ret = mtd_bdi_init(&mtd_bdi, "mtd"); if (ret) - goto err_bdi3; + goto err_bdi; proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops); @@ -1260,11 +1231,7 @@ static int __init init_mtd(void) out_procfs: if (proc_mtd) remove_proc_entry("mtd", NULL); -err_bdi3: - bdi_destroy(&mtd_bdi_ro_mappable); -err_bdi2: - bdi_destroy(&mtd_bdi_unmappable); -err_bdi1: +err_bdi: class_unregister(&mtd_class); err_reg: pr_err("Error registering mtd class or bdi: %d\n", ret); @@ -1277,9 +1244,7 @@ static void __exit cleanup_mtd(void) if (proc_mtd) remove_proc_entry("mtd", NULL); class_unregister(&mtd_class); - bdi_destroy(&mtd_bdi_unmappable); - bdi_destroy(&mtd_bdi_ro_mappable); - bdi_destroy(&mtd_bdi_rw_mappable); + bdi_destroy(&mtd_bdi); } module_init(init_mtd); diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index a3e3a7d..e779de3 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -378,7 +378,6 @@ static struct mtd_part *allocate_partition(struct mtd_info *master, slave->mtd.name = name; slave->mtd.owner = master->owner; - slave->mtd.backing_dev_info = master->backing_dev_info; /* NOTE: we don't arrange MTDs as a tree; it'd be error-prone * to have the same data be in two different partitions. diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 7f90022..96128cb 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -28,8 +28,8 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static void dcssblk_make_request(struct request_queue *q, struct bio *bio); -static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum, - void **kaddr, unsigned long *pfn); +static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, + void **kaddr, unsigned long *pfn, long size); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -877,25 +877,22 @@ fail: bio_io_error(bio); } -static int +static long dcssblk_direct_access (struct block_device *bdev, sector_t secnum, - void **kaddr, unsigned long *pfn) + void **kaddr, unsigned long *pfn, long size) { struct dcssblk_dev_info *dev_info; - unsigned long pgoff; + unsigned long offset, dev_sz; dev_info = bdev->bd_disk->private_data; if (!dev_info) return -ENODEV; - if (secnum % (PAGE_SIZE/512)) - return -EINVAL; - pgoff = secnum / (PAGE_SIZE / 512); - if ((pgoff+1)*PAGE_SIZE-1 > dev_info->end - dev_info->start) - return -ERANGE; - *kaddr = (void *) (dev_info->start+pgoff*PAGE_SIZE); + dev_sz = dev_info->end - dev_info->start; + offset = secnum * 512; + *kaddr = (void *) (dev_info->start + offset); *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; - return 0; + return dev_sz - offset; } static void diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 17bb541..54d7a6c 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2197,6 +2197,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) shost->tag_set.cmd_size = cmd_size; shost->tag_set.numa_node = NUMA_NO_NODE; shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + shost->tag_set.flags |= + BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); shost->tag_set.driver_data = shost; return blk_mq_alloc_tag_set(&shost->tag_set); diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 0deb385..9c0a520 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -277,7 +277,8 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, if (!shost_use_blk_mq(sdev->host) && (shost->bqt || shost->hostt->use_blk_tags)) { blk_queue_init_tags(sdev->request_queue, - sdev->host->cmd_per_lun, shost->bqt); + sdev->host->cmd_per_lun, shost->bqt, + shost->hostt->tag_alloc_policy); } scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun); diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index a668c88..0cbc1fb 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1719,22 +1719,19 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) } if (iov_count) { - int len, size = sizeof(struct sg_iovec) * iov_count; + int size = sizeof(struct iovec) * iov_count; struct iovec *iov; + struct iov_iter i; iov = memdup_user(hp->dxferp, size); if (IS_ERR(iov)) return PTR_ERR(iov); - len = iov_length(iov, iov_count); - if (hp->dxfer_len < len) { - iov_count = iov_shorten(iov, iov_count, hp->dxfer_len); - len = hp->dxfer_len; - } + iov_iter_init(&i, rw, iov, iov_count, + min_t(size_t, hp->dxfer_len, + iov_length(iov, iov_count))); - res = blk_rq_map_user_iov(q, rq, md, (struct sg_iovec *)iov, - iov_count, - len, GFP_ATOMIC); + res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC); kfree(iov); } else res = blk_rq_map_user(q, rq, md, hp->dxferp, diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c index a3367bf..45aaa1c 100644 --- a/drivers/staging/lustre/lustre/llite/llite_lib.c +++ b/drivers/staging/lustre/lustre/llite/llite_lib.c @@ -987,7 +987,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt) if (err) goto out_free; lsi->lsi_flags |= LSI_BDI_INITIALIZED; - lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY; + lsi->lsi_bdi.capabilities = 0; err = ll_bdi_register(&lsi->lsi_bdi); if (err) goto out_free; @@ -1812,10 +1812,6 @@ void ll_read_inode2(struct inode *inode, void *opaque) /* OIDEBUG(inode); */ - /* initializing backing dev info. */ - inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi; - - if (S_ISREG(inode->i_mode)) { struct ll_sb_info *sbi = ll_i2sbi(inode); |