summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/ata/libata-core.c70
-rw-r--r--drivers/ata/libata-scsi.c33
-rw-r--r--drivers/ata/libata.h4
-rw-r--r--drivers/ata/sata_sil24.c1
-rw-r--r--drivers/block/brd.c123
-rw-r--r--drivers/block/drbd/drbd_receiver.c2
-rw-r--r--drivers/block/floppy.c17
-rw-r--r--drivers/block/loop.c416
-rw-r--r--drivers/block/loop.h18
-rw-r--r--drivers/block/null_blk.c2
-rw-r--r--drivers/block/nvme-core.c128
-rw-r--r--drivers/block/osdblk.c2
-rw-r--r--drivers/block/xen-blkback/common.h9
-rw-r--r--drivers/block/xen-blkback/xenbus.c4
-rw-r--r--drivers/block/xen-blkfront.c4
-rw-r--r--drivers/char/mem.c64
-rw-r--r--drivers/char/raw.c4
-rw-r--r--drivers/md/Kconfig5
-rw-r--r--drivers/md/bitmap.c15
-rw-r--r--drivers/md/dm-bufio.c3
-rw-r--r--drivers/md/dm-cache-target.c5
-rw-r--r--drivers/md/dm-ioctl.c4
-rw-r--r--drivers/md/dm-log-userspace-base.c5
-rw-r--r--drivers/md/dm-mpath.c87
-rw-r--r--drivers/md/dm-raid.c24
-rw-r--r--drivers/md/dm-snap-persistent.c14
-rw-r--r--drivers/md/dm-table.c72
-rw-r--r--drivers/md/dm-target.c15
-rw-r--r--drivers/md/dm-thin-metadata.c9
-rw-r--r--drivers/md/dm-thin-metadata.h2
-rw-r--r--drivers/md/dm-thin.c5
-rw-r--r--drivers/md/dm.c346
-rw-r--r--drivers/md/dm.h11
-rw-r--r--drivers/md/faulty.c8
-rw-r--r--drivers/md/linear.c67
-rw-r--r--drivers/md/md.c816
-rw-r--r--drivers/md/md.h57
-rw-r--r--drivers/md/multipath.c22
-rw-r--r--drivers/md/raid0.c29
-rw-r--r--drivers/md/raid1.c52
-rw-r--r--drivers/md/raid1.h3
-rw-r--r--drivers/md/raid10.c49
-rw-r--r--drivers/md/raid10.h3
-rw-r--r--drivers/md/raid5.c334
-rw-r--r--drivers/md/raid5.h1
-rw-r--r--drivers/mtd/mtdchar.c72
-rw-r--r--drivers/mtd/mtdconcat.c10
-rw-r--r--drivers/mtd/mtdcore.c81
-rw-r--r--drivers/mtd/mtdpart.c1
-rw-r--r--drivers/s390/block/dcssblk.c21
-rw-r--r--drivers/scsi/scsi_lib.c2
-rw-r--r--drivers/scsi/scsi_scan.c3
-rw-r--r--drivers/scsi/sg.c15
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_lib.c6
54 files changed, 1788 insertions, 1387 deletions
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 4b0d5e7..4c35f08 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1585,8 +1585,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
else
tag = 0;
- if (test_and_set_bit(tag, &ap->qc_allocated))
- BUG();
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
@@ -4722,69 +4720,36 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
}
/**
- * ata_qc_new - Request an available ATA command, for queueing
- * @ap: target port
- *
- * Some ATA host controllers may implement a queue depth which is less
- * than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond
- * the hardware limitation.
+ * ata_qc_new_init - Request an available ATA command, and initialize it
+ * @dev: Device from whom we request an available command structure
*
* LOCKING:
* None.
*/
-static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
+struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
{
- struct ata_queued_cmd *qc = NULL;
- unsigned int max_queue = ap->host->n_tags;
- unsigned int i, tag;
+ struct ata_port *ap = dev->link->ap;
+ struct ata_queued_cmd *qc;
/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
return NULL;
- for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) {
- if (ap->flags & ATA_FLAG_LOWTAG)
- tag = i;
- else
- tag = tag < max_queue ? tag : 0;
-
- /* the last tag is reserved for internal command. */
- if (tag == ATA_TAG_INTERNAL)
- continue;
-
- if (!test_and_set_bit(tag, &ap->qc_allocated)) {
- qc = __ata_qc_from_tag(ap, tag);
- qc->tag = tag;
- ap->last_tag = tag;
- break;
- }
+ /* libsas case */
+ if (!ap->scsi_host) {
+ tag = ata_sas_allocate_tag(ap);
+ if (tag < 0)
+ return NULL;
}
- return qc;
-}
-
-/**
- * ata_qc_new_init - Request an available ATA command, and initialize it
- * @dev: Device from whom we request an available command structure
- *
- * LOCKING:
- * None.
- */
-
-struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
-{
- struct ata_port *ap = dev->link->ap;
- struct ata_queued_cmd *qc;
-
- qc = ata_qc_new(ap);
- if (qc) {
- qc->scsicmd = NULL;
- qc->ap = ap;
- qc->dev = dev;
+ qc = __ata_qc_from_tag(ap, tag);
+ qc->tag = tag;
+ qc->scsicmd = NULL;
+ qc->ap = ap;
+ qc->dev = dev;
- ata_qc_reinit(qc);
- }
+ ata_qc_reinit(qc);
return qc;
}
@@ -4811,7 +4776,8 @@ void ata_qc_free(struct ata_queued_cmd *qc)
tag = qc->tag;
if (likely(ata_tag_valid(tag))) {
qc->tag = ATA_TAG_POISON;
- clear_bit(tag, &ap->qc_allocated);
+ if (!ap->scsi_host)
+ ata_sas_free_tag(tag, ap);
}
}
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 2807293..b061ba2 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
{
struct ata_queued_cmd *qc;
- qc = ata_qc_new_init(dev);
+ qc = ata_qc_new_init(dev, cmd->request->tag);
if (qc) {
qc->scsicmd = cmd;
qc->scsidone = cmd->scsi_done;
@@ -3668,6 +3668,9 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
*/
shost->max_host_blocked = 1;
+ if (scsi_init_shared_tag_map(shost, host->n_tags))
+ goto err_add;
+
rc = scsi_add_host_with_dma(ap->scsi_host,
&ap->tdev, ap->host->dev);
if (rc)
@@ -4230,3 +4233,31 @@ int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap)
return rc;
}
EXPORT_SYMBOL_GPL(ata_sas_queuecmd);
+
+int ata_sas_allocate_tag(struct ata_port *ap)
+{
+ unsigned int max_queue = ap->host->n_tags;
+ unsigned int i, tag;
+
+ for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) {
+ if (ap->flags & ATA_FLAG_LOWTAG)
+ tag = 1;
+ else
+ tag = tag < max_queue ? tag : 0;
+
+ /* the last tag is reserved for internal command. */
+ if (tag == ATA_TAG_INTERNAL)
+ continue;
+
+ if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
+ ap->sas_last_tag = tag;
+ return tag;
+ }
+ }
+ return -1;
+}
+
+void ata_sas_free_tag(unsigned int tag, struct ata_port *ap)
+{
+ clear_bit(tag, &ap->sas_tag_allocated);
+}
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 82ebe26..f840ca1 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
extern void ata_force_cbl(struct ata_port *ap);
extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
-extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
+extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag);
extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
u64 block, u32 n_block, unsigned int tf_flags,
unsigned int tag);
@@ -144,6 +144,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work);
extern int ata_bus_probe(struct ata_port *ap);
extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
unsigned int id, u64 lun);
+int ata_sas_allocate_tag(struct ata_port *ap);
+void ata_sas_free_tag(unsigned int tag, struct ata_port *ap);
/* libata-eh.c */
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index ea65594..ba2667f 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -388,6 +388,7 @@ static struct scsi_host_template sil24_sht = {
.can_queue = SIL24_MAX_CMDS,
.sg_tablesize = SIL24_MAX_SGE,
.dma_boundary = ATA_DMA_BOUNDARY,
+ .tag_alloc_policy = BLK_TAG_ALLOC_FIFO,
};
static struct ata_port_operations sil24_ops = {
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 3598110..c01b921 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -370,25 +370,25 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
}
#ifdef CONFIG_BLK_DEV_XIP
-static int brd_direct_access(struct block_device *bdev, sector_t sector,
- void **kaddr, unsigned long *pfn)
+static long brd_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
{
struct brd_device *brd = bdev->bd_disk->private_data;
struct page *page;
if (!brd)
return -ENODEV;
- if (sector & (PAGE_SECTORS-1))
- return -EINVAL;
- if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk))
- return -ERANGE;
page = brd_insert_page(brd, sector);
if (!page)
return -ENOSPC;
*kaddr = page_address(page);
*pfn = page_to_pfn(page);
- return 0;
+ /*
+ * TODO: If size > PAGE_SIZE, we could look to see if the next page in
+ * the file happens to be mapped to the next page of physical RAM.
+ */
+ return PAGE_SIZE;
}
#endif
@@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = {
/*
* And now the modules code and kernel interface.
*/
-static int rd_nr;
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
-static int max_part;
-static int part_shift;
-static int part_show = 0;
+static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
module_param(rd_nr, int, S_IRUGO);
MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+
+int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
module_param(rd_size, int, S_IRUGO);
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+
+static int max_part = 1;
module_param(max_part, int, S_IRUGO);
-MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
-module_param(part_show, int, S_IRUGO);
-MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions");
+MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
+
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
MODULE_ALIAS("rd");
@@ -487,25 +486,33 @@ static struct brd_device *brd_alloc(int i)
brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
if (!brd->brd_queue)
goto out_free_dev;
+
blk_queue_make_request(brd->brd_queue, brd_make_request);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
+ /* This is so fdisk will align partitions on 4k, because of
+ * direct_access API needing 4k alignment, returning a PFN
+ * (This is only a problem on very small devices <= 4M,
+ * otherwise fdisk will align on 1M. Regardless this call
+ * is harmless)
+ */
+ blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
+
brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
brd->brd_queue->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
- disk = brd->brd_disk = alloc_disk(1 << part_shift);
+ disk = brd->brd_disk = alloc_disk(max_part);
if (!disk)
goto out_free_queue;
disk->major = RAMDISK_MAJOR;
- disk->first_minor = i << part_shift;
+ disk->first_minor = i * max_part;
disk->fops = &brd_fops;
disk->private_data = brd;
disk->queue = brd->brd_queue;
- if (!part_show)
- disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+ disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "ram%d", i);
set_capacity(disk, rd_size * 2);
@@ -527,10 +534,11 @@ static void brd_free(struct brd_device *brd)
kfree(brd);
}
-static struct brd_device *brd_init_one(int i)
+static struct brd_device *brd_init_one(int i, bool *new)
{
struct brd_device *brd;
+ *new = false;
list_for_each_entry(brd, &brd_devices, brd_list) {
if (brd->brd_number == i)
goto out;
@@ -541,6 +549,7 @@ static struct brd_device *brd_init_one(int i)
add_disk(brd->brd_disk);
list_add_tail(&brd->brd_list, &brd_devices);
}
+ *new = true;
out:
return brd;
}
@@ -556,70 +565,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
{
struct brd_device *brd;
struct kobject *kobj;
+ bool new;
mutex_lock(&brd_devices_mutex);
- brd = brd_init_one(MINOR(dev) >> part_shift);
+ brd = brd_init_one(MINOR(dev) / max_part, &new);
kobj = brd ? get_disk(brd->brd_disk) : NULL;
mutex_unlock(&brd_devices_mutex);
- *part = 0;
+ if (new)
+ *part = 0;
+
return kobj;
}
static int __init brd_init(void)
{
- int i, nr;
- unsigned long range;
struct brd_device *brd, *next;
+ int i;
/*
* brd module now has a feature to instantiate underlying device
* structure on-demand, provided that there is an access dev node.
- * However, this will not work well with user space tool that doesn't
- * know about such "feature". In order to not break any existing
- * tool, we do the following:
*
- * (1) if rd_nr is specified, create that many upfront, and this
- * also becomes a hard limit.
- * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
- * (default 16) rd device on module load, user can further
- * extend brd device by create dev node themselves and have
- * kernel automatically instantiate actual device on-demand.
+ * (1) if rd_nr is specified, create that many upfront. else
+ * it defaults to CONFIG_BLK_DEV_RAM_COUNT
+ * (2) User can further extend brd devices by create dev node themselves
+ * and have kernel automatically instantiate actual device
+ * on-demand. Example:
+ * mknod /path/devnod_name b 1 X # 1 is the rd major
+ * fdisk -l /path/devnod_name
+ * If (X / max_part) was not already created it will be created
+ * dynamically.
*/
- part_shift = 0;
- if (max_part > 0) {
- part_shift = fls(max_part);
-
- /*
- * Adjust max_part according to part_shift as it is exported
- * to user space so that user can decide correct minor number
- * if [s]he want to create more devices.
- *
- * Note that -1 is required because partition 0 is reserved
- * for the whole disk.
- */
- max_part = (1UL << part_shift) - 1;
- }
-
- if ((1UL << part_shift) > DISK_MAX_PARTS)
- return -EINVAL;
-
- if (rd_nr > 1UL << (MINORBITS - part_shift))
- return -EINVAL;
-
- if (rd_nr) {
- nr = rd_nr;
- range = rd_nr << part_shift;
- } else {
- nr = CONFIG_BLK_DEV_RAM_COUNT;
- range = 1UL << MINORBITS;
- }
-
if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
return -EIO;
- for (i = 0; i < nr; i++) {
+ if (unlikely(!max_part))
+ max_part = 1;
+
+ for (i = 0; i < rd_nr; i++) {
brd = brd_alloc(i);
if (!brd)
goto out_free;
@@ -631,10 +616,10 @@ static int __init brd_init(void)
list_for_each_entry(brd, &brd_devices, brd_list)
add_disk(brd->brd_disk);
- blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
+ blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
THIS_MODULE, brd_probe, NULL, NULL);
- printk(KERN_INFO "brd: module loaded\n");
+ pr_info("brd: module loaded\n");
return 0;
out_free:
@@ -644,21 +629,21 @@ out_free:
}
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+ pr_info("brd: module NOT loaded !!!\n");
return -ENOMEM;
}
static void __exit brd_exit(void)
{
- unsigned long range;
struct brd_device *brd, *next;
- range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
-
list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
brd_del_one(brd);
- blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
+ blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+ pr_info("brd: module unloaded\n");
}
module_init(brd_init);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d169b4a..cee2035 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
if (blkdev_issue_zeroout(device->ldev->backing_bdev,
- sector, data_size >> 9, GFP_NOIO))
+ sector, data_size >> 9, GFP_NOIO, false))
peer_req->flags |= EE_WAS_ERROR;
drbd_endio_write_sec_final(peer_req);
return 0;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 56d46ff..a08cda9 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev,
static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
+static struct attribute *floppy_dev_attrs[] = {
+ &dev_attr_cmos.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(floppy_dev);
+
static void floppy_device_release(struct device *dev)
{
}
@@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void)
floppy_device[drive].name = floppy_device_name;
floppy_device[drive].id = drive;
floppy_device[drive].dev.release = floppy_device_release;
+ floppy_device[drive].dev.groups = floppy_dev_groups;
err = platform_device_register(&floppy_device[drive]);
if (err)
goto out_remove_drives;
- err = device_create_file(&floppy_device[drive].dev,
- &dev_attr_cmos);
- if (err)
- goto out_unreg_platform_dev;
-
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
@@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void)
return 0;
-out_unreg_platform_dev:
- platform_device_unregister(&floppy_device[drive]);
out_remove_drives:
while (drive--) {
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
- device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
}
@@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void)
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
- device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
blk_cleanup_queue(disks[drive]->queue);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6cb1beb..d1f168b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -85,6 +85,8 @@ static DEFINE_MUTEX(loop_index_mutex);
static int max_part;
static int part_shift;
+static struct workqueue_struct *loop_wq;
+
/*
* Transfer functions
*/
@@ -284,12 +286,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
return ret;
}
-static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
+static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos)
{
int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
struct page *page);
struct bio_vec bvec;
- struct bvec_iter iter;
+ struct req_iterator iter;
struct page *page = NULL;
int ret = 0;
@@ -303,7 +305,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
do_lo_send = do_lo_send_direct_write;
}
- bio_for_each_segment(bvec, bio, iter) {
+ rq_for_each_segment(bvec, rq, iter) {
ret = do_lo_send(lo, &bvec, pos, page);
if (ret < 0)
break;
@@ -391,19 +393,22 @@ do_lo_receive(struct loop_device *lo,
}
static int
-lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
+lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos)
{
struct bio_vec bvec;
- struct bvec_iter iter;
+ struct req_iterator iter;
ssize_t s;
- bio_for_each_segment(bvec, bio, iter) {
+ rq_for_each_segment(bvec, rq, iter) {
s = do_lo_receive(lo, &bvec, bsize, pos);
if (s < 0)
return s;
if (s != bvec.bv_len) {
- zero_fill_bio(bio);
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
break;
}
pos += bvec.bv_len;
@@ -411,106 +416,58 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
return 0;
}
-static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
+static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
{
- loff_t pos;
+ /*
+ * We use punch hole to reclaim the free space used by the
+ * image a.k.a. discard. However we do not support discard if
+ * encryption is enabled, because it may give an attacker
+ * useful information.
+ */
+ struct file *file = lo->lo_backing_file;
+ int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
int ret;
- pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset;
-
- if (bio_rw(bio) == WRITE) {
- struct file *file = lo->lo_backing_file;
-
- if (bio->bi_rw & REQ_FLUSH) {
- ret = vfs_fsync(file, 0);
- if (unlikely(ret && ret != -EINVAL)) {
- ret = -EIO;
- goto out;
- }
- }
-
- /*
- * We use punch hole to reclaim the free space used by the
- * image a.k.a. discard. However we do not support discard if
- * encryption is enabled, because it may give an attacker
- * useful information.
- */
- if (bio->bi_rw & REQ_DISCARD) {
- struct file *file = lo->lo_backing_file;
- int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
-
- if ((!file->f_op->fallocate) ||
- lo->lo_encrypt_key_size) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- ret = file->f_op->fallocate(file, mode, pos,
- bio->bi_iter.bi_size);
- if (unlikely(ret && ret != -EINVAL &&
- ret != -EOPNOTSUPP))
- ret = -EIO;
- goto out;
- }
-
- ret = lo_send(lo, bio, pos);
-
- if ((bio->bi_rw & REQ_FUA) && !ret) {
- ret = vfs_fsync(file, 0);
- if (unlikely(ret && ret != -EINVAL))
- ret = -EIO;
- }
- } else
- ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
+ if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
-out:
+ ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
+ if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
+ ret = -EIO;
+ out:
return ret;
}
-/*
- * Add bio to back of pending list
- */
-static void loop_add_bio(struct loop_device *lo, struct bio *bio)
+static int lo_req_flush(struct loop_device *lo, struct request *rq)
{
- lo->lo_bio_count++;
- bio_list_add(&lo->lo_bio_list, bio);
-}
+ struct file *file = lo->lo_backing_file;
+ int ret = vfs_fsync(file, 0);
+ if (unlikely(ret && ret != -EINVAL))
+ ret = -EIO;
-/*
- * Grab first pending buffer
- */
-static struct bio *loop_get_bio(struct loop_device *lo)
-{
- lo->lo_bio_count--;
- return bio_list_pop(&lo->lo_bio_list);
+ return ret;
}
-static void loop_make_request(struct request_queue *q, struct bio *old_bio)
+static int do_req_filebacked(struct loop_device *lo, struct request *rq)
{
- struct loop_device *lo = q->queuedata;
- int rw = bio_rw(old_bio);
-
- if (rw == READA)
- rw = READ;
+ loff_t pos;
+ int ret;
- BUG_ON(!lo || (rw != READ && rw != WRITE));
+ pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
- spin_lock_irq(&lo->lo_lock);
- if (lo->lo_state != Lo_bound)
- goto out;
- if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
- goto out;
- if (lo->lo_bio_count >= q->nr_congestion_on)
- wait_event_lock_irq(lo->lo_req_wait,
- lo->lo_bio_count < q->nr_congestion_off,
- lo->lo_lock);
- loop_add_bio(lo, old_bio);
- wake_up(&lo->lo_event);
- spin_unlock_irq(&lo->lo_lock);
- return;
+ if (rq->cmd_flags & REQ_WRITE) {
+ if (rq->cmd_flags & REQ_FLUSH)
+ ret = lo_req_flush(lo, rq);
+ else if (rq->cmd_flags & REQ_DISCARD)
+ ret = lo_discard(lo, rq, pos);
+ else
+ ret = lo_send(lo, rq, pos);
+ } else
+ ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
-out:
- spin_unlock_irq(&lo->lo_lock);
- bio_io_error(old_bio);
+ return ret;
}
struct switch_request {
@@ -518,57 +475,26 @@ struct switch_request {
struct completion wait;
};
-static void do_loop_switch(struct loop_device *, struct switch_request *);
-
-static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
-{
- if (unlikely(!bio->bi_bdev)) {
- do_loop_switch(lo, bio->bi_private);
- bio_put(bio);
- } else {
- int ret = do_bio_filebacked(lo, bio);
- bio_endio(bio, ret);
- }
-}
-
/*
- * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
- *
- * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before
- * calling kthread_stop(). Therefore once kthread_should_stop() is
- * true, make_request will not place any more requests. Therefore
- * once kthread_should_stop() is true and lo_bio is NULL, we are
- * done with the loop.
+ * Do the actual switch; called from the BIO completion routine
*/
-static int loop_thread(void *data)
+static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
{
- struct loop_device *lo = data;
- struct bio *bio;
-
- set_user_nice(current, MIN_NICE);
-
- while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
-
- wait_event_interruptible(lo->lo_event,
- !bio_list_empty(&lo->lo_bio_list) ||
- kthread_should_stop());
-
- if (bio_list_empty(&lo->lo_bio_list))
- continue;
- spin_lock_irq(&lo->lo_lock);
- bio = loop_get_bio(lo);
- if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
- wake_up(&lo->lo_req_wait);
- spin_unlock_irq(&lo->lo_lock);
+ struct file *file = p->file;
+ struct file *old_file = lo->lo_backing_file;
+ struct address_space *mapping;
- BUG_ON(!bio);
- loop_handle_bio(lo, bio);
- }
+ /* if no new file, only flush of queued bios requested */
+ if (!file)
+ return;
- return 0;
+ mapping = file->f_mapping;
+ mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+ lo->lo_backing_file = file;
+ lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+ mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
+ lo->old_gfp_mask = mapping_gfp_mask(mapping);
+ mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
}
/*
@@ -579,15 +505,18 @@ static int loop_thread(void *data)
static int loop_switch(struct loop_device *lo, struct file *file)
{
struct switch_request w;
- struct bio *bio = bio_alloc(GFP_KERNEL, 0);
- if (!bio)
- return -ENOMEM;
- init_completion(&w.wait);
+
w.file = file;
- bio->bi_private = &w;
- bio->bi_bdev = NULL;
- loop_make_request(lo->lo_queue, bio);
- wait_for_completion(&w.wait);
+
+ /* freeze queue and wait for completion of scheduled requests */
+ blk_mq_freeze_queue(lo->lo_queue);
+
+ /* do the switch action */
+ do_loop_switch(lo, &w);
+
+ /* unfreeze */
+ blk_mq_unfreeze_queue(lo->lo_queue);
+
return 0;
}
@@ -596,39 +525,10 @@ static int loop_switch(struct loop_device *lo, struct file *file)
*/
static int loop_flush(struct loop_device *lo)
{
- /* loop not yet configured, no running thread, nothing to flush */
- if (!lo->lo_thread)
- return 0;
-
return loop_switch(lo, NULL);
}
/*
- * Do the actual switch; called from the BIO completion routine
- */
-static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
-{
- struct file *file = p->file;
- struct file *old_file = lo->lo_backing_file;
- struct address_space *mapping;
-
- /* if no new file, only flush of queued bios requested */
- if (!file)
- goto out;
-
- mapping = file->f_mapping;
- mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
- lo->lo_backing_file = file;
- lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
- mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
- lo->old_gfp_mask = mapping_gfp_mask(mapping);
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
-out:
- complete(&p->wait);
-}
-
-
-/*
* loop_change_fd switched the backing store of a loopback device to
* a new file. This is useful for operating system installers to free up
* the original file and in High Availability environments to switch to
@@ -889,12 +789,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
lo->transfer = transfer_none;
lo->ioctl = NULL;
lo->lo_sizelimit = 0;
- lo->lo_bio_count = 0;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
- bio_list_init(&lo->lo_bio_list);
-
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
blk_queue_flush(lo->lo_queue, REQ_FLUSH);
@@ -906,14 +803,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
set_blocksize(bdev, lo_blocksize);
- lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
- lo->lo_number);
- if (IS_ERR(lo->lo_thread)) {
- error = PTR_ERR(lo->lo_thread);
- goto out_clr;
- }
lo->lo_state = Lo_bound;
- wake_up_process(lo->lo_thread);
if (part_shift)
lo->lo_flags |= LO_FLAGS_PARTSCAN;
if (lo->lo_flags & LO_FLAGS_PARTSCAN)
@@ -925,18 +815,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
bdgrab(bdev);
return 0;
-out_clr:
- loop_sysfs_exit(lo);
- lo->lo_thread = NULL;
- lo->lo_device = NULL;
- lo->lo_backing_file = NULL;
- lo->lo_flags = 0;
- set_capacity(lo->lo_disk, 0);
- invalidate_bdev(bdev);
- bd_set_size(bdev, 0);
- kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
- lo->lo_state = Lo_unbound;
out_putf:
fput(file);
out:
@@ -1012,11 +890,6 @@ static int loop_clr_fd(struct loop_device *lo)
spin_lock_irq(&lo->lo_lock);
lo->lo_state = Lo_rundown;
- spin_unlock_irq(&lo->lo_lock);
-
- kthread_stop(lo->lo_thread);
-
- spin_lock_irq(&lo->lo_lock);
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
@@ -1028,7 +901,6 @@ static int loop_clr_fd(struct loop_device *lo)
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
lo->lo_encrypt_key_size = 0;
- lo->lo_thread = NULL;
memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
@@ -1601,6 +1473,105 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);
+static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+
+ blk_mq_start_request(bd->rq);
+
+ if (cmd->rq->cmd_flags & REQ_WRITE) {
+ struct loop_device *lo = cmd->rq->q->queuedata;
+ bool need_sched = true;
+
+ spin_lock_irq(&lo->lo_lock);
+ if (lo->write_started)
+ need_sched = false;
+ else
+ lo->write_started = true;
+ list_add_tail(&cmd->list, &lo->write_cmd_head);
+ spin_unlock_irq(&lo->lo_lock);
+
+ if (need_sched)
+ queue_work(loop_wq, &lo->write_work);
+ } else {
+ queue_work(loop_wq, &cmd->read_work);
+ }
+
+ return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static void loop_handle_cmd(struct loop_cmd *cmd)
+{
+ const bool write = cmd->rq->cmd_flags & REQ_WRITE;
+ struct loop_device *lo = cmd->rq->q->queuedata;
+ int ret = -EIO;
+
+ if (lo->lo_state != Lo_bound)
+ goto failed;
+
+ if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+ goto failed;
+
+ ret = do_req_filebacked(lo, cmd->rq);
+
+ failed:
+ if (ret)
+ cmd->rq->errors = -EIO;
+ blk_mq_complete_request(cmd->rq);
+}
+
+static void loop_queue_write_work(struct work_struct *work)
+{
+ struct loop_device *lo =
+ container_of(work, struct loop_device, write_work);
+ LIST_HEAD(cmd_list);
+
+ spin_lock_irq(&lo->lo_lock);
+ repeat:
+ list_splice_init(&lo->write_cmd_head, &cmd_list);
+ spin_unlock_irq(&lo->lo_lock);
+
+ while (!list_empty(&cmd_list)) {
+ struct loop_cmd *cmd = list_first_entry(&cmd_list,
+ struct loop_cmd, list);
+ list_del_init(&cmd->list);
+ loop_handle_cmd(cmd);
+ }
+
+ spin_lock_irq(&lo->lo_lock);
+ if (!list_empty(&lo->write_cmd_head))
+ goto repeat;
+ lo->write_started = false;
+ spin_unlock_irq(&lo->lo_lock);
+}
+
+static void loop_queue_read_work(struct work_struct *work)
+{
+ struct loop_cmd *cmd =
+ container_of(work, struct loop_cmd, read_work);
+
+ loop_handle_cmd(cmd);
+}
+
+static int loop_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ cmd->rq = rq;
+ INIT_WORK(&cmd->read_work, loop_queue_read_work);
+
+ return 0;
+}
+
+static struct blk_mq_ops loop_mq_ops = {
+ .queue_rq = loop_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = loop_init_request,
+};
+
static int loop_add(struct loop_device **l, int i)
{
struct loop_device *lo;
@@ -1627,16 +1598,28 @@ static int loop_add(struct loop_device **l, int i)
i = err;
err = -ENOMEM;
- lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
- if (!lo->lo_queue)
+ lo->tag_set.ops = &loop_mq_ops;
+ lo->tag_set.nr_hw_queues = 1;
+ lo->tag_set.queue_depth = 128;
+ lo->tag_set.numa_node = NUMA_NO_NODE;
+ lo->tag_set.cmd_size = sizeof(struct loop_cmd);
+ lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ lo->tag_set.driver_data = lo;
+
+ err = blk_mq_alloc_tag_set(&lo->tag_set);
+ if (err)
goto out_free_idr;
- /*
- * set queue make_request_fn
- */
- blk_queue_make_request(lo->lo_queue, loop_make_request);
+ lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
+ if (IS_ERR_OR_NULL(lo->lo_queue)) {
+ err = PTR_ERR(lo->lo_queue);
+ goto out_cleanup_tags;
+ }
lo->lo_queue->queuedata = lo;
+ INIT_LIST_HEAD(&lo->write_cmd_head);
+ INIT_WORK(&lo->write_work, loop_queue_write_work);
+
disk = lo->lo_disk = alloc_disk(1 << part_shift);
if (!disk)
goto out_free_queue;
@@ -1664,9 +1647,6 @@ static int loop_add(struct loop_device **l, int i)
disk->flags |= GENHD_FL_EXT_DEVT;
mutex_init(&lo->lo_ctl_mutex);
lo->lo_number = i;
- lo->lo_thread = NULL;
- init_waitqueue_head(&lo->lo_event);
- init_waitqueue_head(&lo->lo_req_wait);
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i << part_shift;
@@ -1680,6 +1660,8 @@ static int loop_add(struct loop_device **l, int i)
out_free_queue:
blk_cleanup_queue(lo->lo_queue);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&lo->tag_set);
out_free_idr:
idr_remove(&loop_index_idr, i);
out_free_dev:
@@ -1692,6 +1674,7 @@ static void loop_remove(struct loop_device *lo)
{
del_gendisk(lo->lo_disk);
blk_cleanup_queue(lo->lo_queue);
+ blk_mq_free_tag_set(&lo->tag_set);
put_disk(lo->lo_disk);
kfree(lo);
}
@@ -1875,6 +1858,13 @@ static int __init loop_init(void)
goto misc_out;
}
+ loop_wq = alloc_workqueue("kloopd",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
+ if (!loop_wq) {
+ err = -ENOMEM;
+ goto misc_out;
+ }
+
blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
THIS_MODULE, loop_probe, NULL, NULL);
@@ -1912,6 +1902,8 @@ static void __exit loop_exit(void)
blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
unregister_blkdev(LOOP_MAJOR, "loop");
+ destroy_workqueue(loop_wq);
+
misc_deregister(&loop_misc);
}
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 90df5d6..301c27f 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -11,8 +11,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
+#include <linux/workqueue.h>
#include <uapi/linux/loop.h>
/* Possible states of device */
@@ -52,19 +54,23 @@ struct loop_device {
gfp_t old_gfp_mask;
spinlock_t lo_lock;
- struct bio_list lo_bio_list;
- unsigned int lo_bio_count;
+ struct list_head write_cmd_head;
+ struct work_struct write_work;
+ bool write_started;
int lo_state;
struct mutex lo_ctl_mutex;
- struct task_struct *lo_thread;
- wait_queue_head_t lo_event;
- /* wait queue for incoming requests */
- wait_queue_head_t lo_req_wait;
struct request_queue *lo_queue;
+ struct blk_mq_tag_set tag_set;
struct gendisk *lo_disk;
};
+struct loop_cmd {
+ struct work_struct read_work;
+ struct request *rq;
+ struct list_head list;
+};
+
/* Support for loadable transfer modules */
struct loop_func_table {
int number; /* filter type */
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index aa2224a..65cd61a 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -579,7 +579,7 @@ static int null_add_dev(void)
sector_div(size, bs);
set_capacity(disk, size);
- disk->flags |= GENHD_FL_EXT_DEVT;
+ disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->major = null_major;
disk->first_minor = nullb->index;
disk->fops = &null_fops;
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d826bf3..cbdfbbf 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
void *ctx;
int aborted;
struct nvme_queue *nvmeq;
+ struct nvme_iod iod[0];
};
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES 2
+#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
+
+/*
+ * Will slightly overestimate the number of pages needed. This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+ unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+ return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+ unsigned int ret = sizeof(struct nvme_cmd_info);
+
+ ret += sizeof(struct nvme_iod);
+ ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
+ ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+
+ return ret;
+}
+
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
@@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
}
+static void *iod_get_private(struct nvme_iod *iod)
+{
+ return (void *) (iod->private & ~0x1UL);
+}
+
+/*
+ * If bit 0 is set, the iod is embedded in the request payload.
+ */
+static bool iod_should_kfree(struct nvme_iod *iod)
+{
+ return (iod->private & 0x01) == 0;
+}
+
/* Special values must be less than 0x1000 */
#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
@@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
return ((void *)iod) + iod->offset;
}
-/*
- * Will slightly overestimate the number of pages needed. This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static int nvme_npages(unsigned size, struct nvme_dev *dev)
+static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
+ unsigned nseg, unsigned long private)
{
- unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
- return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
+ iod->private = private;
+ iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+ iod->npages = -1;
+ iod->length = nbytes;
+ iod->nents = 0;
}
static struct nvme_iod *
-nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
+__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
+ unsigned long priv, gfp_t gfp)
{
struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
- sizeof(__le64 *) * nvme_npages(nbytes, dev) +
+ sizeof(__le64 *) * nvme_npages(bytes, dev) +
sizeof(struct scatterlist) * nseg, gfp);
- if (iod) {
- iod->offset = offsetof(struct nvme_iod, sg[nseg]);
- iod->npages = -1;
- iod->length = nbytes;
- iod->nents = 0;
- iod->first_dma = 0ULL;
- }
+ if (iod)
+ iod_init(iod, bytes, nseg, priv);
return iod;
}
+static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
+ gfp_t gfp)
+{
+ unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
+ sizeof(struct nvme_dsm_range);
+ unsigned long mask = 0;
+ struct nvme_iod *iod;
+
+ if (rq->nr_phys_segments <= NVME_INT_PAGES &&
+ size <= NVME_INT_BYTES(dev)) {
+ struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+
+ iod = cmd->iod;
+ mask = 0x01;
+ iod_init(iod, size, rq->nr_phys_segments,
+ (unsigned long) rq | 0x01);
+ return iod;
+ }
+
+ return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
+ (unsigned long) rq, gfp);
+}
+
void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
{
const int last_prp = dev->page_size / 8 - 1;
@@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
prp_dma = next_prp_dma;
}
- kfree(iod);
+
+ if (iod_should_kfree(iod))
+ kfree(iod);
}
static int nvme_error_status(u16 status)
@@ -424,7 +486,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct nvme_iod *iod = ctx;
- struct request *req = iod->private;
+ struct request *req = iod_get_private(iod);
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -585,7 +647,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
struct nvme_ns *ns)
{
- struct request *req = iod->private;
+ struct request *req = iod_get_private(iod);
struct nvme_command *cmnd;
u16 control = 0;
u32 dsmgmt = 0;
@@ -626,17 +688,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *req = bd->rq;
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
struct nvme_iod *iod;
- int psegs = req->nr_phys_segments;
enum dma_data_direction dma_dir;
- unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
- sizeof(struct nvme_dsm_range);
- iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
+ iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
if (!iod)
return BLK_MQ_RQ_QUEUE_BUSY;
- iod->private = req;
-
if (req->cmd_flags & REQ_DISCARD) {
void *range;
/*
@@ -651,10 +708,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
goto retry_cmd;
iod_list(iod)[0] = (__le64 *)range;
iod->npages = 0;
- } else if (psegs) {
+ } else if (req->nr_phys_segments) {
dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
- sg_init_table(iod->sg, psegs);
+ sg_init_table(iod->sg, req->nr_phys_segments);
iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
if (!iod->nents)
goto error_cmd;
@@ -1137,21 +1194,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
{
- LLIST_HEAD(q_list);
- struct nvme_queue *nvmeq, *next;
- struct llist_node *entry;
int i;
for (i = dev->queue_count - 1; i >= lowest; i--) {
struct nvme_queue *nvmeq = dev->queues[i];
- llist_add(&nvmeq->node, &q_list);
dev->queue_count--;
dev->queues[i] = NULL;
- }
- synchronize_rcu();
- entry = llist_del_all(&q_list);
- llist_for_each_entry_safe(nvmeq, next, entry, node)
nvme_free_queue(nvmeq);
+ }
}
/**
@@ -1408,7 +1458,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
- dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+ dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
dev->admin_tagset.driver_data = dev;
if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1522,7 +1572,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
}
err = -ENOMEM;
- iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
+ iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
if (!iod)
goto put_pages;
@@ -2148,7 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+ dev->tagset.cmd_size = nvme_cmd_size(dev);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 79aa179..e229425 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -423,7 +423,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
}
/* switch queue to TCQ mode; allocate tag map */
- rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
+ rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO);
if (rc) {
blk_cleanup_queue(q);
put_disk(disk);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index cc90a84..375d288 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -214,6 +214,15 @@ enum blkif_protocol {
BLKIF_PROTOCOL_X86_64 = 3,
};
+/*
+ * Default protocol if the frontend doesn't specify one.
+ */
+#ifdef CONFIG_X86
+# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
+#else
+# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
+#endif
+
struct xen_vbd {
/* What the domain refers to this vbd as. */
blkif_vdev_t handle;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 630a489..e3afe97 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be)
return err;
}
- be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
"%63s", protocol, NULL);
if (err)
- strcpy(protocol, "unspecified, assuming native");
+ strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d2cae5f..37779e4 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info)
merge_bio.tail = copy[i].request->biotail;
bio_list_merge(&bio_list, &merge_bio);
copy[i].request->bio = NULL;
- blk_put_request(copy[i].request);
+ blk_end_request_all(copy[i].request, 0);
}
kfree(copy);
@@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info)
req->bio = NULL;
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
pr_alert("diskcache flush request found!\n");
- __blk_put_request(info->rq, req);
+ __blk_end_request_all(req, 0);
}
spin_unlock_irq(&info->io_lock);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 4c58333..9a6b637 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -287,13 +287,24 @@ static unsigned long get_unmapped_area_mem(struct file *file,
return pgoff << PAGE_SHIFT;
}
+/* permit direct mmap, for read, write or exec */
+static unsigned memory_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_DIRECT |
+ NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
+static unsigned zero_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_COPY;
+}
+
/* can't do an in-place private mapping if there's no MMU */
static inline int private_mapping_ok(struct vm_area_struct *vma)
{
return vma->vm_flags & VM_MAYSHARE;
}
#else
-#define get_unmapped_area_mem NULL
static inline int private_mapping_ok(struct vm_area_struct *vma)
{
@@ -721,7 +732,10 @@ static const struct file_operations mem_fops = {
.write = write_mem,
.mmap = mmap_mem,
.open = open_mem,
+#ifndef CONFIG_MMU
.get_unmapped_area = get_unmapped_area_mem,
+ .mmap_capabilities = memory_mmap_capabilities,
+#endif
};
#ifdef CONFIG_DEVKMEM
@@ -731,7 +745,10 @@ static const struct file_operations kmem_fops = {
.write = write_kmem,
.mmap = mmap_kmem,
.open = open_kmem,
+#ifndef CONFIG_MMU
.get_unmapped_area = get_unmapped_area_mem,
+ .mmap_capabilities = memory_mmap_capabilities,
+#endif
};
#endif
@@ -760,16 +777,9 @@ static const struct file_operations zero_fops = {
.read_iter = read_iter_zero,
.aio_write = aio_write_zero,
.mmap = mmap_zero,
-};
-
-/*
- * capabilities for /dev/zero
- * - permits private mappings, "copies" are taken of the source of zeros
- * - no writeback happens
- */
-static struct backing_dev_info zero_bdi = {
- .name = "char/mem",
- .capabilities = BDI_CAP_MAP_COPY | BDI_CAP_NO_ACCT_AND_WRITEBACK,
+#ifndef CONFIG_MMU
+ .mmap_capabilities = zero_mmap_capabilities,
+#endif
};
static const struct file_operations full_fops = {
@@ -783,22 +793,22 @@ static const struct memdev {
const char *name;
umode_t mode;
const struct file_operations *fops;
- struct backing_dev_info *dev_info;
+ fmode_t fmode;
} devlist[] = {
- [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi },
+ [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET },
#ifdef CONFIG_DEVKMEM
- [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi },
+ [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET },
#endif
- [3] = { "null", 0666, &null_fops, NULL },
+ [3] = { "null", 0666, &null_fops, 0 },
#ifdef CONFIG_DEVPORT
- [4] = { "port", 0, &port_fops, NULL },
+ [4] = { "port", 0, &port_fops, 0 },
#endif
- [5] = { "zero", 0666, &zero_fops, &zero_bdi },
- [7] = { "full", 0666, &full_fops, NULL },
- [8] = { "random", 0666, &random_fops, NULL },
- [9] = { "urandom", 0666, &urandom_fops, NULL },
+ [5] = { "zero", 0666, &zero_fops, 0 },
+ [7] = { "full", 0666, &full_fops, 0 },
+ [8] = { "random", 0666, &random_fops, 0 },
+ [9] = { "urandom", 0666, &urandom_fops, 0 },
#ifdef CONFIG_PRINTK
- [11] = { "kmsg", 0644, &kmsg_fops, NULL },
+ [11] = { "kmsg", 0644, &kmsg_fops, 0 },
#endif
};
@@ -816,12 +826,7 @@ static int memory_open(struct inode *inode, struct file *filp)
return -ENXIO;
filp->f_op = dev->fops;
- if (dev->dev_info)
- filp->f_mapping->backing_dev_info = dev->dev_info;
-
- /* Is /dev/mem or /dev/kmem ? */
- if (dev->dev_info == &directly_mappable_cdev_bdi)
- filp->f_mode |= FMODE_UNSIGNED_OFFSET;
+ filp->f_mode |= dev->fmode;
if (dev->fops->open)
return dev->fops->open(inode, filp);
@@ -846,11 +851,6 @@ static struct class *mem_class;
static int __init chr_dev_init(void)
{
int minor;
- int err;
-
- err = bdi_init(&zero_bdi);
- if (err)
- return err;
if (register_chrdev(MEM_MAJOR, "mem", &memory_fops))
printk("unable to get major %d for memory devs\n", MEM_MAJOR);
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index a24891b..6e29bf2 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -104,11 +104,9 @@ static int raw_release(struct inode *inode, struct file *filp)
mutex_lock(&raw_mutex);
bdev = raw_devices[minor].binding;
- if (--raw_devices[minor].inuse == 0) {
+ if (--raw_devices[minor].inuse == 0)
/* Here inode->i_mapping == bdev->bd_inode->i_mapping */
inode->i_mapping = &inode->i_data;
- inode->i_mapping->backing_dev_info = &default_backing_dev_info;
- }
mutex_unlock(&raw_mutex);
blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index c355a22..c396444 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -231,9 +231,8 @@ config DM_CRYPT
transparently encrypts the data on it. You'll need to activate
the ciphers you're going to use in the cryptoapi configuration.
- Information on how to use dm-crypt can be found on
-
- <http://www.saout.de/misc/dm-crypt/>
+ For further information on dm-crypt and userspace tools see:
+ <http://code.google.com/p/cryptsetup/wiki/DMCrypt>
To compile this code as a module, choose M here: the module will
be called dm-crypt.
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1695ee5..3a57679 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1619,7 +1619,9 @@ void bitmap_destroy(struct mddev *mddev)
return;
mutex_lock(&mddev->bitmap_info.mutex);
+ spin_lock(&mddev->lock);
mddev->bitmap = NULL; /* disconnect from the md device */
+ spin_unlock(&mddev->lock);
mutex_unlock(&mddev->bitmap_info.mutex);
if (mddev->thread)
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
@@ -2209,11 +2211,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t can_clear_show(struct mddev *mddev, char *page)
{
int len;
+ spin_lock(&mddev->lock);
if (mddev->bitmap)
len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
"false" : "true"));
else
len = sprintf(page, "\n");
+ spin_unlock(&mddev->lock);
return len;
}
@@ -2238,10 +2242,15 @@ __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
static ssize_t
behind_writes_used_show(struct mddev *mddev, char *page)
{
+ ssize_t ret;
+ spin_lock(&mddev->lock);
if (mddev->bitmap == NULL)
- return sprintf(page, "0\n");
- return sprintf(page, "%lu\n",
- mddev->bitmap->behind_writes_used);
+ ret = sprintf(page, "0\n");
+ else
+ ret = sprintf(page, "%lu\n",
+ mddev->bitmap->behind_writes_used);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index c33b497..86dbbc7 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -11,6 +11,7 @@
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include <linux/vmalloc.h>
#include <linux/shrinker.h>
#include <linux/module.h>
@@ -1739,7 +1740,7 @@ static unsigned get_max_age_hz(void)
static bool older_than(struct dm_buffer *b, unsigned long age_hz)
{
- return (jiffies - b->last_accessed) >= age_hz;
+ return time_after_eq(jiffies, b->last_accessed + age_hz);
}
static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index e165053..7755af3 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -11,6 +11,7 @@
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
+#include <linux/jiffies.h>
#include <linux/init.h>
#include <linux/mempool.h>
#include <linux/module.h>
@@ -1562,8 +1563,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
static int need_commit_due_to_time(struct cache *cache)
{
- return jiffies < cache->last_commit_jiffies ||
- jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+ return !time_in_range(jiffies, cache->last_commit_jiffies,
+ cache->last_commit_jiffies + COMMIT_PERIOD);
}
static int commit_if_needed(struct cache *cache)
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 73f791b..c8a18e4 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -639,8 +639,8 @@ static int check_name(const char *name)
/*
* On successful return, the caller must not attempt to acquire
- * _hash_lock without first calling dm_table_put, because dm_table_destroy
- * waits for this dm_table_put and could be called under this lock.
+ * _hash_lock without first calling dm_put_live_table, because dm_table_destroy
+ * waits for this dm_put_live_table and could be called under this lock.
*/
static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
{
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index b953db6..03177ca 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -6,6 +6,7 @@
#include <linux/bio.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include <linux/dm-dirty-log.h>
#include <linux/device-mapper.h>
#include <linux/dm-log-userspace.h>
@@ -829,7 +830,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log,
int r;
uint64_t region64 = region;
struct log_c *lc = log->context;
- static unsigned long long limit;
+ static unsigned long limit;
struct {
int64_t is_recovering;
uint64_t in_sync_hint;
@@ -845,7 +846,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log,
*/
if (region < lc->in_sync_hint)
return 0;
- else if (jiffies < limit)
+ else if (time_after(limit, jiffies))
return 1;
limit = jiffies + (HZ / 4);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7b6b0f0..d376dc8 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -11,6 +11,7 @@
#include "dm-path-selector.h"
#include "dm-uevent.h"
+#include <linux/blkdev.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/mempool.h>
@@ -378,18 +379,18 @@ static int __must_push_back(struct multipath *m)
/*
* Map cloned requests
*/
-static int multipath_map(struct dm_target *ti, struct request *clone,
- union map_info *map_context)
+static int __multipath_map(struct dm_target *ti, struct request *clone,
+ union map_info *map_context,
+ struct request *rq, struct request **__clone)
{
struct multipath *m = (struct multipath *) ti->private;
int r = DM_MAPIO_REQUEUE;
- size_t nr_bytes = blk_rq_bytes(clone);
- unsigned long flags;
+ size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
struct pgpath *pgpath;
struct block_device *bdev;
struct dm_mpath_io *mpio;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
/* Do we need to select a new pgpath? */
if (!m->current_pgpath ||
@@ -411,25 +412,61 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
/* ENOMEM, requeue */
goto out_unlock;
- bdev = pgpath->path.dev->bdev;
- clone->q = bdev_get_queue(bdev);
- clone->rq_disk = bdev->bd_disk;
- clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
mpio = map_context->ptr;
mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes;
+
+ bdev = pgpath->path.dev->bdev;
+
+ spin_unlock_irq(&m->lock);
+
+ if (clone) {
+ /* Old request-based interface: allocated clone is passed in */
+ clone->q = bdev_get_queue(bdev);
+ clone->rq_disk = bdev->bd_disk;
+ clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+ } else {
+ /* blk-mq request-based interface */
+ *__clone = blk_get_request(bdev_get_queue(bdev),
+ rq_data_dir(rq), GFP_KERNEL);
+ if (IS_ERR(*__clone))
+ /* ENOMEM, requeue */
+ return r;
+ (*__clone)->bio = (*__clone)->biotail = NULL;
+ (*__clone)->rq_disk = bdev->bd_disk;
+ (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+ }
+
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
&pgpath->path,
nr_bytes);
- r = DM_MAPIO_REMAPPED;
+ return DM_MAPIO_REMAPPED;
out_unlock:
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
return r;
}
+static int multipath_map(struct dm_target *ti, struct request *clone,
+ union map_info *map_context)
+{
+ return __multipath_map(ti, clone, map_context, NULL, NULL);
+}
+
+static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
+ union map_info *map_context,
+ struct request **clone)
+{
+ return __multipath_map(ti, NULL, map_context, rq, clone);
+}
+
+static void multipath_release_clone(struct request *clone)
+{
+ blk_put_request(clone);
+}
+
/*
* If we run out of usable paths, should we queue I/O or error it?
*/
@@ -1666,11 +1703,13 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 7, 0},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
.map_rq = multipath_map,
+ .clone_and_map_rq = multipath_clone_and_map,
+ .release_clone_rq = multipath_release_clone,
.rq_end_io = multipath_end_io,
.presuspend = multipath_presuspend,
.postsuspend = multipath_postsuspend,
@@ -1694,16 +1733,15 @@ static int __init dm_multipath_init(void)
r = dm_register_target(&multipath_target);
if (r < 0) {
DMERR("register failed %d", r);
- kmem_cache_destroy(_mpio_cache);
- return -EINVAL;
+ r = -EINVAL;
+ goto bad_register_target;
}
kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
if (!kmultipathd) {
DMERR("failed to create workqueue kmpathd");
- dm_unregister_target(&multipath_target);
- kmem_cache_destroy(_mpio_cache);
- return -ENOMEM;
+ r = -ENOMEM;
+ goto bad_alloc_kmultipathd;
}
/*
@@ -1716,16 +1754,23 @@ static int __init dm_multipath_init(void)
WQ_MEM_RECLAIM);
if (!kmpath_handlerd) {
DMERR("failed to create workqueue kmpath_handlerd");
- destroy_workqueue(kmultipathd);
- dm_unregister_target(&multipath_target);
- kmem_cache_destroy(_mpio_cache);
- return -ENOMEM;
+ r = -ENOMEM;
+ goto bad_alloc_kmpath_handlerd;
}
DMINFO("version %u.%u.%u loaded",
multipath_target.version[0], multipath_target.version[1],
multipath_target.version[2]);
+ return 0;
+
+bad_alloc_kmpath_handlerd:
+ destroy_workqueue(kmultipathd);
+bad_alloc_kmultipathd:
+ dm_unregister_target(&multipath_target);
+bad_register_target:
+ kmem_cache_destroy(_mpio_cache);
+
return r;
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 07c0fa0..88e4c7f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -746,13 +746,7 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
{
struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
- if (rs->raid_type->level == 1)
- return md_raid1_congested(&rs->md, bits);
-
- if (rs->raid_type->level == 10)
- return md_raid10_congested(&rs->md, bits);
-
- return md_raid5_congested(&rs->md, bits);
+ return mddev_congested(&rs->md, bits);
}
/*
@@ -1243,7 +1237,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
argv++;
/* Skip over RAID params for now and find out # of devices */
- if (num_raid_params + 1 > argc) {
+ if (num_raid_params >= argc) {
ti->error = "Arguments do not agree with counts given";
return -EINVAL;
}
@@ -1254,6 +1248,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
return -EINVAL;
}
+ argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+ if (argc != (num_raid_devs * 2)) {
+ ti->error = "Supplied RAID devices does not match the count given";
+ return -EINVAL;
+ }
+
rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
if (IS_ERR(rs))
return PTR_ERR(rs);
@@ -1262,16 +1262,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (ret)
goto bad;
- ret = -EINVAL;
-
- argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
argv += num_raid_params + 1;
- if (argc != (num_raid_devs * 2)) {
- ti->error = "Supplied RAID devices does not match the count given";
- goto bad;
- }
-
ret = dev_parms(rs, argv);
if (ret)
goto bad;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index d6e8817..808b841 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -200,16 +200,11 @@ err_area:
static void free_area(struct pstore *ps)
{
- if (ps->area)
- vfree(ps->area);
+ vfree(ps->area);
ps->area = NULL;
-
- if (ps->zero_area)
- vfree(ps->zero_area);
+ vfree(ps->zero_area);
ps->zero_area = NULL;
-
- if (ps->header_area)
- vfree(ps->header_area);
+ vfree(ps->header_area);
ps->header_area = NULL;
}
@@ -605,8 +600,7 @@ static void persistent_dtr(struct dm_exception_store *store)
free_area(ps);
/* Allocated in persistent_read_metadata */
- if (ps->callbacks)
- vfree(ps->callbacks);
+ vfree(ps->callbacks);
kfree(ps);
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3afae9e..6554d91 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -827,10 +827,11 @@ static int dm_table_set_type(struct dm_table *t)
{
unsigned i;
unsigned bio_based = 0, request_based = 0, hybrid = 0;
+ bool use_blk_mq = false;
struct dm_target *tgt;
struct dm_dev_internal *dd;
struct list_head *devices;
- unsigned live_md_type;
+ unsigned live_md_type = dm_get_md_type(t->md);
for (i = 0; i < t->num_targets; i++) {
tgt = t->targets + i;
@@ -854,8 +855,8 @@ static int dm_table_set_type(struct dm_table *t)
* Determine the type from the live device.
* Default to bio-based if device is new.
*/
- live_md_type = dm_get_md_type(t->md);
- if (live_md_type == DM_TYPE_REQUEST_BASED)
+ if (live_md_type == DM_TYPE_REQUEST_BASED ||
+ live_md_type == DM_TYPE_MQ_REQUEST_BASED)
request_based = 1;
else
bio_based = 1;
@@ -869,16 +870,6 @@ static int dm_table_set_type(struct dm_table *t)
BUG_ON(!request_based); /* No targets in this table */
- /* Non-request-stackable devices can't be used for request-based dm */
- devices = dm_table_get_devices(t);
- list_for_each_entry(dd, devices, list) {
- if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) {
- DMWARN("table load rejected: including"
- " non-request-stackable devices");
- return -EINVAL;
- }
- }
-
/*
* Request-based dm supports only tables that have a single target now.
* To support multiple targets, request splitting support is needed,
@@ -890,7 +881,37 @@ static int dm_table_set_type(struct dm_table *t)
return -EINVAL;
}
- t->type = DM_TYPE_REQUEST_BASED;
+ /* Non-request-stackable devices can't be used for request-based dm */
+ devices = dm_table_get_devices(t);
+ list_for_each_entry(dd, devices, list) {
+ struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
+
+ if (!blk_queue_stackable(q)) {
+ DMERR("table load rejected: including"
+ " non-request-stackable devices");
+ return -EINVAL;
+ }
+
+ if (q->mq_ops)
+ use_blk_mq = true;
+ }
+
+ if (use_blk_mq) {
+ /* verify _all_ devices in the table are blk-mq devices */
+ list_for_each_entry(dd, devices, list)
+ if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
+ DMERR("table load rejected: not all devices"
+ " are blk-mq request-stackable");
+ return -EINVAL;
+ }
+ t->type = DM_TYPE_MQ_REQUEST_BASED;
+
+ } else if (hybrid && list_empty(devices) && live_md_type != DM_TYPE_NONE) {
+ /* inherit live MD type */
+ t->type = live_md_type;
+
+ } else
+ t->type = DM_TYPE_REQUEST_BASED;
return 0;
}
@@ -907,7 +928,15 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
bool dm_table_request_based(struct dm_table *t)
{
- return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
+ unsigned table_type = dm_table_get_type(t);
+
+ return (table_type == DM_TYPE_REQUEST_BASED ||
+ table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
+bool dm_table_mq_request_based(struct dm_table *t)
+{
+ return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
}
static int dm_table_alloc_md_mempools(struct dm_table *t)
@@ -1360,6 +1389,14 @@ static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
}
+static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags);
+}
+
static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
@@ -1480,6 +1517,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+ if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps))
+ queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q);
+
dm_table_set_integrity(t);
/*
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 242e3ce..925ec1b 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -137,13 +137,26 @@ static int io_err_map_rq(struct dm_target *ti, struct request *clone,
return -EIO;
}
+static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
+ union map_info *map_context,
+ struct request **clone)
+{
+ return -EIO;
+}
+
+static void io_err_release_clone_rq(struct request *clone)
+{
+}
+
static struct target_type error_target = {
.name = "error",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.ctr = io_err_ctr,
.dtr = io_err_dtr,
.map = io_err_map,
.map_rq = io_err_map_rq,
+ .clone_and_map_rq = io_err_clone_and_map_rq,
+ .release_clone_rq = io_err_release_clone_rq,
};
int __init dm_target_init(void)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 43adbb8..79f6941 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1635,15 +1635,6 @@ int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
return r;
}
-int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
-{
- down_read(&pmd->root_lock);
- *result = pmd->data_block_size;
- up_read(&pmd->root_lock);
-
- return 0;
-}
-
int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
{
int r = -EINVAL;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 921d15e..fac01a9 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -182,8 +182,6 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
dm_block_t *result);
-int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
-
int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 07705ee..654773c 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,6 +11,7 @@
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
+#include <linux/jiffies.h>
#include <linux/log2.h>
#include <linux/list.h>
#include <linux/rculist.h>
@@ -1700,8 +1701,8 @@ static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell
*/
static int need_commit_due_to_time(struct pool *pool)
{
- return jiffies < pool->last_commit_jiffies ||
- jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
+ return !time_in_range(jiffies, pool->last_commit_jiffies,
+ pool->last_commit_jiffies + COMMIT_PERIOD);
}
#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2caf5b3..ec1444f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,6 +20,7 @@
#include <linux/hdreg.h>
#include <linux/delay.h>
#include <linux/wait.h>
+#include <linux/kthread.h>
#include <trace/events/block.h>
@@ -78,7 +79,8 @@ struct dm_io {
struct dm_rq_target_io {
struct mapped_device *md;
struct dm_target *ti;
- struct request *orig, clone;
+ struct request *orig, *clone;
+ struct kthread_work work;
int error;
union map_info info;
};
@@ -179,6 +181,7 @@ struct mapped_device {
* io objects are allocated from here.
*/
mempool_t *io_pool;
+ mempool_t *rq_pool;
struct bio_set *bs;
@@ -210,6 +213,9 @@ struct mapped_device {
unsigned internal_suspend_count;
struct dm_stats stats;
+
+ struct kthread_worker kworker;
+ struct task_struct *kworker_task;
};
/*
@@ -217,6 +223,7 @@ struct mapped_device {
*/
struct dm_md_mempools {
mempool_t *io_pool;
+ mempool_t *rq_pool;
struct bio_set *bs;
};
@@ -231,6 +238,7 @@ struct table_device {
#define RESERVED_MAX_IOS 1024
static struct kmem_cache *_io_cache;
static struct kmem_cache *_rq_tio_cache;
+static struct kmem_cache *_rq_cache;
/*
* Bio-based DM's mempools' reserved IOs set by the user.
@@ -288,9 +296,14 @@ static int __init local_init(void)
if (!_rq_tio_cache)
goto out_free_io_cache;
+ _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
+ __alignof__(struct request), 0, NULL);
+ if (!_rq_cache)
+ goto out_free_rq_tio_cache;
+
r = dm_uevent_init();
if (r)
- goto out_free_rq_tio_cache;
+ goto out_free_rq_cache;
deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
if (!deferred_remove_workqueue) {
@@ -312,6 +325,8 @@ out_free_workqueue:
destroy_workqueue(deferred_remove_workqueue);
out_uevent_exit:
dm_uevent_exit();
+out_free_rq_cache:
+ kmem_cache_destroy(_rq_cache);
out_free_rq_tio_cache:
kmem_cache_destroy(_rq_tio_cache);
out_free_io_cache:
@@ -325,6 +340,7 @@ static void local_exit(void)
flush_scheduled_work();
destroy_workqueue(deferred_remove_workqueue);
+ kmem_cache_destroy(_rq_cache);
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
@@ -577,6 +593,17 @@ static void free_rq_tio(struct dm_rq_target_io *tio)
mempool_free(tio, tio->md->io_pool);
}
+static struct request *alloc_clone_request(struct mapped_device *md,
+ gfp_t gfp_mask)
+{
+ return mempool_alloc(md->rq_pool, gfp_mask);
+}
+
+static void free_clone_request(struct mapped_device *md, struct request *rq)
+{
+ mempool_free(rq, md->rq_pool);
+}
+
static int md_in_flight(struct mapped_device *md)
{
return atomic_read(&md->pending[READ]) +
@@ -992,7 +1019,7 @@ static void end_clone_bio(struct bio *clone, int error)
* the md may be freed in dm_put() at the end of this function.
* Or do dm_get() before calling this function and dm_put() later.
*/
-static void rq_completed(struct mapped_device *md, int rw, int run_queue)
+static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
atomic_dec(&md->pending[rw]);
@@ -1020,12 +1047,17 @@ static void free_rq_clone(struct request *clone)
struct dm_rq_target_io *tio = clone->end_io_data;
blk_rq_unprep_clone(clone);
+ if (clone->q && clone->q->mq_ops)
+ tio->ti->type->release_clone_rq(clone);
+ else
+ free_clone_request(tio->md, clone);
free_rq_tio(tio);
}
/*
* Complete the clone and the original request.
- * Must be called without queue lock.
+ * Must be called without clone's queue lock held,
+ * see end_clone_request() for more details.
*/
static void dm_end_request(struct request *clone, int error)
{
@@ -1054,23 +1086,23 @@ static void dm_end_request(struct request *clone, int error)
static void dm_unprep_request(struct request *rq)
{
- struct request *clone = rq->special;
+ struct dm_rq_target_io *tio = rq->special;
+ struct request *clone = tio->clone;
rq->special = NULL;
rq->cmd_flags &= ~REQ_DONTPREP;
- free_rq_clone(clone);
+ if (clone)
+ free_rq_clone(clone);
}
/*
* Requeue the original request of a clone.
*/
-void dm_requeue_unmapped_request(struct request *clone)
+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+ struct request *rq)
{
- int rw = rq_data_dir(clone);
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct mapped_device *md = tio->md;
- struct request *rq = tio->orig;
+ int rw = rq_data_dir(rq);
struct request_queue *q = rq->q;
unsigned long flags;
@@ -1080,9 +1112,15 @@ void dm_requeue_unmapped_request(struct request *clone)
blk_requeue_request(q, rq);
spin_unlock_irqrestore(q->queue_lock, flags);
- rq_completed(md, rw, 0);
+ rq_completed(md, rw, false);
+}
+
+static void dm_requeue_unmapped_request(struct request *clone)
+{
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ dm_requeue_unmapped_original_request(tio->md, tio->orig);
}
-EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
static void __stop_queue(struct request_queue *q)
{
@@ -1151,8 +1189,15 @@ static void dm_done(struct request *clone, int error, bool mapped)
static void dm_softirq_done(struct request *rq)
{
bool mapped = true;
- struct request *clone = rq->completion_data;
- struct dm_rq_target_io *tio = clone->end_io_data;
+ struct dm_rq_target_io *tio = rq->special;
+ struct request *clone = tio->clone;
+
+ if (!clone) {
+ blk_end_request_all(rq, tio->error);
+ rq_completed(tio->md, rq_data_dir(rq), false);
+ free_rq_tio(tio);
+ return;
+ }
if (rq->cmd_flags & REQ_FAILED)
mapped = false;
@@ -1164,13 +1209,11 @@ static void dm_softirq_done(struct request *rq)
* Complete the clone and the original request with the error status
* through softirq context.
*/
-static void dm_complete_request(struct request *clone, int error)
+static void dm_complete_request(struct request *rq, int error)
{
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct request *rq = tio->orig;
+ struct dm_rq_target_io *tio = rq->special;
tio->error = error;
- rq->completion_data = clone;
blk_complete_request(rq);
}
@@ -1178,40 +1221,40 @@ static void dm_complete_request(struct request *clone, int error)
* Complete the not-mapped clone and the original request with the error status
* through softirq context.
* Target's rq_end_io() function isn't called.
- * This may be used when the target's map_rq() function fails.
+ * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
*/
-void dm_kill_unmapped_request(struct request *clone, int error)
+static void dm_kill_unmapped_request(struct request *rq, int error)
{
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct request *rq = tio->orig;
-
rq->cmd_flags |= REQ_FAILED;
- dm_complete_request(clone, error);
+ dm_complete_request(rq, error);
}
-EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
/*
- * Called with the queue lock held
+ * Called with the clone's queue lock held
*/
static void end_clone_request(struct request *clone, int error)
{
- /*
- * For just cleaning up the information of the queue in which
- * the clone was dispatched.
- * The clone is *NOT* freed actually here because it is alloced from
- * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
- */
- __blk_put_request(clone->q, clone);
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ if (!clone->q->mq_ops) {
+ /*
+ * For just cleaning up the information of the queue in which
+ * the clone was dispatched.
+ * The clone is *NOT* freed actually here because it is alloced
+ * from dm own mempool (REQ_ALLOCED isn't set).
+ */
+ __blk_put_request(clone->q, clone);
+ }
/*
* Actual request completion is done in a softirq context which doesn't
- * hold the queue lock. Otherwise, deadlock could occur because:
+ * hold the clone's queue lock. Otherwise, deadlock could occur because:
* - another request may be submitted by the upper level driver
* of the stacking during the completion
* - the submission which requires queue lock may be done
- * against this queue
+ * against this clone's queue
*/
- dm_complete_request(clone, error);
+ dm_complete_request(tio->orig, error);
}
/*
@@ -1689,19 +1732,19 @@ static void dm_request(struct request_queue *q, struct bio *bio)
_dm_request(q, bio);
}
-void dm_dispatch_request(struct request *rq)
+static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
{
int r;
- if (blk_queue_io_stat(rq->q))
- rq->cmd_flags |= REQ_IO_STAT;
+ if (blk_queue_io_stat(clone->q))
+ clone->cmd_flags |= REQ_IO_STAT;
- rq->start_time = jiffies;
- r = blk_insert_cloned_request(rq->q, rq);
+ clone->start_time = jiffies;
+ r = blk_insert_cloned_request(clone->q, clone);
if (r)
+ /* must complete clone in terms of original request */
dm_complete_request(rq, r);
}
-EXPORT_SYMBOL_GPL(dm_dispatch_request);
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
void *data)
@@ -1718,11 +1761,11 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
}
static int setup_clone(struct request *clone, struct request *rq,
- struct dm_rq_target_io *tio)
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
{
int r;
- r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+ r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
dm_rq_bio_constructor, tio);
if (r)
return r;
@@ -1733,14 +1776,37 @@ static int setup_clone(struct request *clone, struct request *rq,
clone->end_io = end_clone_request;
clone->end_io_data = tio;
+ tio->clone = clone;
+
return 0;
}
static struct request *clone_rq(struct request *rq, struct mapped_device *md,
- gfp_t gfp_mask)
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+ struct request *clone = alloc_clone_request(md, gfp_mask);
+
+ if (!clone)
+ return NULL;
+
+ blk_rq_init(NULL, clone);
+ if (setup_clone(clone, rq, tio, gfp_mask)) {
+ /* -ENOMEM */
+ free_clone_request(md, clone);
+ return NULL;
+ }
+
+ return clone;
+}
+
+static void map_tio_request(struct kthread_work *work);
+
+static struct dm_rq_target_io *prep_tio(struct request *rq,
+ struct mapped_device *md, gfp_t gfp_mask)
{
- struct request *clone;
struct dm_rq_target_io *tio;
+ int srcu_idx;
+ struct dm_table *table;
tio = alloc_rq_tio(md, gfp_mask);
if (!tio)
@@ -1748,18 +1814,23 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
tio->md = md;
tio->ti = NULL;
+ tio->clone = NULL;
tio->orig = rq;
tio->error = 0;
memset(&tio->info, 0, sizeof(tio->info));
-
- clone = &tio->clone;
- if (setup_clone(clone, rq, tio)) {
- /* -ENOMEM */
- free_rq_tio(tio);
- return NULL;
+ init_kthread_work(&tio->work, map_tio_request);
+
+ table = dm_get_live_table(md, &srcu_idx);
+ if (!dm_table_mq_request_based(table)) {
+ if (!clone_rq(rq, md, tio, gfp_mask)) {
+ dm_put_live_table(md, srcu_idx);
+ free_rq_tio(tio);
+ return NULL;
+ }
}
+ dm_put_live_table(md, srcu_idx);
- return clone;
+ return tio;
}
/*
@@ -1768,18 +1839,18 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
static int dm_prep_fn(struct request_queue *q, struct request *rq)
{
struct mapped_device *md = q->queuedata;
- struct request *clone;
+ struct dm_rq_target_io *tio;
if (unlikely(rq->special)) {
DMWARN("Already has something in rq->special.");
return BLKPREP_KILL;
}
- clone = clone_rq(rq, md, GFP_ATOMIC);
- if (!clone)
+ tio = prep_tio(rq, md, GFP_ATOMIC);
+ if (!tio)
return BLKPREP_DEFER;
- rq->special = clone;
+ rq->special = tio;
rq->cmd_flags |= REQ_DONTPREP;
return BLKPREP_OK;
@@ -1787,17 +1858,36 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
/*
* Returns:
- * 0 : the request has been processed (not requeued)
- * !0 : the request has been requeued
+ * 0 : the request has been processed
+ * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * < 0 : the request was completed due to failure
*/
-static int map_request(struct dm_target *ti, struct request *clone,
+static int map_request(struct dm_target *ti, struct request *rq,
struct mapped_device *md)
{
- int r, requeued = 0;
- struct dm_rq_target_io *tio = clone->end_io_data;
+ int r;
+ struct dm_rq_target_io *tio = rq->special;
+ struct request *clone = NULL;
+
+ if (tio->clone) {
+ clone = tio->clone;
+ r = ti->type->map_rq(ti, clone, &tio->info);
+ } else {
+ r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+ if (r < 0) {
+ /* The target wants to complete the I/O */
+ dm_kill_unmapped_request(rq, r);
+ return r;
+ }
+ if (IS_ERR(clone))
+ return DM_MAPIO_REQUEUE;
+ if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+ /* -ENOMEM */
+ ti->type->release_clone_rq(clone);
+ return DM_MAPIO_REQUEUE;
+ }
+ }
- tio->ti = ti;
- r = ti->type->map_rq(ti, clone, &tio->info);
switch (r) {
case DM_MAPIO_SUBMITTED:
/* The target has taken the I/O to submit by itself later */
@@ -1805,13 +1895,12 @@ static int map_request(struct dm_target *ti, struct request *clone,
case DM_MAPIO_REMAPPED:
/* The target has remapped the I/O so dispatch it */
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
- blk_rq_pos(tio->orig));
- dm_dispatch_request(clone);
+ blk_rq_pos(rq));
+ dm_dispatch_clone_request(clone, rq);
break;
case DM_MAPIO_REQUEUE:
/* The target wants to requeue the I/O */
dm_requeue_unmapped_request(clone);
- requeued = 1;
break;
default:
if (r > 0) {
@@ -1820,20 +1909,27 @@ static int map_request(struct dm_target *ti, struct request *clone,
}
/* The target wants to complete the I/O */
- dm_kill_unmapped_request(clone, r);
- break;
+ dm_kill_unmapped_request(rq, r);
+ return r;
}
- return requeued;
+ return 0;
}
-static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
+static void map_tio_request(struct kthread_work *work)
{
- struct request *clone;
+ struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
+ struct request *rq = tio->orig;
+ struct mapped_device *md = tio->md;
+ if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+ dm_requeue_unmapped_original_request(md, rq);
+}
+
+static void dm_start_request(struct mapped_device *md, struct request *orig)
+{
blk_start_request(orig);
- clone = orig->special;
- atomic_inc(&md->pending[rq_data_dir(clone)]);
+ atomic_inc(&md->pending[rq_data_dir(orig)]);
/*
* Hold the md reference here for the in-flight I/O.
@@ -1843,8 +1939,6 @@ static struct request *dm_start_request(struct mapped_device *md, struct request
* See the comment in rq_completed() too.
*/
dm_get(md);
-
- return clone;
}
/*
@@ -1857,7 +1951,8 @@ static void dm_request_fn(struct request_queue *q)
int srcu_idx;
struct dm_table *map = dm_get_live_table(md, &srcu_idx);
struct dm_target *ti;
- struct request *rq, *clone;
+ struct request *rq;
+ struct dm_rq_target_io *tio;
sector_t pos;
/*
@@ -1879,34 +1974,29 @@ static void dm_request_fn(struct request_queue *q)
ti = dm_table_find_target(map, pos);
if (!dm_target_is_valid(ti)) {
/*
- * Must perform setup, that dm_done() requires,
+ * Must perform setup, that rq_completed() requires,
* before calling dm_kill_unmapped_request
*/
DMERR_LIMIT("request attempted access beyond the end of device");
- clone = dm_start_request(md, rq);
- dm_kill_unmapped_request(clone, -EIO);
+ dm_start_request(md, rq);
+ dm_kill_unmapped_request(rq, -EIO);
continue;
}
if (ti->type->busy && ti->type->busy(ti))
goto delay_and_out;
- clone = dm_start_request(md, rq);
-
- spin_unlock(q->queue_lock);
- if (map_request(ti, clone, md))
- goto requeued;
+ dm_start_request(md, rq);
+ tio = rq->special;
+ /* Establish tio->ti before queuing work (map_tio_request) */
+ tio->ti = ti;
+ queue_kthread_work(&md->kworker, &tio->work);
BUG_ON(!irqs_disabled());
- spin_lock(q->queue_lock);
}
goto out;
-requeued:
- BUG_ON(!irqs_disabled());
- spin_lock(q->queue_lock);
-
delay_and_out:
blk_delay_queue(q, HZ / 10);
out:
@@ -2092,6 +2182,7 @@ static struct mapped_device *alloc_dev(int minor)
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion);
+ md->kworker_task = NULL;
md->disk->major = _major;
md->disk->first_minor = minor;
@@ -2152,8 +2243,13 @@ static void free_dev(struct mapped_device *md)
unlock_fs(md);
bdput(md->bdev);
destroy_workqueue(md->wq);
+
+ if (md->kworker_task)
+ kthread_stop(md->kworker_task);
if (md->io_pool)
mempool_destroy(md->io_pool);
+ if (md->rq_pool)
+ mempool_destroy(md->rq_pool);
if (md->bs)
bioset_free(md->bs);
blk_integrity_unregister(md->disk);
@@ -2187,23 +2283,24 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
bioset_free(md->bs);
md->bs = p->bs;
p->bs = NULL;
- } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
- /*
- * There's no need to reload with request-based dm
- * because the size of front_pad doesn't change.
- * Note for future: If you are to reload bioset,
- * prep-ed requests in the queue may refer
- * to bio from the old bioset, so you must walk
- * through the queue to unprep.
- */
}
+ /*
+ * There's no need to reload with request-based dm
+ * because the size of front_pad doesn't change.
+ * Note for future: If you are to reload bioset,
+ * prep-ed requests in the queue may refer
+ * to bio from the old bioset, so you must walk
+ * through the queue to unprep.
+ */
goto out;
}
- BUG_ON(!p || md->io_pool || md->bs);
+ BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
md->io_pool = p->io_pool;
p->io_pool = NULL;
+ md->rq_pool = p->rq_pool;
+ p->rq_pool = NULL;
md->bs = p->bs;
p->bs = NULL;
@@ -2406,6 +2503,14 @@ unsigned dm_get_md_type(struct mapped_device *md)
return md->type;
}
+static bool dm_md_type_request_based(struct mapped_device *md)
+{
+ unsigned table_type = dm_get_md_type(md);
+
+ return (table_type == DM_TYPE_REQUEST_BASED ||
+ table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
{
return md->immutable_target_type;
@@ -2443,6 +2548,11 @@ static int dm_init_request_based_queue(struct mapped_device *md)
blk_queue_prep_rq(md->queue, dm_prep_fn);
blk_queue_lld_busy(md->queue, dm_lld_busy);
+ /* Also initialize the request-based DM worker thread */
+ init_kthread_worker(&md->kworker);
+ md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+ "kdmwork-%s", dm_device_name(md));
+
elv_register_queue(md->queue);
return 1;
@@ -2453,8 +2563,7 @@ static int dm_init_request_based_queue(struct mapped_device *md)
*/
int dm_setup_md_queue(struct mapped_device *md)
{
- if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
- !dm_init_request_based_queue(md)) {
+ if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
DMWARN("Cannot initialize queue for request-based mapped device");
return -EINVAL;
}
@@ -2533,6 +2642,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
+ if (dm_request_based(md))
+ flush_kthread_worker(&md->kworker);
+
if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
@@ -2776,8 +2888,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
* Stop md->queue before flushing md->wq in case request-based
* dm defers requests to md->wq from md->queue.
*/
- if (dm_request_based(md))
+ if (dm_request_based(md)) {
stop_queue(md->queue);
+ flush_kthread_worker(&md->kworker);
+ }
flush_workqueue(md->wq);
@@ -3123,24 +3237,35 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
{
struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
struct kmem_cache *cachep;
- unsigned int pool_size;
+ unsigned int pool_size = 0;
unsigned int front_pad;
if (!pools)
return NULL;
- if (type == DM_TYPE_BIO_BASED) {
+ switch (type) {
+ case DM_TYPE_BIO_BASED:
cachep = _io_cache;
pool_size = dm_get_reserved_bio_based_ios();
front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
- } else if (type == DM_TYPE_REQUEST_BASED) {
- cachep = _rq_tio_cache;
+ break;
+ case DM_TYPE_REQUEST_BASED:
pool_size = dm_get_reserved_rq_based_ios();
+ pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+ if (!pools->rq_pool)
+ goto out;
+ /* fall through to setup remaining rq-based pools */
+ case DM_TYPE_MQ_REQUEST_BASED:
+ cachep = _rq_tio_cache;
+ if (!pool_size)
+ pool_size = dm_get_reserved_rq_based_ios();
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_bio_data_size is not used. See __bind_mempools(). */
WARN_ON(per_bio_data_size != 0);
- } else
+ break;
+ default:
goto out;
+ }
pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
if (!pools->io_pool)
@@ -3169,6 +3294,9 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
if (pools->io_pool)
mempool_destroy(pools->io_pool);
+ if (pools->rq_pool)
+ mempool_destroy(pools->rq_pool);
+
if (pools->bs)
bioset_free(pools->bs);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 84b0f9e4..59f53e7 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -34,9 +34,10 @@
/*
* Type of table and mapped_device's mempool
*/
-#define DM_TYPE_NONE 0
-#define DM_TYPE_BIO_BASED 1
-#define DM_TYPE_REQUEST_BASED 2
+#define DM_TYPE_NONE 0
+#define DM_TYPE_BIO_BASED 1
+#define DM_TYPE_REQUEST_BASED 2
+#define DM_TYPE_MQ_REQUEST_BASED 3
/*
* List of devices that a metadevice uses and should open/close.
@@ -73,6 +74,7 @@ int dm_table_any_busy_target(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
+bool dm_table_mq_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
@@ -99,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md);
/*
* To check whether the target type is request-based or not (bio-based).
*/
-#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
+#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \
+ ((t)->type->clone_and_map_rq != NULL))
/*
* To check whether the target type is a hybrid (capable of being
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index e8b4574..1277eb2 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -332,13 +332,11 @@ static int run(struct mddev *mddev)
return 0;
}
-static int stop(struct mddev *mddev)
+static void faulty_free(struct mddev *mddev, void *priv)
{
- struct faulty_conf *conf = mddev->private;
+ struct faulty_conf *conf = priv;
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static struct md_personality faulty_personality =
@@ -348,7 +346,7 @@ static struct md_personality faulty_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = faulty_free,
.status = status,
.check_reshape = reshape,
.size = faulty_size,
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 64713b7..fa7d577 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -34,7 +34,7 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
lo = 0;
hi = mddev->raid_disks - 1;
- conf = rcu_dereference(mddev->private);
+ conf = mddev->private;
/*
* Binary Search
@@ -60,18 +60,16 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
*
* Return amount of bytes we can take at this offset
*/
-static int linear_mergeable_bvec(struct request_queue *q,
+static int linear_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct dev_info *dev0;
unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int maxbytes = biovec->bv_len;
struct request_queue *subq;
- rcu_read_lock();
dev0 = which_dev(mddev, sector);
maxsectors = dev0->end_sector - sector;
subq = bdev_get_queue(dev0->rdev->bdev);
@@ -81,7 +79,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
biovec));
}
- rcu_read_unlock();
if (maxsectors < bio_sectors)
maxsectors = 0;
@@ -97,24 +94,18 @@ static int linear_mergeable_bvec(struct request_queue *q,
return maxsectors << 9;
}
-static int linear_congested(void *data, int bits)
+static int linear_congested(struct mddev *mddev, int bits)
{
- struct mddev *mddev = data;
struct linear_conf *conf;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
- return 1;
-
- rcu_read_lock();
- conf = rcu_dereference(mddev->private);
+ conf = mddev->private;
for (i = 0; i < mddev->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
ret |= bdi_congested(&q->backing_dev_info, bits);
}
- rcu_read_unlock();
return ret;
}
@@ -123,12 +114,10 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk
struct linear_conf *conf;
sector_t array_sectors;
- rcu_read_lock();
- conf = rcu_dereference(mddev->private);
+ conf = mddev->private;
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
array_sectors = conf->array_sectors;
- rcu_read_unlock();
return array_sectors;
}
@@ -217,10 +206,6 @@ static int linear_run (struct mddev *mddev)
mddev->private = conf;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
- blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
- mddev->queue->backing_dev_info.congested_fn = linear_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
-
ret = md_integrity_register(mddev);
if (ret) {
kfree(conf);
@@ -252,38 +237,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
if (!newconf)
return -ENOMEM;
- oldconf = rcu_dereference_protected(mddev->private,
- lockdep_is_held(
- &mddev->reconfig_mutex));
+ mddev_suspend(mddev);
+ oldconf = mddev->private;
mddev->raid_disks++;
- rcu_assign_pointer(mddev->private, newconf);
+ mddev->private = newconf;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
+ mddev_resume(mddev);
revalidate_disk(mddev->gendisk);
- kfree_rcu(oldconf, rcu);
+ kfree(oldconf);
return 0;
}
-static int linear_stop (struct mddev *mddev)
+static void linear_free(struct mddev *mddev, void *priv)
{
- struct linear_conf *conf =
- rcu_dereference_protected(mddev->private,
- lockdep_is_held(
- &mddev->reconfig_mutex));
+ struct linear_conf *conf = priv;
- /*
- * We do not require rcu protection here since
- * we hold reconfig_mutex for both linear_add and
- * linear_stop, so they cannot race.
- * We should make sure any old 'conf's are properly
- * freed though.
- */
- rcu_barrier();
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf);
- mddev->private = NULL;
-
- return 0;
}
static void linear_make_request(struct mddev *mddev, struct bio *bio)
@@ -299,16 +269,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
}
do {
- rcu_read_lock();
-
tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
end_sector = tmp_dev->end_sector;
data_offset = tmp_dev->rdev->data_offset;
bio->bi_bdev = tmp_dev->rdev->bdev;
- rcu_read_unlock();
-
if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
bio->bi_iter.bi_sector < start_sector))
goto out_of_bounds;
@@ -355,6 +321,10 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
}
+static void linear_quiesce(struct mddev *mddev, int state)
+{
+}
+
static struct md_personality linear_personality =
{
.name = "linear",
@@ -362,10 +332,13 @@ static struct md_personality linear_personality =
.owner = THIS_MODULE,
.make_request = linear_make_request,
.run = linear_run,
- .stop = linear_stop,
+ .free = linear_free,
.status = linear_status,
.hot_add_disk = linear_add,
.size = linear_size,
+ .quiesce = linear_quiesce,
+ .congested = linear_congested,
+ .mergeable_bvec = linear_mergeable_bvec,
};
static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 709755f..c8d2bac 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -72,6 +72,7 @@ static struct workqueue_struct *md_misc_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
+static void mddev_detach(struct mddev *mddev);
/*
* Default number of read corrections we'll attempt on an rdev
@@ -292,8 +293,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
/* mddev_suspend makes sure no new requests are submitted
* to the device, and that any requests that have been submitted
* are completely handled.
- * Once ->stop is called and completes, the module will be completely
- * unused.
+ * Once mddev_detach() is called and completes, the module will be
+ * completely unused.
*/
void mddev_suspend(struct mddev *mddev)
{
@@ -321,10 +322,47 @@ EXPORT_SYMBOL_GPL(mddev_resume);
int mddev_congested(struct mddev *mddev, int bits)
{
- return mddev->suspended;
+ struct md_personality *pers = mddev->pers;
+ int ret = 0;
+
+ rcu_read_lock();
+ if (mddev->suspended)
+ ret = 1;
+ else if (pers && pers->congested)
+ ret = pers->congested(mddev, bits);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mddev_congested);
+static int md_congested(void *data, int bits)
+{
+ struct mddev *mddev = data;
+ return mddev_congested(mddev, bits);
}
-EXPORT_SYMBOL(mddev_congested);
+static int md_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct mddev *mddev = q->queuedata;
+ int ret;
+ rcu_read_lock();
+ if (mddev->suspended) {
+ /* Must always allow one vec */
+ if (bvm->bi_size == 0)
+ ret = biovec->bv_len;
+ else
+ ret = 0;
+ } else {
+ struct md_personality *pers = mddev->pers;
+ if (pers && pers->mergeable_bvec)
+ ret = pers->mergeable_bvec(mddev, bvm, biovec);
+ else
+ ret = biovec->bv_len;
+ }
+ rcu_read_unlock();
+ return ret;
+}
/*
* Generic flush handling for md
*/
@@ -397,12 +435,12 @@ static void md_submit_flush_data(struct work_struct *ws)
void md_flush_request(struct mddev *mddev, struct bio *bio)
{
- spin_lock_irq(&mddev->write_lock);
+ spin_lock_irq(&mddev->lock);
wait_event_lock_irq(mddev->sb_wait,
!mddev->flush_bio,
- mddev->write_lock);
+ mddev->lock);
mddev->flush_bio = bio;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock_irq(&mddev->lock);
INIT_WORK(&mddev->flush_work, submit_flushes);
queue_work(md_wq, &mddev->flush_work);
@@ -465,7 +503,7 @@ void mddev_init(struct mddev *mddev)
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
- spin_lock_init(&mddev->write_lock);
+ spin_lock_init(&mddev->lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
@@ -552,32 +590,9 @@ static struct mddev *mddev_find(dev_t unit)
goto retry;
}
-static inline int __must_check mddev_lock(struct mddev *mddev)
-{
- return mutex_lock_interruptible(&mddev->reconfig_mutex);
-}
-
-/* Sometimes we need to take the lock in a situation where
- * failure due to interrupts is not acceptable.
- */
-static inline void mddev_lock_nointr(struct mddev *mddev)
-{
- mutex_lock(&mddev->reconfig_mutex);
-}
-
-static inline int mddev_is_locked(struct mddev *mddev)
-{
- return mutex_is_locked(&mddev->reconfig_mutex);
-}
-
-static inline int mddev_trylock(struct mddev *mddev)
-{
- return mutex_trylock(&mddev->reconfig_mutex);
-}
-
static struct attribute_group md_redundancy_group;
-static void mddev_unlock(struct mddev *mddev)
+void mddev_unlock(struct mddev *mddev)
{
if (mddev->to_remove) {
/* These cannot be removed under reconfig_mutex as
@@ -619,6 +634,7 @@ static void mddev_unlock(struct mddev *mddev)
md_wakeup_thread(mddev->thread);
spin_unlock(&pers_lock);
}
+EXPORT_SYMBOL_GPL(mddev_unlock);
static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
{
@@ -2230,7 +2246,7 @@ repeat:
return;
}
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
mddev->utime = get_seconds();
@@ -2287,7 +2303,7 @@ repeat:
}
sync_sbs(mddev, nospares);
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
mdname(mddev), mddev->in_sync);
@@ -2326,15 +2342,15 @@ repeat:
md_super_wait(mddev);
/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->in_sync != sync_req ||
test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
/* have to write it out again */
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
goto repeat;
}
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
wake_up(&mddev->sb_wait);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -2381,40 +2397,41 @@ state_show(struct md_rdev *rdev, char *page)
{
char *sep = "";
size_t len = 0;
+ unsigned long flags = ACCESS_ONCE(rdev->flags);
- if (test_bit(Faulty, &rdev->flags) ||
+ if (test_bit(Faulty, &flags) ||
rdev->badblocks.unacked_exist) {
len+= sprintf(page+len, "%sfaulty",sep);
sep = ",";
}
- if (test_bit(In_sync, &rdev->flags)) {
+ if (test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sin_sync",sep);
sep = ",";
}
- if (test_bit(WriteMostly, &rdev->flags)) {
+ if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
}
- if (test_bit(Blocked, &rdev->flags) ||
+ if (test_bit(Blocked, &flags) ||
(rdev->badblocks.unacked_exist
- && !test_bit(Faulty, &rdev->flags))) {
+ && !test_bit(Faulty, &flags))) {
len += sprintf(page+len, "%sblocked", sep);
sep = ",";
}
- if (!test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags)) {
+ if (!test_bit(Faulty, &flags) &&
+ !test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sspare", sep);
sep = ",";
}
- if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ if (test_bit(WriteErrorSeen, &flags)) {
len += sprintf(page+len, "%swrite_error", sep);
sep = ",";
}
- if (test_bit(WantReplacement, &rdev->flags)) {
+ if (test_bit(WantReplacement, &flags)) {
len += sprintf(page+len, "%swant_replacement", sep);
sep = ",";
}
- if (test_bit(Replacement, &rdev->flags)) {
+ if (test_bit(Replacement, &flags)) {
len += sprintf(page+len, "%sreplacement", sep);
sep = ",";
}
@@ -2927,21 +2944,12 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
- struct mddev *mddev = rdev->mddev;
- ssize_t rv;
if (!entry->show)
return -EIO;
-
- rv = mddev ? mddev_lock(mddev) : -EBUSY;
- if (!rv) {
- if (rdev->mddev == NULL)
- rv = -EBUSY;
- else
- rv = entry->show(rdev, page);
- mddev_unlock(mddev);
- }
- return rv;
+ if (!rdev->mddev)
+ return -EBUSY;
+ return entry->show(rdev, page);
}
static ssize_t
@@ -3212,11 +3220,13 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
mddev->safemode_delay = 0;
else {
unsigned long old_delay = mddev->safemode_delay;
- mddev->safemode_delay = (msec*HZ)/1000;
- if (mddev->safemode_delay == 0)
- mddev->safemode_delay = 1;
- if (mddev->safemode_delay < old_delay || old_delay == 0)
- md_safemode_timeout((unsigned long)mddev);
+ unsigned long new_delay = (msec*HZ)/1000;
+
+ if (new_delay == 0)
+ new_delay = 1;
+ mddev->safemode_delay = new_delay;
+ if (new_delay < old_delay || old_delay == 0)
+ mod_timer(&mddev->safemode_timer, jiffies+1);
}
return len;
}
@@ -3226,41 +3236,52 @@ __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
static ssize_t
level_show(struct mddev *mddev, char *page)
{
- struct md_personality *p = mddev->pers;
+ struct md_personality *p;
+ int ret;
+ spin_lock(&mddev->lock);
+ p = mddev->pers;
if (p)
- return sprintf(page, "%s\n", p->name);
+ ret = sprintf(page, "%s\n", p->name);
else if (mddev->clevel[0])
- return sprintf(page, "%s\n", mddev->clevel);
+ ret = sprintf(page, "%s\n", mddev->clevel);
else if (mddev->level != LEVEL_NONE)
- return sprintf(page, "%d\n", mddev->level);
+ ret = sprintf(page, "%d\n", mddev->level);
else
- return 0;
+ ret = 0;
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
level_store(struct mddev *mddev, const char *buf, size_t len)
{
char clevel[16];
- ssize_t rv = len;
- struct md_personality *pers;
+ ssize_t rv;
+ size_t slen = len;
+ struct md_personality *pers, *oldpers;
long level;
- void *priv;
+ void *priv, *oldpriv;
struct md_rdev *rdev;
+ if (slen == 0 || slen >= sizeof(clevel))
+ return -EINVAL;
+
+ rv = mddev_lock(mddev);
+ if (rv)
+ return rv;
+
if (mddev->pers == NULL) {
- if (len == 0)
- return 0;
- if (len >= sizeof(mddev->clevel))
- return -ENOSPC;
- strncpy(mddev->clevel, buf, len);
- if (mddev->clevel[len-1] == '\n')
- len--;
- mddev->clevel[len] = 0;
+ strncpy(mddev->clevel, buf, slen);
+ if (mddev->clevel[slen-1] == '\n')
+ slen--;
+ mddev->clevel[slen] = 0;
mddev->level = LEVEL_NONE;
- return rv;
+ rv = len;
+ goto out_unlock;
}
+ rv = -EROFS;
if (mddev->ro)
- return -EROFS;
+ goto out_unlock;
/* request to change the personality. Need to ensure:
* - array is not engaged in resync/recovery/reshape
@@ -3268,25 +3289,25 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
* - new personality will access other array.
*/
+ rv = -EBUSY;
if (mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->reshape_position != MaxSector ||
mddev->sysfs_active)
- return -EBUSY;
+ goto out_unlock;
+ rv = -EINVAL;
if (!mddev->pers->quiesce) {
printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
mdname(mddev), mddev->pers->name);
- return -EINVAL;
+ goto out_unlock;
}
/* Now find the new personality */
- if (len == 0 || len >= sizeof(clevel))
- return -EINVAL;
- strncpy(clevel, buf, len);
- if (clevel[len-1] == '\n')
- len--;
- clevel[len] = 0;
+ strncpy(clevel, buf, slen);
+ if (clevel[slen-1] == '\n')
+ slen--;
+ clevel[slen] = 0;
if (kstrtol(clevel, 10, &level))
level = LEVEL_NONE;
@@ -3297,20 +3318,23 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (!pers || !try_module_get(pers->owner)) {
spin_unlock(&pers_lock);
printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
- return -EINVAL;
+ rv = -EINVAL;
+ goto out_unlock;
}
spin_unlock(&pers_lock);
if (pers == mddev->pers) {
/* Nothing to do! */
module_put(pers->owner);
- return rv;
+ rv = len;
+ goto out_unlock;
}
if (!pers->takeover) {
module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
mdname(mddev), clevel);
- return -EINVAL;
+ rv = -EINVAL;
+ goto out_unlock;
}
rdev_for_each(rdev, mddev)
@@ -3330,30 +3354,29 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s would not accept array\n",
mdname(mddev), clevel);
- return PTR_ERR(priv);
+ rv = PTR_ERR(priv);
+ goto out_unlock;
}
/* Looks like we have a winner */
mddev_suspend(mddev);
- mddev->pers->stop(mddev);
+ mddev_detach(mddev);
- if (mddev->pers->sync_request == NULL &&
- pers->sync_request != NULL) {
- /* need to add the md_redundancy_group */
- if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
- printk(KERN_WARNING
- "md: cannot register extra attributes for %s\n",
- mdname(mddev));
- mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
- }
- if (mddev->pers->sync_request != NULL &&
- pers->sync_request == NULL) {
- /* need to remove the md_redundancy_group */
- if (mddev->to_remove == NULL)
- mddev->to_remove = &md_redundancy_group;
- }
+ spin_lock(&mddev->lock);
+ oldpers = mddev->pers;
+ oldpriv = mddev->private;
+ mddev->pers = pers;
+ mddev->private = priv;
+ strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ mddev->level = mddev->new_level;
+ mddev->layout = mddev->new_layout;
+ mddev->chunk_sectors = mddev->new_chunk_sectors;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ mddev->degraded = 0;
+ spin_unlock(&mddev->lock);
- if (mddev->pers->sync_request == NULL &&
+ if (oldpers->sync_request == NULL &&
mddev->external) {
/* We are converting from a no-redundancy array
* to a redundancy array and metadata is managed
@@ -3367,6 +3390,24 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->safemode = 0;
}
+ oldpers->free(mddev, oldpriv);
+
+ if (oldpers->sync_request == NULL &&
+ pers->sync_request != NULL) {
+ /* need to add the md_redundancy_group */
+ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
+ printk(KERN_WARNING
+ "md: cannot register extra attributes for %s\n",
+ mdname(mddev));
+ mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
+ }
+ if (oldpers->sync_request != NULL &&
+ pers->sync_request == NULL) {
+ /* need to remove the md_redundancy_group */
+ if (mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+ }
+
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
continue;
@@ -3392,17 +3433,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
}
- module_put(mddev->pers->owner);
- mddev->pers = pers;
- mddev->private = priv;
- strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
- mddev->level = mddev->new_level;
- mddev->layout = mddev->new_layout;
- mddev->chunk_sectors = mddev->new_chunk_sectors;
- mddev->delta_disks = 0;
- mddev->reshape_backwards = 0;
- mddev->degraded = 0;
- if (mddev->pers->sync_request == NULL) {
+ if (pers->sync_request == NULL) {
/* this is now an array without redundancy, so
* it must always be in_sync
*/
@@ -3417,6 +3448,9 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
md_update_sb(mddev, 1);
sysfs_notify(&mddev->kobj, NULL, "level");
md_new_event(mddev);
+ rv = len;
+out_unlock:
+ mddev_unlock(mddev);
return rv;
}
@@ -3439,28 +3473,32 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
+ int err;
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers) {
- int err;
if (mddev->pers->check_reshape == NULL)
- return -EBUSY;
- if (mddev->ro)
- return -EROFS;
- mddev->new_layout = n;
- err = mddev->pers->check_reshape(mddev);
- if (err) {
- mddev->new_layout = mddev->layout;
- return err;
+ err = -EBUSY;
+ else if (mddev->ro)
+ err = -EROFS;
+ else {
+ mddev->new_layout = n;
+ err = mddev->pers->check_reshape(mddev);
+ if (err)
+ mddev->new_layout = mddev->layout;
}
} else {
mddev->new_layout = n;
if (mddev->reshape_position == MaxSector)
mddev->layout = n;
}
- return len;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_layout =
__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
@@ -3483,32 +3521,39 @@ static ssize_t
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
- int rv = 0;
+ int err;
unsigned long n = simple_strtoul(buf, &e, 10);
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers)
- rv = update_raid_disks(mddev, n);
+ err = update_raid_disks(mddev, n);
else if (mddev->reshape_position != MaxSector) {
struct md_rdev *rdev;
int olddisks = mddev->raid_disks - mddev->delta_disks;
+ err = -EINVAL;
rdev_for_each(rdev, mddev) {
if (olddisks < n &&
rdev->data_offset < rdev->new_data_offset)
- return -EINVAL;
+ goto out_unlock;
if (olddisks > n &&
rdev->data_offset > rdev->new_data_offset)
- return -EINVAL;
+ goto out_unlock;
}
+ err = 0;
mddev->delta_disks = n - olddisks;
mddev->raid_disks = n;
mddev->reshape_backwards = (mddev->delta_disks < 0);
} else
mddev->raid_disks = n;
- return rv ? rv : len;
+out_unlock:
+ mddev_unlock(mddev);
+ return err ? err : len;
}
static struct md_sysfs_entry md_raid_disks =
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
@@ -3527,30 +3572,34 @@ chunk_size_show(struct mddev *mddev, char *page)
static ssize_t
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
{
+ int err;
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers) {
- int err;
if (mddev->pers->check_reshape == NULL)
- return -EBUSY;
- if (mddev->ro)
- return -EROFS;
- mddev->new_chunk_sectors = n >> 9;
- err = mddev->pers->check_reshape(mddev);
- if (err) {
- mddev->new_chunk_sectors = mddev->chunk_sectors;
- return err;
+ err = -EBUSY;
+ else if (mddev->ro)
+ err = -EROFS;
+ else {
+ mddev->new_chunk_sectors = n >> 9;
+ err = mddev->pers->check_reshape(mddev);
+ if (err)
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
}
} else {
mddev->new_chunk_sectors = n >> 9;
if (mddev->reshape_position == MaxSector)
mddev->chunk_sectors = n >> 9;
}
- return len;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_chunk_size =
__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
@@ -3566,20 +3615,27 @@ resync_start_show(struct mddev *mddev, char *page)
static ssize_t
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
{
+ int err;
char *e;
unsigned long long n = simple_strtoull(buf, &e, 10);
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
- return -EBUSY;
- if (cmd_match(buf, "none"))
+ err = -EBUSY;
+ else if (cmd_match(buf, "none"))
n = MaxSector;
else if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = -EINVAL;
- mddev->recovery_cp = n;
- if (mddev->pers)
- set_bit(MD_CHANGE_CLEAN, &mddev->flags);
- return len;
+ if (!err) {
+ mddev->recovery_cp = n;
+ if (mddev->pers)
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_resync_start =
__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
@@ -3677,8 +3733,39 @@ static int restart_array(struct mddev *mddev);
static ssize_t
array_state_store(struct mddev *mddev, const char *buf, size_t len)
{
- int err = -EINVAL;
+ int err;
enum array_state st = match_word(buf, array_states);
+
+ if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
+ /* don't take reconfig_mutex when toggling between
+ * clean and active
+ */
+ spin_lock(&mddev->lock);
+ if (st == active) {
+ restart_array(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ err = 0;
+ } else /* st == clean */ {
+ restart_array(mddev);
+ if (atomic_read(&mddev->writes_pending) == 0) {
+ if (mddev->in_sync == 0) {
+ mddev->in_sync = 1;
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ err = 0;
+ } else
+ err = -EBUSY;
+ }
+ spin_unlock(&mddev->lock);
+ return err;
+ }
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
switch(st) {
case bad_word:
break;
@@ -3722,7 +3809,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case clean:
if (mddev->pers) {
restart_array(mddev);
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (atomic_read(&mddev->writes_pending) == 0) {
if (mddev->in_sync == 0) {
mddev->in_sync = 1;
@@ -3733,7 +3820,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = 0;
} else
err = -EBUSY;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
} else
err = -EINVAL;
break;
@@ -3754,14 +3841,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
/* these cannot be set */
break;
}
- if (err)
- return err;
- else {
+
+ if (!err) {
if (mddev->hold_active == UNTIL_IOCTL)
mddev->hold_active = 0;
sysfs_notify_dirent_safe(mddev->sysfs_state);
- return len;
}
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_array_state =
__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -3822,6 +3909,11 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
+ flush_workqueue(md_misc_wq);
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->persistent) {
rdev = md_import_device(dev, mddev->major_version,
mddev->minor_version);
@@ -3845,6 +3937,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
out:
if (err)
export_rdev(rdev);
+ mddev_unlock(mddev);
return err ? err : len;
}
@@ -3856,7 +3949,11 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
{
char *end;
unsigned long chunk, end_chunk;
+ int err;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (!mddev->bitmap)
goto out;
/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
@@ -3874,6 +3971,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
}
bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
out:
+ mddev_unlock(mddev);
return len;
}
@@ -3901,6 +3999,9 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err < 0)
return err;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers) {
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
@@ -3911,6 +4012,7 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
else
err = -ENOSPC;
}
+ mddev_unlock(mddev);
return err ? err : len;
}
@@ -3940,21 +4042,28 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
{
int major, minor;
char *e;
+ int err;
/* Changing the details of 'external' metadata is
* always permitted. Otherwise there must be
* no devices attached to the array.
*/
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EBUSY;
if (mddev->external && strncmp(buf, "external:", 9) == 0)
;
else if (!list_empty(&mddev->disks))
- return -EBUSY;
+ goto out_unlock;
+ err = 0;
if (cmd_match(buf, "none")) {
mddev->persistent = 0;
mddev->external = 0;
mddev->major_version = 0;
mddev->minor_version = 90;
- return len;
+ goto out_unlock;
}
if (strncmp(buf, "external:", 9) == 0) {
size_t namelen = len-9;
@@ -3968,22 +4077,27 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
mddev->external = 1;
mddev->major_version = 0;
mddev->minor_version = 90;
- return len;
+ goto out_unlock;
}
major = simple_strtoul(buf, &e, 10);
+ err = -EINVAL;
if (e==buf || *e != '.')
- return -EINVAL;
+ goto out_unlock;
buf = e+1;
minor = simple_strtoul(buf, &e, 10);
if (e==buf || (*e && *e != '\n') )
- return -EINVAL;
+ goto out_unlock;
+ err = -ENOENT;
if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
- return -ENOENT;
+ goto out_unlock;
mddev->major_version = major;
mddev->minor_version = minor;
mddev->persistent = 1;
mddev->external = 0;
- return len;
+ err = 0;
+out_unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_metadata =
@@ -3993,20 +4107,21 @@ static ssize_t
action_show(struct mddev *mddev, char *page)
{
char *type = "idle";
- if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ unsigned long recovery = mddev->recovery;
+ if (test_bit(MD_RECOVERY_FROZEN, &recovery))
type = "frozen";
- else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
+ (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
type = "reshape";
- else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
+ if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
type = "resync";
- else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ else if (test_bit(MD_RECOVERY_CHECK, &recovery))
type = "check";
else
type = "repair";
- } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
type = "recover";
}
return sprintf(page, "%s\n", type);
@@ -4027,7 +4142,10 @@ action_store(struct mddev *mddev, const char *page, size_t len)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev);
+ if (mddev_lock(mddev) == 0) {
+ md_reap_sync_thread(mddev);
+ mddev_unlock(mddev);
+ }
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -4041,7 +4159,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
int err;
if (mddev->pers->start_reshape == NULL)
return -EINVAL;
- err = mddev->pers->start_reshape(mddev);
+ err = mddev_lock(mddev);
+ if (!err) {
+ err = mddev->pers->start_reshape(mddev);
+ mddev_unlock(mddev);
+ }
if (err)
return err;
sysfs_notify(&mddev->kobj, NULL, "degraded");
@@ -4225,22 +4347,36 @@ static ssize_t
min_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long long min;
+ int err;
+ int chunk;
+
if (kstrtoull(buf, 10, &min))
return -EINVAL;
+
+ spin_lock(&mddev->lock);
+ err = -EINVAL;
if (min > mddev->resync_max)
- return -EINVAL;
+ goto out_unlock;
+
+ err = -EBUSY;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- return -EBUSY;
+ goto out_unlock;
/* Must be a multiple of chunk_size */
- if (mddev->chunk_sectors) {
+ chunk = mddev->chunk_sectors;
+ if (chunk) {
sector_t temp = min;
- if (sector_div(temp, mddev->chunk_sectors))
- return -EINVAL;
+
+ err = -EINVAL;
+ if (sector_div(temp, chunk))
+ goto out_unlock;
}
mddev->resync_min = min;
+ err = 0;
- return len;
+out_unlock:
+ spin_unlock(&mddev->lock);
+ return err ?: len;
}
static struct md_sysfs_entry md_min_sync =
@@ -4258,29 +4394,42 @@ max_sync_show(struct mddev *mddev, char *page)
static ssize_t
max_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
+ int err;
+ spin_lock(&mddev->lock);
if (strncmp(buf, "max", 3) == 0)
mddev->resync_max = MaxSector;
else {
unsigned long long max;
+ int chunk;
+
+ err = -EINVAL;
if (kstrtoull(buf, 10, &max))
- return -EINVAL;
+ goto out_unlock;
if (max < mddev->resync_min)
- return -EINVAL;
+ goto out_unlock;
+
+ err = -EBUSY;
if (max < mddev->resync_max &&
mddev->ro == 0 &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- return -EBUSY;
+ goto out_unlock;
/* Must be a multiple of chunk_size */
- if (mddev->chunk_sectors) {
+ chunk = mddev->chunk_sectors;
+ if (chunk) {
sector_t temp = max;
- if (sector_div(temp, mddev->chunk_sectors))
- return -EINVAL;
+
+ err = -EINVAL;
+ if (sector_div(temp, chunk))
+ goto out_unlock;
}
mddev->resync_max = max;
}
wake_up(&mddev->recovery_wait);
- return len;
+ err = 0;
+out_unlock:
+ spin_unlock(&mddev->lock);
+ return err ?: len;
}
static struct md_sysfs_entry md_max_sync =
@@ -4297,14 +4446,20 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old = mddev->suspend_lo;
+ unsigned long long old;
+ int err;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
- return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
+ goto unlock;
+ old = mddev->suspend_lo;
mddev->suspend_lo = new;
if (new >= old)
/* Shrinking suspended region */
@@ -4314,7 +4469,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- return len;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4330,14 +4488,20 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old = mddev->suspend_hi;
+ unsigned long long old;
+ int err;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
- return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
+ goto unlock;
+ old = mddev->suspend_hi;
mddev->suspend_hi = new;
if (new <= old)
/* Shrinking suspended region */
@@ -4347,7 +4511,10 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- return len;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4367,11 +4534,17 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{
struct md_rdev *rdev;
char *e;
+ int err;
unsigned long long new = simple_strtoull(buf, &e, 10);
- if (mddev->pers)
- return -EBUSY;
+
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EBUSY;
+ if (mddev->pers)
+ goto unlock;
mddev->reshape_position = new;
mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
@@ -4380,7 +4553,10 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
mddev->new_chunk_sectors = mddev->chunk_sectors;
rdev_for_each(rdev, mddev)
rdev->new_data_offset = rdev->data_offset;
- return len;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_reshape_position =
@@ -4398,6 +4574,8 @@ static ssize_t
reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
{
int backwards = 0;
+ int err;
+
if (cmd_match(buf, "forwards"))
backwards = 0;
else if (cmd_match(buf, "backwards"))
@@ -4407,16 +4585,19 @@ reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->reshape_backwards == backwards)
return len;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
/* check if we are allowed to change */
if (mddev->delta_disks)
- return -EBUSY;
-
- if (mddev->persistent &&
+ err = -EBUSY;
+ else if (mddev->persistent &&
mddev->major_version == 0)
- return -EINVAL;
-
- mddev->reshape_backwards = backwards;
- return len;
+ err = -EINVAL;
+ else
+ mddev->reshape_backwards = backwards;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_reshape_direction =
@@ -4437,6 +4618,11 @@ static ssize_t
array_size_store(struct mddev *mddev, const char *buf, size_t len)
{
sector_t sectors;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (strncmp(buf, "default", 7) == 0) {
if (mddev->pers)
@@ -4447,19 +4633,22 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
mddev->external_size = 0;
} else {
if (strict_blocks_to_sectors(buf, &sectors) < 0)
- return -EINVAL;
- if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
- return -E2BIG;
-
- mddev->external_size = 1;
+ err = -EINVAL;
+ else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
+ err = -E2BIG;
+ else
+ mddev->external_size = 1;
}
- mddev->array_sectors = sectors;
- if (mddev->pers) {
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ if (!err) {
+ mddev->array_sectors = sectors;
+ if (mddev->pers) {
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ }
}
- return len;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_array_size =
@@ -4523,11 +4712,7 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
- rv = mddev_lock(mddev);
- if (!rv) {
- rv = entry->show(mddev, page);
- mddev_unlock(mddev);
- }
+ rv = entry->show(mddev, page);
mddev_put(mddev);
return rv;
}
@@ -4551,13 +4736,7 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
}
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
- if (entry->store == new_dev_store)
- flush_workqueue(md_misc_wq);
- rv = mddev_lock(mddev);
- if (!rv) {
- rv = entry->store(mddev, page, length);
- mddev_unlock(mddev);
- }
+ rv = entry->store(mddev, page, length);
mddev_put(mddev);
return rv;
}
@@ -4825,7 +5004,6 @@ int md_run(struct mddev *mddev)
mddev->clevel);
return -EINVAL;
}
- mddev->pers = pers;
spin_unlock(&pers_lock);
if (mddev->level != pers->level) {
mddev->level = pers->level;
@@ -4836,7 +5014,6 @@ int md_run(struct mddev *mddev)
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
- mddev->pers = NULL;
module_put(pers->owner);
return -EINVAL;
}
@@ -4880,35 +5057,38 @@ int md_run(struct mddev *mddev)
if (start_readonly && mddev->ro == 0)
mddev->ro = 2; /* read-only, but switch on first write */
- err = mddev->pers->run(mddev);
+ err = pers->run(mddev);
if (err)
printk(KERN_ERR "md: pers->run() failed ...\n");
- else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
+ else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
WARN_ONCE(!mddev->external_size, "%s: default size too small,"
" but 'external_size' not in effect?\n", __func__);
printk(KERN_ERR
"md: invalid array_size %llu > default size %llu\n",
(unsigned long long)mddev->array_sectors / 2,
- (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
+ (unsigned long long)pers->size(mddev, 0, 0) / 2);
err = -EINVAL;
- mddev->pers->stop(mddev);
}
- if (err == 0 && mddev->pers->sync_request &&
+ if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
err = bitmap_create(mddev);
- if (err) {
+ if (err)
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
- mddev->pers->stop(mddev);
- }
}
if (err) {
- module_put(mddev->pers->owner);
- mddev->pers = NULL;
+ mddev_detach(mddev);
+ pers->free(mddev, mddev->private);
+ module_put(pers->owner);
bitmap_destroy(mddev);
return err;
}
- if (mddev->pers->sync_request) {
+ if (mddev->queue) {
+ mddev->queue->backing_dev_info.congested_data = mddev;
+ mddev->queue->backing_dev_info.congested_fn = md_congested;
+ blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
+ }
+ if (pers->sync_request) {
if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_redundancy_group))
printk(KERN_WARNING
@@ -4927,7 +5107,10 @@ int md_run(struct mddev *mddev)
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
+ spin_lock(&mddev->lock);
+ mddev->pers = pers;
mddev->ready = 1;
+ spin_unlock(&mddev->lock);
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
if (sysfs_link_rdev(mddev, rdev))
@@ -5070,14 +5253,38 @@ void md_stop_writes(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_stop_writes);
+static void mddev_detach(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ /* wait for behind writes to complete */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
+ /* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+ if (mddev->pers && mddev->pers->quiesce) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ md_unregister_thread(&mddev->thread);
+ if (mddev->queue)
+ blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+}
+
static void __md_stop(struct mddev *mddev)
{
+ struct md_personality *pers = mddev->pers;
+ mddev_detach(mddev);
+ spin_lock(&mddev->lock);
mddev->ready = 0;
- mddev->pers->stop(mddev);
- if (mddev->pers->sync_request && mddev->to_remove == NULL)
- mddev->to_remove = &md_redundancy_group;
- module_put(mddev->pers->owner);
mddev->pers = NULL;
+ spin_unlock(&mddev->lock);
+ pers->free(mddev, mddev->private);
+ if (pers->sync_request && mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+ module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
@@ -5226,8 +5433,11 @@ static int do_md_stop(struct mddev *mddev, int mode,
bitmap_destroy(mddev);
if (mddev->bitmap_info.file) {
- fput(mddev->bitmap_info.file);
+ struct file *f = mddev->bitmap_info.file;
+ spin_lock(&mddev->lock);
mddev->bitmap_info.file = NULL;
+ spin_unlock(&mddev->lock);
+ fput(f);
}
mddev->bitmap_info.offset = 0;
@@ -5436,37 +5646,31 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
static int get_bitmap_file(struct mddev *mddev, void __user * arg)
{
mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
- char *ptr, *buf = NULL;
- int err = -ENOMEM;
+ char *ptr;
+ int err;
file = kmalloc(sizeof(*file), GFP_NOIO);
-
if (!file)
- goto out;
+ return -ENOMEM;
+ err = 0;
+ spin_lock(&mddev->lock);
/* bitmap disabled, zero the first byte and copy out */
- if (!mddev->bitmap || !mddev->bitmap->storage.file) {
+ if (!mddev->bitmap_info.file)
file->pathname[0] = '\0';
- goto copy_out;
- }
-
- buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
- if (!buf)
- goto out;
-
- ptr = d_path(&mddev->bitmap->storage.file->f_path,
- buf, sizeof(file->pathname));
- if (IS_ERR(ptr))
- goto out;
-
- strcpy(file->pathname, ptr);
+ else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+ file->pathname, sizeof(file->pathname))),
+ IS_ERR(ptr))
+ err = PTR_ERR(ptr);
+ else
+ memmove(file->pathname, ptr,
+ sizeof(file->pathname)-(ptr-file->pathname));
+ spin_unlock(&mddev->lock);
-copy_out:
- err = 0;
- if (copy_to_user(arg, file, sizeof(*file)))
+ if (err == 0 &&
+ copy_to_user(arg, file, sizeof(*file)))
err = -EFAULT;
-out:
- kfree(buf);
+
kfree(file);
return err;
}
@@ -5789,22 +5993,24 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
if (fd >= 0) {
struct inode *inode;
- if (mddev->bitmap)
+ struct file *f;
+
+ if (mddev->bitmap || mddev->bitmap_info.file)
return -EEXIST; /* cannot add when bitmap is present */
- mddev->bitmap_info.file = fget(fd);
+ f = fget(fd);
- if (mddev->bitmap_info.file == NULL) {
+ if (f == NULL) {
printk(KERN_ERR "%s: error: failed to get bitmap file\n",
mdname(mddev));
return -EBADF;
}
- inode = mddev->bitmap_info.file->f_mapping->host;
+ inode = f->f_mapping->host;
if (!S_ISREG(inode->i_mode)) {
printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
mdname(mddev));
err = -EBADF;
- } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) {
+ } else if (!(f->f_mode & FMODE_WRITE)) {
printk(KERN_ERR "%s: error: bitmap file must open for write\n",
mdname(mddev));
err = -EBADF;
@@ -5814,10 +6020,10 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = -EBUSY;
}
if (err) {
- fput(mddev->bitmap_info.file);
- mddev->bitmap_info.file = NULL;
+ fput(f);
return err;
}
+ mddev->bitmap_info.file = f;
mddev->bitmap_info.offset = 0; /* file overrides offset */
} else if (mddev->bitmap == NULL)
return -ENOENT; /* cannot remove what isn't there */
@@ -5836,9 +6042,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
mddev->pers->quiesce(mddev, 0);
}
if (fd < 0) {
- if (mddev->bitmap_info.file)
- fput(mddev->bitmap_info.file);
- mddev->bitmap_info.file = NULL;
+ struct file *f = mddev->bitmap_info.file;
+ if (f) {
+ spin_lock(&mddev->lock);
+ mddev->bitmap_info.file = NULL;
+ spin_unlock(&mddev->lock);
+ fput(f);
+ }
}
return err;
@@ -6251,6 +6461,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
case SET_DISK_FAULTY:
err = set_disk_faulty(mddev, new_decode_dev(arg));
goto out;
+
+ case GET_BITMAP_FILE:
+ err = get_bitmap_file(mddev, argp);
+ goto out;
+
}
if (cmd == ADD_NEW_DISK)
@@ -6342,10 +6557,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
* Commands even a read-only array can execute:
*/
switch (cmd) {
- case GET_BITMAP_FILE:
- err = get_bitmap_file(mddev, argp);
- goto unlock;
-
case RESTART_ARRAY_RW:
err = restart_array(mddev);
goto unlock;
@@ -6873,9 +7084,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
}
- if (mddev_lock(mddev) < 0)
- return -EINTR;
-
+ spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : %sactive", mdname(mddev),
mddev->pers ? "" : "in");
@@ -6888,7 +7097,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
}
sectors = 0;
- rdev_for_each(rdev, mddev) {
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6904,6 +7114,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "(R)");
sectors += rdev->sectors;
}
+ rcu_read_unlock();
if (!list_empty(&mddev->disks)) {
if (mddev->pers)
@@ -6946,7 +7157,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
- mddev_unlock(mddev);
+ spin_unlock(&mddev->lock);
return 0;
}
@@ -7102,7 +7313,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
if (mddev->safemode == 1)
mddev->safemode = 0;
if (mddev->in_sync) {
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
@@ -7110,7 +7321,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
md_wakeup_thread(mddev->thread);
did_change = 1;
}
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
}
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -7148,7 +7359,7 @@ int md_allow_write(struct mddev *mddev)
if (!mddev->pers->sync_request)
return 0;
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
@@ -7156,11 +7367,11 @@ int md_allow_write(struct mddev *mddev)
if (mddev->safemode_delay &&
mddev->safemode == 0)
mddev->safemode = 1;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
md_update_sb(mddev, 0);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
return -EAGAIN;
@@ -7513,6 +7724,7 @@ void md_do_sync(struct md_thread *thread)
skip:
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
@@ -7521,6 +7733,8 @@ void md_do_sync(struct md_thread *thread)
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
mddev->curr_resync = 0;
+ spin_unlock(&mddev->lock);
+
wake_up(&resync_wait);
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -7688,7 +7902,7 @@ void md_check_recovery(struct mddev *mddev)
if (!mddev->external) {
int did_change = 0;
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->safemode &&
!atomic_read(&mddev->writes_pending) &&
!mddev->in_sync &&
@@ -7699,7 +7913,7 @@ void md_check_recovery(struct mddev *mddev)
}
if (mddev->safemode == 1)
mddev->safemode = 0;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
@@ -7721,7 +7935,9 @@ void md_check_recovery(struct mddev *mddev)
* any transients in the value of "sync_action".
*/
mddev->curr_resync_completed = 0;
+ spin_lock(&mddev->lock);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ spin_unlock(&mddev->lock);
/* Clear some bits that don't mean anything, but
* might be left set
*/
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 03cec5b..318ca8f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -386,7 +386,18 @@ struct mddev {
struct work_struct del_work; /* used for delayed sysfs removal */
- spinlock_t write_lock;
+ /* "lock" protects:
+ * flush_bio transition from NULL to !NULL
+ * rdev superblocks, events
+ * clearing MD_CHANGE_*
+ * in_sync - and related safemode and MD_CHANGE changes
+ * pers (also protected by reconfig_mutex and pending IO).
+ * clearing ->bitmap
+ * clearing ->bitmap_info.file
+ * changing ->resync_{min,max}
+ * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
+ */
+ spinlock_t lock;
wait_queue_head_t sb_wait; /* for waiting on superblock updates */
atomic_t pending_writes; /* number of active superblock writes */
@@ -439,13 +450,30 @@ struct mddev {
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
};
-static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
+static inline int __must_check mddev_lock(struct mddev *mddev)
{
- int faulty = test_bit(Faulty, &rdev->flags);
- if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ return mutex_lock_interruptible(&mddev->reconfig_mutex);
+}
+
+/* Sometimes we need to take the lock in a situation where
+ * failure due to interrupts is not acceptable.
+ */
+static inline void mddev_lock_nointr(struct mddev *mddev)
+{
+ mutex_lock(&mddev->reconfig_mutex);
+}
+
+static inline int mddev_is_locked(struct mddev *mddev)
+{
+ return mutex_is_locked(&mddev->reconfig_mutex);
}
+static inline int mddev_trylock(struct mddev *mddev)
+{
+ return mutex_trylock(&mddev->reconfig_mutex);
+}
+extern void mddev_unlock(struct mddev *mddev);
+
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
@@ -459,7 +487,7 @@ struct md_personality
struct module *owner;
void (*make_request)(struct mddev *mddev, struct bio *bio);
int (*run)(struct mddev *mddev);
- int (*stop)(struct mddev *mddev);
+ void (*free)(struct mddev *mddev, void *priv);
void (*status)(struct seq_file *seq, struct mddev *mddev);
/* error_handler must set ->faulty and clear ->in_sync
* if appropriate, and should abort recovery if needed
@@ -490,6 +518,13 @@ struct md_personality
* array.
*/
void *(*takeover) (struct mddev *mddev);
+ /* congested implements bdi.congested_fn().
+ * Will not be called while array is 'suspended' */
+ int (*congested)(struct mddev *mddev, int bits);
+ /* mergeable_bvec is use to implement ->merge_bvec_fn */
+ int (*mergeable_bvec)(struct mddev *mddev,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec);
};
struct md_sysfs_entry {
@@ -624,4 +659,14 @@ static inline int mddev_check_plugged(struct mddev *mddev)
return !!blk_check_plugged(md_unplug, mddev,
sizeof(struct blk_plug_cb));
}
+
+static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
+{
+ int faulty = test_bit(Faulty, &rdev->flags);
+ if (atomic_dec_and_test(&rdev->nr_pending) && faulty) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+}
+
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 399272f..ac3ede2 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -153,15 +153,11 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev)
seq_printf (seq, "]");
}
-static int multipath_congested(void *data, int bits)
+static int multipath_congested(struct mddev *mddev, int bits)
{
- struct mddev *mddev = data;
struct mpconf *conf = mddev->private;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
- return 1;
-
rcu_read_lock();
for (i = 0; i < mddev->raid_disks ; i++) {
struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
@@ -403,7 +399,7 @@ static int multipath_run (struct mddev *mddev)
/*
* copy the already verified devices into our private MULTIPATH
* bookkeeping area. [whatever we allocate in multipath_run(),
- * should be freed in multipath_stop()]
+ * should be freed in multipath_free()]
*/
conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
@@ -489,9 +485,6 @@ static int multipath_run (struct mddev *mddev)
*/
md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
- mddev->queue->backing_dev_info.congested_fn = multipath_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
-
if (md_integrity_register(mddev))
goto out_free_conf;
@@ -507,17 +500,13 @@ out:
return -EIO;
}
-static int multipath_stop (struct mddev *mddev)
+static void multipath_free(struct mddev *mddev, void *priv)
{
- struct mpconf *conf = mddev->private;
+ struct mpconf *conf = priv;
- md_unregister_thread(&mddev->thread);
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
mempool_destroy(conf->pool);
kfree(conf->multipaths);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static struct md_personality multipath_personality =
@@ -527,12 +516,13 @@ static struct md_personality multipath_personality =
.owner = THIS_MODULE,
.make_request = multipath_make_request,
.run = multipath_run,
- .stop = multipath_stop,
+ .free = multipath_free,
.status = multipath_status,
.error_handler = multipath_error,
.hot_add_disk = multipath_add_disk,
.hot_remove_disk= multipath_remove_disk,
.size = multipath_size,
+ .congested = multipath_congested,
};
static int __init multipath_init (void)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ba6b85d..a13f738 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,17 +25,13 @@
#include "raid0.h"
#include "raid5.h"
-static int raid0_congested(void *data, int bits)
+static int raid0_congested(struct mddev *mddev, int bits)
{
- struct mddev *mddev = data;
struct r0conf *conf = mddev->private;
struct md_rdev **devlist = conf->devlist;
int raid_disks = conf->strip_zone[0].nb_dev;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
- return 1;
-
for (i = 0; i < raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
@@ -263,8 +259,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
mdname(mddev),
(unsigned long long)smallest->sectors);
}
- mddev->queue->backing_dev_info.congested_fn = raid0_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
/*
* now since we have the hard sector sizes, we can make sure
@@ -356,17 +350,16 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
/**
* raid0_mergeable_bvec -- tell bio layer if two requests can be merged
- * @q: request queue
+ * @mddev: the md device
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can accept at this offset
*/
-static int raid0_mergeable_bvec(struct request_queue *q,
+static int raid0_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct r0conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
sector_t sector_offset = sector;
@@ -422,7 +415,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
return array_sectors;
}
-static int raid0_stop(struct mddev *mddev);
+static void raid0_free(struct mddev *mddev, void *priv);
static int raid0_run(struct mddev *mddev)
{
@@ -471,26 +464,22 @@ static int raid0_run(struct mddev *mddev)
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
}
- blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
dump_zones(mddev);
ret = md_integrity_register(mddev);
if (ret)
- raid0_stop(mddev);
+ raid0_free(mddev, conf);
return ret;
}
-static int raid0_stop(struct mddev *mddev)
+static void raid0_free(struct mddev *mddev, void *priv)
{
- struct r0conf *conf = mddev->private;
+ struct r0conf *conf = priv;
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf->strip_zone);
kfree(conf->devlist);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
/*
@@ -724,11 +713,13 @@ static struct md_personality raid0_personality=
.owner = THIS_MODULE,
.make_request = raid0_make_request,
.run = raid0_run,
- .stop = raid0_stop,
+ .free = raid0_free,
.status = raid0_status,
.size = raid0_size,
.takeover = raid0_takeover,
.quiesce = raid0_quiesce,
+ .congested = raid0_congested,
+ .mergeable_bvec = raid0_mergeable_bvec,
};
static int __init raid0_init (void)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 40b35be..5dd0c2e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -701,11 +701,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
-static int raid1_mergeable_bvec(struct request_queue *q,
+static int raid1_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct r1conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max = biovec->bv_len;
@@ -734,7 +733,7 @@ static int raid1_mergeable_bvec(struct request_queue *q,
}
-int md_raid1_congested(struct mddev *mddev, int bits)
+static int raid1_congested(struct mddev *mddev, int bits)
{
struct r1conf *conf = mddev->private;
int i, ret = 0;
@@ -763,15 +762,6 @@ int md_raid1_congested(struct mddev *mddev, int bits)
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(md_raid1_congested);
-
-static int raid1_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid1_congested(mddev, bits);
-}
static void flush_pending_writes(struct r1conf *conf)
{
@@ -2882,7 +2872,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
-static int stop(struct mddev *mddev);
+static void raid1_free(struct mddev *mddev, void *priv);
static int run(struct mddev *mddev)
{
struct r1conf *conf;
@@ -2904,7 +2894,7 @@ static int run(struct mddev *mddev)
/*
* copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(),
- * should be freed in stop()]
+ * should be freed in raid1_free()]
*/
if (mddev->private == NULL)
conf = setup_conf(mddev);
@@ -2955,10 +2945,6 @@ static int run(struct mddev *mddev)
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
if (mddev->queue) {
- mddev->queue->backing_dev_info.congested_fn = raid1_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
- blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
-
if (discard_supported)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
mddev->queue);
@@ -2968,37 +2954,23 @@ static int run(struct mddev *mddev)
}
ret = md_integrity_register(mddev);
- if (ret)
- stop(mddev);
+ if (ret) {
+ md_unregister_thread(&mddev->thread);
+ raid1_free(mddev, conf);
+ }
return ret;
}
-static int stop(struct mddev *mddev)
+static void raid1_free(struct mddev *mddev, void *priv)
{
- struct r1conf *conf = mddev->private;
- struct bitmap *bitmap = mddev->bitmap;
+ struct r1conf *conf = priv;
- /* wait for behind writes to complete */
- if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
- printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
- mdname(mddev));
- /* need to kick something here to make sure I/O goes? */
- wait_event(bitmap->behind_wait,
- atomic_read(&bitmap->behind_writes) == 0);
- }
-
- freeze_array(conf, 0);
- unfreeze_array(conf);
-
- md_unregister_thread(&mddev->thread);
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static int raid1_resize(struct mddev *mddev, sector_t sectors)
@@ -3181,7 +3153,7 @@ static struct md_personality raid1_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid1_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid1_add_disk,
@@ -3193,6 +3165,8 @@ static struct md_personality raid1_personality =
.check_reshape = raid1_reshape,
.quiesce = raid1_quiesce,
.takeover = raid1_takeover,
+ .congested = raid1_congested,
+ .mergeable_bvec = raid1_mergeable_bvec,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 33bda55..14ebb28 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -170,7 +170,4 @@ struct r1bio {
*/
#define R1BIO_MadeGood 7
#define R1BIO_WriteError 8
-
-extern int md_raid1_congested(struct mddev *mddev, int bits);
-
#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 32e282f..b8d76b1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -674,7 +674,7 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
/**
* raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
- * @q: request queue
+ * @mddev: the md device
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
@@ -682,11 +682,10 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* This requires checking for end-of-chunk if near_copies != raid_disks,
* and for subordinate merge_bvec_fns if merge_check_needed.
*/
-static int raid10_mergeable_bvec(struct request_queue *q,
+static int raid10_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct r10conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
@@ -910,7 +909,7 @@ retry:
return rdev;
}
-int md_raid10_congested(struct mddev *mddev, int bits)
+static int raid10_congested(struct mddev *mddev, int bits)
{
struct r10conf *conf = mddev->private;
int i, ret = 0;
@@ -934,15 +933,6 @@ int md_raid10_congested(struct mddev *mddev, int bits)
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(md_raid10_congested);
-
-static int raid10_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid10_congested(mddev, bits);
-}
static void flush_pending_writes(struct r10conf *conf)
{
@@ -3757,8 +3747,6 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
int stripe = conf->geo.raid_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
- mddev->queue->backing_dev_info.congested_fn = raid10_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
/* Calculate max read-ahead size.
* We need to readahead at least twice a whole stripe....
@@ -3767,7 +3755,6 @@ static int run(struct mddev *mddev)
stripe /= conf->geo.near_copies;
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
- blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
}
if (md_integrity_register(mddev))
@@ -3811,17 +3798,9 @@ out:
return -EIO;
}
-static int stop(struct mddev *mddev)
+static void raid10_free(struct mddev *mddev, void *priv)
{
- struct r10conf *conf = mddev->private;
-
- raise_barrier(conf, 0);
- lower_barrier(conf);
-
- md_unregister_thread(&mddev->thread);
- if (mddev->queue)
- /* the unplug fn references 'conf'*/
- blk_sync_queue(mddev->queue);
+ struct r10conf *conf = priv;
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
@@ -3830,8 +3809,6 @@ static int stop(struct mddev *mddev)
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static void raid10_quiesce(struct mddev *mddev, int state)
@@ -3895,7 +3872,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
return 0;
}
-static void *raid10_takeover_raid0(struct mddev *mddev)
+static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
{
struct md_rdev *rdev;
struct r10conf *conf;
@@ -3905,6 +3882,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
mdname(mddev));
return ERR_PTR(-EINVAL);
}
+ sector_div(size, devs);
/* Set new parameters */
mddev->new_level = 10;
@@ -3915,12 +3893,15 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
mddev->raid_disks *= 2;
/* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector;
+ mddev->dev_sectors = size;
conf = setup_conf(mddev);
if (!IS_ERR(conf)) {
rdev_for_each(rdev, mddev)
- if (rdev->raid_disk >= 0)
+ if (rdev->raid_disk >= 0) {
rdev->new_raid_disk = rdev->raid_disk * 2;
+ rdev->sectors = size;
+ }
conf->barrier = 1;
}
@@ -3943,7 +3924,9 @@ static void *raid10_takeover(struct mddev *mddev)
mdname(mddev));
return ERR_PTR(-EINVAL);
}
- return raid10_takeover_raid0(mddev);
+ return raid10_takeover_raid0(mddev,
+ raid0_conf->strip_zone->zone_end,
+ raid0_conf->strip_zone->nb_dev);
}
return ERR_PTR(-EINVAL);
}
@@ -4713,7 +4696,7 @@ static struct md_personality raid10_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid10_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid10_add_disk,
@@ -4727,6 +4710,8 @@ static struct md_personality raid10_personality =
.check_reshape = raid10_check_reshape,
.start_reshape = raid10_start_reshape,
.finish_reshape = raid10_finish_reshape,
+ .congested = raid10_congested,
+ .mergeable_bvec = raid10_mergeable_bvec,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 157d69e..5ee6473 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -150,7 +150,4 @@ enum r10bio_state {
*/
R10BIO_Previous,
};
-
-extern int md_raid10_congested(struct mddev *mddev, int bits);
-
#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b98765f..aa76865 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
- !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
- if (atomic_read(&conf->preread_active_stripes)
- < IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
@@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
* Returns 1 when no more member devices need to be checked, otherwise returns
* 0 to tell the loop in handle_stripe_fill to continue
*/
-static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
- int disk_idx, int disks)
+
+static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
{
struct r5dev *dev = &sh->dev[disk_idx];
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
&sh->dev[s->failed_num[1]] };
+ int i;
+
+
+ if (test_bit(R5_LOCKED, &dev->flags) ||
+ test_bit(R5_UPTODATE, &dev->flags))
+ /* No point reading this as we already have it or have
+ * decided to get it.
+ */
+ return 0;
+
+ if (dev->toread ||
+ (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
+ /* We need this block to directly satisfy a request */
+ return 1;
+
+ if (s->syncing || s->expanding ||
+ (s->replacing && want_replace(sh, disk_idx)))
+ /* When syncing, or expanding we read everything.
+ * When replacing, we need the replaced block.
+ */
+ return 1;
+
+ if ((s->failed >= 1 && fdev[0]->toread) ||
+ (s->failed >= 2 && fdev[1]->toread))
+ /* If we want to read from a failed device, then
+ * we need to actually read every other device.
+ */
+ return 1;
+
+ /* Sometimes neither read-modify-write nor reconstruct-write
+ * cycles can work. In those cases we read every block we
+ * can. Then the parity-update is certain to have enough to
+ * work with.
+ * This can only be a problem when we need to write something,
+ * and some device has failed. If either of those tests
+ * fail we need look no further.
+ */
+ if (!s->failed || !s->to_write)
+ return 0;
+
+ if (test_bit(R5_Insync, &dev->flags) &&
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ /* Pre-reads at not permitted until after short delay
+ * to gather multiple requests. However if this
+ * device is no Insync, the block could only be be computed
+ * and there is no need to delay that.
+ */
+ return 0;
+
+ for (i = 0; i < s->failed; i++) {
+ if (fdev[i]->towrite &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ /* If we have a partial write to a failed
+ * device, then we will need to reconstruct
+ * the content of that device, so all other
+ * devices must be read.
+ */
+ return 1;
+ }
+
+ /* If we are forced to do a reconstruct-write, either because
+ * the current RAID6 implementation only supports that, or
+ * or because parity cannot be trusted and we are currently
+ * recovering it, there is extra need to be careful.
+ * If one of the devices that we would need to read, because
+ * it is not being overwritten (and maybe not written at all)
+ * is missing/faulty, then we need to read everything we can.
+ */
+ if (sh->raid_conf->level != 6 &&
+ sh->sector < sh->raid_conf->mddev->recovery_cp)
+ /* reconstruct-write isn't being forced */
+ return 0;
+ for (i = 0; i < s->failed; i++) {
+ if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
+{
+ struct r5dev *dev = &sh->dev[disk_idx];
/* is the data in this block needed, and can we get it? */
- if (!test_bit(R5_LOCKED, &dev->flags) &&
- !test_bit(R5_UPTODATE, &dev->flags) &&
- (dev->toread ||
- (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
- s->syncing || s->expanding ||
- (s->replacing && want_replace(sh, disk_idx)) ||
- (s->failed >= 1 && fdev[0]->toread) ||
- (s->failed >= 2 && fdev[1]->toread) ||
- (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
- (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
- !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
- ((sh->raid_conf->level == 6 ||
- sh->sector >= sh->raid_conf->mddev->recovery_cp)
- && s->failed && s->to_write &&
- (s->to_write - s->non_overwrite <
- sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) &&
- (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
+ if (need_this_block(sh, s, disk_idx, disks)) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
@@ -4081,7 +4149,7 @@ static void activate_bit_delay(struct r5conf *conf,
}
}
-int md_raid5_congested(struct mddev *mddev, int bits)
+static int raid5_congested(struct mddev *mddev, int bits)
{
struct r5conf *conf = mddev->private;
@@ -4098,24 +4166,14 @@ int md_raid5_congested(struct mddev *mddev, int bits)
return 0;
}
-EXPORT_SYMBOL_GPL(md_raid5_congested);
-
-static int raid5_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid5_congested(mddev, bits);
-}
/* We want read requests to align with chunks where possible,
* but write requests don't need to.
*/
-static int raid5_mergeable_bvec(struct request_queue *q,
+static int raid5_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -5296,11 +5354,14 @@ static void raid5d(struct md_thread *thread)
static ssize_t
raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->max_nr_stripes);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+ spin_unlock(&mddev->lock);
+ return ret;
}
int
@@ -5339,21 +5400,25 @@ EXPORT_SYMBOL(raid5_set_cache_size);
static ssize_t
raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
int err;
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- err = raid5_set_cache_size(mddev, new);
+ err = mddev_lock(mddev);
if (err)
return err;
- return len;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else
+ err = raid5_set_cache_size(mddev, new);
+ mddev_unlock(mddev);
+
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5364,29 +5429,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->bypass_threshold);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->bypass_threshold);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- if (new > conf->max_nr_stripes)
- return -EINVAL;
- conf->bypass_threshold = new;
- return len;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new > conf->max_nr_stripes)
+ err = -EINVAL;
+ else
+ conf->bypass_threshold = new;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5398,39 +5474,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
static ssize_t
raid5_show_skip_copy(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->skip_copy);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->skip_copy);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
new = !!new;
- if (new == conf->skip_copy)
- return len;
- mddev_suspend(mddev);
- conf->skip_copy = new;
- if (new)
- mddev->queue->backing_dev_info.capabilities |=
- BDI_CAP_STABLE_WRITES;
- else
- mddev->queue->backing_dev_info.capabilities &=
- ~BDI_CAP_STABLE_WRITES;
- mddev_resume(mddev);
- return len;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->skip_copy) {
+ mddev_suspend(mddev);
+ conf->skip_copy = new;
+ if (new)
+ mddev->queue->backing_dev_info.capabilities |=
+ BDI_CAP_STABLE_WRITES;
+ else
+ mddev->queue->backing_dev_info.capabilities &=
+ ~BDI_CAP_STABLE_WRITES;
+ mddev_resume(mddev);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5454,11 +5539,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
static ssize_t
raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->worker_cnt_per_group);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static int alloc_thread_groups(struct r5conf *conf, int cnt,
@@ -5468,7 +5556,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
static ssize_t
raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
int err;
struct r5worker_group *new_groups, *old_groups;
@@ -5476,41 +5564,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- if (new == conf->worker_cnt_per_group)
- return len;
-
- mddev_suspend(mddev);
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->worker_cnt_per_group) {
+ mddev_suspend(mddev);
- old_groups = conf->worker_groups;
- if (old_groups)
- flush_workqueue(raid5_wq);
+ old_groups = conf->worker_groups;
+ if (old_groups)
+ flush_workqueue(raid5_wq);
- err = alloc_thread_groups(conf, new,
- &group_cnt, &worker_cnt_per_group,
- &new_groups);
- if (!err) {
- spin_lock_irq(&conf->device_lock);
- conf->group_cnt = group_cnt;
- conf->worker_cnt_per_group = worker_cnt_per_group;
- conf->worker_groups = new_groups;
- spin_unlock_irq(&conf->device_lock);
+ err = alloc_thread_groups(conf, new,
+ &group_cnt, &worker_cnt_per_group,
+ &new_groups);
+ if (!err) {
+ spin_lock_irq(&conf->device_lock);
+ conf->group_cnt = group_cnt;
+ conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_groups = new_groups;
+ spin_unlock_irq(&conf->device_lock);
- if (old_groups)
- kfree(old_groups[0].workers);
- kfree(old_groups);
+ if (old_groups)
+ kfree(old_groups[0].workers);
+ kfree(old_groups);
+ }
+ mddev_resume(mddev);
}
+ mddev_unlock(mddev);
- mddev_resume(mddev);
-
- if (err)
- return err;
- return len;
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -6178,11 +6266,6 @@ static int run(struct mddev *mddev)
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
- blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
-
- mddev->queue->backing_dev_info.congested_data = mddev;
- mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size *
@@ -6260,17 +6343,12 @@ abort:
return -EIO;
}
-static int stop(struct mddev *mddev)
+static void raid5_free(struct mddev *mddev, void *priv)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf = priv;
- md_unregister_thread(&mddev->thread);
- if (mddev->queue)
- mddev->queue->backing_dev_info.congested_fn = NULL;
free_conf(conf);
- mddev->private = NULL;
mddev->to_remove = &raid5_attrs_group;
- return 0;
}
static void status(struct seq_file *seq, struct mddev *mddev)
@@ -7044,7 +7122,7 @@ static struct md_personality raid6_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7058,6 +7136,8 @@ static struct md_personality raid6_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid5_personality =
{
@@ -7066,7 +7146,7 @@ static struct md_personality raid5_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7080,6 +7160,8 @@ static struct md_personality raid5_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid4_personality =
@@ -7089,7 +7171,7 @@ static struct md_personality raid4_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7103,6 +7185,8 @@ static struct md_personality raid4_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d59f5ca..983e18a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -558,7 +558,6 @@ static inline int algorithm_is_DDF(int layout)
return layout >= 8 && layout <= 10;
}
-extern int md_raid5_congested(struct mddev *mddev, int bits);
extern void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size);
#endif
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 5356395..55fa27e 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -49,7 +49,6 @@ static DEFINE_MUTEX(mtd_mutex);
*/
struct mtd_file_info {
struct mtd_info *mtd;
- struct inode *ino;
enum mtd_file_modes mode;
};
@@ -59,10 +58,6 @@ static loff_t mtdchar_lseek(struct file *file, loff_t offset, int orig)
return fixed_size_llseek(file, offset, orig, mfi->mtd->size);
}
-static int count;
-static struct vfsmount *mnt;
-static struct file_system_type mtd_inodefs_type;
-
static int mtdchar_open(struct inode *inode, struct file *file)
{
int minor = iminor(inode);
@@ -70,7 +65,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
int ret = 0;
struct mtd_info *mtd;
struct mtd_file_info *mfi;
- struct inode *mtd_ino;
pr_debug("MTD_open\n");
@@ -78,10 +72,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
if ((file->f_mode & FMODE_WRITE) && (minor & 1))
return -EACCES;
- ret = simple_pin_fs(&mtd_inodefs_type, &mnt, &count);
- if (ret)
- return ret;
-
mutex_lock(&mtd_mutex);
mtd = get_mtd_device(NULL, devnum);
@@ -95,43 +85,26 @@ static int mtdchar_open(struct inode *inode, struct file *file)
goto out1;
}
- mtd_ino = iget_locked(mnt->mnt_sb, devnum);
- if (!mtd_ino) {
- ret = -ENOMEM;
- goto out1;
- }
- if (mtd_ino->i_state & I_NEW) {
- mtd_ino->i_private = mtd;
- mtd_ino->i_mode = S_IFCHR;
- mtd_ino->i_data.backing_dev_info = mtd->backing_dev_info;
- unlock_new_inode(mtd_ino);
- }
- file->f_mapping = mtd_ino->i_mapping;
-
/* You can't open it RW if it's not a writeable device */
if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) {
ret = -EACCES;
- goto out2;
+ goto out1;
}
mfi = kzalloc(sizeof(*mfi), GFP_KERNEL);
if (!mfi) {
ret = -ENOMEM;
- goto out2;
+ goto out1;
}
- mfi->ino = mtd_ino;
mfi->mtd = mtd;
file->private_data = mfi;
mutex_unlock(&mtd_mutex);
return 0;
-out2:
- iput(mtd_ino);
out1:
put_mtd_device(mtd);
out:
mutex_unlock(&mtd_mutex);
- simple_release_fs(&mnt, &count);
return ret;
} /* mtdchar_open */
@@ -148,12 +121,9 @@ static int mtdchar_close(struct inode *inode, struct file *file)
if ((file->f_mode & FMODE_WRITE))
mtd_sync(mtd);
- iput(mfi->ino);
-
put_mtd_device(mtd);
file->private_data = NULL;
kfree(mfi);
- simple_release_fs(&mnt, &count);
return 0;
} /* mtdchar_close */
@@ -1117,6 +1087,13 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file,
ret = mtd_get_unmapped_area(mtd, len, offset, flags);
return ret == -EOPNOTSUPP ? -ENODEV : ret;
}
+
+static unsigned mtdchar_mmap_capabilities(struct file *file)
+{
+ struct mtd_file_info *mfi = file->private_data;
+
+ return mtd_mmap_capabilities(mfi->mtd);
+}
#endif
/*
@@ -1160,27 +1137,10 @@ static const struct file_operations mtd_fops = {
.mmap = mtdchar_mmap,
#ifndef CONFIG_MMU
.get_unmapped_area = mtdchar_get_unmapped_area,
+ .mmap_capabilities = mtdchar_mmap_capabilities,
#endif
};
-static const struct super_operations mtd_ops = {
- .drop_inode = generic_delete_inode,
- .statfs = simple_statfs,
-};
-
-static struct dentry *mtd_inodefs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_pseudo(fs_type, "mtd_inode:", &mtd_ops, NULL, MTD_INODE_FS_MAGIC);
-}
-
-static struct file_system_type mtd_inodefs_type = {
- .name = "mtd_inodefs",
- .mount = mtd_inodefs_mount,
- .kill_sb = kill_anon_super,
-};
-MODULE_ALIAS_FS("mtd_inodefs");
-
int __init init_mtdchar(void)
{
int ret;
@@ -1193,23 +1153,11 @@ int __init init_mtdchar(void)
return ret;
}
- ret = register_filesystem(&mtd_inodefs_type);
- if (ret) {
- pr_err("Can't register mtd_inodefs filesystem, error %d\n",
- ret);
- goto err_unregister_chdev;
- }
-
- return ret;
-
-err_unregister_chdev:
- __unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
return ret;
}
void __exit cleanup_mtdchar(void)
{
- unregister_filesystem(&mtd_inodefs_type);
__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
}
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index b900056..eacc3aa 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -732,8 +732,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c
concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks;
- concat->mtd.backing_dev_info = subdev[0]->backing_dev_info;
-
concat->subdev[0] = subdev[0];
for (i = 1; i < num_devs; i++) {
@@ -761,14 +759,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c
subdev[i]->flags & MTD_WRITEABLE;
}
- /* only permit direct mapping if the BDIs are all the same
- * - copy-mapping is still permitted
- */
- if (concat->mtd.backing_dev_info !=
- subdev[i]->backing_dev_info)
- concat->mtd.backing_dev_info =
- &default_backing_dev_info;
-
concat->mtd.size += subdev[i]->size;
concat->mtd.ecc_stats.badblocks +=
subdev[i]->ecc_stats.badblocks;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 4c61187..0ec4d6e 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -43,33 +43,7 @@
#include "mtdcore.h"
-/*
- * backing device capabilities for non-mappable devices (such as NAND flash)
- * - permits private mappings, copies are taken of the data
- */
-static struct backing_dev_info mtd_bdi_unmappable = {
- .capabilities = BDI_CAP_MAP_COPY,
-};
-
-/*
- * backing device capabilities for R/O mappable devices (such as ROM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_ro_mappable = {
- .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
- BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP),
-};
-
-/*
- * backing device capabilities for writable mappable devices (such as RAM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_rw_mappable = {
- .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
- BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP |
- BDI_CAP_WRITE_MAP),
+static struct backing_dev_info mtd_bdi = {
};
static int mtd_cls_suspend(struct device *dev, pm_message_t state);
@@ -365,6 +339,23 @@ static struct device_type mtd_devtype = {
.release = mtd_release,
};
+#ifndef CONFIG_MMU
+unsigned mtd_mmap_capabilities(struct mtd_info *mtd)
+{
+ switch (mtd->type) {
+ case MTD_RAM:
+ return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+ NOMMU_MAP_READ | NOMMU_MAP_WRITE;
+ case MTD_ROM:
+ return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+ NOMMU_MAP_READ;
+ default:
+ return NOMMU_MAP_COPY;
+ }
+}
+EXPORT_SYMBOL_GPL(mtd_mmap_capabilities);
+#endif
+
/**
* add_mtd_device - register an MTD device
* @mtd: pointer to new MTD device info structure
@@ -380,19 +371,7 @@ int add_mtd_device(struct mtd_info *mtd)
struct mtd_notifier *not;
int i, error;
- if (!mtd->backing_dev_info) {
- switch (mtd->type) {
- case MTD_RAM:
- mtd->backing_dev_info = &mtd_bdi_rw_mappable;
- break;
- case MTD_ROM:
- mtd->backing_dev_info = &mtd_bdi_ro_mappable;
- break;
- default:
- mtd->backing_dev_info = &mtd_bdi_unmappable;
- break;
- }
- }
+ mtd->backing_dev_info = &mtd_bdi;
BUG_ON(mtd->writesize == 0);
mutex_lock(&mtd_table_mutex);
@@ -1237,17 +1216,9 @@ static int __init init_mtd(void)
if (ret)
goto err_reg;
- ret = mtd_bdi_init(&mtd_bdi_unmappable, "mtd-unmap");
- if (ret)
- goto err_bdi1;
-
- ret = mtd_bdi_init(&mtd_bdi_ro_mappable, "mtd-romap");
- if (ret)
- goto err_bdi2;
-
- ret = mtd_bdi_init(&mtd_bdi_rw_mappable, "mtd-rwmap");
+ ret = mtd_bdi_init(&mtd_bdi, "mtd");
if (ret)
- goto err_bdi3;
+ goto err_bdi;
proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops);
@@ -1260,11 +1231,7 @@ static int __init init_mtd(void)
out_procfs:
if (proc_mtd)
remove_proc_entry("mtd", NULL);
-err_bdi3:
- bdi_destroy(&mtd_bdi_ro_mappable);
-err_bdi2:
- bdi_destroy(&mtd_bdi_unmappable);
-err_bdi1:
+err_bdi:
class_unregister(&mtd_class);
err_reg:
pr_err("Error registering mtd class or bdi: %d\n", ret);
@@ -1277,9 +1244,7 @@ static void __exit cleanup_mtd(void)
if (proc_mtd)
remove_proc_entry("mtd", NULL);
class_unregister(&mtd_class);
- bdi_destroy(&mtd_bdi_unmappable);
- bdi_destroy(&mtd_bdi_ro_mappable);
- bdi_destroy(&mtd_bdi_rw_mappable);
+ bdi_destroy(&mtd_bdi);
}
module_init(init_mtd);
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index a3e3a7d..e779de3 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -378,7 +378,6 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
slave->mtd.name = name;
slave->mtd.owner = master->owner;
- slave->mtd.backing_dev_info = master->backing_dev_info;
/* NOTE: we don't arrange MTDs as a tree; it'd be error-prone
* to have the same data be in two different partitions.
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 7f90022..96128cb 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -28,8 +28,8 @@
static int dcssblk_open(struct block_device *bdev, fmode_t mode);
static void dcssblk_release(struct gendisk *disk, fmode_t mode);
static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
-static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
- void **kaddr, unsigned long *pfn);
+static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
+ void **kaddr, unsigned long *pfn, long size);
static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
@@ -877,25 +877,22 @@ fail:
bio_io_error(bio);
}
-static int
+static long
dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
- void **kaddr, unsigned long *pfn)
+ void **kaddr, unsigned long *pfn, long size)
{
struct dcssblk_dev_info *dev_info;
- unsigned long pgoff;
+ unsigned long offset, dev_sz;
dev_info = bdev->bd_disk->private_data;
if (!dev_info)
return -ENODEV;
- if (secnum % (PAGE_SIZE/512))
- return -EINVAL;
- pgoff = secnum / (PAGE_SIZE / 512);
- if ((pgoff+1)*PAGE_SIZE-1 > dev_info->end - dev_info->start)
- return -ERANGE;
- *kaddr = (void *) (dev_info->start+pgoff*PAGE_SIZE);
+ dev_sz = dev_info->end - dev_info->start;
+ offset = secnum * 512;
+ *kaddr = (void *) (dev_info->start + offset);
*pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
- return 0;
+ return dev_sz - offset;
}
static void
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 17bb541..54d7a6c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2197,6 +2197,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
shost->tag_set.cmd_size = cmd_size;
shost->tag_set.numa_node = NUMA_NO_NODE;
shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ shost->tag_set.flags |=
+ BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
shost->tag_set.driver_data = shost;
return blk_mq_alloc_tag_set(&shost->tag_set);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 0deb385..9c0a520 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -277,7 +277,8 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
if (!shost_use_blk_mq(sdev->host) &&
(shost->bqt || shost->hostt->use_blk_tags)) {
blk_queue_init_tags(sdev->request_queue,
- sdev->host->cmd_per_lun, shost->bqt);
+ sdev->host->cmd_per_lun, shost->bqt,
+ shost->hostt->tag_alloc_policy);
}
scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index a668c88..0cbc1fb 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1719,22 +1719,19 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
}
if (iov_count) {
- int len, size = sizeof(struct sg_iovec) * iov_count;
+ int size = sizeof(struct iovec) * iov_count;
struct iovec *iov;
+ struct iov_iter i;
iov = memdup_user(hp->dxferp, size);
if (IS_ERR(iov))
return PTR_ERR(iov);
- len = iov_length(iov, iov_count);
- if (hp->dxfer_len < len) {
- iov_count = iov_shorten(iov, iov_count, hp->dxfer_len);
- len = hp->dxfer_len;
- }
+ iov_iter_init(&i, rw, iov, iov_count,
+ min_t(size_t, hp->dxfer_len,
+ iov_length(iov, iov_count)));
- res = blk_rq_map_user_iov(q, rq, md, (struct sg_iovec *)iov,
- iov_count,
- len, GFP_ATOMIC);
+ res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC);
kfree(iov);
} else
res = blk_rq_map_user(q, rq, md, hp->dxferp,
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index a3367bf..45aaa1c 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -987,7 +987,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
if (err)
goto out_free;
lsi->lsi_flags |= LSI_BDI_INITIALIZED;
- lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+ lsi->lsi_bdi.capabilities = 0;
err = ll_bdi_register(&lsi->lsi_bdi);
if (err)
goto out_free;
@@ -1812,10 +1812,6 @@ void ll_read_inode2(struct inode *inode, void *opaque)
/* OIDEBUG(inode); */
- /* initializing backing dev info. */
- inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi;
-
-
if (S_ISREG(inode->i_mode)) {
struct ll_sb_info *sbi = ll_i2sbi(inode);
OpenPOWER on IntegriCloud