summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-bio-prison.c25
-rw-r--r--drivers/md/dm-bio-prison.h1
-rw-r--r--drivers/md/dm-crypt.c5
-rw-r--r--drivers/md/dm-delay.c5
-rw-r--r--drivers/md/dm-flakey.c21
-rw-r--r--drivers/md/dm-io.c23
-rw-r--r--drivers/md/dm-ioctl.c64
-rw-r--r--drivers/md/dm-kcopyd.c18
-rw-r--r--drivers/md/dm-linear.c6
-rw-r--r--drivers/md/dm-raid.c107
-rw-r--r--drivers/md/dm-raid1.c75
-rw-r--r--drivers/md/dm-snap.c90
-rw-r--r--drivers/md/dm-stripe.c20
-rw-r--r--drivers/md/dm-table.c41
-rw-r--r--drivers/md/dm-target.c5
-rw-r--r--drivers/md/dm-thin-metadata.c2
-rw-r--r--drivers/md/dm-thin.c234
-rw-r--r--drivers/md/dm-verity.c25
-rw-r--r--drivers/md/dm-zero.c5
-rw-r--r--drivers/md/dm.c92
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/faulty.c5
-rw-r--r--drivers/md/md.c285
-rw-r--r--drivers/md/md.h28
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c12
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h16
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c50
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c20
-rw-r--r--drivers/md/persistent-data/dm-btree.c31
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c16
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c2
-rw-r--r--drivers/md/raid1.c19
-rw-r--r--drivers/md/raid10.c163
-rw-r--r--drivers/md/raid5.c134
34 files changed, 855 insertions, 792 deletions
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index e4e8415..aefb78e 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -208,31 +208,6 @@ void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
EXPORT_SYMBOL_GPL(dm_cell_release);
/*
- * There are a couple of places where we put a bio into a cell briefly
- * before taking it out again. In these situations we know that no other
- * bio may be in the cell. This function releases the cell, and also does
- * a sanity check.
- */
-static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
-{
- BUG_ON(cell->holder != bio);
- BUG_ON(!bio_list_empty(&cell->bios));
-
- __cell_release(cell, NULL);
-}
-
-void dm_cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
-{
- unsigned long flags;
- struct dm_bio_prison *prison = cell->prison;
-
- spin_lock_irqsave(&prison->lock, flags);
- __cell_release_singleton(cell, bio);
- spin_unlock_irqrestore(&prison->lock, flags);
-}
-EXPORT_SYMBOL_GPL(dm_cell_release_singleton);
-
-/*
* Sometimes we don't want the holder, just the additional bios.
*/
static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 4e0ac376..53d1a7a 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -44,7 +44,6 @@ int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
struct bio *inmate, struct dm_bio_prison_cell **ref);
void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios);
-void dm_cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio); // FIXME: bio arg not needed
void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates);
void dm_cell_error(struct dm_bio_prison_cell *cell);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bbf459b..f7369f9 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1689,8 +1689,7 @@ bad:
return ret;
}
-static int crypt_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int crypt_map(struct dm_target *ti, struct bio *bio)
{
struct dm_crypt_io *io;
struct crypt_config *cc = ti->private;
@@ -1846,7 +1845,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 11, 0},
+ .version = {1, 12, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f53846f..cc1bd04 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -274,8 +274,7 @@ static void delay_resume(struct dm_target *ti)
atomic_set(&dc->may_delay, 1);
}
-static int delay_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int delay_map(struct dm_target *ti, struct bio *bio)
{
struct delay_c *dc = ti->private;
@@ -338,7 +337,7 @@ out:
static struct target_type delay_target = {
.name = "delay",
- .version = {1, 1, 0},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = delay_ctr,
.dtr = delay_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index cc15543..9721f2f 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -39,6 +39,10 @@ enum feature_flag_bits {
DROP_WRITES
};
+struct per_bio_data {
+ bool bio_submitted;
+};
+
static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
struct dm_target *ti)
{
@@ -214,6 +218,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_requests = 1;
ti->num_discard_requests = 1;
+ ti->per_bio_data_size = sizeof(struct per_bio_data);
ti->private = fc;
return 0;
@@ -265,11 +270,12 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
}
}
-static int flakey_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int flakey_map(struct dm_target *ti, struct bio *bio)
{
struct flakey_c *fc = ti->private;
unsigned elapsed;
+ struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+ pb->bio_submitted = false;
/* Are we alive ? */
elapsed = (jiffies - fc->start_time) / HZ;
@@ -277,7 +283,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio,
/*
* Flag this bio as submitted while down.
*/
- map_context->ll = 1;
+ pb->bio_submitted = true;
/*
* Map reads as normal.
@@ -314,17 +320,16 @@ map_bio:
return DM_MAPIO_REMAPPED;
}
-static int flakey_end_io(struct dm_target *ti, struct bio *bio,
- int error, union map_info *map_context)
+static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
{
struct flakey_c *fc = ti->private;
- unsigned bio_submitted_while_down = map_context->ll;
+ struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
/*
* Corrupt successful READs while in down state.
* If flags were specified, only corrupt those that match.
*/
- if (fc->corrupt_bio_byte && !error && bio_submitted_while_down &&
+ if (fc->corrupt_bio_byte && !error && pb->bio_submitted &&
(bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
all_corrupt_bio_flags_match(bio, fc))
corrupt_bio_data(bio, fc);
@@ -406,7 +411,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
static struct target_type flakey_target = {
.name = "flakey",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = flakey_ctr,
.dtr = flakey_dtr,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 1c46f97..ea49834 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -287,7 +287,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
unsigned num_bvecs;
sector_t remaining = where->count;
struct request_queue *q = bdev_get_queue(where->bdev);
- sector_t discard_sectors;
+ unsigned short logical_block_size = queue_logical_block_size(q);
+ sector_t num_sectors;
/*
* where->count may be zero if rw holds a flush and we need to
@@ -297,7 +298,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
/*
* Allocate a suitably sized-bio.
*/
- if (rw & REQ_DISCARD)
+ if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME))
num_bvecs = 1;
else
num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
@@ -310,9 +311,21 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
store_io_and_region_in_bio(bio, io, region);
if (rw & REQ_DISCARD) {
- discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
- bio->bi_size = discard_sectors << SECTOR_SHIFT;
- remaining -= discard_sectors;
+ num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
+ bio->bi_size = num_sectors << SECTOR_SHIFT;
+ remaining -= num_sectors;
+ } else if (rw & REQ_WRITE_SAME) {
+ /*
+ * WRITE SAME only uses a single page.
+ */
+ dp->get_page(dp, &page, &len, &offset);
+ bio_add_page(bio, page, logical_block_size, offset);
+ num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining);
+ bio->bi_size = num_sectors << SECTOR_SHIFT;
+
+ offset = 0;
+ remaining -= num_sectors;
+ dp->next_page(dp);
} else while (remaining) {
/*
* Try and add as many pages as possible.
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index afd9598..0666b5d 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1543,7 +1543,21 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
return r;
}
-static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
+#define DM_PARAMS_VMALLOC 0x0001 /* Params alloced with vmalloc not kmalloc */
+#define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */
+
+static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
+{
+ if (param_flags & DM_WIPE_BUFFER)
+ memset(param, 0, param_size);
+
+ if (param_flags & DM_PARAMS_VMALLOC)
+ vfree(param);
+ else
+ kfree(param);
+}
+
+static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, int *param_flags)
{
struct dm_ioctl tmp, *dmi;
int secure_data;
@@ -1556,7 +1570,21 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
- dmi = vmalloc(tmp.data_size);
+ *param_flags = secure_data ? DM_WIPE_BUFFER : 0;
+
+ /*
+ * Try to avoid low memory issues when a device is suspended.
+ * Use kmalloc() rather than vmalloc() when we can.
+ */
+ dmi = NULL;
+ if (tmp.data_size <= KMALLOC_MAX_SIZE)
+ dmi = kmalloc(tmp.data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+ if (!dmi) {
+ dmi = __vmalloc(tmp.data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
+ *param_flags |= DM_PARAMS_VMALLOC;
+ }
+
if (!dmi) {
if (secure_data && clear_user(user, tmp.data_size))
return -EFAULT;
@@ -1566,6 +1594,14 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
if (copy_from_user(dmi, user, tmp.data_size))
goto bad;
+ /*
+ * Abort if something changed the ioctl data while it was being copied.
+ */
+ if (dmi->data_size != tmp.data_size) {
+ DMERR("rejecting ioctl: data size modified while processing parameters");
+ goto bad;
+ }
+
/* Wipe the user buffer so we do not return it to userspace */
if (secure_data && clear_user(user, tmp.data_size))
goto bad;
@@ -1574,9 +1610,8 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
return 0;
bad:
- if (secure_data)
- memset(dmi, 0, tmp.data_size);
- vfree(dmi);
+ free_params(dmi, tmp.data_size, *param_flags);
+
return -EFAULT;
}
@@ -1613,7 +1648,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
{
int r = 0;
- int wipe_buffer;
+ int param_flags;
unsigned int cmd;
struct dm_ioctl *uninitialized_var(param);
ioctl_fn fn = NULL;
@@ -1649,24 +1684,14 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
}
/*
- * Trying to avoid low memory issues when a device is
- * suspended.
- */
- current->flags |= PF_MEMALLOC;
-
- /*
* Copy the parameters into kernel space.
*/
- r = copy_params(user, &param);
-
- current->flags &= ~PF_MEMALLOC;
+ r = copy_params(user, &param, &param_flags);
if (r)
return r;
input_param_size = param->data_size;
- wipe_buffer = param->flags & DM_SECURE_DATA_FLAG;
-
r = validate_params(cmd, param);
if (r)
goto out;
@@ -1681,10 +1706,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
r = -EFAULT;
out:
- if (wipe_buffer)
- memset(param, 0, input_param_size);
-
- vfree(param);
+ free_params(param, input_param_size, param_flags);
return r;
}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index bed444c..68c0267 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -349,7 +349,7 @@ static void complete_io(unsigned long error, void *context)
struct dm_kcopyd_client *kc = job->kc;
if (error) {
- if (job->rw == WRITE)
+ if (job->rw & WRITE)
job->write_err |= error;
else
job->read_err = 1;
@@ -361,7 +361,7 @@ static void complete_io(unsigned long error, void *context)
}
}
- if (job->rw == WRITE)
+ if (job->rw & WRITE)
push(&kc->complete_jobs, job);
else {
@@ -432,7 +432,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
if (r < 0) {
/* error this rogue job */
- if (job->rw == WRITE)
+ if (job->rw & WRITE)
job->write_err = (unsigned long) -1L;
else
job->read_err = 1;
@@ -585,6 +585,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
{
struct kcopyd_job *job;
+ int i;
/*
* Allocate an array of jobs consisting of one master job
@@ -611,7 +612,16 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
memset(&job->source, 0, sizeof job->source);
job->source.count = job->dests[0].count;
job->pages = &zero_page_list;
- job->rw = WRITE;
+
+ /*
+ * Use WRITE SAME to optimize zeroing if all dests support it.
+ */
+ job->rw = WRITE | REQ_WRITE_SAME;
+ for (i = 0; i < job->num_dests; i++)
+ if (!bdev_write_same(job->dests[i].bdev)) {
+ job->rw = WRITE;
+ break;
+ }
}
job->fn = fn;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 1bf19a9..328cad5 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -55,6 +55,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_requests = 1;
ti->num_discard_requests = 1;
+ ti->num_write_same_requests = 1;
ti->private = lc;
return 0;
@@ -87,8 +88,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
}
-static int linear_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int linear_map(struct dm_target *ti, struct bio *bio)
{
linear_map_bio(ti, bio);
@@ -155,7 +155,7 @@ static int linear_iterate_devices(struct dm_target *ti,
static struct target_type linear_target = {
.name = "linear",
- .version = {1, 1, 0},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 45d94a7..9e58dbd 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -295,9 +295,11 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
* Choose a reasonable default. All figures in sectors.
*/
if (min_region_size > (1 << 13)) {
+ /* If not a power of 2, make it the next power of 2 */
+ if (min_region_size & (min_region_size - 1))
+ region_size = 1 << fls(region_size);
DMINFO("Choosing default region size of %lu sectors",
region_size);
- region_size = min_region_size;
} else {
DMINFO("Choosing default region size of 4MiB");
region_size = 1 << 13; /* sectors */
@@ -338,24 +340,22 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
}
/*
- * validate_rebuild_devices
+ * validate_raid_redundancy
* @rs
*
- * Determine if the devices specified for rebuild can result in a valid
- * usable array that is capable of rebuilding the given devices.
+ * Determine if there are enough devices in the array that haven't
+ * failed (or are being rebuilt) to form a usable array.
*
* Returns: 0 on success, -EINVAL on failure.
*/
-static int validate_rebuild_devices(struct raid_set *rs)
+static int validate_raid_redundancy(struct raid_set *rs)
{
unsigned i, rebuild_cnt = 0;
unsigned rebuilds_per_group, copies, d;
- if (!(rs->print_flags & DMPF_REBUILD))
- return 0;
-
for (i = 0; i < rs->md.raid_disks; i++)
- if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
+ !rs->dev[i].rdev.sb_page)
rebuild_cnt++;
switch (rs->raid_type->level) {
@@ -391,27 +391,24 @@ static int validate_rebuild_devices(struct raid_set *rs)
* A A B B C
* C D D E E
*/
- rebuilds_per_group = 0;
for (i = 0; i < rs->md.raid_disks * copies; i++) {
+ if (!(i % copies))
+ rebuilds_per_group = 0;
d = i % rs->md.raid_disks;
- if (!test_bit(In_sync, &rs->dev[d].rdev.flags) &&
+ if ((!rs->dev[d].rdev.sb_page ||
+ !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
(++rebuilds_per_group >= copies))
goto too_many;
- if (!((i + 1) % copies))
- rebuilds_per_group = 0;
}
break;
default:
- DMERR("The rebuild parameter is not supported for %s",
- rs->raid_type->name);
- rs->ti->error = "Rebuild not supported for this RAID type";
- return -EINVAL;
+ if (rebuild_cnt)
+ return -EINVAL;
}
return 0;
too_many:
- rs->ti->error = "Too many rebuild devices specified";
return -EINVAL;
}
@@ -662,9 +659,6 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
}
rs->md.dev_sectors = sectors_per_dev;
- if (validate_rebuild_devices(rs))
- return -EINVAL;
-
/* Assume there are no metadata devices until the drives are parsed */
rs->md.persistent = 0;
rs->md.external = 1;
@@ -993,28 +987,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
{
int ret;
- unsigned redundancy = 0;
struct raid_dev *dev;
struct md_rdev *rdev, *tmp, *freshest;
struct mddev *mddev = &rs->md;
- switch (rs->raid_type->level) {
- case 1:
- redundancy = rs->md.raid_disks - 1;
- break;
- case 4:
- case 5:
- case 6:
- redundancy = rs->raid_type->parity_devs;
- break;
- case 10:
- redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
- break;
- default:
- ti->error = "Unknown RAID type";
- return -EINVAL;
- }
-
freshest = NULL;
rdev_for_each_safe(rdev, tmp, mddev) {
/*
@@ -1043,44 +1019,43 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
break;
default:
dev = container_of(rdev, struct raid_dev, rdev);
- if (redundancy--) {
- if (dev->meta_dev)
- dm_put_device(ti, dev->meta_dev);
-
- dev->meta_dev = NULL;
- rdev->meta_bdev = NULL;
+ if (dev->meta_dev)
+ dm_put_device(ti, dev->meta_dev);
- if (rdev->sb_page)
- put_page(rdev->sb_page);
+ dev->meta_dev = NULL;
+ rdev->meta_bdev = NULL;
- rdev->sb_page = NULL;
+ if (rdev->sb_page)
+ put_page(rdev->sb_page);
- rdev->sb_loaded = 0;
+ rdev->sb_page = NULL;
- /*
- * We might be able to salvage the data device
- * even though the meta device has failed. For
- * now, we behave as though '- -' had been
- * set for this device in the table.
- */
- if (dev->data_dev)
- dm_put_device(ti, dev->data_dev);
+ rdev->sb_loaded = 0;
- dev->data_dev = NULL;
- rdev->bdev = NULL;
+ /*
+ * We might be able to salvage the data device
+ * even though the meta device has failed. For
+ * now, we behave as though '- -' had been
+ * set for this device in the table.
+ */
+ if (dev->data_dev)
+ dm_put_device(ti, dev->data_dev);
- list_del(&rdev->same_set);
+ dev->data_dev = NULL;
+ rdev->bdev = NULL;
- continue;
- }
- ti->error = "Failed to load superblock";
- return ret;
+ list_del(&rdev->same_set);
}
}
if (!freshest)
return 0;
+ if (validate_raid_redundancy(rs)) {
+ rs->ti->error = "Insufficient redundancy to activate array";
+ return -EINVAL;
+ }
+
/*
* Validation of the freshest device provides the source of
* validation for the remaining devices.
@@ -1216,7 +1191,7 @@ static void raid_dtr(struct dm_target *ti)
context_free(rs);
}
-static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+static int raid_map(struct dm_target *ti, struct bio *bio)
{
struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md;
@@ -1430,7 +1405,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 3, 1},
+ .version = {1, 4, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index fd61f98..fa51918 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -61,7 +61,6 @@ struct mirror_set {
struct dm_region_hash *rh;
struct dm_kcopyd_client *kcopyd_client;
struct dm_io_client *io_client;
- mempool_t *read_record_pool;
/* recovery */
region_t nr_regions;
@@ -139,14 +138,13 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
queue_bio(ms, bio, WRITE);
}
-#define MIN_READ_RECORDS 20
-struct dm_raid1_read_record {
+struct dm_raid1_bio_record {
struct mirror *m;
+ /* if details->bi_bdev == NULL, details were not saved */
struct dm_bio_details details;
+ region_t write_region;
};
-static struct kmem_cache *_dm_raid1_read_record_cache;
-
/*
* Every mirror should look like this one.
*/
@@ -876,19 +874,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
atomic_set(&ms->suspend, 0);
atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
- ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS,
- _dm_raid1_read_record_cache);
-
- if (!ms->read_record_pool) {
- ti->error = "Error creating mirror read_record_pool";
- kfree(ms);
- return NULL;
- }
-
ms->io_client = dm_io_client_create();
if (IS_ERR(ms->io_client)) {
ti->error = "Error creating dm_io client";
- mempool_destroy(ms->read_record_pool);
kfree(ms);
return NULL;
}
@@ -900,7 +888,6 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
if (IS_ERR(ms->rh)) {
ti->error = "Error creating dirty region hash";
dm_io_client_destroy(ms->io_client);
- mempool_destroy(ms->read_record_pool);
kfree(ms);
return NULL;
}
@@ -916,7 +903,6 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
dm_io_client_destroy(ms->io_client);
dm_region_hash_destroy(ms->rh);
- mempool_destroy(ms->read_record_pool);
kfree(ms);
}
@@ -1088,6 +1074,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_requests = 1;
ti->num_discard_requests = 1;
+ ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
ti->discard_zeroes_data_unsupported = true;
ms->kmirrord_wq = alloc_workqueue("kmirrord",
@@ -1155,18 +1142,20 @@ static void mirror_dtr(struct dm_target *ti)
/*
* Mirror mapping function
*/
-static int mirror_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int mirror_map(struct dm_target *ti, struct bio *bio)
{
int r, rw = bio_rw(bio);
struct mirror *m;
struct mirror_set *ms = ti->private;
- struct dm_raid1_read_record *read_record = NULL;
struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+ struct dm_raid1_bio_record *bio_record =
+ dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
+
+ bio_record->details.bi_bdev = NULL;
if (rw == WRITE) {
/* Save region for mirror_end_io() handler */
- map_context->ll = dm_rh_bio_to_region(ms->rh, bio);
+ bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio);
queue_bio(ms, bio, rw);
return DM_MAPIO_SUBMITTED;
}
@@ -1194,33 +1183,29 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
if (unlikely(!m))
return -EIO;
- read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
- if (likely(read_record)) {
- dm_bio_record(&read_record->details, bio);
- map_context->ptr = read_record;
- read_record->m = m;
- }
+ dm_bio_record(&bio_record->details, bio);
+ bio_record->m = m;
map_bio(m, bio);
return DM_MAPIO_REMAPPED;
}
-static int mirror_end_io(struct dm_target *ti, struct bio *bio,
- int error, union map_info *map_context)
+static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
{
int rw = bio_rw(bio);
struct mirror_set *ms = (struct mirror_set *) ti->private;
struct mirror *m = NULL;
struct dm_bio_details *bd = NULL;
- struct dm_raid1_read_record *read_record = map_context->ptr;
+ struct dm_raid1_bio_record *bio_record =
+ dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
/*
* We need to dec pending if this was a write.
*/
if (rw == WRITE) {
if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)))
- dm_rh_dec(ms->rh, map_context->ll);
+ dm_rh_dec(ms->rh, bio_record->write_region);
return error;
}
@@ -1231,7 +1216,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
goto out;
if (unlikely(error)) {
- if (!read_record) {
+ if (!bio_record->details.bi_bdev) {
/*
* There wasn't enough memory to record necessary
* information for a retry or there was no other
@@ -1241,7 +1226,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
return -EIO;
}
- m = read_record->m;
+ m = bio_record->m;
DMERR("Mirror read failed from %s. Trying alternative device.",
m->dev->name);
@@ -1253,22 +1238,18 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
* mirror.
*/
if (default_ok(m) || mirror_available(ms, bio)) {
- bd = &read_record->details;
+ bd = &bio_record->details;
dm_bio_restore(bd, bio);
- mempool_free(read_record, ms->read_record_pool);
- map_context->ptr = NULL;
+ bio_record->details.bi_bdev = NULL;
queue_bio(ms, bio, rw);
- return 1;
+ return DM_ENDIO_INCOMPLETE;
}
DMERR("All replicated volumes dead, failing I/O");
}
out:
- if (read_record) {
- mempool_free(read_record, ms->read_record_pool);
- map_context->ptr = NULL;
- }
+ bio_record->details.bi_bdev = NULL;
return error;
}
@@ -1422,7 +1403,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 12, 1},
+ .version = {1, 13, 1},
.module = THIS_MODULE,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
@@ -1439,13 +1420,6 @@ static int __init dm_mirror_init(void)
{
int r;
- _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0);
- if (!_dm_raid1_read_record_cache) {
- DMERR("Can't allocate dm_raid1_read_record cache");
- r = -ENOMEM;
- goto bad_cache;
- }
-
r = dm_register_target(&mirror_target);
if (r < 0) {
DMERR("Failed to register mirror target");
@@ -1455,15 +1429,12 @@ static int __init dm_mirror_init(void)
return 0;
bad_target:
- kmem_cache_destroy(_dm_raid1_read_record_cache);
-bad_cache:
return r;
}
static void __exit dm_mirror_exit(void)
{
dm_unregister_target(&mirror_target);
- kmem_cache_destroy(_dm_raid1_read_record_cache);
}
/* Module hooks */
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a143921..59fc18a 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -79,7 +79,6 @@ struct dm_snapshot {
/* Chunks with outstanding reads */
spinlock_t tracked_chunk_lock;
- mempool_t *tracked_chunk_pool;
struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
/* The on disk metadata handler */
@@ -191,35 +190,38 @@ struct dm_snap_tracked_chunk {
chunk_t chunk;
};
-static struct kmem_cache *tracked_chunk_cache;
+static void init_tracked_chunk(struct bio *bio)
+{
+ struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
+ INIT_HLIST_NODE(&c->node);
+}
-static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s,
- chunk_t chunk)
+static bool is_bio_tracked(struct bio *bio)
{
- struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool,
- GFP_NOIO);
- unsigned long flags;
+ struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
+ return !hlist_unhashed(&c->node);
+}
+
+static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk)
+{
+ struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
c->chunk = chunk;
- spin_lock_irqsave(&s->tracked_chunk_lock, flags);
+ spin_lock_irq(&s->tracked_chunk_lock);
hlist_add_head(&c->node,
&s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
- spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
-
- return c;
+ spin_unlock_irq(&s->tracked_chunk_lock);
}
-static void stop_tracking_chunk(struct dm_snapshot *s,
- struct dm_snap_tracked_chunk *c)
+static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio)
{
+ struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
unsigned long flags;
spin_lock_irqsave(&s->tracked_chunk_lock, flags);
hlist_del(&c->node);
spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
-
- mempool_free(c, s->tracked_chunk_pool);
}
static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
@@ -1120,14 +1122,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_pending_pool;
}
- s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
- tracked_chunk_cache);
- if (!s->tracked_chunk_pool) {
- ti->error = "Could not allocate tracked_chunk mempool for "
- "tracking reads";
- goto bad_tracked_chunk_pool;
- }
-
for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
@@ -1135,6 +1129,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = s;
ti->num_flush_requests = num_flush_requests;
+ ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
/* Add snapshot to the list of snapshots for this origin */
/* Exceptions aren't triggered till snapshot_resume() is called */
@@ -1183,9 +1178,6 @@ bad_read_metadata:
unregister_snapshot(s);
bad_load_and_register:
- mempool_destroy(s->tracked_chunk_pool);
-
-bad_tracked_chunk_pool:
mempool_destroy(s->pending_pool);
bad_pending_pool:
@@ -1290,8 +1282,6 @@ static void snapshot_dtr(struct dm_target *ti)
BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
#endif
- mempool_destroy(s->tracked_chunk_pool);
-
__free_exceptions(s);
mempool_destroy(s->pending_pool);
@@ -1577,8 +1567,7 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
s->store->chunk_mask);
}
-static int snapshot_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int snapshot_map(struct dm_target *ti, struct bio *bio)
{
struct dm_exception *e;
struct dm_snapshot *s = ti->private;
@@ -1586,6 +1575,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
chunk_t chunk;
struct dm_snap_pending_exception *pe = NULL;
+ init_tracked_chunk(bio);
+
if (bio->bi_rw & REQ_FLUSH) {
bio->bi_bdev = s->cow->bdev;
return DM_MAPIO_REMAPPED;
@@ -1670,7 +1661,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
}
} else {
bio->bi_bdev = s->origin->bdev;
- map_context->ptr = track_chunk(s, chunk);
+ track_chunk(s, bio, chunk);
}
out_unlock:
@@ -1691,20 +1682,20 @@ out:
* If merging is currently taking place on the chunk in question, the
* I/O is deferred by adding it to s->bios_queued_during_merge.
*/
-static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
{
struct dm_exception *e;
struct dm_snapshot *s = ti->private;
int r = DM_MAPIO_REMAPPED;
chunk_t chunk;
+ init_tracked_chunk(bio);
+
if (bio->bi_rw & REQ_FLUSH) {
- if (!map_context->target_request_nr)
+ if (!dm_bio_get_target_request_nr(bio))
bio->bi_bdev = s->origin->bdev;
else
bio->bi_bdev = s->cow->bdev;
- map_context->ptr = NULL;
return DM_MAPIO_REMAPPED;
}
@@ -1733,7 +1724,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
remap_exception(s, e, bio, chunk);
if (bio_rw(bio) == WRITE)
- map_context->ptr = track_chunk(s, chunk);
+ track_chunk(s, bio, chunk);
goto out_unlock;
}
@@ -1751,14 +1742,12 @@ out_unlock:
return r;
}
-static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
- int error, union map_info *map_context)
+static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error)
{
struct dm_snapshot *s = ti->private;
- struct dm_snap_tracked_chunk *c = map_context->ptr;
- if (c)
- stop_tracking_chunk(s, c);
+ if (is_bio_tracked(bio))
+ stop_tracking_chunk(s, bio);
return 0;
}
@@ -2127,8 +2116,7 @@ static void origin_dtr(struct dm_target *ti)
dm_put_device(ti, dev);
}
-static int origin_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int origin_map(struct dm_target *ti, struct bio *bio)
{
struct dm_dev *dev = ti->private;
bio->bi_bdev = dev->bdev;
@@ -2193,7 +2181,7 @@ static int origin_iterate_devices(struct dm_target *ti,
static struct target_type origin_target = {
.name = "snapshot-origin",
- .version = {1, 7, 1},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = origin_ctr,
.dtr = origin_dtr,
@@ -2206,7 +2194,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2220,7 +2208,7 @@ static struct target_type snapshot_target = {
static struct target_type merge_target = {
.name = dm_snapshot_merge_target_name,
- .version = {1, 1, 0},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2281,17 +2269,8 @@ static int __init dm_snapshot_init(void)
goto bad_pending_cache;
}
- tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
- if (!tracked_chunk_cache) {
- DMERR("Couldn't create cache to track chunks in use.");
- r = -ENOMEM;
- goto bad_tracked_chunk_cache;
- }
-
return 0;
-bad_tracked_chunk_cache:
- kmem_cache_destroy(pending_cache);
bad_pending_cache:
kmem_cache_destroy(exception_cache);
bad_exception_cache:
@@ -2317,7 +2296,6 @@ static void __exit dm_snapshot_exit(void)
exit_origin_hash();
kmem_cache_destroy(pending_cache);
kmem_cache_destroy(exception_cache);
- kmem_cache_destroy(tracked_chunk_cache);
dm_exception_store_exit();
}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index e2f87653..c89cde8 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -162,6 +162,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_requests = stripes;
ti->num_discard_requests = stripes;
+ ti->num_write_same_requests = stripes;
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
@@ -251,8 +252,8 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
*result += sc->chunk_size; /* next chunk */
}
-static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
- uint32_t target_stripe)
+static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
+ uint32_t target_stripe)
{
sector_t begin, end;
@@ -271,23 +272,23 @@ static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
}
}
-static int stripe_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int stripe_map(struct dm_target *ti, struct bio *bio)
{
struct stripe_c *sc = ti->private;
uint32_t stripe;
unsigned target_request_nr;
if (bio->bi_rw & REQ_FLUSH) {
- target_request_nr = map_context->target_request_nr;
+ target_request_nr = dm_bio_get_target_request_nr(bio);
BUG_ON(target_request_nr >= sc->stripes);
bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
return DM_MAPIO_REMAPPED;
}
- if (unlikely(bio->bi_rw & REQ_DISCARD)) {
- target_request_nr = map_context->target_request_nr;
+ if (unlikely(bio->bi_rw & REQ_DISCARD) ||
+ unlikely(bio->bi_rw & REQ_WRITE_SAME)) {
+ target_request_nr = dm_bio_get_target_request_nr(bio);
BUG_ON(target_request_nr >= sc->stripes);
- return stripe_map_discard(sc, bio, target_request_nr);
+ return stripe_map_range(sc, bio, target_request_nr);
}
stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
@@ -342,8 +343,7 @@ static int stripe_status(struct dm_target *ti, status_type_t type,
return 0;
}
-static int stripe_end_io(struct dm_target *ti, struct bio *bio,
- int error, union map_info *map_context)
+static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
{
unsigned i;
char major_minor[16];
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 100368e..daf25d0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -967,13 +967,22 @@ bool dm_table_request_based(struct dm_table *t)
int dm_table_alloc_md_mempools(struct dm_table *t)
{
unsigned type = dm_table_get_type(t);
+ unsigned per_bio_data_size = 0;
+ struct dm_target *tgt;
+ unsigned i;
if (unlikely(type == DM_TYPE_NONE)) {
DMWARN("no table type is set, can't allocate mempools");
return -EINVAL;
}
- t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
+ if (type == DM_TYPE_BIO_BASED)
+ for (i = 0; i < t->num_targets; i++) {
+ tgt = t->targets + i;
+ per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
+ }
+
+ t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
if (!t->mempools)
return -ENOMEM;
@@ -1414,6 +1423,33 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
return 1;
}
+static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !q->limits.max_write_same_sectors;
+}
+
+static bool dm_table_supports_write_same(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->num_write_same_requests)
+ return false;
+
+ if (!ti->type->iterate_devices ||
+ !ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
+ return false;
+ }
+
+ return true;
+}
+
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
@@ -1445,6 +1481,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
+ if (!dm_table_supports_write_same(t))
+ q->limits.max_write_same_sectors = 0;
+
dm_table_set_integrity(t);
/*
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 8da366c..617d21a 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -126,15 +126,14 @@ static void io_err_dtr(struct dm_target *tt)
/* empty */
}
-static int io_err_map(struct dm_target *tt, struct bio *bio,
- union map_info *map_context)
+static int io_err_map(struct dm_target *tt, struct bio *bio)
{
return -EIO;
}
static struct target_type error_target = {
.name = "error",
- .version = {1, 0, 1},
+ .version = {1, 1, 0},
.ctr = io_err_ctr,
.dtr = io_err_dtr,
.map = io_err_map,
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 693e149..4d6e853 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -408,7 +408,7 @@ static void __setup_btree_details(struct dm_pool_metadata *pmd)
pmd->tl_info.tm = pmd->tm;
pmd->tl_info.levels = 1;
- pmd->tl_info.value_type.context = &pmd->info;
+ pmd->tl_info.value_type.context = &pmd->bl_info;
pmd->tl_info.value_type.size = sizeof(__le64);
pmd->tl_info.value_type.inc = subtree_inc;
pmd->tl_info.value_type.dec = subtree_dec;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 058acf3..675ae52 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -186,7 +186,6 @@ struct pool {
struct dm_thin_new_mapping *next_mapping;
mempool_t *mapping_pool;
- mempool_t *endio_hook_pool;
process_bio_fn process_bio;
process_bio_fn process_discard;
@@ -304,7 +303,7 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
bio_list_init(master);
while ((bio = bio_list_pop(&bios))) {
- struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
if (h->tc == tc)
bio_endio(bio, DM_ENDIO_REQUEUE);
@@ -368,6 +367,17 @@ static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
dm_thin_changed_this_transaction(tc->td);
}
+static void inc_all_io_entry(struct pool *pool, struct bio *bio)
+{
+ struct dm_thin_endio_hook *h;
+
+ if (bio->bi_rw & REQ_DISCARD)
+ return;
+
+ h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+ h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
+}
+
static void issue(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
@@ -474,7 +484,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
static void overwrite_endio(struct bio *bio, int err)
{
unsigned long flags;
- struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct dm_thin_new_mapping *m = h->overwrite_mapping;
struct pool *pool = m->tc->pool;
@@ -499,8 +509,7 @@ static void overwrite_endio(struct bio *bio, int err)
/*
* This sends the bios in the cell back to the deferred_bios list.
*/
-static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
- dm_block_t data_block)
+static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
struct pool *pool = tc->pool;
unsigned long flags;
@@ -513,17 +522,13 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
}
/*
- * Same as cell_defer above, except it omits one particular detainee,
- * a write bio that covers the block and has already been processed.
+ * Same as cell_defer except it omits the original holder of the cell.
*/
-static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
- struct bio_list bios;
struct pool *pool = tc->pool;
unsigned long flags;
- bio_list_init(&bios);
-
spin_lock_irqsave(&pool->lock, flags);
dm_cell_release_no_holder(cell, &pool->deferred_bios);
spin_unlock_irqrestore(&pool->lock, flags);
@@ -561,7 +566,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
*/
r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
if (r) {
- DMERR("dm_thin_insert_block() failed");
+ DMERR_LIMIT("dm_thin_insert_block() failed");
dm_cell_error(m->cell);
goto out;
}
@@ -573,10 +578,10 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
* the bios in the cell.
*/
if (bio) {
- cell_defer_except(tc, m->cell);
+ cell_defer_no_holder(tc, m->cell);
bio_endio(bio, 0);
} else
- cell_defer(tc, m->cell, m->data_block);
+ cell_defer(tc, m->cell);
out:
list_del(&m->list);
@@ -588,8 +593,8 @@ static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
struct thin_c *tc = m->tc;
bio_io_error(m->bio);
- cell_defer_except(tc, m->cell);
- cell_defer_except(tc, m->cell2);
+ cell_defer_no_holder(tc, m->cell);
+ cell_defer_no_holder(tc, m->cell2);
mempool_free(m, tc->pool->mapping_pool);
}
@@ -597,13 +602,15 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
+ inc_all_io_entry(tc->pool, m->bio);
+ cell_defer_no_holder(tc, m->cell);
+ cell_defer_no_holder(tc, m->cell2);
+
if (m->pass_discard)
remap_and_issue(tc, m->bio, m->data_block);
else
bio_endio(m->bio, 0);
- cell_defer_except(tc, m->cell);
- cell_defer_except(tc, m->cell2);
mempool_free(m, tc->pool->mapping_pool);
}
@@ -614,7 +621,7 @@ static void process_prepared_discard(struct dm_thin_new_mapping *m)
r = dm_thin_remove_block(tc->td, m->virt_block);
if (r)
- DMERR("dm_thin_remove_block() failed");
+ DMERR_LIMIT("dm_thin_remove_block() failed");
process_prepared_discard_passdown(m);
}
@@ -706,11 +713,12 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
* bio immediately. Otherwise we use kcopyd to clone the data first.
*/
if (io_overwrites_block(pool, bio)) {
- struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
h->overwrite_mapping = m;
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
+ inc_all_io_entry(pool, bio);
remap_and_issue(tc, bio, data_dest);
} else {
struct dm_io_region from, to;
@@ -727,7 +735,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
0, copy_complete, m);
if (r < 0) {
mempool_free(m, pool->mapping_pool);
- DMERR("dm_kcopyd_copy() failed");
+ DMERR_LIMIT("dm_kcopyd_copy() failed");
dm_cell_error(cell);
}
}
@@ -775,11 +783,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
process_prepared_mapping(m);
else if (io_overwrites_block(pool, bio)) {
- struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
h->overwrite_mapping = m;
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
+ inc_all_io_entry(pool, bio);
remap_and_issue(tc, bio, data_block);
} else {
int r;
@@ -792,7 +801,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
if (r < 0) {
mempool_free(m, pool->mapping_pool);
- DMERR("dm_kcopyd_zero() failed");
+ DMERR_LIMIT("dm_kcopyd_zero() failed");
dm_cell_error(cell);
}
}
@@ -804,7 +813,7 @@ static int commit(struct pool *pool)
r = dm_pool_commit_metadata(pool->pmd);
if (r)
- DMERR("commit failed, error = %d", r);
+ DMERR_LIMIT("commit failed: error = %d", r);
return r;
}
@@ -889,7 +898,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
*/
static void retry_on_resume(struct bio *bio)
{
- struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct thin_c *tc = h->tc;
struct pool *pool = tc->pool;
unsigned long flags;
@@ -936,7 +945,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
*/
build_data_key(tc->td, lookup_result.block, &key2);
if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
- dm_cell_release_singleton(cell, bio);
+ cell_defer_no_holder(tc, cell);
break;
}
@@ -962,13 +971,15 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
wake_worker(pool);
}
} else {
+ inc_all_io_entry(pool, bio);
+ cell_defer_no_holder(tc, cell);
+ cell_defer_no_holder(tc, cell2);
+
/*
* The DM core makes sure that the discard doesn't span
* a block boundary. So we submit the discard of a
* partial block appropriately.
*/
- dm_cell_release_singleton(cell, bio);
- dm_cell_release_singleton(cell2, bio);
if ((!lookup_result.shared) && pool->pf.discard_passdown)
remap_and_issue(tc, bio, lookup_result.block);
else
@@ -980,13 +991,14 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
/*
* It isn't provisioned, just forget it.
*/
- dm_cell_release_singleton(cell, bio);
+ cell_defer_no_holder(tc, cell);
bio_endio(bio, 0);
break;
default:
- DMERR("discard: find block unexpectedly returned %d", r);
- dm_cell_release_singleton(cell, bio);
+ DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+ __func__, r);
+ cell_defer_no_holder(tc, cell);
bio_io_error(bio);
break;
}
@@ -1012,7 +1024,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
break;
default:
- DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
+ DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
+ __func__, r);
dm_cell_error(cell);
break;
}
@@ -1037,11 +1050,12 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
if (bio_data_dir(bio) == WRITE && bio->bi_size)
break_sharing(tc, bio, block, &key, lookup_result, cell);
else {
- struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
+ inc_all_io_entry(pool, bio);
+ cell_defer_no_holder(tc, cell);
- dm_cell_release_singleton(cell, bio);
remap_and_issue(tc, bio, lookup_result->block);
}
}
@@ -1056,7 +1070,9 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
* Remap empty bios (flushes) immediately, without provisioning.
*/
if (!bio->bi_size) {
- dm_cell_release_singleton(cell, bio);
+ inc_all_io_entry(tc->pool, bio);
+ cell_defer_no_holder(tc, cell);
+
remap_and_issue(tc, bio, 0);
return;
}
@@ -1066,7 +1082,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
*/
if (bio_data_dir(bio) == READ) {
zero_fill_bio(bio);
- dm_cell_release_singleton(cell, bio);
+ cell_defer_no_holder(tc, cell);
bio_endio(bio, 0);
return;
}
@@ -1085,7 +1101,8 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
break;
default:
- DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
+ DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
+ __func__, r);
set_pool_mode(tc->pool, PM_READ_ONLY);
dm_cell_error(cell);
break;
@@ -1111,34 +1128,31 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
switch (r) {
case 0:
- /*
- * We can release this cell now. This thread is the only
- * one that puts bios into a cell, and we know there were
- * no preceding bios.
- */
- /*
- * TODO: this will probably have to change when discard goes
- * back in.
- */
- dm_cell_release_singleton(cell, bio);
-
- if (lookup_result.shared)
+ if (lookup_result.shared) {
process_shared_bio(tc, bio, block, &lookup_result);
- else
+ cell_defer_no_holder(tc, cell);
+ } else {
+ inc_all_io_entry(tc->pool, bio);
+ cell_defer_no_holder(tc, cell);
+
remap_and_issue(tc, bio, lookup_result.block);
+ }
break;
case -ENODATA:
if (bio_data_dir(bio) == READ && tc->origin_dev) {
- dm_cell_release_singleton(cell, bio);
+ inc_all_io_entry(tc->pool, bio);
+ cell_defer_no_holder(tc, cell);
+
remap_to_origin_and_issue(tc, bio);
} else
provision_block(tc, bio, block, cell);
break;
default:
- DMERR("dm_thin_find_block() failed, error = %d", r);
- dm_cell_release_singleton(cell, bio);
+ DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+ __func__, r);
+ cell_defer_no_holder(tc, cell);
bio_io_error(bio);
break;
}
@@ -1156,8 +1170,10 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
case 0:
if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
bio_io_error(bio);
- else
+ else {
+ inc_all_io_entry(tc->pool, bio);
remap_and_issue(tc, bio, lookup_result.block);
+ }
break;
case -ENODATA:
@@ -1167,6 +1183,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
}
if (tc->origin_dev) {
+ inc_all_io_entry(tc->pool, bio);
remap_to_origin_and_issue(tc, bio);
break;
}
@@ -1176,7 +1193,8 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
break;
default:
- DMERR("dm_thin_find_block() failed, error = %d", r);
+ DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+ __func__, r);
bio_io_error(bio);
break;
}
@@ -1207,7 +1225,7 @@ static void process_deferred_bios(struct pool *pool)
spin_unlock_irqrestore(&pool->lock, flags);
while ((bio = bio_list_pop(&bios))) {
- struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct thin_c *tc = h->tc;
/*
@@ -1340,32 +1358,30 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
wake_worker(pool);
}
-static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
{
- struct pool *pool = tc->pool;
- struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
h->tc = tc;
h->shared_read_entry = NULL;
- h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds);
+ h->all_io_entry = NULL;
h->overwrite_mapping = NULL;
-
- return h;
}
/*
* Non-blocking function called from the thin target's map function.
*/
-static int thin_bio_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int thin_bio_map(struct dm_target *ti, struct bio *bio)
{
int r;
struct thin_c *tc = ti->private;
dm_block_t block = get_bio_block(tc, bio);
struct dm_thin_device *td = tc->td;
struct dm_thin_lookup_result result;
+ struct dm_bio_prison_cell *cell1, *cell2;
+ struct dm_cell_key key;
- map_context->ptr = thin_hook_bio(tc, bio);
+ thin_hook_bio(tc, bio);
if (get_pool_mode(tc->pool) == PM_FAIL) {
bio_io_error(bio);
@@ -1400,12 +1416,25 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
* shared flag will be set in their case.
*/
thin_defer_bio(tc, bio);
- r = DM_MAPIO_SUBMITTED;
- } else {
- remap(tc, bio, result.block);
- r = DM_MAPIO_REMAPPED;
+ return DM_MAPIO_SUBMITTED;
}
- break;
+
+ build_virtual_key(tc->td, block, &key);
+ if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1))
+ return DM_MAPIO_SUBMITTED;
+
+ build_data_key(tc->td, result.block, &key);
+ if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) {
+ cell_defer_no_holder(tc, cell1);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ inc_all_io_entry(tc->pool, bio);
+ cell_defer_no_holder(tc, cell2);
+ cell_defer_no_holder(tc, cell1);
+
+ remap(tc, bio, result.block);
+ return DM_MAPIO_REMAPPED;
case -ENODATA:
if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
@@ -1414,8 +1443,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
* of doing so. Just error it.
*/
bio_io_error(bio);
- r = DM_MAPIO_SUBMITTED;
- break;
+ return DM_MAPIO_SUBMITTED;
}
/* fall through */
@@ -1425,8 +1453,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
* provide the hint to load the metadata into cache.
*/
thin_defer_bio(tc, bio);
- r = DM_MAPIO_SUBMITTED;
- break;
+ return DM_MAPIO_SUBMITTED;
default:
/*
@@ -1435,11 +1462,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
* pool is switched to fail-io mode.
*/
bio_io_error(bio);
- r = DM_MAPIO_SUBMITTED;
- break;
+ return DM_MAPIO_SUBMITTED;
}
-
- return r;
}
static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
@@ -1566,14 +1590,12 @@ static void __pool_destroy(struct pool *pool)
if (pool->next_mapping)
mempool_free(pool->next_mapping, pool->mapping_pool);
mempool_destroy(pool->mapping_pool);
- mempool_destroy(pool->endio_hook_pool);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool);
}
static struct kmem_cache *_new_mapping_cache;
-static struct kmem_cache *_endio_hook_cache;
static struct pool *pool_create(struct mapped_device *pool_md,
struct block_device *metadata_dev,
@@ -1667,13 +1689,6 @@ static struct pool *pool_create(struct mapped_device *pool_md,
goto bad_mapping_pool;
}
- pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
- _endio_hook_cache);
- if (!pool->endio_hook_pool) {
- *error = "Error creating pool's endio_hook mempool";
- err_p = ERR_PTR(-ENOMEM);
- goto bad_endio_hook_pool;
- }
pool->ref_count = 1;
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
@@ -1682,8 +1697,6 @@ static struct pool *pool_create(struct mapped_device *pool_md,
return pool;
-bad_endio_hook_pool:
- mempool_destroy(pool->mapping_pool);
bad_mapping_pool:
dm_deferred_set_destroy(pool->all_io_ds);
bad_all_io_ds:
@@ -1966,8 +1979,7 @@ out_unlock:
return r;
}
-static int pool_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int pool_map(struct dm_target *ti, struct bio *bio)
{
int r;
struct pool_c *pt = ti->private;
@@ -2358,7 +2370,9 @@ static int pool_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("rw ");
- if (pool->pf.discard_enabled && pool->pf.discard_passdown)
+ if (!pool->pf.discard_enabled)
+ DMEMIT("ignore_discard");
+ else if (pool->pf.discard_passdown)
DMEMIT("discard_passdown");
else
DMEMIT("no_discard_passdown");
@@ -2454,7 +2468,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 5, 0},
+ .version = {1, 6, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -2576,6 +2590,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->num_flush_requests = 1;
ti->flush_supported = true;
+ ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
/* In case the pool supports discards, pass them on. */
if (tc->pool->pf.discard_enabled) {
@@ -2609,20 +2624,17 @@ out_unlock:
return r;
}
-static int thin_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int thin_map(struct dm_target *ti, struct bio *bio)
{
bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
- return thin_bio_map(ti, bio, map_context);
+ return thin_bio_map(ti, bio);
}
-static int thin_endio(struct dm_target *ti,
- struct bio *bio, int err,
- union map_info *map_context)
+static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
{
unsigned long flags;
- struct dm_thin_endio_hook *h = map_context->ptr;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct list_head work;
struct dm_thin_new_mapping *m, *tmp;
struct pool *pool = h->tc->pool;
@@ -2643,14 +2655,15 @@ static int thin_endio(struct dm_target *ti,
if (h->all_io_entry) {
INIT_LIST_HEAD(&work);
dm_deferred_entry_dec(h->all_io_entry, &work);
- spin_lock_irqsave(&pool->lock, flags);
- list_for_each_entry_safe(m, tmp, &work, list)
- list_add(&m->list, &pool->prepared_discards);
- spin_unlock_irqrestore(&pool->lock, flags);
+ if (!list_empty(&work)) {
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry_safe(m, tmp, &work, list)
+ list_add(&m->list, &pool->prepared_discards);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ wake_worker(pool);
+ }
}
- mempool_free(h, pool->endio_hook_pool);
-
return 0;
}
@@ -2745,7 +2758,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 5, 0},
+ .version = {1, 6, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
@@ -2779,14 +2792,8 @@ static int __init dm_thin_init(void)
if (!_new_mapping_cache)
goto bad_new_mapping_cache;
- _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
- if (!_endio_hook_cache)
- goto bad_endio_hook_cache;
-
return 0;
-bad_endio_hook_cache:
- kmem_cache_destroy(_new_mapping_cache);
bad_new_mapping_cache:
dm_unregister_target(&pool_target);
bad_pool_target:
@@ -2801,7 +2808,6 @@ static void dm_thin_exit(void)
dm_unregister_target(&pool_target);
kmem_cache_destroy(_new_mapping_cache);
- kmem_cache_destroy(_endio_hook_cache);
}
module_init(dm_thin_init);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 9e7328b..52cde982 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -55,7 +55,6 @@ struct dm_verity {
unsigned shash_descsize;/* the size of temporary space for crypto */
int hash_failed; /* set to 1 if hash of any block failed */
- mempool_t *io_mempool; /* mempool of struct dm_verity_io */
mempool_t *vec_mempool; /* mempool of bio vector */
struct workqueue_struct *verify_wq;
@@ -66,7 +65,6 @@ struct dm_verity {
struct dm_verity_io {
struct dm_verity *v;
- struct bio *bio;
/* original values of bio->bi_end_io and bio->bi_private */
bio_end_io_t *orig_bi_end_io;
@@ -389,8 +387,8 @@ test_block_hash:
*/
static void verity_finish_io(struct dm_verity_io *io, int error)
{
- struct bio *bio = io->bio;
struct dm_verity *v = io->v;
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
bio->bi_end_io = io->orig_bi_end_io;
bio->bi_private = io->orig_bi_private;
@@ -398,8 +396,6 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
if (io->io_vec != io->io_vec_inline)
mempool_free(io->io_vec, v->vec_mempool);
- mempool_free(io, v->io_mempool);
-
bio_endio(bio, error);
}
@@ -462,8 +458,7 @@ no_prefetch_cluster:
* Bio map function. It allocates dm_verity_io structure and bio vector and
* fills them. Then it issues prefetches and the I/O.
*/
-static int verity_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int verity_map(struct dm_target *ti, struct bio *bio)
{
struct dm_verity *v = ti->private;
struct dm_verity_io *io;
@@ -486,9 +481,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio,
if (bio_data_dir(bio) == WRITE)
return -EIO;
- io = mempool_alloc(v->io_mempool, GFP_NOIO);
+ io = dm_per_bio_data(bio, ti->per_bio_data_size);
io->v = v;
- io->bio = bio;
io->orig_bi_end_io = bio->bi_end_io;
io->orig_bi_private = bio->bi_private;
io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
@@ -610,9 +604,6 @@ static void verity_dtr(struct dm_target *ti)
if (v->vec_mempool)
mempool_destroy(v->vec_mempool);
- if (v->io_mempool)
- mempool_destroy(v->io_mempool);
-
if (v->bufio)
dm_bufio_client_destroy(v->bufio);
@@ -841,13 +832,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
- v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
- sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2);
- if (!v->io_mempool) {
- ti->error = "Cannot allocate io mempool";
- r = -ENOMEM;
- goto bad;
- }
+ ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
BIO_MAX_PAGES * sizeof(struct bio_vec));
@@ -875,7 +860,7 @@ bad:
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index cc2b3cb..69a5c3b 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -33,8 +33,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
/*
* Return zeros only on reads
*/
-static int zero_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static int zero_map(struct dm_target *ti, struct bio *bio)
{
switch(bio_rw(bio)) {
case READ:
@@ -56,7 +55,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio,
static struct target_type zero_target = {
.name = "zero",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = zero_ctr,
.map = zero_map,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 02db9183..c72e4d5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -63,18 +63,6 @@ struct dm_io {
};
/*
- * For bio-based dm.
- * One of these is allocated per target within a bio. Hopefully
- * this will be simplified out one day.
- */
-struct dm_target_io {
- struct dm_io *io;
- struct dm_target *ti;
- union map_info info;
- struct bio clone;
-};
-
-/*
* For request-based dm.
* One of these is allocated per request.
*/
@@ -657,7 +645,7 @@ static void clone_endio(struct bio *bio, int error)
error = -EIO;
if (endio) {
- r = endio(tio->ti, bio, error, &tio->info);
+ r = endio(tio->ti, bio, error);
if (r < 0 || r == DM_ENDIO_REQUEUE)
/*
* error and requeue request are handled
@@ -740,8 +728,14 @@ static void rq_completed(struct mapped_device *md, int rw, int run_queue)
if (!md_in_flight(md))
wake_up(&md->wait);
+ /*
+ * Run this off this callpath, as drivers could invoke end_io while
+ * inside their request_fn (and holding the queue lock). Calling
+ * back into ->request_fn() could deadlock attempting to grab the
+ * queue lock again.
+ */
if (run_queue)
- blk_run_queue(md->queue);
+ blk_run_queue_async(md->queue);
/*
* dm_put() must be at the end of this function. See the comment above
@@ -1010,7 +1004,7 @@ static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
*/
atomic_inc(&tio->io->io_count);
sector = clone->bi_sector;
- r = ti->type->map(ti, clone, &tio->info);
+ r = ti->type->map(ti, clone);
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
@@ -1105,6 +1099,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
tio->io = ci->io;
tio->ti = ti;
memset(&tio->info, 0, sizeof(tio->info));
+ tio->target_request_nr = 0;
return tio;
}
@@ -1115,7 +1110,7 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs);
struct bio *clone = &tio->clone;
- tio->info.target_request_nr = request_nr;
+ tio->target_request_nr = request_nr;
/*
* Discard requests require the bio's inline iovecs be initialized.
@@ -1168,7 +1163,28 @@ static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
ci->sector_count = 0;
}
-static int __clone_and_map_discard(struct clone_info *ci)
+typedef unsigned (*get_num_requests_fn)(struct dm_target *ti);
+
+static unsigned get_num_discard_requests(struct dm_target *ti)
+{
+ return ti->num_discard_requests;
+}
+
+static unsigned get_num_write_same_requests(struct dm_target *ti)
+{
+ return ti->num_write_same_requests;
+}
+
+typedef bool (*is_split_required_fn)(struct dm_target *ti);
+
+static bool is_split_required_for_discard(struct dm_target *ti)
+{
+ return ti->split_discard_requests;
+}
+
+static int __clone_and_map_changing_extent_only(struct clone_info *ci,
+ get_num_requests_fn get_num_requests,
+ is_split_required_fn is_split_required)
{
struct dm_target *ti;
sector_t len;
@@ -1179,15 +1195,15 @@ static int __clone_and_map_discard(struct clone_info *ci)
return -EIO;
/*
- * Even though the device advertised discard support,
- * that does not mean every target supports it, and
+ * Even though the device advertised support for this type of
+ * request, that does not mean every target supports it, and
* reconfiguration might also have changed that since the
* check was performed.
*/
- if (!ti->num_discard_requests)
+ if (!get_num_requests || !get_num_requests(ti))
return -EOPNOTSUPP;
- if (!ti->split_discard_requests)
+ if (is_split_required && !is_split_required(ti))
len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
else
len = min(ci->sector_count, max_io_len(ci->sector, ti));
@@ -1200,6 +1216,17 @@ static int __clone_and_map_discard(struct clone_info *ci)
return 0;
}
+static int __clone_and_map_discard(struct clone_info *ci)
+{
+ return __clone_and_map_changing_extent_only(ci, get_num_discard_requests,
+ is_split_required_for_discard);
+}
+
+static int __clone_and_map_write_same(struct clone_info *ci)
+{
+ return __clone_and_map_changing_extent_only(ci, get_num_write_same_requests, NULL);
+}
+
static int __clone_and_map(struct clone_info *ci)
{
struct bio *bio = ci->bio;
@@ -1209,6 +1236,8 @@ static int __clone_and_map(struct clone_info *ci)
if (unlikely(bio->bi_rw & REQ_DISCARD))
return __clone_and_map_discard(ci);
+ else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
+ return __clone_and_map_write_same(ci);
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
@@ -1940,13 +1969,20 @@ static void free_dev(struct mapped_device *md)
static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
{
- struct dm_md_mempools *p;
+ struct dm_md_mempools *p = dm_table_get_md_mempools(t);
- if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs)
- /* the md already has necessary mempools */
+ if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs) {
+ /*
+ * The md already has necessary mempools. Reload just the
+ * bioset because front_pad may have changed because
+ * a different table was loaded.
+ */
+ bioset_free(md->bs);
+ md->bs = p->bs;
+ p->bs = NULL;
goto out;
+ }
- p = dm_table_get_md_mempools(t);
BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
md->io_pool = p->io_pool;
@@ -2705,7 +2741,7 @@ int dm_noflush_suspending(struct dm_target *ti)
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
{
struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
@@ -2713,6 +2749,8 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
if (!pools)
return NULL;
+ per_bio_data_size = roundup(per_bio_data_size, __alignof__(struct dm_target_io));
+
pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
mempool_create_slab_pool(MIN_IOS, _io_cache) :
mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
@@ -2728,7 +2766,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
pools->bs = (type == DM_TYPE_BIO_BASED) ?
bioset_create(pool_size,
- offsetof(struct dm_target_io, clone)) :
+ per_bio_data_size + offsetof(struct dm_target_io, clone)) :
bioset_create(pool_size,
offsetof(struct dm_rq_clone_bio_info, clone));
if (!pools->bs)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6a99fef..45b97da 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -159,7 +159,7 @@ void dm_kcopyd_exit(void);
/*
* Mempool operations
*/
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
void dm_free_md_mempools(struct dm_md_mempools *pools);
#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 45135f6..5e7dc77 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -315,8 +315,11 @@ static int run(struct mddev *mddev)
}
conf->nfaults = 0;
- rdev_for_each(rdev, mddev)
+ rdev_for_each(rdev, mddev) {
conf->rdev = rdev;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ }
md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
mddev->private = conf;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7511ce3..3db3d1b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -452,7 +452,7 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
spin_lock_irq(&mddev->write_lock);
wait_event_lock_irq(mddev->sb_wait,
!mddev->flush_bio,
- mddev->write_lock, /*nothing*/);
+ mddev->write_lock);
mddev->flush_bio = bio;
spin_unlock_irq(&mddev->write_lock);
@@ -1414,12 +1414,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
unsigned long long newcsum;
int size = 256 + le32_to_cpu(sb->max_dev)*2;
__le32 *isuper = (__le32*)sb;
- int i;
disk_csum = sb->sb_csum;
sb->sb_csum = 0;
newcsum = 0;
- for (i=0; size>=4; size -= 4 )
+ for (; size >= 4; size -= 4)
newcsum += le32_to_cpu(*isuper++);
if (size == 2)
@@ -1817,10 +1816,10 @@ retry:
memset(bbp, 0xff, PAGE_SIZE);
for (i = 0 ; i < bb->count ; i++) {
- u64 internal_bb = *p++;
+ u64 internal_bb = p[i];
u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
| BB_LEN(internal_bb));
- *bbp++ = cpu_to_le64(store_bb);
+ bbp[i] = cpu_to_le64(store_bb);
}
bb->changed = 0;
if (read_seqretry(&bb->lock, seq))
@@ -4753,6 +4752,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
}
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
+ if (entry->store == new_dev_store)
+ flush_workqueue(md_misc_wq);
rv = mddev_lock(mddev);
if (!rv) {
rv = entry->store(mddev, page, length);
@@ -5294,7 +5295,7 @@ void md_stop_writes(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_stop_writes);
-void md_stop(struct mddev *mddev)
+static void __md_stop(struct mddev *mddev)
{
mddev->ready = 0;
mddev->pers->stop(mddev);
@@ -5304,6 +5305,18 @@ void md_stop(struct mddev *mddev)
mddev->pers = NULL;
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
+
+void md_stop(struct mddev *mddev)
+{
+ /* stop the array and free an attached data structures.
+ * This is called from dm-raid
+ */
+ __md_stop(mddev);
+ bitmap_destroy(mddev);
+ if (mddev->bio_set)
+ bioset_free(mddev->bio_set);
+}
+
EXPORT_SYMBOL_GPL(md_stop);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
@@ -5364,7 +5377,7 @@ static int do_md_stop(struct mddev * mddev, int mode,
set_disk_ro(disk, 0);
__md_stop_writes(mddev);
- md_stop(mddev);
+ __md_stop(mddev);
mddev->queue->merge_bvec_fn = NULL;
mddev->queue->backing_dev_info.congested_fn = NULL;
@@ -6334,24 +6347,23 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
* Commands dealing with the RAID driver but not any
* particular array:
*/
- switch (cmd)
- {
- case RAID_VERSION:
- err = get_version(argp);
- goto done;
+ switch (cmd) {
+ case RAID_VERSION:
+ err = get_version(argp);
+ goto done;
- case PRINT_RAID_DEBUG:
- err = 0;
- md_print_devices();
- goto done;
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done;
#ifndef MODULE
- case RAID_AUTORUN:
- err = 0;
- autostart_arrays(arg);
- goto done;
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays(arg);
+ goto done;
#endif
- default:;
+ default:;
}
/*
@@ -6386,6 +6398,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
goto abort;
}
+ if (cmd == ADD_NEW_DISK)
+ /* need to ensure md_delayed_delete() has completed */
+ flush_workqueue(md_misc_wq);
+
err = mddev_lock(mddev);
if (err) {
printk(KERN_INFO
@@ -6394,50 +6410,44 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
goto abort;
}
- switch (cmd)
- {
- case SET_ARRAY_INFO:
- {
- mdu_array_info_t info;
- if (!arg)
- memset(&info, 0, sizeof(info));
- else if (copy_from_user(&info, argp, sizeof(info))) {
- err = -EFAULT;
- goto abort_unlock;
- }
- if (mddev->pers) {
- err = update_array_info(mddev, &info);
- if (err) {
- printk(KERN_WARNING "md: couldn't update"
- " array info. %d\n", err);
- goto abort_unlock;
- }
- goto done_unlock;
- }
- if (!list_empty(&mddev->disks)) {
- printk(KERN_WARNING
- "md: array %s already has disks!\n",
- mdname(mddev));
- err = -EBUSY;
- goto abort_unlock;
- }
- if (mddev->raid_disks) {
- printk(KERN_WARNING
- "md: array %s already initialised!\n",
- mdname(mddev));
- err = -EBUSY;
- goto abort_unlock;
- }
- err = set_array_info(mddev, &info);
- if (err) {
- printk(KERN_WARNING "md: couldn't set"
- " array info. %d\n", err);
- goto abort_unlock;
- }
+ if (cmd == SET_ARRAY_INFO) {
+ mdu_array_info_t info;
+ if (!arg)
+ memset(&info, 0, sizeof(info));
+ else if (copy_from_user(&info, argp, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ if (mddev->pers) {
+ err = update_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't update"
+ " array info. %d\n", err);
+ goto abort_unlock;
}
goto done_unlock;
-
- default:;
+ }
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: array %s already has disks!\n",
+ mdname(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (mddev->raid_disks) {
+ printk(KERN_WARNING
+ "md: array %s already initialised!\n",
+ mdname(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't set"
+ " array info. %d\n", err);
+ goto abort_unlock;
+ }
+ goto done_unlock;
}
/*
@@ -6456,52 +6466,51 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
/*
* Commands even a read-only array can execute:
*/
- switch (cmd)
- {
- case GET_BITMAP_FILE:
- err = get_bitmap_file(mddev, argp);
- goto done_unlock;
+ switch (cmd) {
+ case GET_BITMAP_FILE:
+ err = get_bitmap_file(mddev, argp);
+ goto done_unlock;
- case RESTART_ARRAY_RW:
- err = restart_array(mddev);
- goto done_unlock;
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
- case STOP_ARRAY:
- err = do_md_stop(mddev, 0, bdev);
- goto done_unlock;
+ case STOP_ARRAY:
+ err = do_md_stop(mddev, 0, bdev);
+ goto done_unlock;
- case STOP_ARRAY_RO:
- err = md_set_readonly(mddev, bdev);
- goto done_unlock;
+ case STOP_ARRAY_RO:
+ err = md_set_readonly(mddev, bdev);
+ goto done_unlock;
- case BLKROSET:
- if (get_user(ro, (int __user *)(arg))) {
- err = -EFAULT;
- goto done_unlock;
- }
- err = -EINVAL;
+ case BLKROSET:
+ if (get_user(ro, (int __user *)(arg))) {
+ err = -EFAULT;
+ goto done_unlock;
+ }
+ err = -EINVAL;
- /* if the bdev is going readonly the value of mddev->ro
- * does not matter, no writes are coming
- */
- if (ro)
- goto done_unlock;
+ /* if the bdev is going readonly the value of mddev->ro
+ * does not matter, no writes are coming
+ */
+ if (ro)
+ goto done_unlock;
- /* are we are already prepared for writes? */
- if (mddev->ro != 1)
- goto done_unlock;
+ /* are we are already prepared for writes? */
+ if (mddev->ro != 1)
+ goto done_unlock;
- /* transitioning to readauto need only happen for
- * arrays that call md_write_start
- */
- if (mddev->pers) {
- err = restart_array(mddev);
- if (err == 0) {
- mddev->ro = 2;
- set_disk_ro(mddev->gendisk, 0);
- }
+ /* transitioning to readauto need only happen for
+ * arrays that call md_write_start
+ */
+ if (mddev->pers) {
+ err = restart_array(mddev);
+ if (err == 0) {
+ mddev->ro = 2;
+ set_disk_ro(mddev->gendisk, 0);
}
- goto done_unlock;
+ }
+ goto done_unlock;
}
/*
@@ -6523,37 +6532,36 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
}
}
- switch (cmd)
+ switch (cmd) {
+ case ADD_NEW_DISK:
{
- case ADD_NEW_DISK:
- {
- mdu_disk_info_t info;
- if (copy_from_user(&info, argp, sizeof(info)))
- err = -EFAULT;
- else
- err = add_new_disk(mddev, &info);
- goto done_unlock;
- }
+ mdu_disk_info_t info;
+ if (copy_from_user(&info, argp, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
- case HOT_REMOVE_DISK:
- err = hot_remove_disk(mddev, new_decode_dev(arg));
- goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, new_decode_dev(arg));
+ goto done_unlock;
- case HOT_ADD_DISK:
- err = hot_add_disk(mddev, new_decode_dev(arg));
- goto done_unlock;
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, new_decode_dev(arg));
+ goto done_unlock;
- case RUN_ARRAY:
- err = do_md_run(mddev);
- goto done_unlock;
+ case RUN_ARRAY:
+ err = do_md_run(mddev);
+ goto done_unlock;
- case SET_BITMAP_FILE:
- err = set_bitmap_file(mddev, (int)arg);
- goto done_unlock;
+ case SET_BITMAP_FILE:
+ err = set_bitmap_file(mddev, (int)arg);
+ goto done_unlock;
- default:
- err = -EINVAL;
- goto abort_unlock;
+ default:
+ err = -EINVAL;
+ goto abort_unlock;
}
done_unlock:
@@ -7172,6 +7180,7 @@ void md_done_sync(struct mddev *mddev, int blocks, int ok)
wake_up(&mddev->recovery_wait);
if (!ok) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
md_wakeup_thread(mddev->thread);
// stop recovery, signal do_sync ....
}
@@ -7269,6 +7278,7 @@ EXPORT_SYMBOL_GPL(md_allow_write);
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
+#define UPDATE_FREQUENCY (5*60*HZ)
void md_do_sync(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
@@ -7277,6 +7287,7 @@ void md_do_sync(struct md_thread *thread)
window;
sector_t max_sectors,j, io_sectors;
unsigned long mark[SYNC_MARKS];
+ unsigned long update_time;
sector_t mark_cnt[SYNC_MARKS];
int last_mark,m;
struct list_head *tmp;
@@ -7436,6 +7447,7 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync_completed = j;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
md_new_event(mddev);
+ update_time = jiffies;
blk_start_plug(&plug);
while (j < max_sectors) {
@@ -7447,6 +7459,7 @@ void md_do_sync(struct md_thread *thread)
((mddev->curr_resync > mddev->curr_resync_completed &&
(mddev->curr_resync - mddev->curr_resync_completed)
> (max_sectors >> 4)) ||
+ time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
(j - mddev->curr_resync_completed)*2
>= mddev->resync_max - mddev->curr_resync_completed
)) {
@@ -7454,6 +7467,10 @@ void md_do_sync(struct md_thread *thread)
wait_event(mddev->recovery_wait,
atomic_read(&mddev->recovery_active) == 0);
mddev->curr_resync_completed = j;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ j > mddev->recovery_cp)
+ mddev->recovery_cp = j;
+ update_time = jiffies;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
@@ -7558,8 +7575,13 @@ void md_do_sync(struct md_thread *thread)
printk(KERN_INFO
"md: checkpointing %s of %s.\n",
desc, mdname(mddev));
- mddev->recovery_cp =
- mddev->curr_resync_completed;
+ if (test_bit(MD_RECOVERY_ERROR,
+ &mddev->recovery))
+ mddev->recovery_cp =
+ mddev->curr_resync_completed;
+ else
+ mddev->recovery_cp =
+ mddev->curr_resync;
}
} else
mddev->recovery_cp = MaxSector;
@@ -7936,9 +7958,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors)
{
int hi;
- int lo = 0;
+ int lo;
u64 *p = bb->page;
- int rv = 0;
+ int rv;
sector_t target = s + sectors;
unsigned seq;
@@ -7953,7 +7975,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
retry:
seq = read_seqbegin(&bb->lock);
-
+ lo = 0;
+ rv = 0;
hi = bb->count;
/* Binary search between lo and hi for 'target'
diff --git a/drivers/md/md.h b/drivers/md/md.h
index af443ab..eca59c3 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -307,6 +307,7 @@ struct mddev {
* REQUEST: user-space has requested a sync (used with SYNC)
* CHECK: user-space request for check-only, no repair
* RESHAPE: A reshape is happening
+ * ERROR: sync-action interrupted because io-error
*
* If neither SYNC or RESHAPE are set, then it is a recovery.
*/
@@ -320,6 +321,7 @@ struct mddev {
#define MD_RECOVERY_CHECK 7
#define MD_RECOVERY_RESHAPE 8
#define MD_RECOVERY_FROZEN 9
+#define MD_RECOVERY_ERROR 10
unsigned long recovery;
/* If a RAID personality determines that recovery (of a particular
@@ -551,32 +553,6 @@ struct md_thread {
#define THREAD_WAKEUP 0
-#define __wait_event_lock_irq(wq, condition, lock, cmd) \
-do { \
- wait_queue_t __wait; \
- init_waitqueue_entry(&__wait, current); \
- \
- add_wait_queue(&wq, &__wait); \
- for (;;) { \
- set_current_state(TASK_UNINTERRUPTIBLE); \
- if (condition) \
- break; \
- spin_unlock_irq(&lock); \
- cmd; \
- schedule(); \
- spin_lock_irq(&lock); \
- } \
- current->state = TASK_RUNNING; \
- remove_wait_queue(&wq, &__wait); \
-} while (0)
-
-#define wait_event_lock_irq(wq, condition, lock, cmd) \
-do { \
- if (condition) \
- break; \
- __wait_event_lock_irq(wq, condition, lock, cmd); \
-} while (0)
-
static inline void safe_put_page(struct page *p)
{
if (p) put_page(p);
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index a3ae091..28c3ed0 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -428,15 +428,17 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm,
if (!v)
return 0;
r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
- if (unlikely(r))
+ if (unlikely(r)) {
+ DMERR_LIMIT("%s validator check failed for block %llu", v->name,
+ (unsigned long long) dm_bufio_get_block_number(buf));
return r;
+ }
aux->validator = v;
} else {
if (unlikely(aux->validator != v)) {
- DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
- aux->validator->name, v ? v->name : "NULL",
- (unsigned long long)
- dm_bufio_get_block_number(buf));
+ DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu",
+ aux->validator->name, v ? v->name : "NULL",
+ (unsigned long long) dm_bufio_get_block_number(buf));
return -EINVAL;
}
}
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index 5709bfe..accbb05 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -36,13 +36,13 @@ struct node_header {
__le32 padding;
} __packed;
-struct node {
+struct btree_node {
struct node_header header;
__le64 keys[0];
} __packed;
-void inc_children(struct dm_transaction_manager *tm, struct node *n,
+void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt);
int new_block(struct dm_btree_info *info, struct dm_block **result);
@@ -64,7 +64,7 @@ struct ro_spine {
void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
int exit_ro_spine(struct ro_spine *s);
int ro_step(struct ro_spine *s, dm_block_t new_child);
-struct node *ro_node(struct ro_spine *s);
+struct btree_node *ro_node(struct ro_spine *s);
struct shadow_spine {
struct dm_btree_info *info;
@@ -98,17 +98,17 @@ int shadow_root(struct shadow_spine *s);
/*
* Some inlines.
*/
-static inline __le64 *key_ptr(struct node *n, uint32_t index)
+static inline __le64 *key_ptr(struct btree_node *n, uint32_t index)
{
return n->keys + index;
}
-static inline void *value_base(struct node *n)
+static inline void *value_base(struct btree_node *n)
{
return &n->keys[le32_to_cpu(n->header.max_entries)];
}
-static inline void *value_ptr(struct node *n, uint32_t index)
+static inline void *value_ptr(struct btree_node *n, uint32_t index)
{
uint32_t value_size = le32_to_cpu(n->header.value_size);
return value_base(n) + (value_size * index);
@@ -117,7 +117,7 @@ static inline void *value_ptr(struct node *n, uint32_t index)
/*
* Assumes the values are suitably-aligned and converts to core format.
*/
-static inline uint64_t value64(struct node *n, uint32_t index)
+static inline uint64_t value64(struct btree_node *n, uint32_t index)
{
__le64 *values_le = value_base(n);
@@ -127,7 +127,7 @@ static inline uint64_t value64(struct node *n, uint32_t index)
/*
* Searching for a key within a single node.
*/
-int lower_bound(struct node *n, uint64_t key);
+int lower_bound(struct btree_node *n, uint64_t key);
extern struct dm_block_validator btree_node_validator;
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index aa71e23..c4f2813 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -53,7 +53,7 @@
/*
* Some little utilities for moving node data around.
*/
-static void node_shift(struct node *n, int shift)
+static void node_shift(struct btree_node *n, int shift)
{
uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
uint32_t value_size = le32_to_cpu(n->header.value_size);
@@ -79,7 +79,7 @@ static void node_shift(struct node *n, int shift)
}
}
-static void node_copy(struct node *left, struct node *right, int shift)
+static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
{
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t value_size = le32_to_cpu(left->header.value_size);
@@ -108,7 +108,7 @@ static void node_copy(struct node *left, struct node *right, int shift)
/*
* Delete a specific entry from a leaf node.
*/
-static void delete_at(struct node *n, unsigned index)
+static void delete_at(struct btree_node *n, unsigned index)
{
unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
unsigned nr_to_copy = nr_entries - (index + 1);
@@ -128,7 +128,7 @@ static void delete_at(struct node *n, unsigned index)
n->header.nr_entries = cpu_to_le32(nr_entries - 1);
}
-static unsigned merge_threshold(struct node *n)
+static unsigned merge_threshold(struct btree_node *n)
{
return le32_to_cpu(n->header.max_entries) / 3;
}
@@ -136,7 +136,7 @@ static unsigned merge_threshold(struct node *n)
struct child {
unsigned index;
struct dm_block *block;
- struct node *n;
+ struct btree_node *n;
};
static struct dm_btree_value_type le64_type = {
@@ -147,7 +147,7 @@ static struct dm_btree_value_type le64_type = {
.equal = NULL
};
-static int init_child(struct dm_btree_info *info, struct node *parent,
+static int init_child(struct dm_btree_info *info, struct btree_node *parent,
unsigned index, struct child *result)
{
int r, inc;
@@ -177,7 +177,7 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
return dm_tm_unlock(info->tm, c->block);
}
-static void shift(struct node *left, struct node *right, int count)
+static void shift(struct btree_node *left, struct btree_node *right, int count)
{
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
@@ -203,11 +203,11 @@ static void shift(struct node *left, struct node *right, int count)
right->header.nr_entries = cpu_to_le32(nr_right + count);
}
-static void __rebalance2(struct dm_btree_info *info, struct node *parent,
+static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
struct child *l, struct child *r)
{
- struct node *left = l->n;
- struct node *right = r->n;
+ struct btree_node *left = l->n;
+ struct btree_node *right = r->n;
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
unsigned threshold = 2 * merge_threshold(left) + 1;
@@ -239,7 +239,7 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
unsigned left_index)
{
int r;
- struct node *parent;
+ struct btree_node *parent;
struct child left, right;
parent = dm_block_data(shadow_current(s));
@@ -270,9 +270,9 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
* in right, then rebalance2. This wastes some cpu, but I want something
* simple atm.
*/
-static void delete_center_node(struct dm_btree_info *info, struct node *parent,
+static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
struct child *l, struct child *c, struct child *r,
- struct node *left, struct node *center, struct node *right,
+ struct btree_node *left, struct btree_node *center, struct btree_node *right,
uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
{
uint32_t max_entries = le32_to_cpu(left->header.max_entries);
@@ -301,9 +301,9 @@ static void delete_center_node(struct dm_btree_info *info, struct node *parent,
/*
* Redistributes entries among 3 sibling nodes.
*/
-static void redistribute3(struct dm_btree_info *info, struct node *parent,
+static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
struct child *l, struct child *c, struct child *r,
- struct node *left, struct node *center, struct node *right,
+ struct btree_node *left, struct btree_node *center, struct btree_node *right,
uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
{
int s;
@@ -343,12 +343,12 @@ static void redistribute3(struct dm_btree_info *info, struct node *parent,
*key_ptr(parent, r->index) = right->keys[0];
}
-static void __rebalance3(struct dm_btree_info *info, struct node *parent,
+static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
struct child *l, struct child *c, struct child *r)
{
- struct node *left = l->n;
- struct node *center = c->n;
- struct node *right = r->n;
+ struct btree_node *left = l->n;
+ struct btree_node *center = c->n;
+ struct btree_node *right = r->n;
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
@@ -371,7 +371,7 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
unsigned left_index)
{
int r;
- struct node *parent = dm_block_data(shadow_current(s));
+ struct btree_node *parent = dm_block_data(shadow_current(s));
struct child left, center, right;
/*
@@ -421,7 +421,7 @@ static int get_nr_entries(struct dm_transaction_manager *tm,
{
int r;
struct dm_block *block;
- struct node *n;
+ struct btree_node *n;
r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
if (r)
@@ -438,7 +438,7 @@ static int rebalance_children(struct shadow_spine *s,
{
int i, r, has_left_sibling, has_right_sibling;
uint32_t child_entries;
- struct node *n;
+ struct btree_node *n;
n = dm_block_data(shadow_current(s));
@@ -483,7 +483,7 @@ static int rebalance_children(struct shadow_spine *s,
return r;
}
-static int do_leaf(struct node *n, uint64_t key, unsigned *index)
+static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
{
int i = lower_bound(n, key);
@@ -506,7 +506,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
uint64_t key, unsigned *index)
{
int i = *index, r;
- struct node *n;
+ struct btree_node *n;
for (;;) {
r = shadow_step(s, root, vt);
@@ -556,7 +556,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
unsigned level, last_level = info->levels - 1;
int index = 0, r = 0;
struct shadow_spine spine;
- struct node *n;
+ struct btree_node *n;
init_shadow_spine(&spine, info);
for (level = 0; level < info->levels; level++) {
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index d9a7912..f199a0c 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -23,7 +23,7 @@ static void node_prepare_for_write(struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
- struct node *n = dm_block_data(b);
+ struct btree_node *n = dm_block_data(b);
struct node_header *h = &n->header;
h->blocknr = cpu_to_le64(dm_block_location(b));
@@ -38,15 +38,15 @@ static int node_check(struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
- struct node *n = dm_block_data(b);
+ struct btree_node *n = dm_block_data(b);
struct node_header *h = &n->header;
size_t value_size;
__le32 csum_disk;
uint32_t flags;
if (dm_block_location(b) != le64_to_cpu(h->blocknr)) {
- DMERR("node_check failed blocknr %llu wanted %llu",
- le64_to_cpu(h->blocknr), dm_block_location(b));
+ DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu",
+ le64_to_cpu(h->blocknr), dm_block_location(b));
return -ENOTBLK;
}
@@ -54,8 +54,8 @@ static int node_check(struct dm_block_validator *v,
block_size - sizeof(__le32),
BTREE_CSUM_XOR));
if (csum_disk != h->csum) {
- DMERR("node_check failed csum %u wanted %u",
- le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
+ DMERR_LIMIT("node_check failed: csum %u != wanted %u",
+ le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
return -EILSEQ;
}
@@ -63,12 +63,12 @@ static int node_check(struct dm_block_validator *v,
if (sizeof(struct node_header) +
(sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) {
- DMERR("node_check failed: max_entries too large");
+ DMERR_LIMIT("node_check failed: max_entries too large");
return -EILSEQ;
}
if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) {
- DMERR("node_check failed, too many entries");
+ DMERR_LIMIT("node_check failed: too many entries");
return -EILSEQ;
}
@@ -77,7 +77,7 @@ static int node_check(struct dm_block_validator *v,
*/
flags = le32_to_cpu(h->flags);
if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) {
- DMERR("node_check failed, node is neither INTERNAL or LEAF");
+ DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF");
return -EILSEQ;
}
@@ -164,7 +164,7 @@ int ro_step(struct ro_spine *s, dm_block_t new_child)
return r;
}
-struct node *ro_node(struct ro_spine *s)
+struct btree_node *ro_node(struct ro_spine *s)
{
struct dm_block *block;
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index d12b2cc..4caf669 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -38,7 +38,7 @@ static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
/*----------------------------------------------------------------*/
/* makes the assumption that no two keys are the same. */
-static int bsearch(struct node *n, uint64_t key, int want_hi)
+static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
{
int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
@@ -58,12 +58,12 @@ static int bsearch(struct node *n, uint64_t key, int want_hi)
return want_hi ? hi : lo;
}
-int lower_bound(struct node *n, uint64_t key)
+int lower_bound(struct btree_node *n, uint64_t key)
{
return bsearch(n, key, 0);
}
-void inc_children(struct dm_transaction_manager *tm, struct node *n,
+void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt)
{
unsigned i;
@@ -77,7 +77,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
vt->inc(vt->context, value_ptr(n, i));
}
-static int insert_at(size_t value_size, struct node *node, unsigned index,
+static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
uint64_t key, void *value)
__dm_written_to_disk(value)
{
@@ -122,7 +122,7 @@ int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
{
int r;
struct dm_block *b;
- struct node *n;
+ struct btree_node *n;
size_t block_size;
uint32_t max_entries;
@@ -154,7 +154,7 @@ EXPORT_SYMBOL_GPL(dm_btree_empty);
#define MAX_SPINE_DEPTH 64
struct frame {
struct dm_block *b;
- struct node *n;
+ struct btree_node *n;
unsigned level;
unsigned nr_children;
unsigned current_child;
@@ -230,6 +230,11 @@ static void pop_frame(struct del_stack *s)
dm_tm_unlock(s->tm, f->b);
}
+static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
+{
+ return f->level < (info->levels - 1);
+}
+
int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
{
int r;
@@ -241,7 +246,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
s->tm = info->tm;
s->top = -1;
- r = push_frame(s, root, 1);
+ r = push_frame(s, root, 0);
if (r)
goto out;
@@ -267,7 +272,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
if (r)
goto out;
- } else if (f->level != (info->levels - 1)) {
+ } else if (is_internal_level(info, f)) {
b = value64(f->n, f->current_child);
f->current_child++;
r = push_frame(s, b, f->level + 1);
@@ -295,7 +300,7 @@ EXPORT_SYMBOL_GPL(dm_btree_del);
/*----------------------------------------------------------------*/
static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
- int (*search_fn)(struct node *, uint64_t),
+ int (*search_fn)(struct btree_node *, uint64_t),
uint64_t *result_key, void *v, size_t value_size)
{
int i, r;
@@ -406,7 +411,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
size_t size;
unsigned nr_left, nr_right;
struct dm_block *left, *right, *parent;
- struct node *ln, *rn, *pn;
+ struct btree_node *ln, *rn, *pn;
__le64 location;
left = shadow_current(s);
@@ -491,7 +496,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
size_t size;
unsigned nr_left, nr_right;
struct dm_block *left, *right, *new_parent;
- struct node *pn, *ln, *rn;
+ struct btree_node *pn, *ln, *rn;
__le64 val;
new_parent = shadow_current(s);
@@ -576,7 +581,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
uint64_t key, unsigned *index)
{
int r, i = *index, top = 1;
- struct node *node;
+ struct btree_node *node;
for (;;) {
r = shadow_step(s, root, vt);
@@ -643,7 +648,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
unsigned level, index = -1, last_level = info->levels - 1;
dm_block_t block = root;
struct shadow_spine spine;
- struct node *n;
+ struct btree_node *n;
struct dm_btree_value_type le64_type;
le64_type.context = NULL;
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index f3a9af8..3e7a88d 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -39,8 +39,8 @@ static int index_check(struct dm_block_validator *v,
__le32 csum_disk;
if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
- DMERR("index_check failed blocknr %llu wanted %llu",
- le64_to_cpu(mi_le->blocknr), dm_block_location(b));
+ DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu",
+ le64_to_cpu(mi_le->blocknr), dm_block_location(b));
return -ENOTBLK;
}
@@ -48,8 +48,8 @@ static int index_check(struct dm_block_validator *v,
block_size - sizeof(__le32),
INDEX_CSUM_XOR));
if (csum_disk != mi_le->csum) {
- DMERR("index_check failed csum %u wanted %u",
- le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
+ DMERR_LIMIT("index_check failed: csum %u != wanted %u",
+ le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
return -EILSEQ;
}
@@ -89,8 +89,8 @@ static int bitmap_check(struct dm_block_validator *v,
__le32 csum_disk;
if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
- DMERR("bitmap check failed blocknr %llu wanted %llu",
- le64_to_cpu(disk_header->blocknr), dm_block_location(b));
+ DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu",
+ le64_to_cpu(disk_header->blocknr), dm_block_location(b));
return -ENOTBLK;
}
@@ -98,8 +98,8 @@ static int bitmap_check(struct dm_block_validator *v,
block_size - sizeof(__le32),
BITMAP_CSUM_XOR));
if (csum_disk != disk_header->csum) {
- DMERR("bitmap check failed csum %u wanted %u",
- le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
+ DMERR_LIMIT("bitmap check failed: csum %u != wanted %u",
+ le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
return -EILSEQ;
}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e89ae5e..906cf3d 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -337,7 +337,7 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
{
int r = sm_metadata_new_block_(sm, b);
if (r)
- DMERR("out of metadata space");
+ DMERR("unable to allocate new metadata block");
return r;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8034fbd..d5bddfc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -822,7 +822,7 @@ static void raise_barrier(struct r1conf *conf)
/* Wait until no block IO is waiting */
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
- conf->resync_lock, );
+ conf->resync_lock);
/* block any new IO from starting */
conf->barrier++;
@@ -830,7 +830,7 @@ static void raise_barrier(struct r1conf *conf)
/* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock, );
+ conf->resync_lock);
spin_unlock_irq(&conf->resync_lock);
}
@@ -864,8 +864,7 @@ static void wait_barrier(struct r1conf *conf)
(conf->nr_pending &&
current->bio_list &&
!bio_list_empty(current->bio_list)),
- conf->resync_lock,
- );
+ conf->resync_lock);
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -898,10 +897,10 @@ static void freeze_array(struct r1conf *conf)
spin_lock_irq(&conf->resync_lock);
conf->barrier++;
conf->nr_waiting++;
- wait_event_lock_irq(conf->wait_barrier,
- conf->nr_pending == conf->nr_queued+1,
- conf->resync_lock,
- flush_pending_writes(conf));
+ wait_event_lock_irq_cmd(conf->wait_barrier,
+ conf->nr_pending == conf->nr_queued+1,
+ conf->resync_lock,
+ flush_pending_writes(conf));
spin_unlock_irq(&conf->resync_lock);
}
static void unfreeze_array(struct r1conf *conf)
@@ -963,7 +962,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r1conf *conf = mddev->private;
struct bio *bio;
- if (from_schedule) {
+ if (from_schedule || current->bio_list) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
@@ -2710,7 +2709,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
|| disk_idx < 0)
continue;
if (test_bit(Replacement, &rdev->flags))
- disk = conf->mirrors + conf->raid_disks + disk_idx;
+ disk = conf->mirrors + mddev->raid_disks + disk_idx;
else
disk = conf->mirrors + disk_idx;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 906ccbd..64d4824 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
*/
one_write_done(r10_bio);
if (dec_rdev)
- rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+ rdev_dec_pending(rdev, conf->mddev);
}
/*
@@ -952,7 +952,7 @@ static void raise_barrier(struct r10conf *conf, int force)
/* Wait until no block IO is waiting (unless 'force') */
wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
- conf->resync_lock, );
+ conf->resync_lock);
/* block any new IO from starting */
conf->barrier++;
@@ -960,7 +960,7 @@ static void raise_barrier(struct r10conf *conf, int force)
/* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock, );
+ conf->resync_lock);
spin_unlock_irq(&conf->resync_lock);
}
@@ -993,8 +993,7 @@ static void wait_barrier(struct r10conf *conf)
(conf->nr_pending &&
current->bio_list &&
!bio_list_empty(current->bio_list)),
- conf->resync_lock,
- );
+ conf->resync_lock);
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -1027,10 +1026,10 @@ static void freeze_array(struct r10conf *conf)
spin_lock_irq(&conf->resync_lock);
conf->barrier++;
conf->nr_waiting++;
- wait_event_lock_irq(conf->wait_barrier,
- conf->nr_pending == conf->nr_queued+1,
- conf->resync_lock,
- flush_pending_writes(conf));
+ wait_event_lock_irq_cmd(conf->wait_barrier,
+ conf->nr_pending == conf->nr_queued+1,
+ conf->resync_lock,
+ flush_pending_writes(conf));
spin_unlock_irq(&conf->resync_lock);
}
@@ -1069,7 +1068,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r10conf *conf = mddev->private;
struct bio *bio;
- if (from_schedule) {
+ if (from_schedule || current->bio_list) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
@@ -1334,18 +1333,21 @@ retry_write:
blocked_rdev = rrdev;
break;
}
+ if (rdev && (test_bit(Faulty, &rdev->flags)
+ || test_bit(Unmerged, &rdev->flags)))
+ rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)
|| test_bit(Unmerged, &rrdev->flags)))
rrdev = NULL;
r10_bio->devs[i].bio = NULL;
r10_bio->devs[i].repl_bio = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags) ||
- test_bit(Unmerged, &rdev->flags)) {
+
+ if (!rdev && !rrdev) {
set_bit(R10BIO_Degraded, &r10_bio->state);
continue;
}
- if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
int bad_sectors;
@@ -1387,8 +1389,10 @@ retry_write:
max_sectors = good_sectors;
}
}
- r10_bio->devs[i].bio = bio;
- atomic_inc(&rdev->nr_pending);
+ if (rdev) {
+ r10_bio->devs[i].bio = bio;
+ atomic_inc(&rdev->nr_pending);
+ }
if (rrdev) {
r10_bio->devs[i].repl_bio = bio;
atomic_inc(&rrdev->nr_pending);
@@ -1444,69 +1448,71 @@ retry_write:
for (i = 0; i < conf->copies; i++) {
struct bio *mbio;
int d = r10_bio->devs[i].devnum;
- if (!r10_bio->devs[i].bio)
- continue;
+ if (r10_bio->devs[i].bio) {
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+ max_sectors);
+ r10_bio->devs[i].bio = mbio;
+
+ mbio->bi_sector = (r10_bio->devs[i].addr+
+ choose_data_offset(r10_bio,
+ rdev));
+ mbio->bi_bdev = rdev->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_private = r10_bio;
- mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
- max_sectors);
- r10_bio->devs[i].bio = mbio;
+ atomic_inc(&r10_bio->remaining);
- mbio->bi_sector = (r10_bio->devs[i].addr+
- choose_data_offset(r10_bio,
- conf->mirrors[d].rdev));
- mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
- mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
- mbio->bi_private = r10_bio;
+ cb = blk_check_plugged(raid10_unplug, mddev,
+ sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid10_plug_cb,
+ cb);
+ else
+ plug = NULL;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!plug)
+ md_wakeup_thread(mddev->thread);
+ }
- atomic_inc(&r10_bio->remaining);
+ if (r10_bio->devs[i].repl_bio) {
+ struct md_rdev *rdev = conf->mirrors[d].replacement;
+ if (rdev == NULL) {
+ /* Replacement just got moved to main 'rdev' */
+ smp_mb();
+ rdev = conf->mirrors[d].rdev;
+ }
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+ max_sectors);
+ r10_bio->devs[i].repl_bio = mbio;
+
+ mbio->bi_sector = (r10_bio->devs[i].addr +
+ choose_data_offset(
+ r10_bio, rdev));
+ mbio->bi_bdev = rdev->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_private = r10_bio;
- cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid10_plug_cb, cb);
- else
- plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- plug->pending_cnt++;
- } else {
+ atomic_inc(&r10_bio->remaining);
+ spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!mddev_check_plugged(mddev))
+ md_wakeup_thread(mddev->thread);
}
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
- md_wakeup_thread(mddev->thread);
-
- if (!r10_bio->devs[i].repl_bio)
- continue;
-
- mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
- max_sectors);
- r10_bio->devs[i].repl_bio = mbio;
-
- /* We are actively writing to the original device
- * so it cannot disappear, so the replacement cannot
- * become NULL here
- */
- mbio->bi_sector = (r10_bio->devs[i].addr +
- choose_data_offset(
- r10_bio,
- conf->mirrors[d].replacement));
- mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
- mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
- mbio->bi_private = r10_bio;
-
- atomic_inc(&r10_bio->remaining);
- spin_lock_irqsave(&conf->device_lock, flags);
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!mddev_check_plugged(mddev))
- md_wakeup_thread(mddev->thread);
}
/* Don't remove the bias on 'remaining' (one_write_done) until
@@ -1783,7 +1789,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
clear_bit(Unmerged, &rdev->flags);
}
md_integrity_add_rdev(rdev, mddev);
- if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
print_conf(conf);
@@ -3613,11 +3619,14 @@ static int run(struct mddev *mddev)
discard_supported = true;
}
- if (discard_supported)
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
- else
- queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
-
+ if (mddev->queue) {
+ if (discard_supported)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ }
/* need to check that every block has at least one working mirror */
if (!enough(conf, -1)) {
printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1fe7c22..19d77a0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -53,6 +53,8 @@
#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <trace/events/block.h>
+
#include "md.h"
#include "raid5.h"
#include "raid0.h"
@@ -182,6 +184,8 @@ static void return_io(struct bio *return_bi)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
bio_endio(bi, 0);
bi = return_bi;
}
@@ -466,7 +470,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
do {
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0 || noquiesce,
- conf->device_lock, /* nothing */);
+ conf->device_lock);
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!conf->inactive_blocked)
@@ -480,8 +484,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
(atomic_read(&conf->active_stripes)
< (conf->max_nr_stripes *3/4)
|| !conf->inactive_blocked),
- conf->device_lock,
- );
+ conf->device_lock);
conf->inactive_blocked = 0;
} else
init_stripe(sh, sector, previous);
@@ -671,6 +674,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi->bi_next = NULL;
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
+ trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+ bi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(bi);
}
if (rrdev) {
@@ -698,6 +704,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_size = STRIPE_SIZE;
rbi->bi_next = NULL;
+ trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+ rbi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(rbi);
}
if (!rdev && !rrdev) {
@@ -1646,8 +1655,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
spin_lock_irq(&conf->device_lock);
wait_event_lock_irq(conf->wait_for_stripe,
!list_empty(&conf->inactive_list),
- conf->device_lock,
- );
+ conf->device_lock);
osh = get_free_stripe(conf);
spin_unlock_irq(&conf->device_lock);
atomic_set(&nsh->count, 1);
@@ -2774,10 +2782,12 @@ static void handle_stripe_clean_event(struct r5conf *conf,
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
(test_bit(R5_UPTODATE, &dev->flags) ||
- test_and_clear_bit(R5_Discard, &dev->flags))) {
+ test_bit(R5_Discard, &dev->flags))) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
pr_debug("Return write for disc %d\n", i);
+ if (test_and_clear_bit(R5_Discard, &dev->flags))
+ clear_bit(R5_UPTODATE, &dev->flags);
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_sector <
@@ -2795,7 +2805,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
}
- }
+ } else if (test_bit(R5_Discard, &sh->dev[i].flags))
+ clear_bit(R5_Discard, &sh->dev[i].flags);
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -2852,8 +2863,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
pr_debug("for sector %llu, rmw=%d rcw=%d\n",
(unsigned long long)sh->sector, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
- if (rmw < rcw && rmw > 0)
+ if (rmw < rcw && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
+ blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
+ (unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
@@ -2864,7 +2877,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
if (
test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
pr_debug("Read_old block "
- "%d for r-m-w\n", i);
+ "%d for r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
@@ -2874,8 +2887,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
}
}
}
+ }
if (rcw <= rmw && rcw > 0) {
/* want reconstruct write, but need to get some data */
+ int qread =0;
rcw = 0;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -2894,12 +2909,17 @@ static void handle_stripe_dirtying(struct r5conf *conf,
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
+ qread++;
} else {
set_bit(STRIPE_DELAYED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
}
}
+ if (rcw)
+ blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
+ (unsigned long long)sh->sector,
+ rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
}
/* now if nothing is locked, and if we have enough data,
* we can start a write request
@@ -3221,10 +3241,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
}
/* done submitting copies, wait for them to complete */
- if (tx) {
- async_tx_ack(tx);
- dma_wait_for_async_tx(tx);
- }
+ async_tx_quiesce(&tx);
}
/*
@@ -3490,40 +3507,6 @@ static void handle_stripe(struct stripe_head *sh)
handle_failed_sync(conf, sh, &s);
}
- /*
- * might be able to return some write requests if the parity blocks
- * are safe, or on a failed drive
- */
- pdev = &sh->dev[sh->pd_idx];
- s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
- || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
- qdev = &sh->dev[sh->qd_idx];
- s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
- || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
- || conf->level < 6;
-
- if (s.written &&
- (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
- && !test_bit(R5_LOCKED, &pdev->flags)
- && (test_bit(R5_UPTODATE, &pdev->flags) ||
- test_bit(R5_Discard, &pdev->flags))))) &&
- (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
- && !test_bit(R5_LOCKED, &qdev->flags)
- && (test_bit(R5_UPTODATE, &qdev->flags) ||
- test_bit(R5_Discard, &qdev->flags))))))
- handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
-
- /* Now we might consider reading some blocks, either to check/generate
- * parity, or to satisfy requests
- * or to load a block that is being partially written.
- */
- if (s.to_read || s.non_overwrite
- || (conf->level == 6 && s.to_write && s.failed)
- || (s.syncing && (s.uptodate + s.compute < disks))
- || s.replacing
- || s.expanding)
- handle_stripe_fill(sh, &s, disks);
-
/* Now we check to see if any write operations have recently
* completed
*/
@@ -3561,6 +3544,40 @@ static void handle_stripe(struct stripe_head *sh)
s.dec_preread_active = 1;
}
+ /*
+ * might be able to return some write requests if the parity blocks
+ * are safe, or on a failed drive
+ */
+ pdev = &sh->dev[sh->pd_idx];
+ s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+ || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+ qdev = &sh->dev[sh->qd_idx];
+ s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+ || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+ || conf->level < 6;
+
+ if (s.written &&
+ (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+ && !test_bit(R5_LOCKED, &pdev->flags)
+ && (test_bit(R5_UPTODATE, &pdev->flags) ||
+ test_bit(R5_Discard, &pdev->flags))))) &&
+ (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+ && !test_bit(R5_LOCKED, &qdev->flags)
+ && (test_bit(R5_UPTODATE, &qdev->flags) ||
+ test_bit(R5_Discard, &qdev->flags))))))
+ handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+
+ /* Now we might consider reading some blocks, either to check/generate
+ * parity, or to satisfy requests
+ * or to load a block that is being partially written.
+ */
+ if (s.to_read || s.non_overwrite
+ || (conf->level == 6 && s.to_write && s.failed)
+ || (s.syncing && (s.uptodate + s.compute < disks))
+ || s.replacing
+ || s.expanding)
+ handle_stripe_fill(sh, &s, disks);
+
/* Now to consider new write requests and what else, if anything
* should be read. We do not handle new writes when:
* 1/ A 'write' operation (copy+xor) is already in flight.
@@ -3900,6 +3917,8 @@ static void raid5_align_endio(struct bio *bi, int error)
rdev_dec_pending(rdev, conf->mddev);
if (!error && uptodate) {
+ trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+ raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
@@ -4000,10 +4019,13 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
spin_lock_irq(&conf->device_lock);
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0,
- conf->device_lock, /* nothing */);
+ conf->device_lock);
atomic_inc(&conf->active_aligned_reads);
spin_unlock_irq(&conf->device_lock);
+ trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+ align_bi, disk_devt(mddev->gendisk),
+ raid_bio->bi_sector);
generic_make_request(align_bi);
return 1;
} else {
@@ -4078,6 +4100,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
struct stripe_head *sh;
struct mddev *mddev = cb->cb.data;
struct r5conf *conf = mddev->private;
+ int cnt = 0;
if (cb->list.next && !list_empty(&cb->list)) {
spin_lock_irq(&conf->device_lock);
@@ -4092,9 +4115,11 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
smp_mb__before_clear_bit();
clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
__release_stripe(conf, sh);
+ cnt++;
}
spin_unlock_irq(&conf->device_lock);
}
+ trace_block_unplug(mddev->queue, cnt, !from_schedule);
kfree(cb);
}
@@ -4352,6 +4377,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
if ( rw == WRITE )
md_write_end(mddev);
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
bio_endio(bi, 0);
}
}
@@ -4728,8 +4755,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
handled++;
}
remaining = raid5_dec_bi_active_stripes(raid_bio);
- if (remaining == 0)
+ if (remaining == 0) {
+ trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+ raid_bio, 0);
bio_endio(raid_bio, 0);
+ }
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
return handled;
@@ -5529,6 +5559,10 @@ static int run(struct mddev *mddev)
* discard data disk but write parity disk
*/
stripe = stripe * PAGE_SIZE;
+ /* Round up to power of 2, as discard handling
+ * currently assumes that */
+ while ((stripe-1) & stripe)
+ stripe = (stripe | (stripe-1)) + 1;
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
/*
@@ -6088,7 +6122,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
wait_event_lock_irq(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
- conf->device_lock, /* nothing */);
+ conf->device_lock);
conf->quiesce = 1;
spin_unlock_irq(&conf->device_lock);
/* allow reshape to continue */
OpenPOWER on IntegriCloud