From 6bbf79a14080a0c61212f53b4b87dc1a99fedf9c Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Thu, 12 Aug 2010 04:13:49 +0100 Subject: dm mpath: fix NULL pointer dereference when path parameters missing multipath_ctr() forgets to return an error after detecting missing path parameters. Fix this. Signed-off-by: Patrick LoPresti Cc: stable@kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 826bce7..da2223a 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -706,6 +706,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, if (as->argc < nr_params) { ti->error = "not enough path parameters"; + r = -EINVAL; goto bad; } -- cgit v1.1 From 1e5554c8428bc7209a83e2d07ca724be4d981ce3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 12 Aug 2010 04:13:50 +0100 Subject: dm snapshot: iterate origin and cow devices Iterate both origin and snapshot devices iterate_devices method should call the callback for all the devices where the bio may be remapped. Thus, snapshot_iterate_devices should call the callback for both snapshot and origin underlying devices because it remaps some bios to the snapshot and some to the origin. snapshot_iterate_devices called the callback only for the origin device. This led to badly calculated device limits if snapshot and origin were placed on different types of disks. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 5485377..a6ab989 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1899,8 +1899,14 @@ static int snapshot_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dm_snapshot *snap = ti->private; + int r; + + r = fn(ti, snap->origin, 0, ti->len, data); - return fn(ti, snap->origin, 0, ti->len, data); + if (!r) + r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data); + + return r; } -- cgit v1.1 From c24110450650f17f7d3ba4fbe01f01ac5a115456 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 12 Aug 2010 04:13:51 +0100 Subject: dm snapshot: test chunk size against both origin and snapshot Validate chunk size against both origin and snapshot sector size Don't allow chunk size smaller than either origin or snapshot logical sector size. Reading or writing data not aligned to sector size is not allowed and causes immediate errors. This requires us to open the origin before initialising the exception store and to export dm_snap_origin. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.c | 4 +++- drivers/md/dm-exception-store.h | 3 ++- drivers/md/dm-snap.c | 36 +++++++++++++++++++++--------------- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 2b7907b..0bdb201 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -173,7 +173,9 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, /* Validate the chunk size against the device block size */ if (chunk_size % - (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) { + (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9) || + chunk_size % + (bdev_logical_block_size(dm_snap_origin(store->snap)->bdev) >> 9)) { *error = "Chunk size is not a multiple of device blocksize"; return -EINVAL; } diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index e8dfa06..0b25362 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -126,8 +126,9 @@ struct dm_exception_store { }; /* - * Obtain the cow device used by a given snapshot. + * Obtain the origin or cow device used by a given snapshot. */ +struct dm_dev *dm_snap_origin(struct dm_snapshot *snap); struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); /* diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a6ab989..a1f2ab5 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -148,6 +148,12 @@ struct dm_snapshot { #define RUNNING_MERGE 0 #define SHUTDOWN_MERGE 1 +struct dm_dev *dm_snap_origin(struct dm_snapshot *s) +{ + return s->origin; +} +EXPORT_SYMBOL(dm_snap_origin); + struct dm_dev *dm_snap_cow(struct dm_snapshot *s) { return s->cow; @@ -1065,10 +1071,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) origin_mode = FMODE_WRITE; } - origin_path = argv[0]; - argv++; - argc--; - s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) { ti->error = "Cannot allocate snapshot context private " @@ -1077,6 +1079,16 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + origin_path = argv[0]; + argv++; + argc--; + + r = dm_get_device(ti, origin_path, origin_mode, &s->origin); + if (r) { + ti->error = "Cannot get origin device"; + goto bad_origin; + } + cow_path = argv[0]; argv++; argc--; @@ -1097,12 +1109,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) argv += args_used; argc -= args_used; - r = dm_get_device(ti, origin_path, origin_mode, &s->origin); - if (r) { - ti->error = "Cannot get origin device"; - goto bad_origin; - } - s->ti = ti; s->valid = 1; s->active = 0; @@ -1212,15 +1218,15 @@ bad_kcopyd: dm_exception_table_exit(&s->complete, exception_cache); bad_hash_tables: - dm_put_device(ti, s->origin); - -bad_origin: dm_exception_store_destroy(s->store); bad_store: dm_put_device(ti, s->cow); bad_cow: + dm_put_device(ti, s->origin); + +bad_origin: kfree(s); bad: @@ -1314,12 +1320,12 @@ static void snapshot_dtr(struct dm_target *ti) mempool_destroy(s->pending_pool); - dm_put_device(ti, s->origin); - dm_exception_store_destroy(s->store); dm_put_device(ti, s->cow); + dm_put_device(ti, s->origin); + kfree(s); } -- cgit v1.1 From 6be544940109b4c45f560785fe5798ce3fdc1922 Mon Sep 17 00:00:00 2001 From: Peter Rajnoha Date: Thu, 12 Aug 2010 04:13:52 +0100 Subject: dm ioctl: remove __dev_status from geometry and target message Remove useless __dev_status call while processing an ioctl that sets up device geometry and target message. The data is not returned to userspace so there is no point collecting it and in the case of target_message it is collected before processing the message so if it did return it might be stale. Signed-off-by: Peter Rajnoha Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index d7500e1..70f8307 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -818,8 +818,6 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) geometry.start = indata[3]; r = dm_set_geometry(md, &geometry); - if (!r) - r = __dev_status(md, param); param->data_size = 0; @@ -1333,10 +1331,6 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; - if (tmsg < (struct dm_target_msg *) param->data || invalid_str(tmsg->message, (void *) param + param_size)) { DMWARN("Invalid target message parameters."); -- cgit v1.1 From 094ea9a071f68bd6f56c3f8cdeb5263727b68ce9 Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Thu, 12 Aug 2010 04:13:52 +0100 Subject: dm ioctl: make __dev_status void __dev_status() cannot fail so make it void and simplify callers. Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 67 ++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 70f8307..4bc4c4f 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -573,7 +573,7 @@ static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, * Fills in a dm_ioctl structure, ready for sending back to * userland. */ -static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) +static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) { struct gendisk *disk = dm_disk(md); struct dm_table *table; @@ -617,8 +617,6 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) dm_table_put(table); } } - - return 0; } static int dev_create(struct dm_ioctl *param, size_t param_size) @@ -638,14 +636,14 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) return r; r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); - if (r) { - dm_put(md); - return r; - } + if (r) + goto out; param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(md, param); + __dev_status(md, param); + +out: dm_put(md); return r; @@ -841,13 +839,17 @@ static int do_suspend(struct dm_ioctl *param) if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended_md(md)) + if (!dm_suspended_md(md)) { r = dm_suspend(md, suspend_flags); + if (r) + goto out; + } - if (!r) - r = __dev_status(md, param); + __dev_status(md, param); +out: dm_put(md); + return r; } @@ -909,7 +911,7 @@ static int do_resume(struct dm_ioctl *param) dm_table_destroy(old_map); if (!r) - r = __dev_status(md, param); + __dev_status(md, param); dm_put(md); return r; @@ -933,16 +935,16 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size) */ static int dev_status(struct dm_ioctl *param, size_t param_size) { - int r; struct mapped_device *md; md = find_device(param); if (!md) return -ENXIO; - r = __dev_status(md, param); + __dev_status(md, param); dm_put(md); - return r; + + return 0; } /* @@ -1017,7 +1019,7 @@ static void retrieve_status(struct dm_table *table, */ static int dev_wait(struct dm_ioctl *param, size_t param_size) { - int r; + int r = 0; struct mapped_device *md; struct dm_table *table; @@ -1038,9 +1040,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) * changed to trigger the event, so we may as well tell * him and save an ioctl. */ - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); table = dm_get_live_or_inactive_table(md, param); if (table) { @@ -1048,8 +1048,9 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) dm_table_put(table); } - out: +out: dm_put(md); + return r; } @@ -1184,7 +1185,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) up_write(&_hash_lock); param->flags |= DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(md, param); + __dev_status(md, param); out: dm_put(md); @@ -1194,7 +1195,6 @@ out: static int table_clear(struct dm_ioctl *param, size_t param_size) { - int r; struct hash_cell *hc; struct mapped_device *md; @@ -1214,11 +1214,12 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(hc->md, param); + __dev_status(hc->md, param); md = hc->md; up_write(&_hash_lock); dm_put(md); - return r; + + return 0; } /* @@ -1263,7 +1264,6 @@ static void retrieve_deps(struct dm_table *table, static int table_deps(struct dm_ioctl *param, size_t param_size) { - int r = 0; struct mapped_device *md; struct dm_table *table; @@ -1271,9 +1271,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); table = dm_get_live_or_inactive_table(md, param); if (table) { @@ -1281,9 +1279,9 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) dm_table_put(table); } - out: dm_put(md); - return r; + + return 0; } /* @@ -1292,7 +1290,6 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) */ static int table_status(struct dm_ioctl *param, size_t param_size) { - int r; struct mapped_device *md; struct dm_table *table; @@ -1300,9 +1297,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); table = dm_get_live_or_inactive_table(md, param); if (table) { @@ -1310,9 +1305,9 @@ static int table_status(struct dm_ioctl *param, size_t param_size) dm_table_put(table); } -out: dm_put(md); - return r; + + return 0; } /* -- cgit v1.1 From 856a6f1dbd8940e72755af145ebcd806408ecedd Mon Sep 17 00:00:00 2001 From: Peter Rajnoha Date: Thu, 12 Aug 2010 04:13:53 +0100 Subject: dm ioctl: return uevent flag after rename All the dm ioctls that generate uevents set the DM_UEVENT_GENERATED flag so that userspace knows whether or not to wait for a uevent to be processed before continuing, The dm rename ioctl sets this flag but was not structured to return it to userspace. This patch restructures the rename ioctl processing to behave like the other ioctls that return data and so fix this. Signed-off-by: Peter Rajnoha Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4bc4c4f..79ee5ba 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -285,19 +285,20 @@ retry: up_write(&_hash_lock); } -static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old, - const char *new) +static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, + const char *new) { char *new_name, *old_name; struct hash_cell *hc; struct dm_table *table; + struct mapped_device *md; /* * duplicate new. */ new_name = kstrdup(new, GFP_KERNEL); if (!new_name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); down_write(&_hash_lock); @@ -306,24 +307,24 @@ static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old, */ hc = __get_name_cell(new); if (hc) { - DMWARN("asked to rename to an already existing name %s -> %s", - old, new); + DMWARN("asked to rename to an already-existing name %s -> %s", + param->name, new); dm_put(hc->md); up_write(&_hash_lock); kfree(new_name); - return -EBUSY; + return ERR_PTR(-EBUSY); } /* * Is there such a device as 'old' ? */ - hc = __get_name_cell(old); + hc = __get_name_cell(param->name); if (!hc) { - DMWARN("asked to rename a non existent device %s -> %s", - old, new); + DMWARN("asked to rename a non-existent device %s -> %s", + param->name, new); up_write(&_hash_lock); kfree(new_name); - return -ENXIO; + return ERR_PTR(-ENXIO); } /* @@ -345,13 +346,14 @@ static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old, dm_table_put(table); } - if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie)) - *flags |= DM_UEVENT_GENERATED_FLAG; + if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; - dm_put(hc->md); + md = hc->md; up_write(&_hash_lock); kfree(old_name); - return 0; + + return md; } /*----------------------------------------------------------------- @@ -760,6 +762,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) { int r; char *new_name = (char *) param + param->data_start; + struct mapped_device *md; if (new_name < param->data || invalid_str(new_name, (void *) param + param_size) || @@ -772,10 +775,14 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) if (r) return r; - param->data_size = 0; + md = dm_hash_rename(param, new_name); + if (IS_ERR(md)) + return PTR_ERR(md); - return dm_hash_rename(param->event_nr, ¶m->flags, param->name, - new_name); + __dev_status(md, param); + dm_put(md); + + return 0; } static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) -- cgit v1.1 From abdc568b0540bec6d3e0afebac496adef1189b77 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 12 Aug 2010 04:13:54 +0100 Subject: dm: prevent access to md being deleted This patch prevents access to mapped_device which is being deleted. Currently, even after a mapped_device has been removed from the hash, it could be accessed through idr_find() using minor number. That could cause a race and NULL pointer reference below: CPU0 CPU1 ------------------------------------------------------------------ dev_remove(param) down_write(_hash_lock) dm_lock_for_deletion(md) spin_lock(_minor_lock) set_bit(DMF_DELETING) spin_unlock(_minor_lock) __hash_remove(hc) up_write(_hash_lock) dev_status(param) md = find_device(param) down_read(_hash_lock) __find_device_hash_cell(param) dm_get_md(param->dev) md = dm_find_md(dev) spin_lock(_minor_lock) md = idr_find(MINOR(dev)) spin_unlock(_minor_lock) dm_put(md) free_dev(md) dm_get(md) up_read(_hash_lock) __dev_status(md, param) dm_put(md) This patch fixes such problems. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: stable@kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a3f21dc..ba6934c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2136,6 +2136,7 @@ static struct mapped_device *dm_find_md(dev_t dev) md = idr_find(&_minor_idr, minor); if (md && (md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || + dm_deleting_md(md) || test_bit(DMF_FREEING, &md->flags))) { md = NULL; goto out; -- cgit v1.1 From 98f332855effef02aeb738e4d62e9a5b903c52fd Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 12 Aug 2010 04:13:55 +0100 Subject: dm ioctl: release _hash_lock between devices in remove_all This patch changes dm_hash_remove_all() to release _hash_lock when removing a device. After removing the device, dm_hash_remove_all() takes _hash_lock and searches the hash from scratch again. This patch is a preparation for the next patch, which changes device deletion code to wait for md reference to be 0. Without this patch, the wait in the next patch may cause AB-BA deadlock: CPU0 CPU1 ----------------------------------------------------------------------- dm_hash_remove_all() down_write(_hash_lock) table_status() md = find_device() dm_get(md) holders> dm_get_live_or_inactive_table() dm_get_inactive_table() down_write(_hash_lock) holders to be 0> Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: stable@kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 79ee5ba..6a6d475 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -249,40 +249,46 @@ static void __hash_remove(struct hash_cell *hc) static void dm_hash_remove_all(int keep_open_devices) { - int i, dev_skipped, dev_removed; + int i, dev_skipped; struct hash_cell *hc; - struct list_head *tmp, *n; + struct mapped_device *md; + +retry: + dev_skipped = 0; down_write(&_hash_lock); -retry: - dev_skipped = dev_removed = 0; for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_safe (tmp, n, _name_buckets + i) { - hc = list_entry(tmp, struct hash_cell, name_list); + list_for_each_entry(hc, _name_buckets + i, name_list) { + md = hc->md; + dm_get(md); - if (keep_open_devices && - dm_lock_for_deletion(hc->md)) { + if (keep_open_devices && dm_lock_for_deletion(md)) { + dm_put(md); dev_skipped++; continue; } + __hash_remove(hc); - dev_removed = 1; - } - } - /* - * Some mapped devices may be using other mapped devices, so if any - * still exist, repeat until we make no further progress. - */ - if (dev_skipped) { - if (dev_removed) - goto retry; + up_write(&_hash_lock); - DMWARN("remove_all left %d open device(s)", dev_skipped); + dm_put(md); + + /* + * Some mapped devices may be using other mapped + * devices, so repeat until we make no further + * progress. If a new mapped device is created + * here it will also get removed. + */ + goto retry; + } } up_write(&_hash_lock); + + if (dev_skipped) + DMWARN("remove_all left %d open device(s)", dev_skipped); } static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, -- cgit v1.1 From 3f77316de0ec0fd208467fbee8d9edc70e2c73b2 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 12 Aug 2010 04:13:56 +0100 Subject: dm: separate device deletion from dm_put This patch separates the device deletion code from dm_put() to make sure the deletion happens in the process context. By this patch, device deletion always occurs in an ioctl (process) context and dm_put() can be called in interrupt context. As a result, the request-based dm's bad dm_put() usage pointed out by Mikulas below disappears. http://marc.info/?l=dm-devel&m=126699981019735&w=2 Without this patch, I confirmed there is a case to crash the system: dm_put() => dm_table_destroy() => vfree() => BUG_ON(in_interrupt()) Some more backgrounds and details: In request-based dm, a device opener can remove a mapped_device while the last request is still completing, because bios in the last request complete first and then the device opener can close and remove the mapped_device before the last request completes: CPU0 CPU1 ================================================================= <> blk_end_request_all(clone_rq) blk_update_request(clone_rq) bio_endio(clone_bio) == end_clone_bio blk_update_request(orig_rq) bio_endio(orig_bio) <> dm_blk_close() dev_remove() dm_put(md) <> blk_finish_request(clone_rq) .... dm_end_request(clone_rq) free_rq_clone(clone_rq) blk_end_request_all(orig_rq) rq_completed(md) So request-based dm used dm_get()/dm_put() to hold md for each I/O until its request completion handling is fully done. However, the final dm_put() can call the device deletion code which must not be run in interrupt context and may cause kernel panic. To solve the problem, this patch moves the device deletion code, dm_destroy(), to predetermined places that is actually deleting the mapped_device in ioctl (process) context, and changes dm_put() just to decrement the reference count of the mapped_device. By this change, dm_put() can be used in any context and the symmetric model below is introduced: dm_create(): create a mapped_device dm_destroy(): destroy a mapped_device dm_get(): increment the reference count of a mapped_device dm_put(): decrement the reference count of a mapped_device dm_destroy() waits for all references of the mapped_device to disappear, then deletes the mapped_device. dm_destroy() uses active waiting with msleep(1), since deleting the mapped_device isn't performance-critical task. And since at this point, nobody opens the mapped_device and no new reference will be taken, the pending counts are just for racing completing activity and will eventually decrease to zero. For the unlikely case of the forced module unload, dm_destroy_immediate(), which doesn't wait and forcibly deletes the mapped_device, is also introduced and used in dm_hash_remove_all(). Otherwise, "rmmod -f" may be stuck and never return. And now, because the mapped_device is deleted at this point, subsequent accesses to the mapped_device may cause NULL pointer references. Cc: stable@kernel.org Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 15 +++++++++---- drivers/md/dm.c | 62 ++++++++++++++++++++++++++++++++++++++------------- drivers/md/dm.h | 5 +++++ 3 files changed, 62 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 6a6d475..feb64d6 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -274,6 +274,10 @@ retry: up_write(&_hash_lock); dm_put(md); + if (likely(keep_open_devices)) + dm_destroy(md); + else + dm_destroy_immediate(md); /* * Some mapped devices may be using other mapped @@ -644,17 +648,19 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) return r; r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); - if (r) - goto out; + if (r) { + dm_put(md); + dm_destroy(md); + return r; + } param->flags &= ~DM_INACTIVE_PRESENT_FLAG; __dev_status(md, param); -out: dm_put(md); - return r; + return 0; } /* @@ -748,6 +754,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) param->flags |= DM_UEVENT_GENERATED_FLAG; dm_put(md); + dm_destroy(md); return 0; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ba6934c..a503b95 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -2171,6 +2172,7 @@ void dm_set_mdptr(struct mapped_device *md, void *ptr) void dm_get(struct mapped_device *md) { atomic_inc(&md->holders); + BUG_ON(test_bit(DMF_FREEING, &md->flags)); } const char *dm_device_name(struct mapped_device *md) @@ -2179,27 +2181,55 @@ const char *dm_device_name(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_device_name); -void dm_put(struct mapped_device *md) +static void __dm_destroy(struct mapped_device *md, bool wait) { struct dm_table *map; - BUG_ON(test_bit(DMF_FREEING, &md->flags)); + might_sleep(); - if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { - map = dm_get_live_table(md); - idr_replace(&_minor_idr, MINOR_ALLOCED, - MINOR(disk_devt(dm_disk(md)))); - set_bit(DMF_FREEING, &md->flags); - spin_unlock(&_minor_lock); - if (!dm_suspended_md(md)) { - dm_table_presuspend_targets(map); - dm_table_postsuspend_targets(map); - } - dm_sysfs_exit(md); - dm_table_put(map); - dm_table_destroy(__unbind(md)); - free_dev(md); + spin_lock(&_minor_lock); + map = dm_get_live_table(md); + idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); + set_bit(DMF_FREEING, &md->flags); + spin_unlock(&_minor_lock); + + if (!dm_suspended_md(md)) { + dm_table_presuspend_targets(map); + dm_table_postsuspend_targets(map); } + + /* + * Rare, but there may be I/O requests still going to complete, + * for example. Wait for all references to disappear. + * No one should increment the reference count of the mapped_device, + * after the mapped_device state becomes DMF_FREEING. + */ + if (wait) + while (atomic_read(&md->holders)) + msleep(1); + else if (atomic_read(&md->holders)) + DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", + dm_device_name(md), atomic_read(&md->holders)); + + dm_sysfs_exit(md); + dm_table_put(map); + dm_table_destroy(__unbind(md)); + free_dev(md); +} + +void dm_destroy(struct mapped_device *md) +{ + __dm_destroy(md, true); +} + +void dm_destroy_immediate(struct mapped_device *md) +{ + __dm_destroy(md, false); +} + +void dm_put(struct mapped_device *md) +{ + atomic_dec(&md->holders); } EXPORT_SYMBOL_GPL(dm_put); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index bad1724..8223671 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -122,6 +122,11 @@ void dm_linear_exit(void); int dm_stripe_init(void); void dm_stripe_exit(void); +/* + * mapped_device operations + */ +void dm_destroy(struct mapped_device *md); +void dm_destroy_immediate(struct mapped_device *md); int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md); -- cgit v1.1 From 402ab352c2c00ba8f90c724565f8cf31210d99cf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 12 Aug 2010 04:13:57 +0100 Subject: dm ioctl: use nonseekable_open The dm control device does not implement read/write, so it has no use for seeking. Using no_llseek prevents falling back to default_llseek, which requires the BKL. Signed-off-by: Arnd Bergmann Signed-off-by: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index feb64d6..3fd8f0e 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1602,6 +1602,7 @@ static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) #endif static const struct file_operations _ctl_fops = { + .open = nonseekable_open, .unlocked_ioctl = dm_ctl_ioctl, .compat_ioctl = dm_compat_ctl_ioctl, .owner = THIS_MODULE, -- cgit v1.1 From a9c88f2ebc1a5937915cb3b89c9f03894134f39a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 12 Aug 2010 04:13:58 +0100 Subject: dm crypt: use kstrdup Use kstrdup when the goal of an allocation is copy a string into the allocated region. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression from,to; expression flag,E1,E2; statement S; @@ - to = kmalloc(strlen(from) + 1,flag); + to = kstrdup(from, flag); ... when != \(from = E1 \| to = E1 \) if (to==NULL || ...) S ... when != \(from = E2 \| to = E2 \) - strcpy(to, from); // Signed-off-by: Julia Lawall Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3bdbb61..a8aab9c 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1168,12 +1168,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ivmode && cc->iv_gen_ops) { if (ivopts) *(ivopts - 1) = ':'; - cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL); + cc->iv_mode = kstrdup(ivmode, GFP_KERNEL); if (!cc->iv_mode) { ti->error = "Error kmallocing iv_mode string"; goto bad_ivmode_string; } - strcpy(cc->iv_mode, ivmode); } else cc->iv_mode = NULL; -- cgit v1.1 From 87c961cb747fa55b664b76abfcb9d44c14ae851f Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Thu, 12 Aug 2010 04:13:59 +0100 Subject: dm snapshot: persistent use define for disk header chunk size This patch fixes hard-coded value for the size of a chunk that includes disk header for persistent snapshot. It should be changed to existing macro NUM_SNAPSHOT_HDR_CHUNKS instead of using hard-coded value 1. Signed-off-by: Tomohiro Kusumi Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap-persistent.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index c097d8a..cc2bdb8 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -266,7 +266,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, */ static chunk_t area_location(struct pstore *ps, chunk_t area) { - return 1 + ((ps->exceptions_per_area + 1) * area); + return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area); } /* @@ -780,8 +780,8 @@ static int persistent_commit_merge(struct dm_exception_store *store, * ps->current_area does not get reduced by prepare_merge() until * after commit_merge() has removed the nr_merged previous exceptions. */ - ps->next_free = (area_location(ps, ps->current_area) - 1) + - (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS; + ps->next_free = area_location(ps, ps->current_area) + + ps->current_committed + 1; return 0; } -- cgit v1.1 From 708e929513502fb050c0a3c3ee267cab5b056ded Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 12 Aug 2010 04:14:00 +0100 Subject: dm: skip second flush on bio unsupported error When processing barriers, skip the second flush if processing the bio failed with -EOPNOTSUPP. This can happen with discard+barrier requests. If the device doesn't support discard, there would be two useless SYNCHRONIZE CACHE commands. The first dm_flush cannot be so easily optimized out, so we leave it there. Previously, -EOPNOTSUPP could be received in dec_pending only with empty barriers and we ignored that error, assuming the device not supporting cache flushes has cache always consistent. With the addition of discard barriers, this -EOPNOTSUPP can also be generated by discards and we must record it in md->barrier_error for process_barrier. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a503b95..f3cc5d9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -639,8 +639,14 @@ static void dec_pending(struct dm_io *io, int error) * There can be just one barrier request so we use * a per-device variable for error reporting. * Note that you can't touch the bio after end_io_acct + * + * We ignore -EOPNOTSUPP for empty flush reported by + * underlying devices. We assume that if the device + * doesn't support empty barriers, it doesn't need + * cache flushing commands. */ - if (!md->barrier_error && io_error != -EOPNOTSUPP) + if (!md->barrier_error && + !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) md->barrier_error = io_error; end_io_acct(io); free_io(md, io); @@ -2284,7 +2290,12 @@ static void process_barrier(struct mapped_device *md, struct bio *bio) if (!bio_empty_barrier(bio)) { __split_and_process_bio(md, bio); - dm_flush(md); + /* + * If the request isn't supported, don't waste time with + * the second flush. + */ + if (md->barrier_error != -EOPNOTSUPP) + dm_flush(md); } if (md->barrier_error != DM_ENDIO_REQUEUE) -- cgit v1.1 From a5664dad7e1a278d2915c2bf79cf42250e12d7db Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:01 +0100 Subject: dm ioctl: make bio or request based device type immutable Determine whether a mapped device is bio-based or request-based when loading its first (inactive) table and don't allow that to be changed later. This patch performs different device initialisation in each of the two cases. (We don't think it's necessary to add code to support changing between the two types.) Allowed md->type transitions: DM_TYPE_NONE to DM_TYPE_BIO_BASED DM_TYPE_NONE to DM_TYPE_REQUEST_BASED We now prevent table_load from replacing the inactive table with a conflicting type of table even after an explicit table_clear. Introduce 'type_lock' into the struct mapped_device to protect md->type and to prepare for the next patch that will change the queue initialization and allocate memory while md->type_lock is held. Signed-off-by: Mike Snitzer Acked-by: Kiyoshi Ueda Signed-off-by: Alasdair G Kergon drivers/md/dm-ioctl.c | 15 +++++++++++++++ drivers/md/dm.c | 37 ++++++++++++++++++++++++++++++------- drivers/md/dm.h | 5 +++++ include/linux/dm-ioctl.h | 4 ++-- 4 files changed, 52 insertions(+), 9 deletions(-) --- drivers/md/dm-ioctl.c | 15 +++++++++++++++ drivers/md/dm.c | 37 ++++++++++++++++++++++++++++++------- drivers/md/dm.h | 5 +++++ include/linux/dm-ioctl.h | 4 ++-- 4 files changed, 52 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 3fd8f0e..4702f38 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1189,6 +1189,21 @@ static int table_load(struct dm_ioctl *param, size_t param_size) goto out; } + /* Protect md->type against concurrent table loads. */ + dm_lock_md_type(md); + if (dm_get_md_type(md) == DM_TYPE_NONE) + /* Initial table load: acquire type of table. */ + dm_set_md_type(md, dm_table_get_type(t)); + else if (dm_get_md_type(md) != dm_table_get_type(t)) { + DMWARN("can't change device type after initial table load."); + dm_table_destroy(t); + dm_unlock_md_type(md); + r = -EINVAL; + goto out; + } + dm_unlock_md_type(md); + + /* stage inactive table */ down_write(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f3cc5d9..345e94c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -125,6 +125,10 @@ struct mapped_device { unsigned long flags; struct request_queue *queue; + unsigned type; + /* Protect type against concurrent access. */ + struct mutex type_lock; + struct gendisk *disk; char name[16]; @@ -1877,8 +1881,10 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_minor; + md->type = DM_TYPE_NONE; init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); + mutex_init(&md->type_lock); spin_lock_init(&md->deferred_lock); spin_lock_init(&md->barrier_error_lock); rwlock_init(&md->map_lock); @@ -2130,6 +2136,30 @@ int dm_create(int minor, struct mapped_device **result) return 0; } +/* + * Functions to manage md->type. + * All are required to hold md->type_lock. + */ +void dm_lock_md_type(struct mapped_device *md) +{ + mutex_lock(&md->type_lock); +} + +void dm_unlock_md_type(struct mapped_device *md) +{ + mutex_unlock(&md->type_lock); +} + +void dm_set_md_type(struct mapped_device *md, unsigned type) +{ + md->type = type; +} + +unsigned dm_get_md_type(struct mapped_device *md) +{ + return md->type; +} + static struct mapped_device *dm_find_md(dev_t dev) { struct mapped_device *md; @@ -2440,13 +2470,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) goto out; } - /* cannot change the device type, once a table is bound */ - if (md->map && - (dm_table_get_type(md->map) != dm_table_get_type(table))) { - DMWARN("can't change the device type after a table is bound"); - goto out; - } - map = __bind(md, table, &limits); out: diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 8223671..1db7825 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -66,6 +66,11 @@ int dm_table_alloc_md_mempools(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); +void dm_lock_md_type(struct mapped_device *md); +void dm_unlock_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, unsigned type); +unsigned dm_get_md_type(struct mapped_device *md); + /* * To check the return value from dm_table_find_target(). */ diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 2c445e1..43b2de1 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -266,9 +266,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 17 +#define DM_VERSION_MINOR 18 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2010-03-05)" +#define DM_VERSION_EXTRA "-ioctl (2010-06-29)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.1 From 4a0b4ddf261fc89c050fe0a10ec57a61251d7ac0 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:02 +0100 Subject: dm: do not initialise full request queue when bio based Change bio-based mapped devices no longer to have a fully initialized request_queue (request_fn, elevator, etc). This means bio-based DM devices no longer register elevator sysfs attributes ('iosched/' tree or 'scheduler' other than "none"). In contrast, a request-based DM device will continue to have a full request_queue and will register elevator sysfs attributes. Therefore a user can determine a DM device's type by checking if elevator sysfs attributes exist. First allocate a minimalist request_queue structure for a DM device (needed for both bio and request-based DM). Initialization of a full request_queue is deferred until it is known that the DM device is request-based, at the end of the table load sequence. Factor DM device's request_queue initialization: - common to both request-based and bio-based into dm_init_md_queue(). - specific to request-based into dm_init_request_based_queue(). The md->type_lock mutex is used to protect md->queue, in addition to md->type, during table_load(). A DM device's first table_load will establish the immutable md->type. But md->queue initialization, based on md->type, may fail at that time (because blk_init_allocated_queue cannot allocate memory). Therefore any subsequent table_load must (re)try dm_setup_md_queue independently of establishing md->type. Signed-off-by: Mike Snitzer Acked-by: Kiyoshi Ueda Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 11 +++++- drivers/md/dm.c | 92 +++++++++++++++++++++++++++++++++++++-------------- drivers/md/dm.h | 2 ++ 3 files changed, 79 insertions(+), 26 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4702f38..ed85859 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1189,7 +1189,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) goto out; } - /* Protect md->type against concurrent table loads. */ + /* Protect md->type and md->queue against concurrent table loads. */ dm_lock_md_type(md); if (dm_get_md_type(md) == DM_TYPE_NONE) /* Initial table load: acquire type of table. */ @@ -1201,6 +1201,15 @@ static int table_load(struct dm_ioctl *param, size_t param_size) r = -EINVAL; goto out; } + + /* setup md->queue to reflect md's type (may block) */ + r = dm_setup_md_queue(md); + if (r) { + DMWARN("unable to set up device queue for new table."); + dm_table_destroy(t); + dm_unlock_md_type(md); + goto out; + } dm_unlock_md_type(md); /* stage inactive table */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 345e94c..5ae0a05 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -126,7 +126,7 @@ struct mapped_device { struct request_queue *queue; unsigned type; - /* Protect type against concurrent access. */ + /* Protect queue and type against concurrent access. */ struct mutex type_lock; struct gendisk *disk; @@ -1856,6 +1856,28 @@ static const struct block_device_operations dm_blk_dops; static void dm_wq_work(struct work_struct *work); static void dm_rq_barrier_work(struct work_struct *work); +static void dm_init_md_queue(struct mapped_device *md) +{ + /* + * Request-based dm devices cannot be stacked on top of bio-based dm + * devices. The type of this dm device has not been decided yet. + * The type is decided at the first table loading time. + * To prevent problematic device stacking, clear the queue flag + * for request stacking support until then. + * + * This queue is new, so no concurrency on the queue_flags. + */ + queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); + + md->queue->queuedata = md; + md->queue->backing_dev_info.congested_fn = dm_any_congested; + md->queue->backing_dev_info.congested_data = md; + blk_queue_make_request(md->queue, dm_request); + blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); + md->queue->unplug_fn = dm_unplug_all; + blk_queue_merge_bvec(md->queue, dm_merge_bvec); +} + /* * Allocate and initialise a blank device with a given minor. */ @@ -1895,33 +1917,11 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->uevent_list); spin_lock_init(&md->uevent_lock); - md->queue = blk_init_queue(dm_request_fn, NULL); + md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) goto bad_queue; - /* - * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device has not been decided yet, - * although we initialized the queue using blk_init_queue(). - * The type is decided at the first table loading time. - * To prevent problematic device stacking, clear the queue flag - * for request stacking support until then. - * - * This queue is new, so no concurrency on the queue_flags. - */ - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); - md->saved_make_request_fn = md->queue->make_request_fn; - md->queue->queuedata = md; - md->queue->backing_dev_info.congested_fn = dm_any_congested; - md->queue->backing_dev_info.congested_data = md; - blk_queue_make_request(md->queue, dm_request); - blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); - md->queue->unplug_fn = dm_unplug_all; - blk_queue_merge_bvec(md->queue, dm_merge_bvec); - blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_prep_fn); - blk_queue_lld_busy(md->queue, dm_lld_busy); - blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH); + dm_init_md_queue(md); md->disk = alloc_disk(1); if (!md->disk) @@ -2160,6 +2160,48 @@ unsigned dm_get_md_type(struct mapped_device *md) return md->type; } +/* + * Fully initialize a request-based queue (->elevator, ->request_fn, etc). + */ +static int dm_init_request_based_queue(struct mapped_device *md) +{ + struct request_queue *q = NULL; + + if (md->queue->elevator) + return 1; + + /* Fully initialize the queue */ + q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); + if (!q) + return 0; + + md->queue = q; + md->saved_make_request_fn = md->queue->make_request_fn; + dm_init_md_queue(md); + blk_queue_softirq_done(md->queue, dm_softirq_done); + blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_lld_busy(md->queue, dm_lld_busy); + blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH); + + elv_register_queue(md->queue); + + return 1; +} + +/* + * Setup the DM device's queue based on md's type + */ +int dm_setup_md_queue(struct mapped_device *md) +{ + if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && + !dm_init_request_based_queue(md)) { + DMWARN("Cannot initialize queue for request-based mapped device"); + return -EINVAL; + } + + return 0; +} + static struct mapped_device *dm_find_md(dev_t dev) { struct mapped_device *md; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1db7825..450fbd9 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -71,6 +71,8 @@ void dm_unlock_md_type(struct mapped_device *md); void dm_set_md_type(struct mapped_device *md, unsigned type); unsigned dm_get_md_type(struct mapped_device *md); +int dm_setup_md_queue(struct mapped_device *md); + /* * To check the return value from dm_table_find_target(). */ -- cgit v1.1 From b1d5552838334c600b068c9c8cc18638e5a8cb47 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 12 Aug 2010 04:14:02 +0100 Subject: dm snapshot: implement merge Implement merge method for the snapshot origin to improve read performance. Without merge method, dm asks the upper layers to submit smallest possible bios --- one page. Submitting such small bios impacts performance negatively when reading or writing the origin device. Without this patch, CPU consumption when reading the origin on lvm on md-raid0 was 6 to 12%, with this patch, it drops to 1 to 4%. Note: in my testing, it actually degraded performance in some settings, I traced it to Maxtor disks having problems with > 512-sector requests. Reducing the number of sectors to /sys/block/sd*/queue/max_sectors_kb to 256 fixed the read performance. I think we don't have to care about weird disks that actually degrade performance because of large requests being sent to them. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a1f2ab5..96feada 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2171,6 +2171,21 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, return 0; } +static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct dm_dev *dev = ti->private; + struct request_queue *q = bdev_get_queue(dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = dev->bdev; + bvm->bi_sector = bvm->bi_sector; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static int origin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -2188,6 +2203,7 @@ static struct target_type origin_target = { .map = origin_map, .resume = origin_resume, .status = origin_status, + .merge = origin_merge, .iterate_devices = origin_iterate_devices, }; -- cgit v1.1 From 26803b9f06d365122fae82e7554a66ef8278e0bb Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Thu, 12 Aug 2010 04:14:03 +0100 Subject: dm ioctl: refactor dm_table_complete This change unifies the various checks and finalization that occurs on a table prior to use. By doing so, it allows table construction without traversing the dm-ioctl interface. Signed-off-by: Will Drewry Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 34 ------------------------------- drivers/md/dm-table.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++--- drivers/md/dm.h | 1 - 3 files changed, 52 insertions(+), 38 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index ed85859..4d4ced8 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1131,28 +1131,9 @@ static int populate_table(struct dm_table *table, next = spec->next; } - r = dm_table_set_type(table); - if (r) { - DMWARN("unable to set table type"); - return r; - } - return dm_table_complete(table); } -static int table_prealloc_integrity(struct dm_table *t, - struct mapped_device *md) -{ - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *dd; - - list_for_each_entry(dd, devices, list) - if (bdev_get_integrity(dd->dm_dev.bdev)) - return blk_integrity_register(dm_disk(md), NULL); - - return 0; -} - static int table_load(struct dm_ioctl *param, size_t param_size) { int r; @@ -1174,21 +1155,6 @@ static int table_load(struct dm_ioctl *param, size_t param_size) goto out; } - r = table_prealloc_integrity(t, md); - if (r) { - DMERR("%s: could not register integrity profile.", - dm_device_name(md)); - dm_table_destroy(t); - goto out; - } - - r = dm_table_alloc_md_mempools(t); - if (r) { - DMWARN("unable to allocate mempools for this table"); - dm_table_destroy(t); - goto out; - } - /* Protect md->type and md->queue against concurrent table loads. */ dm_lock_md_type(md); if (dm_get_md_type(md) == DM_TYPE_NONE) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 9924ea2..bc60ef7 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -245,7 +245,7 @@ void dm_table_destroy(struct dm_table *t) msleep(1); smp_mb(); - /* free the indexes (see dm_table_complete) */ + /* free the indexes */ if (t->depth >= 2) vfree(t->index[t->depth - 2]); @@ -778,7 +778,7 @@ int dm_table_add_target(struct dm_table *t, const char *type, return r; } -int dm_table_set_type(struct dm_table *t) +static int dm_table_set_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0; @@ -900,7 +900,7 @@ static int setup_indexes(struct dm_table *t) /* * Builds the btree to index the map. */ -int dm_table_complete(struct dm_table *t) +static int dm_table_build_index(struct dm_table *t) { int r = 0; unsigned int leaf_nodes; @@ -919,6 +919,55 @@ int dm_table_complete(struct dm_table *t) return r; } +/* + * Register the mapped device for blk_integrity support if + * the underlying devices support it. + */ +static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) +{ + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *dd; + + list_for_each_entry(dd, devices, list) + if (bdev_get_integrity(dd->dm_dev.bdev)) + return blk_integrity_register(dm_disk(md), NULL); + + return 0; +} + +/* + * Prepares the table for use by building the indices, + * setting the type, and allocating mempools. + */ +int dm_table_complete(struct dm_table *t) +{ + int r; + + r = dm_table_set_type(t); + if (r) { + DMERR("unable to set table type"); + return r; + } + + r = dm_table_build_index(t); + if (r) { + DMERR("unable to build btrees"); + return r; + } + + r = dm_table_prealloc_integrity(t, t->md); + if (r) { + DMERR("could not register integrity profile."); + return r; + } + + r = dm_table_alloc_md_mempools(t); + if (r) + DMERR("unable to allocate mempools"); + + return r; +} + static DEFINE_MUTEX(_event_lock); void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 450fbd9..0d7b374 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -59,7 +59,6 @@ void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); int dm_table_any_busy_target(struct dm_table *t); -int dm_table_set_type(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); int dm_table_alloc_md_mempools(struct dm_table *t); -- cgit v1.1 From 57cba5d3658d9fdc019c6af14a2d80aefa651e56 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:04 +0100 Subject: dm: rename map_info flush_request to target_request_nr 'target_request_nr' is a more generic name that reflects the fact that it will be used for both flush and discard support. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 2 +- drivers/md/dm-stripe.c | 6 ++++-- drivers/md/dm.c | 18 +++++++++--------- include/linux/device-mapper.h | 4 ++-- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 96feada..5974d30 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1692,7 +1692,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, chunk_t chunk; if (unlikely(bio_empty_barrier(bio))) { - if (!map_context->flush_request) + if (!map_context->target_request_nr) bio->bi_bdev = s->origin->bdev; else bio->bi_bdev = s->cow->bdev; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index d6e28d7..22d5e2f 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -213,10 +213,12 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, struct stripe_c *sc = (struct stripe_c *) ti->private; sector_t offset, chunk; uint32_t stripe; + unsigned target_request_nr; if (unlikely(bio_empty_barrier(bio))) { - BUG_ON(map_context->flush_request >= sc->stripes); - bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev; + target_request_nr = map_context->target_request_nr; + BUG_ON(target_request_nr >= sc->stripes); + bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; return DM_MAPIO_REMAPPED; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5ae0a05..0d47101 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1183,12 +1183,12 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, } static void __flush_target(struct clone_info *ci, struct dm_target *ti, - unsigned flush_nr) + unsigned request_nr) { struct dm_target_io *tio = alloc_tio(ci, ti); struct bio *clone; - tio->info.flush_request = flush_nr; + tio->info.target_request_nr = request_nr; clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); __bio_clone(clone, ci->bio); @@ -1199,13 +1199,13 @@ static void __flush_target(struct clone_info *ci, struct dm_target *ti, static int __clone_and_map_empty_barrier(struct clone_info *ci) { - unsigned target_nr = 0, flush_nr; + unsigned target_nr = 0, request_nr; struct dm_target *ti; while ((ti = dm_table_get_target(ci->map, target_nr++))) - for (flush_nr = 0; flush_nr < ti->num_flush_requests; - flush_nr++) - __flush_target(ci, ti, flush_nr); + for (request_nr = 0; request_nr < ti->num_flush_requests; + request_nr++) + __flush_target(ci, ti, request_nr); ci->sector_count = 0; @@ -2424,11 +2424,11 @@ static void dm_queue_flush(struct mapped_device *md) queue_work(md->wq, &md->work); } -static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) +static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr) { struct dm_rq_target_io *tio = clone->end_io_data; - tio->info.flush_request = flush_nr; + tio->info.target_request_nr = request_nr; } /* Issue barrier requests to targets and wait for their completion. */ @@ -2446,7 +2446,7 @@ static int dm_rq_barrier(struct mapped_device *md) ti = dm_table_get_target(map, i); for (j = 0; j < ti->num_flush_requests; j++) { clone = clone_rq(md->flush_request, md, GFP_NOIO); - dm_rq_set_flush_nr(clone, j); + dm_rq_set_target_request_nr(clone, j); atomic_inc(&md->pending[rq_data_dir(clone)]); map_request(ti, clone, md); } diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 1381cd9..531a6f2 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -22,7 +22,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; union map_info { void *ptr; unsigned long long ll; - unsigned flush_request; + unsigned target_request_nr; }; /* @@ -174,7 +174,7 @@ struct dm_target { * A number of zero-length barrier requests that will be submitted * to the target for the purpose of flushing cache. * - * The request number will be placed in union map_info->flush_request. + * The request number will be placed in union map_info->target_request_nr. * It is a responsibility of the target driver to remap these requests * to the real underlying devices. */ -- cgit v1.1 From 7e507eb6432afdd798d4c6dccf949b8c43ef151c Mon Sep 17 00:00:00 2001 From: Peter Rajnoha Date: Thu, 12 Aug 2010 04:14:05 +0100 Subject: dm: allow autoloading of dm mod Add devname:mapper/control and MAPPER_CTRL_MINOR module alias to support dm-mod module autoloading. Signed-off-by: Kay Sievers Signed-off-by: Peter Rajnoha Signed-off-by: Alasdair G Kergon --- Documentation/devices.txt | 1 + drivers/md/dm-ioctl.c | 7 +++++-- include/linux/dm-ioctl.h | 1 + include/linux/miscdevice.h | 1 + 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Documentation/devices.txt b/Documentation/devices.txt index f2da781..d0d1df6 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt @@ -445,6 +445,7 @@ Your cooperation is appreciated. 233 = /dev/kmview View-OS A process with a view 234 = /dev/btrfs-control Btrfs control device 235 = /dev/autofs Autofs control device + 236 = /dev/mapper/control Device-Mapper control device 240-254 Reserved for local use 255 Reserved for MISC_DYNAMIC_MINOR diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4d4ced8..3e39193 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1599,12 +1599,15 @@ static const struct file_operations _ctl_fops = { }; static struct miscdevice _dm_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = MAPPER_CTRL_MINOR, .name = DM_NAME, - .nodename = "mapper/control", + .nodename = DM_DIR "/" DM_CONTROL_NODE, .fops = &_ctl_fops }; +MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR); +MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE); + /* * Create misc character device and link to DM_DIR/control. */ diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 43b2de1..49eab36 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -11,6 +11,7 @@ #include #define DM_DIR "mapper" /* Slashes not supported */ +#define DM_CONTROL_NODE "control" #define DM_MAX_TYPE_NAME 16 #define DM_NAME_LEN 128 #define DM_UUID_LEN 129 diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index f6c9b7d..bafffc7 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -38,6 +38,7 @@ #define KVM_MINOR 232 #define BTRFS_MINOR 234 #define AUTOFS_MINOR 235 +#define MAPPER_CTRL_MINOR 236 #define MISC_DYNAMIC_MINOR 255 struct device; -- cgit v1.1 From 28513fccf0ceefb8171ddc0cefa429b82e92a2c9 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 12 Aug 2010 04:14:06 +0100 Subject: dm crypt: simplify crypt_config destruction logic Use just one label and reuse common destructor for crypt target. Parse remaining argv arguments in logic order. Also do not ignore error values from IV init and set key functions. No functional change in this patch except changed return codes based on above. Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 180 ++++++++++++++++++++++++++------------------------ 1 file changed, 93 insertions(+), 87 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a8aab9c..139bbe2 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -999,6 +999,45 @@ static int crypt_wipe_key(struct crypt_config *cc) return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); } +static void crypt_dtr(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + ti->private = NULL; + + if (!cc) + return; + + if (cc->io_queue) + destroy_workqueue(cc->io_queue); + if (cc->crypt_queue) + destroy_workqueue(cc->crypt_queue); + + if (cc->bs) + bioset_free(cc->bs); + + if (cc->page_pool) + mempool_destroy(cc->page_pool); + if (cc->req_pool) + mempool_destroy(cc->req_pool); + if (cc->io_pool) + mempool_destroy(cc->io_pool); + + if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) + cc->iv_gen_ops->dtr(cc); + + if (cc->tfm && !IS_ERR(cc->tfm)) + crypto_free_ablkcipher(cc->tfm); + + if (cc->dev) + dm_put_device(ti, cc->dev); + + kfree(cc->iv_mode); + + /* Must zero key material before freeing */ + kzfree(cc); +} + /* * Construct an encryption mapping: * @@ -1006,7 +1045,6 @@ static int crypt_wipe_key(struct crypt_config *cc) static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; - struct crypto_ablkcipher *tfm; char *tmp; char *cipher; char *chainmode; @@ -1014,6 +1052,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) char *ivopts; unsigned int key_size; unsigned long long tmpll; + int ret = -EINVAL; if (argc != 5) { ti->error = "Not enough arguments"; @@ -1032,12 +1071,13 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) key_size = strlen(argv[1]) >> 1; cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); - if (cc == NULL) { - ti->error = - "Cannot allocate transparent encryption context"; + if (!cc) { + ti->error = "Cannot allocate transparent encryption context"; return -ENOMEM; } + ti->private = cc; + /* Compatibility mode for old dm-crypt cipher strings */ if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { chainmode = "cbc"; @@ -1046,35 +1086,36 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (strcmp(chainmode, "ecb") && !ivmode) { ti->error = "This chaining mode requires an IV mechanism"; - goto bad_cipher; + goto bad; } + ret = -ENOMEM; if (snprintf(cc->cipher, CRYPTO_MAX_ALG_NAME, "%s(%s)", chainmode, cipher) >= CRYPTO_MAX_ALG_NAME) { ti->error = "Chain mode + cipher name is too long"; - goto bad_cipher; + goto bad; } - tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); - if (IS_ERR(tfm)) { + cc->tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); + if (IS_ERR(cc->tfm)) { ti->error = "Error allocating crypto tfm"; - goto bad_cipher; + goto bad; } strcpy(cc->cipher, cipher); strcpy(cc->chainmode, chainmode); - cc->tfm = tfm; - if (crypt_set_key(cc, argv[1]) < 0) { + ret = crypt_set_key(cc, argv[1]); + if (ret < 0) { ti->error = "Error decoding and setting key"; - goto bad_ivmode; + goto bad; } /* * Choose ivmode. Valid modes: "plain", "essiv:", "benbi". * See comments at iv code */ - + ret = -EINVAL; if (ivmode == NULL) cc->iv_gen_ops = NULL; else if (strcmp(ivmode, "plain") == 0) @@ -1089,20 +1130,28 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops = &crypt_iv_null_ops; else { ti->error = "Invalid IV mode"; - goto bad_ivmode; + goto bad; } - if (cc->iv_gen_ops && cc->iv_gen_ops->ctr && - cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) - goto bad_ivmode; + /* Allocate IV */ + if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { + ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); + if (ret < 0) { + ti->error = "Error creating IV"; + goto bad; + } + } - if (cc->iv_gen_ops && cc->iv_gen_ops->init && - cc->iv_gen_ops->init(cc) < 0) { - ti->error = "Error initialising IV"; - goto bad_slab_pool; + /* Initialize IV (set keys for ESSIV etc) */ + if (cc->iv_gen_ops && cc->iv_gen_ops->init) { + ret = cc->iv_gen_ops->init(cc); + if (ret < 0) { + ti->error = "Error initialising IV"; + goto bad; + } } - cc->iv_size = crypto_ablkcipher_ivsize(tfm); + cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -1116,62 +1165,65 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } } + ret = -ENOMEM; cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); if (!cc->io_pool) { ti->error = "Cannot allocate crypt io mempool"; - goto bad_slab_pool; + goto bad; } cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(tfm); + cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) & + cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & ~(crypto_tfm_ctx_alignment() - 1); cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + sizeof(struct dm_crypt_request) + cc->iv_size); if (!cc->req_pool) { ti->error = "Cannot allocate crypt request mempool"; - goto bad_req_pool; + goto bad; } cc->req = NULL; cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; - goto bad_page_pool; + goto bad; } cc->bs = bioset_create(MIN_IOS, 0); if (!cc->bs) { ti->error = "Cannot allocate crypt bioset"; - goto bad_bs; + goto bad; } + ret = -EINVAL; if (sscanf(argv[2], "%llu", &tmpll) != 1) { ti->error = "Invalid iv_offset sector"; - goto bad_device; + goto bad; } cc->iv_offset = tmpll; + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { + ti->error = "Device lookup failed"; + goto bad; + } + if (sscanf(argv[4], "%llu", &tmpll) != 1) { ti->error = "Invalid device sector"; - goto bad_device; + goto bad; } cc->start = tmpll; - if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { - ti->error = "Device lookup failed"; - goto bad_device; - } - + ret = -ENOMEM; if (ivmode && cc->iv_gen_ops) { if (ivopts) *(ivopts - 1) = ':'; cc->iv_mode = kstrdup(ivmode, GFP_KERNEL); if (!cc->iv_mode) { ti->error = "Error kmallocing iv_mode string"; - goto bad_ivmode_string; + goto bad; } } else cc->iv_mode = NULL; @@ -1179,67 +1231,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->io_queue = create_singlethread_workqueue("kcryptd_io"); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; - goto bad_io_queue; + goto bad; } cc->crypt_queue = create_singlethread_workqueue("kcryptd"); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; - goto bad_crypt_queue; + goto bad; } ti->num_flush_requests = 1; - ti->private = cc; return 0; -bad_crypt_queue: - destroy_workqueue(cc->io_queue); -bad_io_queue: - kfree(cc->iv_mode); -bad_ivmode_string: - dm_put_device(ti, cc->dev); -bad_device: - bioset_free(cc->bs); -bad_bs: - mempool_destroy(cc->page_pool); -bad_page_pool: - mempool_destroy(cc->req_pool); -bad_req_pool: - mempool_destroy(cc->io_pool); -bad_slab_pool: - if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); -bad_ivmode: - crypto_free_ablkcipher(tfm); -bad_cipher: - /* Must zero key material before freeing */ - kzfree(cc); - return -EINVAL; -} - -static void crypt_dtr(struct dm_target *ti) -{ - struct crypt_config *cc = (struct crypt_config *) ti->private; - - destroy_workqueue(cc->io_queue); - destroy_workqueue(cc->crypt_queue); - - if (cc->req) - mempool_free(cc->req, cc->req_pool); - - bioset_free(cc->bs); - mempool_destroy(cc->page_pool); - mempool_destroy(cc->req_pool); - mempool_destroy(cc->io_pool); - - kfree(cc->iv_mode); - if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); - crypto_free_ablkcipher(cc->tfm); - dm_put_device(ti, cc->dev); - - /* Must zero key material before freeing */ - kzfree(cc); +bad: + crypt_dtr(ti); + return ret; } static int crypt_map(struct dm_target *ti, struct bio *bio, -- cgit v1.1 From 5ebaee6d290279d1df6ce45d6d54de8cfc473273 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 12 Aug 2010 04:14:07 +0100 Subject: dm crypt: simplify crypt_ctr Allocate cipher strings indpendently of struct crypt_config and move cipher parsing and allocation into a separate function to prepare for supporting the cryptoapi format e.g. "xts(aes)". No functional change in this patch. Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 185 ++++++++++++++++++++++++++++---------------------- 1 file changed, 104 insertions(+), 81 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 139bbe2..6401bfa 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -107,11 +107,10 @@ struct crypt_config { struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; - /* - * crypto related data - */ + char *cipher; + char *cipher_mode; + struct crypt_iv_operations *iv_gen_ops; - char *iv_mode; union { struct iv_essiv_private essiv; struct iv_benbi_private benbi; @@ -135,8 +134,6 @@ struct crypt_config { unsigned int dmreq_start; struct ablkcipher_request *req; - char cipher[CRYPTO_MAX_ALG_NAME]; - char chainmode[CRYPTO_MAX_ALG_NAME]; struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; @@ -1032,90 +1029,102 @@ static void crypt_dtr(struct dm_target *ti) if (cc->dev) dm_put_device(ti, cc->dev); - kfree(cc->iv_mode); + kzfree(cc->cipher); + kzfree(cc->cipher_mode); /* Must zero key material before freeing */ kzfree(cc); } -/* - * Construct an encryption mapping: - * - */ -static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +static int crypt_ctr_cipher(struct dm_target *ti, + char *cipher_in, char *key) { - struct crypt_config *cc; - char *tmp; - char *cipher; - char *chainmode; - char *ivmode; - char *ivopts; - unsigned int key_size; - unsigned long long tmpll; + struct crypt_config *cc = ti->private; + char *tmp, *cipher, *chainmode, *ivmode, *ivopts; + char *cipher_api = NULL; int ret = -EINVAL; - if (argc != 5) { - ti->error = "Not enough arguments"; + /* Convert to crypto api definition? */ + if (strchr(cipher_in, '(')) { + ti->error = "Bad cipher specification"; return -EINVAL; } - tmp = argv[0]; + /* + * Legacy dm-crypt cipher specification + * cipher-mode-iv:ivopts + */ + tmp = cipher_in; cipher = strsep(&tmp, "-"); + + cc->cipher = kstrdup(cipher, GFP_KERNEL); + if (!cc->cipher) + goto bad_mem; + + if (tmp) { + cc->cipher_mode = kstrdup(tmp, GFP_KERNEL); + if (!cc->cipher_mode) + goto bad_mem; + } + chainmode = strsep(&tmp, "-"); ivopts = strsep(&tmp, "-"); ivmode = strsep(&ivopts, ":"); if (tmp) - DMWARN("Unexpected additional cipher options"); + DMWARN("Ignoring unexpected additional cipher options"); - key_size = strlen(argv[1]) >> 1; - - cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); - if (!cc) { - ti->error = "Cannot allocate transparent encryption context"; - return -ENOMEM; - } - - ti->private = cc; - - /* Compatibility mode for old dm-crypt cipher strings */ - if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { + /* Compatibility mode for old dm-crypt mappings */ + if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { + kfree(cc->cipher_mode); + cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL); chainmode = "cbc"; ivmode = "plain"; } if (strcmp(chainmode, "ecb") && !ivmode) { - ti->error = "This chaining mode requires an IV mechanism"; - goto bad; + ti->error = "IV mechanism required"; + return -EINVAL; } - ret = -ENOMEM; - if (snprintf(cc->cipher, CRYPTO_MAX_ALG_NAME, "%s(%s)", - chainmode, cipher) >= CRYPTO_MAX_ALG_NAME) { - ti->error = "Chain mode + cipher name is too long"; - goto bad; + cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL); + if (!cipher_api) + goto bad_mem; + + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "%s(%s)", chainmode, cipher); + if (ret < 0) { + kfree(cipher_api); + goto bad_mem; } - cc->tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); + /* Allocate cipher */ + cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); if (IS_ERR(cc->tfm)) { + ret = PTR_ERR(cc->tfm); ti->error = "Error allocating crypto tfm"; goto bad; } - strcpy(cc->cipher, cipher); - strcpy(cc->chainmode, chainmode); - - ret = crypt_set_key(cc, argv[1]); + /* Initialize and set key */ + ret = crypt_set_key(cc, key); if (ret < 0) { ti->error = "Error decoding and setting key"; goto bad; } - /* - * Choose ivmode. Valid modes: "plain", "essiv:", "benbi". - * See comments at iv code - */ - ret = -EINVAL; + /* Initialize IV */ + cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); + if (cc->iv_size) + /* at least a 64 bit sector number should fit in our buffer */ + cc->iv_size = max(cc->iv_size, + (unsigned int)(sizeof(u64) / sizeof(u8))); + else if (ivmode) { + DMWARN("Selected cipher does not support IVs"); + ivmode = NULL; + } + + /* Choose ivmode, see comments at iv code. */ if (ivmode == NULL) cc->iv_gen_ops = NULL; else if (strcmp(ivmode, "plain") == 0) @@ -1129,6 +1138,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) else if (strcmp(ivmode, "null") == 0) cc->iv_gen_ops = &crypt_iv_null_ops; else { + ret = -EINVAL; ti->error = "Invalid IV mode"; goto bad; } @@ -1151,20 +1161,45 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } } - cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); - if (cc->iv_size) - /* at least a 64 bit sector number should fit in our buffer */ - cc->iv_size = max(cc->iv_size, - (unsigned int)(sizeof(u64) / sizeof(u8))); - else { - if (cc->iv_gen_ops) { - DMWARN("Selected cipher does not support IVs"); - if (cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); - cc->iv_gen_ops = NULL; - } + ret = 0; +bad: + kfree(cipher_api); + return ret; + +bad_mem: + ti->error = "Cannot allocate cipher strings"; + return -ENOMEM; +} + +/* + * Construct an encryption mapping: + * + */ +static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct crypt_config *cc; + unsigned int key_size; + unsigned long long tmpll; + int ret; + + if (argc != 5) { + ti->error = "Not enough arguments"; + return -EINVAL; } + key_size = strlen(argv[1]) >> 1; + + cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); + if (!cc) { + ti->error = "Cannot allocate encryption context"; + return -ENOMEM; + } + + ti->private = cc; + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); + if (ret < 0) + goto bad; + ret = -ENOMEM; cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); if (!cc->io_pool) { @@ -1217,17 +1252,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->start = tmpll; ret = -ENOMEM; - if (ivmode && cc->iv_gen_ops) { - if (ivopts) - *(ivopts - 1) = ':'; - cc->iv_mode = kstrdup(ivmode, GFP_KERNEL); - if (!cc->iv_mode) { - ti->error = "Error kmallocing iv_mode string"; - goto bad; - } - } else - cc->iv_mode = NULL; - cc->io_queue = create_singlethread_workqueue("kcryptd_io"); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; @@ -1273,7 +1297,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, static int crypt_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { - struct crypt_config *cc = (struct crypt_config *) ti->private; + struct crypt_config *cc = ti->private; unsigned int sz = 0; switch (type) { @@ -1282,11 +1306,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - if (cc->iv_mode) - DMEMIT("%s-%s-%s ", cc->cipher, cc->chainmode, - cc->iv_mode); + if (cc->cipher_mode) + DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode); else - DMEMIT("%s-%s ", cc->cipher, cc->chainmode); + DMEMIT("%s ", cc->cipher); if (cc->key_size > 0) { if ((maxlen - sz) < ((cc->key_size << 1) + 1)) -- cgit v1.1 From 5ae89a8720c28caf35c4e53711d77df2856c404e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:08 +0100 Subject: dm: linear support discard Allow discards to be passed through to linear mappings if at least one underlying device supports it. Discards will be forwarded only to devices that support them. A target that supports discards should set num_discard_requests to indicate how many times each discard request must be submitted to it. Verify table's underlying devices support discards prior to setting the associated DM device as capable of discards (via QUEUE_FLAG_DISCARD). Signed-off-by: Mike Snitzer Signed-off-by: Mikulas Patocka Reviewed-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-linear.c | 1 + drivers/md/dm-table.c | 44 +++++++++++++++++++++++++++++ drivers/md/dm.c | 65 +++++++++++++++++++++++++++++++++++-------- drivers/md/dm.h | 1 + include/linux/device-mapper.h | 6 ++++ 5 files changed, 105 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 9200dbf..f043b5f 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ti->private = lc; return 0; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index bc60ef7..f9fc07d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -54,6 +54,8 @@ struct dm_table { sector_t *highs; struct dm_target *targets; + unsigned discards_supported:1; + /* * Indicates the rw permissions for the new logical * device. This should be a combination of FMODE_READ @@ -203,6 +205,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); atomic_set(&t->holders, 0); + t->discards_supported = 1; if (!num_targets) num_targets = KEYS_PER_NODE; @@ -770,6 +773,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + if (!tgt->num_discard_requests) + t->discards_supported = 0; + return 0; bad: @@ -1135,6 +1141,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); + if (!dm_table_supports_discards(t)) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + dm_table_set_integrity(t); /* @@ -1281,6 +1292,39 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) return t->md; } +static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_discard(q); +} + +bool dm_table_supports_discards(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + if (!t->discards_supported) + return 0; + + /* + * Ensure that at least one underlying device supports discards. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting discard must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_discard_capable, NULL)) + return 1; + } + + return 0; +} + EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0d47101..44aba29 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1212,6 +1212,53 @@ static int __clone_and_map_empty_barrier(struct clone_info *ci) return 0; } +/* + * Perform all io with a single clone. + */ +static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) +{ + struct bio *clone, *bio = ci->bio; + struct dm_target_io *tio; + + tio = alloc_tio(ci, ti); + clone = clone_bio(bio, ci->sector, ci->idx, + bio->bi_vcnt - ci->idx, ci->sector_count, + ci->md->bs); + __map_bio(ti, clone, tio); + ci->sector_count = 0; +} + +static int __clone_and_map_discard(struct clone_info *ci) +{ + struct dm_target *ti; + sector_t max; + + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; + + /* + * Even though the device advertised discard support, + * reconfiguration might have changed that since the + * check was performed. + */ + + if (!ti->num_discard_requests) + return -EOPNOTSUPP; + + max = max_io_len(ci->md, ci->sector, ti); + + if (ci->sector_count > max) + /* + * FIXME: Handle a discard that spans two or more targets. + */ + return -EOPNOTSUPP; + + __clone_and_map_simple(ci, ti); + + return 0; +} + static int __clone_and_map(struct clone_info *ci) { struct bio *clone, *bio = ci->bio; @@ -1222,27 +1269,21 @@ static int __clone_and_map(struct clone_info *ci) if (unlikely(bio_empty_barrier(bio))) return __clone_and_map_empty_barrier(ci); + if (unlikely(bio->bi_rw & REQ_DISCARD)) + return __clone_and_map_discard(ci); + ti = dm_table_find_target(ci->map, ci->sector); if (!dm_target_is_valid(ti)) return -EIO; max = max_io_len(ci->md, ci->sector, ti); - /* - * Allocate a target io object. - */ - tio = alloc_tio(ci, ti); - if (ci->sector_count <= max) { /* * Optimise for the simple case where we can do all of * the remaining io with a single clone. */ - clone = clone_bio(bio, ci->sector, ci->idx, - bio->bi_vcnt - ci->idx, ci->sector_count, - ci->md->bs); - __map_bio(ti, clone, tio); - ci->sector_count = 0; + __clone_and_map_simple(ci, ti); } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { /* @@ -1263,6 +1304,7 @@ static int __clone_and_map(struct clone_info *ci) len += bv_len; } + tio = alloc_tio(ci, ti); clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, ci->md->bs); __map_bio(ti, clone, tio); @@ -1286,12 +1328,11 @@ static int __clone_and_map(struct clone_info *ci) return -EIO; max = max_io_len(ci->md, ci->sector, ti); - - tio = alloc_tio(ci, ti); } len = min(remaining, max); + tio = alloc_tio(ci, ti); clone = split_bvec(bio, ci->sector, ci->idx, bv->bv_offset + offset, len, ci->md->bs); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 0d7b374..0c2dd5f 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -61,6 +61,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits); int dm_table_any_busy_target(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); +bool dm_table_supports_discards(struct dm_table *t); int dm_table_alloc_md_mempools(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 531a6f2..751ce21 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -180,6 +180,12 @@ struct dm_target { */ unsigned num_flush_requests; + /* + * The number of discard requests that will be submitted to the + * target. map_info->request_nr is used just like num_flush_requests. + */ + unsigned num_discard_requests; + /* target specific data */ void *private; -- cgit v1.1 From 06a426cee9b35505aeb7516a67bd26496ca7ed08 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:09 +0100 Subject: dm: use common __issue_target_request for flush and discard support Rename __flush_target to __issue_target_request now that it is used to issue both flush and discard requests. Introduce __issue_target_requests as a convenient wrapper to __issue_target_request 'num_flush_requests' or 'num_discard_requests' times per target. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 44aba29..3dd846e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1182,30 +1182,42 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, return tio; } -static void __flush_target(struct clone_info *ci, struct dm_target *ti, - unsigned request_nr) +static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, + unsigned request_nr) { struct dm_target_io *tio = alloc_tio(ci, ti); struct bio *clone; tio->info.target_request_nr = request_nr; - clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); + /* + * Discard requests require the bio's inline iovecs be initialized. + * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush + * and discard, so no need for concern about wasted bvec allocations. + */ + clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); __bio_clone(clone, ci->bio); clone->bi_destructor = dm_bio_destructor; __map_bio(ti, clone, tio); } +static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, + unsigned num_requests) +{ + unsigned request_nr; + + for (request_nr = 0; request_nr < num_requests; request_nr++) + __issue_target_request(ci, ti, request_nr); +} + static int __clone_and_map_empty_barrier(struct clone_info *ci) { - unsigned target_nr = 0, request_nr; + unsigned target_nr = 0; struct dm_target *ti; while ((ti = dm_table_get_target(ci->map, target_nr++))) - for (request_nr = 0; request_nr < ti->num_flush_requests; - request_nr++) - __flush_target(ci, ti, request_nr); + __issue_target_requests(ci, ti, ti->num_flush_requests); ci->sector_count = 0; @@ -1254,7 +1266,9 @@ static int __clone_and_map_discard(struct clone_info *ci) */ return -EOPNOTSUPP; - __clone_and_map_simple(ci, ti); + __issue_target_requests(ci, ti, ti->num_discard_requests); + + ci->sector_count = 0; return 0; } -- cgit v1.1 From 56a67df766039666f61fb15b079f713e44a735ae Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:10 +0100 Subject: dm: factor out max_io_len_target_boundary Split max_io_len_target_boundary out of max_io_len so that the discard support can make use of it without duplicating max_io_len code. Avoiding max_io_len's split_io logic enables DM's discard support to submit the entire discard request to a target. But discards must still be split on target boundaries. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 26 ++++++++++++++++++-------- include/linux/device-mapper.h | 6 ++++++ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3dd846e..561313a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1030,17 +1030,27 @@ static void end_clone_request(struct request *clone, int error) dm_complete_request(clone, error); } -static sector_t max_io_len(struct mapped_device *md, - sector_t sector, struct dm_target *ti) +/* + * Return maximum size of I/O possible at the supplied sector up to the current + * target boundary. + */ +static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) +{ + sector_t target_offset = dm_target_offset(ti, sector); + + return ti->len - target_offset; +} + +static sector_t max_io_len(sector_t sector, struct dm_target *ti) { - sector_t offset = sector - ti->begin; - sector_t len = ti->len - offset; + sector_t len = max_io_len_target_boundary(sector, ti); /* * Does the target need to split even further ? */ if (ti->split_io) { sector_t boundary; + sector_t offset = dm_target_offset(ti, sector); boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) - offset; if (len > boundary) @@ -1258,7 +1268,7 @@ static int __clone_and_map_discard(struct clone_info *ci) if (!ti->num_discard_requests) return -EOPNOTSUPP; - max = max_io_len(ci->md, ci->sector, ti); + max = max_io_len(ci->sector, ti); if (ci->sector_count > max) /* @@ -1290,7 +1300,7 @@ static int __clone_and_map(struct clone_info *ci) if (!dm_target_is_valid(ti)) return -EIO; - max = max_io_len(ci->md, ci->sector, ti); + max = max_io_len(ci->sector, ti); if (ci->sector_count <= max) { /* @@ -1341,7 +1351,7 @@ static int __clone_and_map(struct clone_info *ci) if (!dm_target_is_valid(ti)) return -EIO; - max = max_io_len(ci->md, ci->sector, ti); + max = max_io_len(ci->sector, ti); } len = min(remaining, max); @@ -1428,7 +1438,7 @@ static int dm_merge_bvec(struct request_queue *q, /* * Find maximum amount of I/O that won't need splitting */ - max_sectors = min(max_io_len(md, bvm->bi_sector, ti), + max_sectors = min(max_io_len(bvm->bi_sector, ti), (sector_t) BIO_MAX_SECTORS); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; if (max_size < 0) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 751ce21..2970022 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -398,6 +398,12 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); #define dm_array_too_big(fixed, obj, num) \ ((num) > (UINT_MAX - (fixed)) / (obj)) +/* + * Sector offset taken relative to the start of the target instead of + * relative to the start of the device. + */ +#define dm_target_offset(ti, sector) ((sector) - (ti)->begin) + static inline sector_t to_sector(unsigned long n) { return (n >> SECTOR_SHIFT); -- cgit v1.1 From b441a262e7d1c56fbe21794c91d7a9c83809113f Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Thu, 12 Aug 2010 04:14:11 +0100 Subject: dm: use dm_target_offset macro Use new dm_target_offset() macro to avoid most references to ti->begin in dm targets. Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 4 ++-- drivers/md/dm-delay.c | 5 ++--- drivers/md/dm-linear.c | 2 +- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-stripe.c | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6401bfa..368e8e9 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1284,7 +1284,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, return DM_MAPIO_REMAPPED; } - io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); + io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); if (bio_data_dir(io->base_bio) == READ) kcryptd_queue_io(io); @@ -1406,7 +1406,7 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return max_size; bvm->bi_bdev = cc->dev->bdev; - bvm->bi_sector = cc->start + bvm->bi_sector - ti->begin; + bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector); return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 8520528..a91049e 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -281,14 +281,13 @@ static int delay_map(struct dm_target *ti, struct bio *bio, bio->bi_bdev = dc->dev_write->bdev; if (bio_sectors(bio)) bio->bi_sector = dc->start_write + - (bio->bi_sector - ti->begin); + dm_target_offset(ti, bio->bi_sector); return delay_bio(dc, dc->write_delay, bio); } bio->bi_bdev = dc->dev_read->bdev; - bio->bi_sector = dc->start_read + - (bio->bi_sector - ti->begin); + bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector); return delay_bio(dc, dc->read_delay, bio); } diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index f043b5f..3921e3b 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -74,7 +74,7 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) { struct linear_c *lc = ti->private; - return lc->start + (bi_sector - ti->begin); + return lc->start + dm_target_offset(ti, bi_sector); } static void linear_map_bio(struct dm_target *ti, struct bio *bio) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 7413626..7c081bc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -445,7 +445,7 @@ static sector_t map_sector(struct mirror *m, struct bio *bio) { if (unlikely(!bio->bi_size)) return 0; - return m->offset + (bio->bi_sector - m->ms->ti->begin); + return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector); } static void map_bio(struct mirror *m, struct bio *bio) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 22d5e2f..779f47a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -222,7 +222,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, return DM_MAPIO_REMAPPED; } - offset = bio->bi_sector - ti->begin; + offset = dm_target_offset(ti, bio->bi_sector); chunk = offset >> sc->chunk_shift; stripe = sector_div(chunk, sc->stripes); -- cgit v1.1 From f8facb61b5095488a4d78fa78116ef4f4b82bc4d Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:12 +0100 Subject: dm: zero silently drop discards Have the zero target silently drop a discard rather than fail the request with -EOPNOTSUPP. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-zero.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index bbc9703..cc2b3cb 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -22,6 +22,11 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } + /* + * Silently drop discards, avoiding -EOPNOTSUPP. + */ + ti->num_discard_requests = 1; + return 0; } -- cgit v1.1 From 3fd5d48027181168ce85e8094b926aeb9f34c556 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:13 +0100 Subject: dm delay: support discard Enable discard support for the delay target. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-delay.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index a91049e..baa1191 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -198,6 +198,7 @@ out: atomic_set(&dc->may_delay, 1); ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ti->private = dc; return 0; -- cgit v1.1 From 38e1b257fd7b4f3eee667d29a5e44ec15e253c1c Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:14 +0100 Subject: dm: error return error for discards Have the error target respond to a discard request with a hard -EIO rather than fail the request with -EOPNOTSUPP. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-target.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 11dea11..8da366c 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -113,6 +113,11 @@ void dm_unregister_target(struct target_type *tt) */ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) { + /* + * Return error for discards instead of -EOPNOTSUPP + */ + tt->num_discard_requests = 1; + return 0; } -- cgit v1.1 From 65988525abde0b0a5833c4e20f32967184a5dcf0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 12 Aug 2010 04:14:14 +0100 Subject: dm stripe: move sector translation to a function Move sector to stripe translation into a function. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 779f47a..e2ad1ae 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -207,11 +207,20 @@ static void stripe_dtr(struct dm_target *ti) kfree(sc); } +static void stripe_map_sector(struct stripe_c *sc, sector_t sector, + uint32_t *stripe, sector_t *result) +{ + sector_t offset = dm_target_offset(sc->ti, sector); + sector_t chunk = offset >> sc->chunk_shift; + + *stripe = sector_div(chunk, sc->stripes); + *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); +} + static int stripe_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { - struct stripe_c *sc = (struct stripe_c *) ti->private; - sector_t offset, chunk; + struct stripe_c *sc = ti->private; uint32_t stripe; unsigned target_request_nr; @@ -222,13 +231,11 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, return DM_MAPIO_REMAPPED; } - offset = dm_target_offset(ti, bio->bi_sector); - chunk = offset >> sc->chunk_shift; - stripe = sector_div(chunk, sc->stripes); + stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); + bio->bi_sector += sc->stripe[stripe].physical_start; bio->bi_bdev = sc->stripe[stripe].dev->bdev; - bio->bi_sector = sc->stripe[stripe].physical_start + - (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); + return DM_MAPIO_REMAPPED; } -- cgit v1.1 From c96053b767d494d7c30e2be68097ac9defa9403f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 12 Aug 2010 04:14:21 +0100 Subject: dm stripe: optimize sector division Optimize sector division: If the number of stripes is a power of two, we can do shift and mask instead of division. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index e2ad1ae..1559fdf 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -25,6 +25,8 @@ struct stripe { struct stripe_c { uint32_t stripes; + int stripes_shift; + sector_t stripes_mask; /* The size of this target / num. stripes */ sector_t stripe_width; @@ -162,16 +164,21 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Set pointer to dm target; used in trigger_event */ sc->ti = ti; - sc->stripes = stripes; sc->stripe_width = width; + + if (stripes & (stripes - 1)) + sc->stripes_shift = -1; + else { + sc->stripes_shift = ffs(stripes) - 1; + sc->stripes_mask = ((sector_t) stripes) - 1; + } + ti->split_io = chunk_size; ti->num_flush_requests = stripes; + sc->chunk_shift = ffs(chunk_size) - 1; sc->chunk_mask = ((sector_t) chunk_size) - 1; - for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) - chunk_size >>= 1; - sc->chunk_shift--; /* * Get the stripe destinations. @@ -213,7 +220,13 @@ static void stripe_map_sector(struct stripe_c *sc, sector_t sector, sector_t offset = dm_target_offset(sc->ti, sector); sector_t chunk = offset >> sc->chunk_shift; - *stripe = sector_div(chunk, sc->stripes); + if (sc->stripes_shift < 0) + *stripe = sector_div(chunk, sc->stripes); + else { + *stripe = chunk & sc->stripes_mask; + chunk >>= sc->stripes_shift; + } + *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); } -- cgit v1.1 From a79245b3e5669dc203fec63644d988c451fe55d5 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:24 +0100 Subject: dm: split discard requests on target boundaries Update __clone_and_map_discard to loop across all targets in a DM device's table when it processes a discard bio. If a discard crosses a target boundary it must be split accordingly. Update __issue_target_requests and __issue_target_request to allow a cloned discard bio to have a custom start sector and size. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 561313a..ac384b2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1193,7 +1193,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, } static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, - unsigned request_nr) + unsigned request_nr, sector_t len) { struct dm_target_io *tio = alloc_tio(ci, ti); struct bio *clone; @@ -1208,17 +1208,21 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); __bio_clone(clone, ci->bio); clone->bi_destructor = dm_bio_destructor; + if (len) { + clone->bi_sector = ci->sector; + clone->bi_size = to_bytes(len); + } __map_bio(ti, clone, tio); } static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, - unsigned num_requests) + unsigned num_requests, sector_t len) { unsigned request_nr; for (request_nr = 0; request_nr < num_requests; request_nr++) - __issue_target_request(ci, ti, request_nr); + __issue_target_request(ci, ti, request_nr, len); } static int __clone_and_map_empty_barrier(struct clone_info *ci) @@ -1227,7 +1231,7 @@ static int __clone_and_map_empty_barrier(struct clone_info *ci) struct dm_target *ti; while ((ti = dm_table_get_target(ci->map, target_nr++))) - __issue_target_requests(ci, ti, ti->num_flush_requests); + __issue_target_requests(ci, ti, ti->num_flush_requests, 0); ci->sector_count = 0; @@ -1253,32 +1257,27 @@ static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) static int __clone_and_map_discard(struct clone_info *ci) { struct dm_target *ti; - sector_t max; + sector_t len; - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; - - /* - * Even though the device advertised discard support, - * reconfiguration might have changed that since the - * check was performed. - */ + do { + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; - if (!ti->num_discard_requests) - return -EOPNOTSUPP; - - max = max_io_len(ci->sector, ti); - - if (ci->sector_count > max) /* - * FIXME: Handle a discard that spans two or more targets. + * Even though the device advertised discard support, + * reconfiguration might have changed that since the + * check was performed. */ - return -EOPNOTSUPP; + if (!ti->num_discard_requests) + return -EOPNOTSUPP; - __issue_target_requests(ci, ti, ti->num_discard_requests); + len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); - ci->sector_count = 0; + __issue_target_requests(ci, ti, ti->num_discard_requests, len); + + ci->sector += len; + } while (ci->sector_count -= len); return 0; } -- cgit v1.1 From 7b76ec11fec40203836b488496d2df082d5b2022 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 12 Aug 2010 04:14:26 +0100 Subject: dm stripe: support discards The DM core will submit a discard bio to the stripe target for each stripe in a striped DM device. The stripe target will determine stripe-specific portions of the supplied bio to be remapped into individual (at most 'num_discard_requests' extents). If a given stripe-specific discard bio doesn't touch a particular stripe the bio will be dropped. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 1559fdf..c297f6d 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -176,6 +176,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->split_io = chunk_size; ti->num_flush_requests = stripes; + ti->num_discard_requests = stripes; sc->chunk_shift = ffs(chunk_size) - 1; sc->chunk_mask = ((sector_t) chunk_size) - 1; @@ -230,6 +231,39 @@ static void stripe_map_sector(struct stripe_c *sc, sector_t sector, *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); } +static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, + uint32_t target_stripe, sector_t *result) +{ + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, result); + if (stripe == target_stripe) + return; + *result &= ~sc->chunk_mask; /* round down */ + if (target_stripe < stripe) + *result += sc->chunk_mask + 1; /* next chunk */ +} + +static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, + uint32_t target_stripe) +{ + sector_t begin, end; + + stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); + stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio), + target_stripe, &end); + if (begin < end) { + bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; + bio->bi_sector = begin + sc->stripe[target_stripe].physical_start; + bio->bi_size = to_bytes(end - begin); + return DM_MAPIO_REMAPPED; + } else { + /* The range doesn't map to the target stripe */ + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } +} + static int stripe_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { @@ -243,6 +277,11 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; return DM_MAPIO_REMAPPED; } + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + target_request_nr = map_context->target_request_nr; + BUG_ON(target_request_nr >= sc->stripes); + return stripe_map_discard(sc, bio, target_request_nr); + } stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); -- cgit v1.1 From 959eb4e5592cc0b0b07db0ca30d2b1efd790020f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Aug 2010 04:14:32 +0100 Subject: dm mpath: support discard Enable discard support in the DM multipath target. This discard support depends on a few discard-specific fixes to the block layer's request stacking driver methods. Discard requests are optional so don't allow a failed discard to trigger path failures. If there is a real problem with a given path the barriers associated with the discard (either before or after the discard) will cause path failure. That said, unconditionally passing discard failures up the stack is not ideal. This must be fixed once DM has more information about the nature of the underlying storage failure. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon Cc: Kiyoshi Ueda --- drivers/md/dm-mpath.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index da2223a..487ecda 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -893,6 +893,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, } ti->num_flush_requests = 1; + ti->num_discard_requests = 1; return 0; @@ -1272,6 +1273,15 @@ static int do_end_io(struct multipath *m, struct request *clone, if (error == -EOPNOTSUPP) return error; + if (clone->cmd_flags & REQ_DISCARD) + /* + * Pass all discard request failures up. + * FIXME: only fail_path if the discard failed due to a + * transport problem. This requires precise understanding + * of the underlying failure (e.g. the SCSI sense). + */ + return error; + if (mpio->pgpath) fail_path(mpio->pgpath); -- cgit v1.1