diff options
author | Dan Williams <dan.j.williams@intel.com> | 2009-09-08 17:55:21 -0700 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2009-09-08 17:55:21 -0700 |
commit | bbb20089a3275a19e475dbc21320c3742e3ca423 (patch) | |
tree | 216fdc1cbef450ca688135c5b8969169482d9a48 /drivers/md | |
parent | 3e48e656903e9fd8bc805c6a2c4264d7808d315b (diff) | |
parent | 657a77fa7284d8ae28dfa48f1dc5d919bf5b2843 (diff) | |
download | op-kernel-dev-bbb20089a3275a19e475dbc21320c3742e3ca423.zip op-kernel-dev-bbb20089a3275a19e475dbc21320c3742e3ca423.tar.gz |
Merge branch 'dmaengine' into async-tx-next
Conflicts:
crypto/async_tx/async_xor.c
drivers/dma/ioat/dma_v2.h
drivers/dma/ioat/pci.c
drivers/md/raid5.c
Diffstat (limited to 'drivers/md')
43 files changed, 3987 insertions, 1098 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 09c0c6e..2158377 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -257,6 +257,17 @@ config DM_MIRROR Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. +config DM_LOG_USERSPACE + tristate "Mirror userspace logging (EXPERIMENTAL)" + depends on DM_MIRROR && EXPERIMENTAL && NET + select CONNECTOR + ---help--- + The userspace logging module provides a mechanism for + relaying the dm-dirty-log API to userspace. Log designs + which are more suited to userspace implementation (e.g. + shared storage logs) or experimental logs can be implemented + by leveraging this framework. + config DM_ZERO tristate "Zero target" depends on BLK_DEV_DM @@ -275,6 +286,25 @@ config DM_MULTIPATH ---help--- Allow volume managers to support multipath hardware. +config DM_MULTIPATH_QL + tristate "I/O Path Selector based on the number of in-flight I/Os" + depends on DM_MULTIPATH + ---help--- + This path selector is a dynamic load balancer which selects + the path with the least number of in-flight I/Os. + + If unsure, say N. + +config DM_MULTIPATH_ST + tristate "I/O Path Selector based on the service time" + depends on DM_MULTIPATH + ---help--- + This path selector is a dynamic load balancer which selects + the path expected to complete the incoming I/O in the shortest + time. + + If unsure, say N. + config DM_DELAY tristate "I/O delaying target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 45cc595..1dc4185 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -8,6 +8,8 @@ dm-multipath-y += dm-path-selector.o dm-mpath.o dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o dm-mirror-y += dm-raid1.o +dm-log-userspace-y \ + += dm-log-userspace-base.o dm-log-userspace-transfer.o md-mod-y += md.o bitmap.o raid456-y += raid5.o raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ @@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o +obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o +obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o +obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o quiet_cmd_unroll = UNROLL $@ diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 56df1ce..3319c2f 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, target = rdev->sb_start + offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev->bdev, target, - roundup(size, bdev_hardsect_size(rdev->bdev)), + roundup(size, bdev_logical_block_size(rdev->bdev)), page, READ)) { page->index = index; attach_page_buffers(page, NULL); /* so that free_buffer will @@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) int size = PAGE_SIZE; if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, - bdev_hardsect_size(rdev->bdev)); + bdev_logical_block_size(rdev->bdev)); /* Just make sure we aren't corrupting data or * metadata */ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 53394e8..9933eb8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_crypt_queue; } + ti->num_flush_requests = 1; ti->private = cc; return 0; @@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { struct dm_crypt_io *io; + struct crypt_config *cc; + + if (unlikely(bio_empty_barrier(bio))) { + cc = ti->private; + bio->bi_bdev = cc->dev->bdev; + return DM_MAPIO_REMAPPED; + } io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); @@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +static int crypt_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct crypt_config *cc = ti->private; + + return fn(ti, cc->dev, cc->start, data); +} + static struct target_type crypt_target = { .name = "crypt", - .version= {1, 6, 0}, + .version = {1, 7, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, @@ -1318,6 +1334,7 @@ static struct target_type crypt_target = { .resume = crypt_resume, .message = crypt_message, .merge = crypt_merge, + .iterate_devices = crypt_iterate_devices, }; static int __init dm_crypt_init(void) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 559dbb5..4e5b843 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -197,6 +197,7 @@ out: mutex_init(&dc->timer_lock); atomic_set(&dc->may_delay, 1); + ti->num_flush_requests = 1; ti->private = dc; return 0; @@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio, if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { bio->bi_bdev = dc->dev_write->bdev; - bio->bi_sector = dc->start_write + - (bio->bi_sector - ti->begin); + if (bio_sectors(bio)) + bio->bi_sector = dc->start_write + + (bio->bi_sector - ti->begin); return delay_bio(dc, dc->write_delay, bio); } @@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type, return 0; } +static int delay_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct delay_c *dc = ti->private; + int ret = 0; + + ret = fn(ti, dc->dev_read, dc->start_read, data); + if (ret) + goto out; + + if (dc->dev_write) + ret = fn(ti, dc->dev_write, dc->start_write, data); + +out: + return ret; +} + static struct target_type delay_target = { .name = "delay", - .version = {1, 0, 2}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = delay_ctr, .dtr = delay_dtr, @@ -326,6 +345,7 @@ static struct target_type delay_target = { .presuspend = delay_presuspend, .resume = delay_resume, .status = delay_status, + .iterate_devices = delay_iterate_devices, }; static int __init dm_delay_init(void) diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index a2e26c2..c3ae515 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store, } /* Validate the chunk size against the device block size */ - if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) { + if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) { *error = "Chunk size is not a multiple of device blocksize"; return -EINVAL; } @@ -216,7 +216,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, return -EINVAL; } - type = get_type(argv[1]); + type = get_type(&persistent); if (!type) { ti->error = "Exception store type not recognised"; r = -EINVAL; diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 0a2e6e7..2442c8c 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -111,7 +111,7 @@ struct dm_exception_store { /* * Funtions to manipulate consecutive chunks */ -# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) +# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) # define DM_CHUNK_CONSECUTIVE_BITS 8 # define DM_CHUNK_NUMBER_BITS 56 @@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) */ static inline sector_t get_dev_size(struct block_device *bdev) { - return bdev->bd_inode->i_size >> SECTOR_SHIFT; + return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; } static inline chunk_t sector_to_chunk(struct dm_exception_store *store, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index e73aabd..3a2e6a2 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -22,6 +22,7 @@ struct dm_io_client { /* FIXME: can we shrink this ? */ struct io { unsigned long error_bits; + unsigned long eopnotsupp_bits; atomic_t count; struct task_struct *sleeper; struct dm_io_client *client; @@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio) *---------------------------------------------------------------*/ static void dec_count(struct io *io, unsigned int region, int error) { - if (error) + if (error) { set_bit(region, &io->error_bits); + if (error == -EOPNOTSUPP) + set_bit(region, &io->eopnotsupp_bits); + } if (atomic_dec_and_test(&io->count)) { if (io->sleeper) @@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, return -EIO; } +retry: io.error_bits = 0; + io.eopnotsupp_bits = 0; atomic_set(&io.count, 1); /* see dispatch_io() */ io.sleeper = current; io.client = client; @@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, } set_current_state(TASK_RUNNING); + if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { + rw &= ~(1 << BIO_RW_BARRIER); + goto retry; + } + if (error_bits) *error_bits = io.error_bits; @@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; + io->eopnotsupp_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = NULL; io->client = client; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 823ceba..7f77f18 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -276,7 +276,7 @@ retry: up_write(&_hash_lock); } -static int dm_hash_rename(const char *old, const char *new) +static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) { char *new_name, *old_name; struct hash_cell *hc; @@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new) dm_table_put(table); } - dm_kobject_uevent(hc->md); + dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); dm_put(hc->md); up_write(&_hash_lock); @@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) __hash_remove(hc); up_write(&_hash_lock); + + dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); + dm_put(md); param->data_size = 0; return 0; @@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) return r; param->data_size = 0; - return dm_hash_rename(param->name, new_name); + return dm_hash_rename(param->event_nr, param->name, new_name); } static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) @@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param) if (dm_suspended(md)) r = dm_resume(md); - if (!r) + + if (!r) { + dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); r = __dev_status(md, param); + } dm_put(md); return r; @@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table, next = spec->next; } + r = dm_table_set_type(table); + if (r) { + DMWARN("unable to set table type"); + return r; + } + return dm_table_complete(table); } @@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size) goto out; } + r = dm_table_alloc_md_mempools(t); + if (r) { + DMWARN("unable to allocate mempools for this table"); + dm_table_destroy(t); + goto out; + } + down_write(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { @@ -1513,6 +1532,7 @@ static const struct file_operations _ctl_fops = { static struct miscdevice _dm_misc = { .minor = MISC_DYNAMIC_MINOR, .name = DM_NAME, + .devnode = "mapper/control", .fops = &_ctl_fops }; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 79fb53e..9184b6de 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + ti->num_flush_requests = 1; ti->private = lc; return 0; @@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) struct linear_c *lc = ti->private; bio->bi_bdev = lc->dev->bdev; - bio->bi_sector = linear_map_sector(ti, bio->bi_sector); + if (bio_sectors(bio)) + bio->bi_sector = linear_map_sector(ti, bio->bi_sector); } static int linear_map(struct dm_target *ti, struct bio *bio, @@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +static int linear_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct linear_c *lc = ti->private; + + return fn(ti, lc->dev, lc->start, data); +} + static struct target_type linear_target = { .name = "linear", - .version= {1, 0, 3}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, @@ -142,6 +152,7 @@ static struct target_type linear_target = { .status = linear_status, .ioctl = linear_ioctl, .merge = linear_merge, + .iterate_devices = linear_iterate_devices, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c new file mode 100644 index 0000000..e69b965 --- /dev/null +++ b/drivers/md/dm-log-userspace-base.c @@ -0,0 +1,696 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#include <linux/bio.h> +#include <linux/dm-dirty-log.h> +#include <linux/device-mapper.h> +#include <linux/dm-log-userspace.h> + +#include "dm-log-userspace-transfer.h" + +struct flush_entry { + int type; + region_t region; + struct list_head list; +}; + +struct log_c { + struct dm_target *ti; + uint32_t region_size; + region_t region_count; + char uuid[DM_UUID_LEN]; + + char *usr_argv_str; + uint32_t usr_argc; + + /* + * in_sync_hint gets set when doing is_remote_recovering. It + * represents the first region that needs recovery. IOW, the + * first zero bit of sync_bits. This can be useful for to limit + * traffic for calls like is_remote_recovering and get_resync_work, + * but be take care in its use for anything else. + */ + uint64_t in_sync_hint; + + spinlock_t flush_lock; + struct list_head flush_list; /* only for clear and mark requests */ +}; + +static mempool_t *flush_entry_pool; + +static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct flush_entry), gfp_mask); +} + +static void flush_entry_free(void *element, void *pool_data) +{ + kfree(element); +} + +static int userspace_do_request(struct log_c *lc, const char *uuid, + int request_type, char *data, size_t data_size, + char *rdata, size_t *rdata_size) +{ + int r; + + /* + * If the server isn't there, -ESRCH is returned, + * and we must keep trying until the server is + * restored. + */ +retry: + r = dm_consult_userspace(uuid, request_type, data, + data_size, rdata, rdata_size); + + if (r != -ESRCH) + return r; + + DMERR(" Userspace log server not found."); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(2*HZ); + DMWARN("Attempting to contact userspace log server..."); + r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str, + strlen(lc->usr_argv_str) + 1, + NULL, NULL); + if (!r) + break; + } + DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); + r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL, + 0, NULL, NULL); + if (!r) + goto retry; + + DMERR("Error trying to resume userspace log: %d", r); + + return -ESRCH; +} + +static int build_constructor_string(struct dm_target *ti, + unsigned argc, char **argv, + char **ctr_str) +{ + int i, str_size; + char *str = NULL; + + *ctr_str = NULL; + + for (i = 0, str_size = 0; i < argc; i++) + str_size += strlen(argv[i]) + 1; /* +1 for space between args */ + + str_size += 20; /* Max number of chars in a printed u64 number */ + + str = kzalloc(str_size, GFP_KERNEL); + if (!str) { + DMWARN("Unable to allocate memory for constructor string"); + return -ENOMEM; + } + + for (i = 0, str_size = 0; i < argc; i++) + str_size += sprintf(str + str_size, "%s ", argv[i]); + str_size += sprintf(str + str_size, "%llu", + (unsigned long long)ti->len); + + *ctr_str = str; + return str_size; +} + +/* + * userspace_ctr + * + * argv contains: + * <UUID> <other args> + * Where 'other args' is the userspace implementation specific log + * arguments. An example might be: + * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync] + * + * So, this module will strip off the <UUID> for identification purposes + * when communicating with userspace about a log; but will pass on everything + * else. + */ +static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, + unsigned argc, char **argv) +{ + int r = 0; + int str_size; + char *ctr_str = NULL; + struct log_c *lc = NULL; + uint64_t rdata; + size_t rdata_size = sizeof(rdata); + + if (argc < 3) { + DMWARN("Too few arguments to userspace dirty log"); + return -EINVAL; + } + + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (!lc) { + DMWARN("Unable to allocate userspace log context."); + return -ENOMEM; + } + + lc->ti = ti; + + if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { + DMWARN("UUID argument too long."); + kfree(lc); + return -EINVAL; + } + + strncpy(lc->uuid, argv[0], DM_UUID_LEN); + spin_lock_init(&lc->flush_lock); + INIT_LIST_HEAD(&lc->flush_list); + + str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); + if (str_size < 0) { + kfree(lc); + return str_size; + } + + /* Send table string */ + r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR, + ctr_str, str_size, NULL, NULL); + + if (r == -ESRCH) { + DMERR("Userspace log server not found"); + goto out; + } + + /* Since the region size does not change, get it now */ + rdata_size = sizeof(rdata); + r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE, + NULL, 0, (char *)&rdata, &rdata_size); + + if (r) { + DMERR("Failed to get region size of dirty log"); + goto out; + } + + lc->region_size = (uint32_t)rdata; + lc->region_count = dm_sector_div_up(ti->len, lc->region_size); + +out: + if (r) { + kfree(lc); + kfree(ctr_str); + } else { + lc->usr_argv_str = ctr_str; + lc->usr_argc = argc; + log->context = lc; + } + + return r; +} + +static void userspace_dtr(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR, + NULL, 0, + NULL, NULL); + + kfree(lc->usr_argv_str); + kfree(lc); + + return; +} + +static int userspace_presuspend(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND, + NULL, 0, + NULL, NULL); + + return r; +} + +static int userspace_postsuspend(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND, + NULL, 0, + NULL, NULL); + + return r; +} + +static int userspace_resume(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + lc->in_sync_hint = 0; + r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME, + NULL, 0, + NULL, NULL); + + return r; +} + +static uint32_t userspace_get_region_size(struct dm_dirty_log *log) +{ + struct log_c *lc = log->context; + + return lc->region_size; +} + +/* + * userspace_is_clean + * + * Check whether a region is clean. If there is any sort of + * failure when consulting the server, we return not clean. + * + * Returns: 1 if clean, 0 otherwise + */ +static int userspace_is_clean(struct dm_dirty_log *log, region_t region) +{ + int r; + uint64_t region64 = (uint64_t)region; + int64_t is_clean; + size_t rdata_size; + struct log_c *lc = log->context; + + rdata_size = sizeof(is_clean); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, + (char *)®ion64, sizeof(region64), + (char *)&is_clean, &rdata_size); + + return (r) ? 0 : (int)is_clean; +} + +/* + * userspace_in_sync + * + * Check if the region is in-sync. If there is any sort + * of failure when consulting the server, we assume that + * the region is not in sync. + * + * If 'can_block' is set, return immediately + * + * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK + */ +static int userspace_in_sync(struct dm_dirty_log *log, region_t region, + int can_block) +{ + int r; + uint64_t region64 = region; + int64_t in_sync; + size_t rdata_size; + struct log_c *lc = log->context; + + /* + * We can never respond directly - even if in_sync_hint is + * set. This is because another machine could see a device + * failure and mark the region out-of-sync. If we don't go + * to userspace to ask, we might think the region is in-sync + * and allow a read to pick up data that is stale. (This is + * very unlikely if a device actually fails; but it is very + * likely if a connection to one device from one machine fails.) + * + * There still might be a problem if the mirror caches the region + * state as in-sync... but then this call would not be made. So, + * that is a mirror problem. + */ + if (!can_block) + return -EWOULDBLOCK; + + rdata_size = sizeof(in_sync); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, + (char *)®ion64, sizeof(region64), + (char *)&in_sync, &rdata_size); + return (r) ? 0 : (int)in_sync; +} + +/* + * userspace_flush + * + * This function is ok to block. + * The flush happens in two stages. First, it sends all + * clear/mark requests that are on the list. Then it + * tells the server to commit them. This gives the + * server a chance to optimise the commit, instead of + * doing it for every request. + * + * Additionally, we could implement another thread that + * sends the requests up to the server - reducing the + * load on flush. Then the flush would have less in + * the list and be responsible for the finishing commit. + * + * Returns: 0 on success, < 0 on failure + */ +static int userspace_flush(struct dm_dirty_log *log) +{ + int r = 0; + unsigned long flags; + struct log_c *lc = log->context; + LIST_HEAD(flush_list); + struct flush_entry *fe, *tmp_fe; + + spin_lock_irqsave(&lc->flush_lock, flags); + list_splice_init(&lc->flush_list, &flush_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + if (list_empty(&flush_list)) + return 0; + + /* + * FIXME: Count up requests, group request types, + * allocate memory to stick all requests in and + * send to server in one go. Failing the allocation, + * do it one by one. + */ + + list_for_each_entry(fe, &flush_list, list) { + r = userspace_do_request(lc, lc->uuid, fe->type, + (char *)&fe->region, + sizeof(fe->region), + NULL, NULL); + if (r) + goto fail; + } + + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, + NULL, 0, NULL, NULL); + +fail: + /* + * We can safely remove these entries, even if failure. + * Calling code will receive an error and will know that + * the log facility has failed. + */ + list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + + if (r) + dm_table_event(lc->ti->table); + + return r; +} + +/* + * userspace_mark_region + * + * This function should avoid blocking unless absolutely required. + * (Memory allocation is valid for blocking.) + */ +static void userspace_mark_region(struct dm_dirty_log *log, region_t region) +{ + unsigned long flags; + struct log_c *lc = log->context; + struct flush_entry *fe; + + /* Wait for an allocation, but _never_ fail */ + fe = mempool_alloc(flush_entry_pool, GFP_NOIO); + BUG_ON(!fe); + + spin_lock_irqsave(&lc->flush_lock, flags); + fe->type = DM_ULOG_MARK_REGION; + fe->region = region; + list_add(&fe->list, &lc->flush_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + return; +} + +/* + * userspace_clear_region + * + * This function must not block. + * So, the alloc can't block. In the worst case, it is ok to + * fail. It would simply mean we can't clear the region. + * Does nothing to current sync context, but does mean + * the region will be re-sync'ed on a reload of the mirror + * even though it is in-sync. + */ +static void userspace_clear_region(struct dm_dirty_log *log, region_t region) +{ + unsigned long flags; + struct log_c *lc = log->context; + struct flush_entry *fe; + + /* + * If we fail to allocate, we skip the clearing of + * the region. This doesn't hurt us in any way, except + * to cause the region to be resync'ed when the + * device is activated next time. + */ + fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); + if (!fe) { + DMERR("Failed to allocate memory to clear region."); + return; + } + + spin_lock_irqsave(&lc->flush_lock, flags); + fe->type = DM_ULOG_CLEAR_REGION; + fe->region = region; + list_add(&fe->list, &lc->flush_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + return; +} + +/* + * userspace_get_resync_work + * + * Get a region that needs recovery. It is valid to return + * an error for this function. + * + * Returns: 1 if region filled, 0 if no work, <0 on error + */ +static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) +{ + int r; + size_t rdata_size; + struct log_c *lc = log->context; + struct { + int64_t i; /* 64-bit for mix arch compatibility */ + region_t r; + } pkg; + + if (lc->in_sync_hint >= lc->region_count) + return 0; + + rdata_size = sizeof(pkg); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, + NULL, 0, + (char *)&pkg, &rdata_size); + + *region = pkg.r; + return (r) ? r : (int)pkg.i; +} + +/* + * userspace_set_region_sync + * + * Set the sync status of a given region. This function + * must not fail. + */ +static void userspace_set_region_sync(struct dm_dirty_log *log, + region_t region, int in_sync) +{ + int r; + struct log_c *lc = log->context; + struct { + region_t r; + int64_t i; + } pkg; + + pkg.r = region; + pkg.i = (int64_t)in_sync; + + r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, + (char *)&pkg, sizeof(pkg), + NULL, NULL); + + /* + * It would be nice to be able to report failures. + * However, it is easy emough to detect and resolve. + */ + return; +} + +/* + * userspace_get_sync_count + * + * If there is any sort of failure when consulting the server, + * we assume that the sync count is zero. + * + * Returns: sync count on success, 0 on failure + */ +static region_t userspace_get_sync_count(struct dm_dirty_log *log) +{ + int r; + size_t rdata_size; + uint64_t sync_count; + struct log_c *lc = log->context; + + rdata_size = sizeof(sync_count); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, + NULL, 0, + (char *)&sync_count, &rdata_size); + + if (r) + return 0; + + if (sync_count >= lc->region_count) + lc->in_sync_hint = lc->region_count; + + return (region_t)sync_count; +} + +/* + * userspace_status + * + * Returns: amount of space consumed + */ +static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, + char *result, unsigned maxlen) +{ + int r = 0; + size_t sz = (size_t)maxlen; + struct log_c *lc = log->context; + + switch (status_type) { + case STATUSTYPE_INFO: + r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, + NULL, 0, + result, &sz); + + if (r) { + sz = 0; + DMEMIT("%s 1 COM_FAILURE", log->type->name); + } + break; + case STATUSTYPE_TABLE: + sz = 0; + DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1, + lc->uuid, lc->usr_argv_str); + break; + } + return (r) ? 0 : (int)sz; +} + +/* + * userspace_is_remote_recovering + * + * Returns: 1 if region recovering, 0 otherwise + */ +static int userspace_is_remote_recovering(struct dm_dirty_log *log, + region_t region) +{ + int r; + uint64_t region64 = region; + struct log_c *lc = log->context; + static unsigned long long limit; + struct { + int64_t is_recovering; + uint64_t in_sync_hint; + } pkg; + size_t rdata_size = sizeof(pkg); + + /* + * Once the mirror has been reported to be in-sync, + * it will never again ask for recovery work. So, + * we can safely say there is not a remote machine + * recovering if the device is in-sync. (in_sync_hint + * must be reset at resume time.) + */ + if (region < lc->in_sync_hint) + return 0; + else if (jiffies < limit) + return 1; + + limit = jiffies + (HZ / 4); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, + (char *)®ion64, sizeof(region64), + (char *)&pkg, &rdata_size); + if (r) + return 1; + + lc->in_sync_hint = pkg.in_sync_hint; + + return (int)pkg.is_recovering; +} + +static struct dm_dirty_log_type _userspace_type = { + .name = "userspace", + .module = THIS_MODULE, + .ctr = userspace_ctr, + .dtr = userspace_dtr, + .presuspend = userspace_presuspend, + .postsuspend = userspace_postsuspend, + .resume = userspace_resume, + .get_region_size = userspace_get_region_size, + .is_clean = userspace_is_clean, + .in_sync = userspace_in_sync, + .flush = userspace_flush, + .mark_region = userspace_mark_region, + .clear_region = userspace_clear_region, + .get_resync_work = userspace_get_resync_work, + .set_region_sync = userspace_set_region_sync, + .get_sync_count = userspace_get_sync_count, + .status = userspace_status, + .is_remote_recovering = userspace_is_remote_recovering, +}; + +static int __init userspace_dirty_log_init(void) +{ + int r = 0; + + flush_entry_pool = mempool_create(100, flush_entry_alloc, + flush_entry_free, NULL); + + if (!flush_entry_pool) { + DMWARN("Unable to create flush_entry_pool: No memory."); + return -ENOMEM; + } + + r = dm_ulog_tfr_init(); + if (r) { + DMWARN("Unable to initialize userspace log communications"); + mempool_destroy(flush_entry_pool); + return r; + } + + r = dm_dirty_log_type_register(&_userspace_type); + if (r) { + DMWARN("Couldn't register userspace dirty log type"); + dm_ulog_tfr_exit(); + mempool_destroy(flush_entry_pool); + return r; + } + + DMINFO("version 1.0.0 loaded"); + return 0; +} + +static void __exit userspace_dirty_log_exit(void) +{ + dm_dirty_log_type_unregister(&_userspace_type); + dm_ulog_tfr_exit(); + mempool_destroy(flush_entry_pool); + + DMINFO("version 1.0.0 unloaded"); + return; +} + +module_init(userspace_dirty_log_init); +module_exit(userspace_dirty_log_exit); + +MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); +MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c new file mode 100644 index 0000000..0ca1ee7 --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.c @@ -0,0 +1,276 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <net/sock.h> +#include <linux/workqueue.h> +#include <linux/connector.h> +#include <linux/device-mapper.h> +#include <linux/dm-log-userspace.h> + +#include "dm-log-userspace-transfer.h" + +static uint32_t dm_ulog_seq; + +/* + * Netlink/Connector is an unreliable protocol. How long should + * we wait for a response before assuming it was lost and retrying? + * (If we do receive a response after this time, it will be discarded + * and the response to the resent request will be waited for. + */ +#define DM_ULOG_RETRY_TIMEOUT (15 * HZ) + +/* + * Pre-allocated space for speed + */ +#define DM_ULOG_PREALLOCED_SIZE 512 +static struct cn_msg *prealloced_cn_msg; +static struct dm_ulog_request *prealloced_ulog_tfr; + +static struct cb_id ulog_cn_id = { + .idx = CN_IDX_DM, + .val = CN_VAL_DM_USERSPACE_LOG +}; + +static DEFINE_MUTEX(dm_ulog_lock); + +struct receiving_pkg { + struct list_head list; + struct completion complete; + + uint32_t seq; + + int error; + size_t *data_size; + char *data; +}; + +static DEFINE_SPINLOCK(receiving_list_lock); +static struct list_head receiving_list; + +static int dm_ulog_sendto_server(struct dm_ulog_request *tfr) +{ + int r; + struct cn_msg *msg = prealloced_cn_msg; + + memset(msg, 0, sizeof(struct cn_msg)); + + msg->id.idx = ulog_cn_id.idx; + msg->id.val = ulog_cn_id.val; + msg->ack = 0; + msg->seq = tfr->seq; + msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; + + r = cn_netlink_send(msg, 0, gfp_any()); + + return r; +} + +/* + * Parameters for this function can be either msg or tfr, but not + * both. This function fills in the reply for a waiting request. + * If just msg is given, then the reply is simply an ACK from userspace + * that the request was received. + * + * Returns: 0 on success, -ENOENT on failure + */ +static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) +{ + uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0; + struct receiving_pkg *pkg; + + /* + * The 'receiving_pkg' entries in this list are statically + * allocated on the stack in 'dm_consult_userspace'. + * Each process that is waiting for a reply from the user + * space server will have an entry in this list. + * + * We are safe to do it this way because the stack space + * is unique to each process, but still addressable by + * other processes. + */ + list_for_each_entry(pkg, &receiving_list, list) { + if (rtn_seq != pkg->seq) + continue; + + if (msg) { + pkg->error = -msg->ack; + /* + * If we are trying again, we will need to know our + * storage capacity. Otherwise, along with the + * error code, we make explicit that we have no data. + */ + if (pkg->error != -EAGAIN) + *(pkg->data_size) = 0; + } else if (tfr->data_size > *(pkg->data_size)) { + DMERR("Insufficient space to receive package [%u] " + "(%u vs %lu)", tfr->request_type, + tfr->data_size, *(pkg->data_size)); + + *(pkg->data_size) = 0; + pkg->error = -ENOSPC; + } else { + pkg->error = tfr->error; + memcpy(pkg->data, tfr->data, tfr->data_size); + *(pkg->data_size) = tfr->data_size; + } + complete(&pkg->complete); + return 0; + } + + return -ENOENT; +} + +/* + * This is the connector callback that delivers data + * that was sent from userspace. + */ +static void cn_ulog_callback(void *data) +{ + struct cn_msg *msg = (struct cn_msg *)data; + struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); + + spin_lock(&receiving_list_lock); + if (msg->len == 0) + fill_pkg(msg, NULL); + else if (msg->len < sizeof(*tfr)) + DMERR("Incomplete message received (expected %u, got %u): [%u]", + (unsigned)sizeof(*tfr), msg->len, msg->seq); + else + fill_pkg(NULL, tfr); + spin_unlock(&receiving_list_lock); +} + +/** + * dm_consult_userspace + * @uuid: log's uuid (must be DM_UUID_LEN in size) + * @request_type: found in include/linux/dm-log-userspace.h + * @data: data to tx to the server + * @data_size: size of data in bytes + * @rdata: place to put return data from server + * @rdata_size: value-result (amount of space given/amount of space used) + * + * rdata_size is undefined on failure. + * + * Memory used to communicate with userspace is zero'ed + * before populating to ensure that no unwanted bits leak + * from kernel space to user-space. All userspace log communications + * between kernel and user space go through this function. + * + * Returns: 0 on success, -EXXX on failure + **/ +int dm_consult_userspace(const char *uuid, int request_type, + char *data, size_t data_size, + char *rdata, size_t *rdata_size) +{ + int r = 0; + size_t dummy = 0; + int overhead_size = + sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); + struct dm_ulog_request *tfr = prealloced_ulog_tfr; + struct receiving_pkg pkg; + + if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { + DMINFO("Size of tfr exceeds preallocated size"); + return -EINVAL; + } + + if (!rdata_size) + rdata_size = &dummy; +resend: + /* + * We serialize the sending of requests so we can + * use the preallocated space. + */ + mutex_lock(&dm_ulog_lock); + + memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); + memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->seq = dm_ulog_seq++; + + /* + * Must be valid request type (all other bits set to + * zero). This reserves other bits for possible future + * use. + */ + tfr->request_type = request_type & DM_ULOG_REQUEST_MASK; + + tfr->data_size = data_size; + if (data && data_size) + memcpy(tfr->data, data, data_size); + + memset(&pkg, 0, sizeof(pkg)); + init_completion(&pkg.complete); + pkg.seq = tfr->seq; + pkg.data_size = rdata_size; + pkg.data = rdata; + spin_lock(&receiving_list_lock); + list_add(&(pkg.list), &receiving_list); + spin_unlock(&receiving_list_lock); + + r = dm_ulog_sendto_server(tfr); + + mutex_unlock(&dm_ulog_lock); + + if (r) { + DMERR("Unable to send log request [%u] to userspace: %d", + request_type, r); + spin_lock(&receiving_list_lock); + list_del_init(&(pkg.list)); + spin_unlock(&receiving_list_lock); + + goto out; + } + + r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); + spin_lock(&receiving_list_lock); + list_del_init(&(pkg.list)); + spin_unlock(&receiving_list_lock); + if (!r) { + DMWARN("[%s] Request timed out: [%u/%u] - retrying", + (strlen(uuid) > 8) ? + (uuid + (strlen(uuid) - 8)) : (uuid), + request_type, pkg.seq); + goto resend; + } + + r = pkg.error; + if (r == -EAGAIN) + goto resend; + +out: + return r; +} + +int dm_ulog_tfr_init(void) +{ + int r; + void *prealloced; + + INIT_LIST_HEAD(&receiving_list); + + prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL); + if (!prealloced) + return -ENOMEM; + + prealloced_cn_msg = prealloced; + prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg); + + r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); + if (r) { + cn_del_callback(&ulog_cn_id); + return r; + } + + return 0; +} + +void dm_ulog_tfr_exit(void) +{ + cn_del_callback(&ulog_cn_id); + kfree(prealloced_cn_msg); +} diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h new file mode 100644 index 0000000..c26d8e4 --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#ifndef __DM_LOG_USERSPACE_TRANSFER_H__ +#define __DM_LOG_USERSPACE_TRANSFER_H__ + +#define DM_MSG_PREFIX "dm-log-userspace" + +int dm_ulog_tfr_init(void); +void dm_ulog_tfr_exit(void); +int dm_consult_userspace(const char *uuid, int request_type, + char *data, size_t data_size, + char *rdata, size_t *rdata_size); + +#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */ diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index be233bc..9443896 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -412,10 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, /* * Buffer holds both header and bitset. */ - buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + - bitset_size, ti->limits.hardsect_size); + buf_size = + dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, + bdev_logical_block_size(lc->header_location. + bdev)); - if (buf_size > dev->bdev->bd_inode->i_size) { + if (buf_size > i_size_read(dev->bdev->bd_inode)) { DMWARN("log device %s too small: need %llu bytes", dev->name, (unsigned long long)buf_size); kfree(lc); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6a386ab..c70604a 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -8,7 +8,6 @@ #include <linux/device-mapper.h> #include "dm-path-selector.h" -#include "dm-bio-record.h" #include "dm-uevent.h" #include <linux/ctype.h> @@ -35,6 +34,7 @@ struct pgpath { struct dm_path path; struct work_struct deactivate_path; + struct work_struct activate_path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -64,8 +64,6 @@ struct multipath { spinlock_t lock; const char *hw_handler_name; - struct work_struct activate_path; - struct pgpath *pgpath_to_activate; unsigned nr_priority_groups; struct list_head priority_groups; unsigned pg_init_required; /* pg_init needs calling? */ @@ -84,7 +82,7 @@ struct multipath { unsigned pg_init_count; /* Number of times pg_init called */ struct work_struct process_queued_ios; - struct bio_list queued_ios; + struct list_head queued_ios; unsigned queue_size; struct work_struct trigger_event; @@ -101,7 +99,7 @@ struct multipath { */ struct dm_mpath_io { struct pgpath *pgpath; - struct dm_bio_details details; + size_t nr_bytes; }; typedef int (*action_fn) (struct pgpath *pgpath); @@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = 1; INIT_WORK(&pgpath->deactivate_path, deactivate_path); + INIT_WORK(&pgpath->activate_path, activate_path); } return pgpath; @@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void) static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) { - unsigned long flags; struct pgpath *pgpath, *tmp; struct multipath *m = ti->private; @@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) if (m->hw_handler_name) scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); dm_put_device(ti, pgpath->path.dev); - spin_lock_irqsave(&m->lock, flags); - if (m->pgpath_to_activate == pgpath) - m->pgpath_to_activate = NULL; - spin_unlock_irqrestore(&m->lock, flags); free_pgpath(pgpath); } } @@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti) m = kzalloc(sizeof(*m), GFP_KERNEL); if (m) { INIT_LIST_HEAD(&m->priority_groups); + INIT_LIST_HEAD(&m->queued_ios); spin_lock_init(&m->lock); m->queue_io = 1; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); - INIT_WORK(&m->activate_path, activate_path); m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); if (!m->mpio_pool) { kfree(m); @@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) m->pg_init_count = 0; } -static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) +static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, + size_t nr_bytes) { struct dm_path *path; - path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); + path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); if (!path) return -ENXIO; @@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) return 0; } -static void __choose_pgpath(struct multipath *m) +static void __choose_pgpath(struct multipath *m, size_t nr_bytes) { struct priority_group *pg; unsigned bypassed = 1; @@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m) if (m->next_pg) { pg = m->next_pg; m->next_pg = NULL; - if (!__choose_path_in_pg(m, pg)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) return; } /* Don't change PG until it has no remaining paths */ - if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) + if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) return; /* @@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m) list_for_each_entry(pg, &m->priority_groups, list) { if (pg->bypassed == bypassed) continue; - if (!__choose_path_in_pg(m, pg)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) return; } } while (bypassed--); @@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m) dm_noflush_suspending(m->ti)); } -static int map_io(struct multipath *m, struct bio *bio, +static int map_io(struct multipath *m, struct request *clone, struct dm_mpath_io *mpio, unsigned was_queued) { int r = DM_MAPIO_REMAPPED; + size_t nr_bytes = blk_rq_bytes(clone); unsigned long flags; struct pgpath *pgpath; + struct block_device *bdev; spin_lock_irqsave(&m->lock, flags); /* Do we need to select a new pgpath? */ if (!m->current_pgpath || (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) - __choose_pgpath(m); + __choose_pgpath(m, nr_bytes); pgpath = m->current_pgpath; @@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio, if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) { /* Queue for the daemon to resubmit */ - bio_list_add(&m->queued_ios, bio); + list_add_tail(&clone->queuelist, &m->queued_ios); m->queue_size++; if ((m->pg_init_required && !m->pg_init_in_progress) || !m->queue_io) queue_work(kmultipathd, &m->process_queued_ios); pgpath = NULL; r = DM_MAPIO_SUBMITTED; - } else if (pgpath) - bio->bi_bdev = pgpath->path.dev->bdev; - else if (__must_push_back(m)) + } else if (pgpath) { + bdev = pgpath->path.dev->bdev; + clone->q = bdev_get_queue(bdev); + clone->rq_disk = bdev->bd_disk; + } else if (__must_push_back(m)) r = DM_MAPIO_REQUEUE; else r = -EIO; /* Failed */ mpio->pgpath = pgpath; + mpio->nr_bytes = nr_bytes; + + if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) + pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, + nr_bytes); spin_unlock_irqrestore(&m->lock, flags); @@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m) { int r; unsigned long flags; - struct bio *bio = NULL, *next; struct dm_mpath_io *mpio; union map_info *info; + struct request *clone, *n; + LIST_HEAD(cl); spin_lock_irqsave(&m->lock, flags); - bio = bio_list_get(&m->queued_ios); + list_splice_init(&m->queued_ios, &cl); spin_unlock_irqrestore(&m->lock, flags); - while (bio) { - next = bio->bi_next; - bio->bi_next = NULL; + list_for_each_entry_safe(clone, n, &cl, queuelist) { + list_del_init(&clone->queuelist); - info = dm_get_mapinfo(bio); + info = dm_get_rq_mapinfo(clone); mpio = info->ptr; - r = map_io(m, bio, mpio, 1); - if (r < 0) - bio_endio(bio, r); - else if (r == DM_MAPIO_REMAPPED) - generic_make_request(bio); - else if (r == DM_MAPIO_REQUEUE) - bio_endio(bio, -EIO); - - bio = next; + r = map_io(m, clone, mpio, 1); + if (r < 0) { + mempool_free(mpio, m->mpio_pool); + dm_kill_unmapped_request(clone, r); + } else if (r == DM_MAPIO_REMAPPED) + dm_dispatch_request(clone); + else if (r == DM_MAPIO_REQUEUE) { + mempool_free(mpio, m->mpio_pool); + dm_requeue_unmapped_request(clone); + } } } @@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work) { struct multipath *m = container_of(work, struct multipath, process_queued_ios); - struct pgpath *pgpath = NULL; - unsigned init_required = 0, must_queue = 1; + struct pgpath *pgpath = NULL, *tmp; + unsigned must_queue = 1; unsigned long flags; spin_lock_irqsave(&m->lock, flags); @@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work) goto out; if (!m->current_pgpath) - __choose_pgpath(m); + __choose_pgpath(m, 0); pgpath = m->current_pgpath; @@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work) must_queue = 0; if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { - m->pgpath_to_activate = pgpath; m->pg_init_count++; m->pg_init_required = 0; - m->pg_init_in_progress = 1; - init_required = 1; + list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { + if (queue_work(kmpath_handlerd, &tmp->activate_path)) + m->pg_init_in_progress++; + } } - out: spin_unlock_irqrestore(&m->lock, flags); - - if (init_required) - queue_work(kmpath_handlerd, &m->activate_path); - if (!must_queue) dispatch_queued_ios(m); } @@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, return -EINVAL; } + if (ps_argc > as->argc) { + dm_put_path_selector(pst); + ti->error = "not enough arguments for path selector"; + return -EINVAL; + } + r = pst->create(&pg->ps, ps_argc, as->argv); if (r) { dm_put_path_selector(pst); @@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, } if (m->hw_handler_name) { - r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), - m->hw_handler_name); + struct request_queue *q = bdev_get_queue(p->path.dev->bdev); + + r = scsi_dh_attach(q, m->hw_handler_name); + if (r == -EBUSY) { + /* + * Already attached to different hw_handler, + * try to reattach with correct one. + */ + scsi_dh_detach(q); + r = scsi_dh_attach(q, m->hw_handler_name); + } + if (r < 0) { + ti->error = "error attaching hardware handler"; dm_put_device(ti, p->path.dev); goto bad; } @@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) if (!hw_argc) return 0; + if (hw_argc > as->argc) { + ti->error = "not enough arguments for hardware handler"; + return -EINVAL; + } + m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); request_module("scsi_dh_%s", m->hw_handler_name); if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { @@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, goto bad; } + ti->num_flush_requests = 1; + return 0; bad: @@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti) flush_workqueue(kmpath_handlerd); flush_workqueue(kmultipathd); + flush_scheduled_work(); free_multipath(m); } /* - * Map bios, recording original fields for later in case we have to resubmit + * Map cloned requests */ -static int multipath_map(struct dm_target *ti, struct bio *bio, +static int multipath_map(struct dm_target *ti, struct request *clone, union map_info *map_context) { int r; struct dm_mpath_io *mpio; struct multipath *m = (struct multipath *) ti->private; - mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); - dm_bio_record(&mpio->details, bio); + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); + if (!mpio) + /* ENOMEM, requeue */ + return DM_MAPIO_REQUEUE; + memset(mpio, 0, sizeof(*mpio)); map_context->ptr = mpio; - bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); - r = map_io(m, bio, mpio, 0); + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + r = map_io(m, clone, mpio, 0); if (r < 0 || r == DM_MAPIO_REQUEUE) mempool_free(mpio, m->mpio_pool); @@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath) pgpath->is_active = 1; - m->current_pgpath = NULL; - if (!m->nr_valid_paths++ && m->queue_size) + if (!m->nr_valid_paths++ && m->queue_size) { + m->current_pgpath = NULL; queue_work(kmultipathd, &m->process_queued_ios); + } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { + if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + m->pg_init_in_progress++; + } dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, pgpath->path.dev->name, m->nr_valid_paths); @@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors) spin_lock_irqsave(&m->lock, flags); if (errors) { - DMERR("Could not failover device. Error %d.", errors); - m->current_pgpath = NULL; - m->current_pg = NULL; + if (pgpath == m->current_pgpath) { + DMERR("Could not failover device. Error %d.", errors); + m->current_pgpath = NULL; + m->current_pg = NULL; + } } else if (!m->pg_init_required) { m->queue_io = 0; pg->bypassed = 0; } - m->pg_init_in_progress = 0; - queue_work(kmultipathd, &m->process_queued_ios); + m->pg_init_in_progress--; + if (!m->pg_init_in_progress) + queue_work(kmultipathd, &m->process_queued_ios); spin_unlock_irqrestore(&m->lock, flags); } static void activate_path(struct work_struct *work) { int ret; - struct multipath *m = - container_of(work, struct multipath, activate_path); - struct dm_path *path; - unsigned long flags; + struct pgpath *pgpath = + container_of(work, struct pgpath, activate_path); - spin_lock_irqsave(&m->lock, flags); - path = &m->pgpath_to_activate->path; - m->pgpath_to_activate = NULL; - spin_unlock_irqrestore(&m->lock, flags); - if (!path) - return; - ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); - pg_init_done(path, ret); + ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); + pg_init_done(&pgpath->path, ret); } /* * end_io handling */ -static int do_end_io(struct multipath *m, struct bio *bio, +static int do_end_io(struct multipath *m, struct request *clone, int error, struct dm_mpath_io *mpio) { + /* + * We don't queue any clone request inside the multipath target + * during end I/O handling, since those clone requests don't have + * bio clones. If we queue them inside the multipath target, + * we need to make bio clones, that requires memory allocation. + * (See drivers/md/dm.c:end_clone_bio() about why the clone requests + * don't have bio clones.) + * Instead of queueing the clone request here, we queue the original + * request into dm core, which will remake a clone request and + * clone bios for it and resubmit it later. + */ + int r = DM_ENDIO_REQUEUE; unsigned long flags; - if (!error) + if (!error && !clone->errors) return 0; /* I/O complete */ - if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) - return error; - if (error == -EOPNOTSUPP) return error; - spin_lock_irqsave(&m->lock, flags); - if (!m->nr_valid_paths) { - if (__must_push_back(m)) { - spin_unlock_irqrestore(&m->lock, flags); - return DM_ENDIO_REQUEUE; - } else if (!m->queue_if_no_path) { - spin_unlock_irqrestore(&m->lock, flags); - return -EIO; - } else { - spin_unlock_irqrestore(&m->lock, flags); - goto requeue; - } - } - spin_unlock_irqrestore(&m->lock, flags); - if (mpio->pgpath) fail_path(mpio->pgpath); - requeue: - dm_bio_restore(&mpio->details, bio); - - /* queue for the daemon to resubmit or fail */ spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_ios, bio); - m->queue_size++; - if (!m->queue_io) - queue_work(kmultipathd, &m->process_queued_ios); + if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) + r = -EIO; spin_unlock_irqrestore(&m->lock, flags); - return DM_ENDIO_INCOMPLETE; /* io not complete */ + return r; } -static int multipath_end_io(struct dm_target *ti, struct bio *bio, +static int multipath_end_io(struct dm_target *ti, struct request *clone, int error, union map_info *map_context) { struct multipath *m = ti->private; @@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, struct path_selector *ps; int r; - r = do_end_io(m, bio, error, mpio); + r = do_end_io(m, clone, error, mpio); if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } - if (r != DM_ENDIO_INCOMPLETE) - mempool_free(mpio, m->mpio_pool); + mempool_free(mpio, m->mpio_pool); return r; } @@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, spin_lock_irqsave(&m->lock, flags); if (!m->current_pgpath) - __choose_pgpath(m); + __choose_pgpath(m, 0); if (m->current_pgpath) { bdev = m->current_pgpath->path.dev->bdev; @@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); } +static int multipath_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *p; + int ret = 0; + + list_for_each_entry(pg, &m->priority_groups, list) { + list_for_each_entry(p, &pg->pgpaths, list) { + ret = fn(ti, p->path.dev, ti->begin, data); + if (ret) + goto out; + } + } + +out: + return ret; +} + +static int __pgpath_busy(struct pgpath *pgpath) +{ + struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); + + return dm_underlying_device_busy(q); +} + +/* + * We return "busy", only when we can map I/Os but underlying devices + * are busy (so even if we map I/Os now, the I/Os will wait on + * the underlying queue). + * In other words, if we want to kill I/Os or queue them inside us + * due to map unavailability, we don't return "busy". Otherwise, + * dm core won't give us the I/Os and we can't do what we want. + */ +static int multipath_busy(struct dm_target *ti) +{ + int busy = 0, has_active = 0; + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *pgpath; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + /* Guess which priority_group will be used at next mapping time */ + if (unlikely(!m->current_pgpath && m->next_pg)) + pg = m->next_pg; + else if (likely(m->current_pg)) + pg = m->current_pg; + else + /* + * We don't know which pg will be used at next mapping time. + * We don't call __choose_pgpath() here to avoid to trigger + * pg_init just by busy checking. + * So we don't know whether underlying devices we will be using + * at next mapping time are busy or not. Just try mapping. + */ + goto out; + + /* + * If there is one non-busy active path at least, the path selector + * will be able to select it. So we consider such a pg as not busy. + */ + busy = 1; + list_for_each_entry(pgpath, &pg->pgpaths, list) + if (pgpath->is_active) { + has_active = 1; + + if (!__pgpath_busy(pgpath)) { + busy = 0; + break; + } + } + + if (!has_active) + /* + * No active path in this pg, so this pg won't be used and + * the current_pg will be changed at next mapping time. + * We need to try mapping to determine it. + */ + busy = 0; + +out: + spin_unlock_irqrestore(&m->lock, flags); + + return busy; +} + /*----------------------------------------------------------------- * Module setup *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 0, 5}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, - .map = multipath_map, - .end_io = multipath_end_io, + .map_rq = multipath_map, + .rq_end_io = multipath_end_io, .presuspend = multipath_presuspend, .resume = multipath_resume, .status = multipath_status, .message = multipath_message, .ioctl = multipath_ioctl, + .iterate_devices = multipath_iterate_devices, + .busy = multipath_busy, }; static int __init dm_multipath_init(void) diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 27357b8..e7d1fa8 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -56,7 +56,8 @@ struct path_selector_type { * the path fails. */ struct dm_path *(*select_path) (struct path_selector *ps, - unsigned *repeat_count); + unsigned *repeat_count, + size_t nr_bytes); /* * Notify the selector that a path has failed. @@ -75,7 +76,10 @@ struct path_selector_type { int (*status) (struct path_selector *ps, struct dm_path *path, status_type_t type, char *result, unsigned int maxlen); - int (*end_io) (struct path_selector *ps, struct dm_path *path); + int (*start_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); + int (*end_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); }; /* Register a path selector */ diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c new file mode 100644 index 0000000..f92b6ce --- /dev/null +++ b/drivers/md/dm-queue-length.c @@ -0,0 +1,263 @@ +/* + * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. + * Copyright (C) 2006-2009 NEC Corporation. + * + * dm-queue-length.c + * + * Module Author: Stefan Bader, IBM + * Modified by: Kiyoshi Ueda, NEC + * + * This file is released under the GPL. + * + * queue-length path selector - choose a path with the least number of + * in-flight I/Os. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <asm/atomic.h> + +#define DM_MSG_PREFIX "multipath queue-length" +#define QL_MIN_IO 128 +#define QL_VERSION "0.1.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + atomic_t qlen; /* the number of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + } + + return s; +} + +static int ql_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void ql_free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void ql_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + ql_free_paths(&s->valid_paths); + ql_free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int ql_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + /* When called with NULL path, return selector status/args. */ + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", atomic_read(&pi->qlen)); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u ", pi->repeat_count); + break; + } + } + + return sz; +} + +static int ql_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = QL_MIN_IO; + + /* + * Arguments: [<repeat_count>] + * <repeat_count>: The number of I/Os before switching path. + * If not given, default (QL_MIN_IO) is used. + */ + if (argc > 1) { + *error = "queue-length ps: incorrect number of arguments"; + return -EINVAL; + } + + if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + *error = "queue-length ps: invalid repeat count"; + return -EINVAL; + } + + /* Allocate the path information structure */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "queue-length ps: Error allocating path information"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + atomic_set(&pi->qlen, 0); + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void ql_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move(&pi->list, &s->failed_paths); +} + +static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move_tail(&pi->list, &s->valid_paths); + + return 0; +} + +/* + * Select a path having the minimum number of in-flight I/Os + */ +static struct dm_path *ql_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + + if (list_empty(&s->valid_paths)) + return NULL; + + /* Change preferred (first in list) path to evenly balance. */ + list_move_tail(s->valid_paths.next, &s->valid_paths); + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || + (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) + best = pi; + + if (!atomic_read(&best->qlen)) + break; + } + + if (!best) + return NULL; + + *repeat_count = best->repeat_count; + + return best->path; +} + +static int ql_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_inc(&pi->qlen); + + return 0; +} + +static int ql_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_dec(&pi->qlen); + + return 0; +} + +static struct path_selector_type ql_ps = { + .name = "queue-length", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 1, + .create = ql_create, + .destroy = ql_destroy, + .status = ql_status, + .add_path = ql_add_path, + .fail_path = ql_fail_path, + .reinstate_path = ql_reinstate_path, + .select_path = ql_select_path, + .start_io = ql_start_io, + .end_io = ql_end_io, +}; + +static int __init dm_ql_init(void) +{ + int r = dm_register_path_selector(&ql_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " QL_VERSION " loaded"); + + return r; +} + +static void __exit dm_ql_exit(void) +{ + int r = dm_unregister_path_selector(&ql_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_ql_init); +module_exit(dm_ql_exit); + +MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>"); +MODULE_DESCRIPTION( + "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" + DM_NAME " path selector to balance the number of in-flight I/Os" +); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 076fbb4..ce8868c 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type, return 0; } +static int mirror_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct mirror_set *ms = ti->private; + int ret = 0; + unsigned i; + + for (i = 0; !ret && i < ms->nr_mirrors; i++) + ret = fn(ti, ms->mirror[i].dev, + ms->mirror[i].offset, data); + + return ret; +} + static struct target_type mirror_target = { .name = "mirror", - .version = {1, 0, 20}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, @@ -1295,6 +1309,7 @@ static struct target_type mirror_target = { .postsuspend = mirror_postsuspend, .resume = mirror_resume, .status = mirror_status, + .iterate_devices = mirror_iterate_devices, }; static int __init dm_mirror_init(void) diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7b899be..36dbe29 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); if (unlikely(!nreg)) - nreg = kmalloc(sizeof(*nreg), GFP_NOIO); + nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? DM_RH_CLEAN : DM_RH_NOSYNC; diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index cdfbf65..24752f4 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) } static struct dm_path *rr_select_path(struct path_selector *ps, - unsigned *repeat_count) + unsigned *repeat_count, size_t nr_bytes) { struct selector *s = (struct selector *) ps->context; struct path_info *pi = NULL; diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c new file mode 100644 index 0000000..cfa668f --- /dev/null +++ b/drivers/md/dm-service-time.c @@ -0,0 +1,339 @@ +/* + * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. + * + * Module Author: Kiyoshi Ueda + * + * This file is released under the GPL. + * + * Throughput oriented path selector. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#define DM_MSG_PREFIX "multipath service-time" +#define ST_MIN_IO 1 +#define ST_MAX_RELATIVE_THROUGHPUT 100 +#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 +#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) +#define ST_VERSION "0.2.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + unsigned relative_throughput; + atomic_t in_flight_size; /* Total size of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + } + + return s; +} + +static int st_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void st_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int st_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), + pi->relative_throughput); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %u ", pi->repeat_count, + pi->relative_throughput); + break; + } + } + + return sz; +} + +static int st_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = ST_MIN_IO; + unsigned relative_throughput = 1; + + /* + * Arguments: [<repeat_count> [<relative_throughput>]] + * <repeat_count>: The number of I/Os before switching path. + * If not given, default (ST_MIN_IO) is used. + * <relative_throughput>: The relative throughput value of + * the path among all paths in the path-group. + * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> + * If not given, minimum value '1' is used. + * If '0' is given, the path isn't selected while + * other paths having a positive value are + * available. + */ + if (argc > 2) { + *error = "service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + *error = "service-time ps: invalid repeat count"; + return -EINVAL; + } + + if ((argc == 2) && + (sscanf(argv[1], "%u", &relative_throughput) != 1 || + relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { + *error = "service-time ps: invalid relative_throughput value"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + pi->relative_throughput = relative_throughput; + atomic_set(&pi->in_flight_size, 0); + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void st_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move(&pi->list, &s->failed_paths); +} + +static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move_tail(&pi->list, &s->valid_paths); + + return 0; +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + * Description: + * Basically, the service time is estimated by: + * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' + * To reduce the calculation, some optimizations are made. + * (See comments inline) + */ +static int st_compare_load(struct path_info *pi1, struct path_info *pi2, + size_t incoming) +{ + size_t sz1, sz2, st1, st2; + + sz1 = atomic_read(&pi1->in_flight_size); + sz2 = atomic_read(&pi2->in_flight_size); + + /* + * Case 1: Both have same throughput value. Choose less loaded path. + */ + if (pi1->relative_throughput == pi2->relative_throughput) + return sz1 - sz2; + + /* + * Case 2a: Both have same load. Choose higher throughput path. + * Case 2b: One path has no throughput value. Choose the other one. + */ + if (sz1 == sz2 || + !pi1->relative_throughput || !pi2->relative_throughput) + return pi2->relative_throughput - pi1->relative_throughput; + + /* + * Case 3: Calculate service time. Choose faster path. + * Service time using pi1: + * st1 = (sz1 + incoming) / pi1->relative_throughput + * Service time using pi2: + * st2 = (sz2 + incoming) / pi2->relative_throughput + * + * To avoid the division, transform the expression to use + * multiplication. + * Because ->relative_throughput > 0 here, if st1 < st2, + * the expressions below are the same meaning: + * (sz1 + incoming) / pi1->relative_throughput < + * (sz2 + incoming) / pi2->relative_throughput + * (sz1 + incoming) * pi2->relative_throughput < + * (sz2 + incoming) * pi1->relative_throughput + * So use the later one. + */ + sz1 += incoming; + sz2 += incoming; + if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || + sz2 >= ST_MAX_INFLIGHT_SIZE)) { + /* + * Size may be too big for multiplying pi->relative_throughput + * and overflow. + * To avoid the overflow and mis-selection, shift down both. + */ + sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + } + st1 = sz1 * pi2->relative_throughput; + st2 = sz2 * pi1->relative_throughput; + if (st1 != st2) + return st1 - st2; + + /* + * Case 4: Service time is equal. Choose higher throughput path. + */ + return pi2->relative_throughput - pi1->relative_throughput; +} + +static struct dm_path *st_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + + if (list_empty(&s->valid_paths)) + return NULL; + + /* Change preferred (first in list) path to evenly balance. */ + list_move_tail(s->valid_paths.next, &s->valid_paths); + + list_for_each_entry(pi, &s->valid_paths, list) + if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) + best = pi; + + if (!best) + return NULL; + + *repeat_count = best->repeat_count; + + return best->path; +} + +static int st_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_add(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static int st_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_sub(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static struct path_selector_type st_ps = { + .name = "service-time", + .module = THIS_MODULE, + .table_args = 2, + .info_args = 2, + .create = st_create, + .destroy = st_destroy, + .status = st_status, + .add_path = st_add_path, + .fail_path = st_fail_path, + .reinstate_path = st_reinstate_path, + .select_path = st_select_path, + .start_io = st_start_io, + .end_io = st_end_io, +}; + +static int __init dm_st_init(void) +{ + int r = dm_register_path_selector(&st_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " ST_VERSION " loaded"); + + return r; +} + +static void __exit dm_st_exit(void) +{ + int r = dm_unregister_path_selector(&st_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_st_init); +module_exit(dm_st_exit); + +MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); +MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index e75c6dd..6e3fe4f 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) */ if (!ps->store->chunk_size) { ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, - bdev_hardsect_size(ps->store->cow->bdev) >> 9); + bdev_logical_block_size(ps->store->cow->bdev) >> 9); ps->store->chunk_mask = ps->store->chunk_size - 1; ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; chunk_size_supplied = 0; @@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, /* * Commit exceptions to disk. */ - if (ps->valid && area_io(ps, WRITE)) + if (ps->valid && area_io(ps, WRITE_BARRIER)) ps->valid = 0; /* diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index d73f17f..d573165 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = s; ti->split_io = s->store->chunk_size; + ti->num_flush_requests = 1; return 0; @@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; + if (unlikely(bio_empty_barrier(bio))) { + bio->bi_bdev = s->store->cow->bdev; + return DM_MAPIO_REMAPPED; + } + chunk = sector_to_chunk(s->store, bio->bi_sector); /* Full snapshots are not usable */ @@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->private = dev; + ti->num_flush_requests = 1; + return 0; } @@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio, struct dm_dev *dev = ti->private; bio->bi_bdev = dev->bdev; + if (unlikely(bio_empty_barrier(bio))) + return DM_MAPIO_REMAPPED; + /* Only tell snapshots if this is a write */ return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; } diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 41569bc..b240e85 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) sc->stripes = stripes; sc->stripe_width = width; ti->split_io = chunk_size; + ti->num_flush_requests = stripes; sc->chunk_mask = ((sector_t) chunk_size) - 1; for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) @@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { struct stripe_c *sc = (struct stripe_c *) ti->private; + sector_t offset, chunk; + uint32_t stripe; - sector_t offset = bio->bi_sector - ti->begin; - sector_t chunk = offset >> sc->chunk_shift; - uint32_t stripe = sector_div(chunk, sc->stripes); + if (unlikely(bio_empty_barrier(bio))) { + BUG_ON(map_context->flush_request >= sc->stripes); + bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev; + return DM_MAPIO_REMAPPED; + } + + offset = bio->bi_sector - ti->begin; + chunk = offset >> sc->chunk_shift; + stripe = sector_div(chunk, sc->stripes); bio->bi_bdev = sc->stripe[stripe].dev->bdev; bio->bi_sector = sc->stripe[stripe].physical_start + @@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, return error; } +static int stripe_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct stripe_c *sc = ti->private; + int ret = 0; + unsigned i = 0; + + do + ret = fn(ti, sc->stripe[i].dev, + sc->stripe[i].physical_start, data); + while (!ret && ++i < sc->stripes); + + return ret; +} + static struct target_type stripe_target = { .name = "striped", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, .map = stripe_map, .end_io = stripe_end_io, .status = stripe_status, + .iterate_devices = stripe_iterate_devices, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index a2a45e6..4b04590 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) return strlen(buf); } +static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) +{ + sprintf(buf, "%d\n", dm_suspended(md)); + + return strlen(buf); +} + static DM_ATTR_RO(name); static DM_ATTR_RO(uuid); +static DM_ATTR_RO(suspended); static struct attribute *dm_attrs[] = { &dm_attr_name.attr, &dm_attr_uuid.attr, + &dm_attr_suspended.attr, NULL, }; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 429b50b..4899ebe 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -41,6 +41,7 @@ struct dm_table { struct mapped_device *md; atomic_t holders; + unsigned type; /* btree table */ unsigned int depth; @@ -62,15 +63,11 @@ struct dm_table { /* a list of devices used by this table */ struct list_head devices; - /* - * These are optimistic limits taken from all the - * targets, some targets will need smaller limits. - */ - struct io_restrictions limits; - /* events get handed up using this callback */ void (*event_fn)(void *); void *event_context; + + struct dm_md_mempools *mempools; }; /* @@ -89,42 +86,6 @@ static unsigned int int_log(unsigned int n, unsigned int base) } /* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - -/* - * Combine two io_restrictions, always taking the lower value. - */ -static void combine_restrictions_low(struct io_restrictions *lhs, - struct io_restrictions *rhs) -{ - lhs->max_sectors = - min_not_zero(lhs->max_sectors, rhs->max_sectors); - - lhs->max_phys_segments = - min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments); - - lhs->max_hw_segments = - min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); - - lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size); - - lhs->max_segment_size = - min_not_zero(lhs->max_segment_size, rhs->max_segment_size); - - lhs->max_hw_sectors = - min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors); - - lhs->seg_boundary_mask = - min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); - - lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn); - - lhs->no_cluster |= rhs->no_cluster; -} - -/* * Calculate the index of the child node of the n'th node k'th key. */ static inline unsigned int get_child(unsigned int n, unsigned int k) @@ -266,6 +227,8 @@ static void free_devices(struct list_head *devices) list_for_each_safe(tmp, next, devices) { struct dm_dev_internal *dd = list_entry(tmp, struct dm_dev_internal, list); + DMWARN("dm_table_destroy: dm_put_device call missing for %s", + dd->dm_dev.name); kfree(dd); } } @@ -295,12 +258,10 @@ void dm_table_destroy(struct dm_table *t) vfree(t->highs); /* free the device list */ - if (t->devices.next != &t->devices) { - DMWARN("devices still present during destroy: " - "dm_table_remove_device calls missing"); - + if (t->devices.next != &t->devices) free_devices(&t->devices); - } + + dm_free_md_mempools(t->mempools); kfree(t); } @@ -384,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) /* * If possible, this checks an area of a destination device is valid. */ -static int check_device_area(struct dm_dev_internal *dd, sector_t start, - sector_t len) +static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, + sector_t start, void *data) { - sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT; + struct queue_limits *limits = data; + struct block_device *bdev = dev->bdev; + sector_t dev_size = + i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + unsigned short logical_block_size_sectors = + limits->logical_block_size >> SECTOR_SHIFT; + char b[BDEVNAME_SIZE]; if (!dev_size) return 1; - return ((start < dev_size) && (len <= (dev_size - start))); + if ((start >= dev_size) || (start + ti->len > dev_size)) { + DMWARN("%s: %s too small for target", + dm_device_name(ti->table->md), bdevname(bdev, b)); + return 0; + } + + if (logical_block_size_sectors <= 1) + return 1; + + if (start & (logical_block_size_sectors - 1)) { + DMWARN("%s: start=%llu not aligned to h/w " + "logical block size %hu of %s", + dm_device_name(ti->table->md), + (unsigned long long)start, + limits->logical_block_size, bdevname(bdev, b)); + return 0; + } + + if (ti->len & (logical_block_size_sectors - 1)) { + DMWARN("%s: len=%llu not aligned to h/w " + "logical block size %hu of %s", + dm_device_name(ti->table->md), + (unsigned long long)ti->len, + limits->logical_block_size, bdevname(bdev, b)); + return 0; + } + + return 1; } /* @@ -478,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, } atomic_inc(&dd->count); - if (!check_device_area(dd, start, len)) { - DMWARN("device %s too small for target", path); - dm_put_device(ti, &dd->dm_dev); - return -EINVAL; - } - *result = &dd->dm_dev; - return 0; } -void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) +/* + * Returns the minimum that is _not_ zero, unless both are zero. + */ +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + +int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, + sector_t start, void *data) { + struct queue_limits *limits = data; + struct block_device *bdev = dev->bdev; struct request_queue *q = bdev_get_queue(bdev); - struct io_restrictions *rs = &ti->limits; char b[BDEVNAME_SIZE]; if (unlikely(!q)) { DMWARN("%s: Cannot set limits for nonexistent device %s", dm_device_name(ti->table->md), bdevname(bdev, b)); - return; + return 0; } - /* - * Combine the device limits low. - * - * FIXME: if we move an io_restriction struct - * into q this would just be a call to - * combine_restrictions_low() - */ - rs->max_sectors = - min_not_zero(rs->max_sectors, q->max_sectors); + if (blk_stack_limits(limits, &q->limits, start) < 0) + DMWARN("%s: target device %s is misaligned", + dm_device_name(ti->table->md), bdevname(bdev, b)); /* * Check if merge fn is supported. @@ -518,47 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) */ if (q->merge_bvec_fn && !ti->type->merge) - rs->max_sectors = - min_not_zero(rs->max_sectors, + limits->max_sectors = + min_not_zero(limits->max_sectors, (unsigned int) (PAGE_SIZE >> 9)); - - rs->max_phys_segments = - min_not_zero(rs->max_phys_segments, - q->max_phys_segments); - - rs->max_hw_segments = - min_not_zero(rs->max_hw_segments, q->max_hw_segments); - - rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); - - rs->max_segment_size = - min_not_zero(rs->max_segment_size, q->max_segment_size); - - rs->max_hw_sectors = - min_not_zero(rs->max_hw_sectors, q->max_hw_sectors); - - rs->seg_boundary_mask = - min_not_zero(rs->seg_boundary_mask, - q->seg_boundary_mask); - - rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn); - - rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); + return 0; } EXPORT_SYMBOL_GPL(dm_set_device_limits); int dm_get_device(struct dm_target *ti, const char *path, sector_t start, sector_t len, fmode_t mode, struct dm_dev **result) { - int r = __table_get_device(ti->table, ti, path, - start, len, mode, result); - - if (!r) - dm_set_device_limits(ti, (*result)->bdev); - - return r; + return __table_get_device(ti->table, ti, path, + start, len, mode, result); } + /* * Decrement a devices use count and remove it if necessary. */ @@ -673,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input) return 0; } -static void check_for_valid_limits(struct io_restrictions *rs) +/* + * Impose necessary and sufficient conditions on a devices's table such + * that any incoming bio which respects its logical_block_size can be + * processed successfully. If it falls across the boundary between + * two or more targets, the size of each piece it gets split into must + * be compatible with the logical_block_size of the target processing it. + */ +static int validate_hardware_logical_block_alignment(struct dm_table *table, + struct queue_limits *limits) { - if (!rs->max_sectors) - rs->max_sectors = SAFE_MAX_SECTORS; - if (!rs->max_hw_sectors) - rs->max_hw_sectors = SAFE_MAX_SECTORS; - if (!rs->max_phys_segments) - rs->max_phys_segments = MAX_PHYS_SEGMENTS; - if (!rs->max_hw_segments) - rs->max_hw_segments = MAX_HW_SEGMENTS; - if (!rs->hardsect_size) - rs->hardsect_size = 1 << SECTOR_SHIFT; - if (!rs->max_segment_size) - rs->max_segment_size = MAX_SEGMENT_SIZE; - if (!rs->seg_boundary_mask) - rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - if (!rs->bounce_pfn) - rs->bounce_pfn = -1; + /* + * This function uses arithmetic modulo the logical_block_size + * (in units of 512-byte sectors). + */ + unsigned short device_logical_block_size_sects = + limits->logical_block_size >> SECTOR_SHIFT; + + /* + * Offset of the start of the next table entry, mod logical_block_size. + */ + unsigned short next_target_start = 0; + + /* + * Given an aligned bio that extends beyond the end of a + * target, how many sectors must the next target handle? + */ + unsigned short remaining = 0; + + struct dm_target *uninitialized_var(ti); + struct queue_limits ti_limits; + unsigned i = 0; + + /* + * Check each entry in the table in turn. + */ + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + blk_set_default_limits(&ti_limits); + + /* combine all target devices' limits */ + if (ti->type->iterate_devices) + ti->type->iterate_devices(ti, dm_set_device_limits, + &ti_limits); + + /* + * If the remaining sectors fall entirely within this + * table entry are they compatible with its logical_block_size? + */ + if (remaining < ti->len && + remaining & ((ti_limits.logical_block_size >> + SECTOR_SHIFT) - 1)) + break; /* Error */ + + next_target_start = + (unsigned short) ((next_target_start + ti->len) & + (device_logical_block_size_sects - 1)); + remaining = next_target_start ? + device_logical_block_size_sects - next_target_start : 0; + } + + if (remaining) { + DMWARN("%s: table line %u (start sect %llu len %llu) " + "not aligned to h/w logical block size %hu", + dm_device_name(table->md), i, + (unsigned long long) ti->begin, + (unsigned long long) ti->len, + limits->logical_block_size); + return -EINVAL; + } + + return 0; } int dm_table_add_target(struct dm_table *t, const char *type, @@ -745,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; - /* FIXME: the plan is to combine high here and then have - * the merge fn apply the target level restrictions. */ - combine_restrictions_low(&t->limits, &tgt->limits); return 0; bad: @@ -756,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type, return r; } +int dm_table_set_type(struct dm_table *t) +{ + unsigned i; + unsigned bio_based = 0, request_based = 0; + struct dm_target *tgt; + struct dm_dev_internal *dd; + struct list_head *devices; + + for (i = 0; i < t->num_targets; i++) { + tgt = t->targets + i; + if (dm_target_request_based(tgt)) + request_based = 1; + else + bio_based = 1; + + if (bio_based && request_based) { + DMWARN("Inconsistent table: different target types" + " can't be mixed up"); + return -EINVAL; + } + } + + if (bio_based) { + /* We must use this table as bio-based */ + t->type = DM_TYPE_BIO_BASED; + return 0; + } + + BUG_ON(!request_based); /* No targets in this table */ + + /* Non-request-stackable devices can't be used for request-based dm */ + devices = dm_table_get_devices(t); + list_for_each_entry(dd, devices, list) { + if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { + DMWARN("table load rejected: including" + " non-request-stackable devices"); + return -EINVAL; + } + } + + /* + * Request-based dm supports only tables that have a single target now. + * To support multiple targets, request splitting support is needed, + * and that needs lots of changes in the block-layer. + * (e.g. request completion process for partial completion.) + */ + if (t->num_targets > 1) { + DMWARN("Request-based dm doesn't support multiple targets yet"); + return -EINVAL; + } + + t->type = DM_TYPE_REQUEST_BASED; + + return 0; +} + +unsigned dm_table_get_type(struct dm_table *t) +{ + return t->type; +} + +bool dm_table_bio_based(struct dm_table *t) +{ + return dm_table_get_type(t) == DM_TYPE_BIO_BASED; +} + +bool dm_table_request_based(struct dm_table *t) +{ + return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; +} + +int dm_table_alloc_md_mempools(struct dm_table *t) +{ + unsigned type = dm_table_get_type(t); + + if (unlikely(type == DM_TYPE_NONE)) { + DMWARN("no table type is set, can't allocate mempools"); + return -EINVAL; + } + + t->mempools = dm_alloc_md_mempools(type); + if (!t->mempools) + return -ENOMEM; + + return 0; +} + +void dm_table_free_md_mempools(struct dm_table *t) +{ + dm_free_md_mempools(t->mempools); + t->mempools = NULL; +} + +struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) +{ + return t->mempools; +} + static int setup_indexes(struct dm_table *t) { int i; @@ -790,8 +901,6 @@ int dm_table_complete(struct dm_table *t) int r = 0; unsigned int leaf_nodes; - check_for_valid_limits(&t->limits); - /* how many indexes will the btree have ? */ leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); @@ -867,6 +976,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) } /* + * Establish the new table's queue_limits and validate them. + */ +int dm_calculate_queue_limits(struct dm_table *table, + struct queue_limits *limits) +{ + struct dm_target *uninitialized_var(ti); + struct queue_limits ti_limits; + unsigned i = 0; + + blk_set_default_limits(limits); + + while (i < dm_table_get_num_targets(table)) { + blk_set_default_limits(&ti_limits); + + ti = dm_table_get_target(table, i++); + + if (!ti->type->iterate_devices) + goto combine_limits; + + /* + * Combine queue limits of all the devices this target uses. + */ + ti->type->iterate_devices(ti, dm_set_device_limits, + &ti_limits); + + /* + * Check each device area is consistent with the target's + * overall queue limits. + */ + if (!ti->type->iterate_devices(ti, device_area_is_valid, + &ti_limits)) + return -EINVAL; + +combine_limits: + /* + * Merge this target's queue limits into the overall limits + * for the table. + */ + if (blk_stack_limits(limits, &ti_limits, 0) < 0) + DMWARN("%s: target device " + "(start sect %llu len %llu) " + "is misaligned", + dm_device_name(table->md), + (unsigned long long) ti->begin, + (unsigned long long) ti->len); + } + + return validate_hardware_logical_block_alignment(table, limits); +} + +/* * Set the integrity profile for this device if all devices used have * matching profiles. */ @@ -905,27 +1065,42 @@ no_integrity: return; } -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) +void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits) { /* - * Make sure we obey the optimistic sub devices - * restrictions. + * Each target device in the table has a data area that should normally + * be aligned such that the DM device's alignment_offset is 0. + * FIXME: Propagate alignment_offsets up the stack and warn of + * sub-optimal or inconsistent settings. + */ + limits->alignment_offset = 0; + limits->misaligned = 0; + + /* + * Copy table's limits to the DM device's request_queue */ - blk_queue_max_sectors(q, t->limits.max_sectors); - q->max_phys_segments = t->limits.max_phys_segments; - q->max_hw_segments = t->limits.max_hw_segments; - q->hardsect_size = t->limits.hardsect_size; - q->max_segment_size = t->limits.max_segment_size; - q->max_hw_sectors = t->limits.max_hw_sectors; - q->seg_boundary_mask = t->limits.seg_boundary_mask; - q->bounce_pfn = t->limits.bounce_pfn; - - if (t->limits.no_cluster) + q->limits = *limits; + + if (limits->no_cluster) queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); else queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); dm_table_set_integrity(t); + + /* + * QUEUE_FLAG_STACKABLE must be set after all queue settings are + * visible to other CPUs because, once the flag is set, incoming bios + * are processed by request-based dm, which refers to the queue + * settings. + * Until the flag set, bios are passed to bio-based dm and queued to + * md->deferred where queue settings are not needed yet. + * Those bios are passed to request-based dm at the resume time. + */ + smp_mb(); + if (dm_table_request_based(t)) + queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); } unsigned int dm_table_get_num_targets(struct dm_table *t) @@ -1021,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) return r; } +int dm_table_any_busy_target(struct dm_table *t) +{ + unsigned i; + struct dm_target *ti; + + for (i = 0; i < t->num_targets; i++) { + ti = t->targets + i; + if (ti->type->busy && ti->type->busy(ti)) + return 1; + } + + return 0; +} + void dm_table_unplug_all(struct dm_table *t) { struct dm_dev_internal *dd; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 424f7b0..3c6d4ee 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -19,11 +19,18 @@ #include <linux/slab.h> #include <linux/idr.h> #include <linux/hdreg.h> -#include <linux/blktrace_api.h> -#include <trace/block.h> + +#include <trace/events/block.h> #define DM_MSG_PREFIX "core" +/* + * Cookies are numeric values sent with CHANGE and REMOVE + * uevents while resuming, removing or renaming the device. + */ +#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" +#define DM_COOKIE_LENGTH 24 + static const char *_name = DM_NAME; static unsigned int major = 0; @@ -53,8 +60,6 @@ struct dm_target_io { union map_info info; }; -DEFINE_TRACE(block_bio_complete); - /* * For request-based dm. * One of these is allocated per request. @@ -73,7 +78,7 @@ struct dm_rq_target_io { */ struct dm_rq_clone_bio_info { struct bio *orig; - struct request *rq; + struct dm_rq_target_io *tio; }; union map_info *dm_get_mapinfo(struct bio *bio) @@ -83,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio) return NULL; } +union map_info *dm_get_rq_mapinfo(struct request *rq) +{ + if (rq && rq->end_io_data) + return &((struct dm_rq_target_io *)rq->end_io_data)->info; + return NULL; +} +EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); + #define MINOR_ALLOCED ((void *)-1) /* @@ -159,13 +172,31 @@ struct mapped_device { * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; - struct block_device *suspended_bdev; + struct block_device *bdev; /* forced geometry settings */ struct hd_geometry geometry; + /* marker of flush suspend for request-based dm */ + struct request suspend_rq; + + /* For saving the address of __make_request for request based dm */ + make_request_fn *saved_make_request_fn; + /* sysfs handle */ struct kobject kobj; + + /* zero-length barrier that will be cloned and submitted to targets */ + struct bio barrier_bio; +}; + +/* + * For mempools pre-allocation at the table loading time. + */ +struct dm_md_mempools { + mempool_t *io_pool; + mempool_t *tio_pool; + struct bio_set *bs; }; #define MIN_IOS 256 @@ -393,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io) mempool_free(io, md->io_pool); } -static struct dm_target_io *alloc_tio(struct mapped_device *md) +static void free_tio(struct mapped_device *md, struct dm_target_io *tio) +{ + mempool_free(tio, md->tio_pool); +} + +static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) { - return mempool_alloc(md->tio_pool, GFP_NOIO); + return mempool_alloc(md->tio_pool, GFP_ATOMIC); } -static void free_tio(struct mapped_device *md, struct dm_target_io *tio) +static void free_rq_tio(struct dm_rq_target_io *tio) { - mempool_free(tio, md->tio_pool); + mempool_free(tio, tio->md->tio_pool); +} + +static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) +{ + return mempool_alloc(md->io_pool, GFP_ATOMIC); +} + +static void free_bio_info(struct dm_rq_clone_bio_info *info) +{ + mempool_free(info, info->tio->md->io_pool); } static void start_io_acct(struct dm_io *io) @@ -466,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio) struct dm_table *dm_get_table(struct mapped_device *md) { struct dm_table *t; + unsigned long flags; - read_lock(&md->map_lock); + read_lock_irqsave(&md->map_lock, flags); t = md->map; if (t) dm_table_get(t); - read_unlock(&md->map_lock); + read_unlock_irqrestore(&md->map_lock, flags); return t; } @@ -538,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error) * Target requested pushing back the I/O. */ spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md)) - bio_list_add_head(&md->deferred, io->bio); - else + if (__noflush_suspending(md)) { + if (!bio_barrier(io->bio)) + bio_list_add_head(&md->deferred, + io->bio); + } else /* noflush suspend was interrupted. */ io->error = -EIO; spin_unlock_irqrestore(&md->deferred_lock, flags); @@ -555,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error) * a per-device variable for error reporting. * Note that you can't touch the bio after end_io_acct */ - md->barrier_error = io_error; + if (!md->barrier_error && io_error != -EOPNOTSUPP) + md->barrier_error = io_error; end_io_acct(io); } else { end_io_acct(io); @@ -609,6 +659,262 @@ static void clone_endio(struct bio *bio, int error) dec_pending(io, error); } +/* + * Partial completion handling for request-based dm + */ +static void end_clone_bio(struct bio *clone, int error) +{ + struct dm_rq_clone_bio_info *info = clone->bi_private; + struct dm_rq_target_io *tio = info->tio; + struct bio *bio = info->orig; + unsigned int nr_bytes = info->orig->bi_size; + + bio_put(clone); + + if (tio->error) + /* + * An error has already been detected on the request. + * Once error occurred, just let clone->end_io() handle + * the remainder. + */ + return; + else if (error) { + /* + * Don't notice the error to the upper layer yet. + * The error handling decision is made by the target driver, + * when the request is completed. + */ + tio->error = error; + return; + } + + /* + * I/O for the bio successfully completed. + * Notice the data completion to the upper layer. + */ + + /* + * bios are processed from the head of the list. + * So the completing bio should always be rq->bio. + * If it's not, something wrong is happening. + */ + if (tio->orig->bio != bio) + DMERR("bio completion is going in the middle of the request"); + + /* + * Update the original request. + * Do not use blk_end_request() here, because it may complete + * the original request before the clone, and break the ordering. + */ + blk_update_request(tio->orig, 0, nr_bytes); +} + +/* + * Don't touch any member of the md after calling this function because + * the md may be freed in dm_put() at the end of this function. + * Or do dm_get() before calling this function and dm_put() later. + */ +static void rq_completed(struct mapped_device *md, int run_queue) +{ + int wakeup_waiters = 0; + struct request_queue *q = md->queue; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + if (!queue_in_flight(q)) + wakeup_waiters = 1; + spin_unlock_irqrestore(q->queue_lock, flags); + + /* nudge anyone waiting on suspend queue */ + if (wakeup_waiters) + wake_up(&md->wait); + + if (run_queue) + blk_run_queue(q); + + /* + * dm_put() must be at the end of this function. See the comment above + */ + dm_put(md); +} + +static void dm_unprep_request(struct request *rq) +{ + struct request *clone = rq->special; + struct dm_rq_target_io *tio = clone->end_io_data; + + rq->special = NULL; + rq->cmd_flags &= ~REQ_DONTPREP; + + blk_rq_unprep_clone(clone); + free_rq_tio(tio); +} + +/* + * Requeue the original request of a clone. + */ +void dm_requeue_unmapped_request(struct request *clone) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + struct request_queue *q = rq->q; + unsigned long flags; + + dm_unprep_request(rq); + + spin_lock_irqsave(q->queue_lock, flags); + if (elv_queue_empty(q)) + blk_plug_device(q); + blk_requeue_request(q, rq); + spin_unlock_irqrestore(q->queue_lock, flags); + + rq_completed(md, 0); +} +EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); + +static void __stop_queue(struct request_queue *q) +{ + blk_stop_queue(q); +} + +static void stop_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __stop_queue(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void __start_queue(struct request_queue *q) +{ + if (blk_queue_stopped(q)) + blk_start_queue(q); +} + +static void start_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __start_queue(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +/* + * Complete the clone and the original request. + * Must be called without queue lock. + */ +static void dm_end_request(struct request *clone, int error) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + + if (blk_pc_request(rq)) { + rq->errors = clone->errors; + rq->resid_len = clone->resid_len; + + if (rq->sense) + /* + * We are using the sense buffer of the original + * request. + * So setting the length of the sense data is enough. + */ + rq->sense_len = clone->sense_len; + } + + BUG_ON(clone->bio); + free_rq_tio(tio); + + blk_end_request_all(rq, error); + + rq_completed(md, 1); +} + +/* + * Request completion handler for request-based dm + */ +static void dm_softirq_done(struct request *rq) +{ + struct request *clone = rq->completion_data; + struct dm_rq_target_io *tio = clone->end_io_data; + dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; + int error = tio->error; + + if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) + error = rq_end_io(tio->ti, clone, error, &tio->info); + + if (error <= 0) + /* The target wants to complete the I/O */ + dm_end_request(clone, error); + else if (error == DM_ENDIO_INCOMPLETE) + /* The target will handle the I/O */ + return; + else if (error == DM_ENDIO_REQUEUE) + /* The target wants to requeue the I/O */ + dm_requeue_unmapped_request(clone); + else { + DMWARN("unimplemented target endio return value: %d", error); + BUG(); + } +} + +/* + * Complete the clone and the original request with the error status + * through softirq context. + */ +static void dm_complete_request(struct request *clone, int error) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + struct request *rq = tio->orig; + + tio->error = error; + rq->completion_data = clone; + blk_complete_request(rq); +} + +/* + * Complete the not-mapped clone and the original request with the error status + * through softirq context. + * Target's rq_end_io() function isn't called. + * This may be used when the target's map_rq() function fails. + */ +void dm_kill_unmapped_request(struct request *clone, int error) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + struct request *rq = tio->orig; + + rq->cmd_flags |= REQ_FAILED; + dm_complete_request(clone, error); +} +EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); + +/* + * Called with the queue lock held + */ +static void end_clone_request(struct request *clone, int error) +{ + /* + * For just cleaning up the information of the queue in which + * the clone was dispatched. + * The clone is *NOT* freed actually here because it is alloced from + * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. + */ + __blk_put_request(clone->q, clone); + + /* + * Actual request completion is done in a softirq context which doesn't + * hold the queue lock. Otherwise, deadlock could occur because: + * - another request may be submitted by the upper level driver + * of the stacking during the completion + * - the submission which requires queue lock may be done + * against this queue + */ + dm_complete_request(clone, error); +} + static sector_t max_io_len(struct mapped_device *md, sector_t sector, struct dm_target *ti) { @@ -636,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, sector_t sector; struct mapped_device *md; - /* - * Sanity checks. - */ - BUG_ON(!clone->bi_size); - clone->bi_end_io = clone_endio; clone->bi_private = tio; @@ -656,8 +957,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, /* the bio has been remapped so dispatch it */ trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, - clone->bi_sector, sector); + tio->io->bio->bi_bdev->bd_dev, sector); generic_make_request(clone); } else if (r < 0 || r == DM_MAPIO_REQUEUE) { @@ -755,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, return clone; } +static struct dm_target_io *alloc_tio(struct clone_info *ci, + struct dm_target *ti) +{ + struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); + + tio->io = ci->io; + tio->ti = ti; + memset(&tio->info, 0, sizeof(tio->info)); + + return tio; +} + +static void __flush_target(struct clone_info *ci, struct dm_target *ti, + unsigned flush_nr) +{ + struct dm_target_io *tio = alloc_tio(ci, ti); + struct bio *clone; + + tio->info.flush_request = flush_nr; + + clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); + __bio_clone(clone, ci->bio); + clone->bi_destructor = dm_bio_destructor; + + __map_bio(ti, clone, tio); +} + +static int __clone_and_map_empty_barrier(struct clone_info *ci) +{ + unsigned target_nr = 0, flush_nr; + struct dm_target *ti; + + while ((ti = dm_table_get_target(ci->map, target_nr++))) + for (flush_nr = 0; flush_nr < ti->num_flush_requests; + flush_nr++) + __flush_target(ci, ti, flush_nr); + + ci->sector_count = 0; + + return 0; +} + static int __clone_and_map(struct clone_info *ci) { struct bio *clone, *bio = ci->bio; @@ -762,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci) sector_t len = 0, max; struct dm_target_io *tio; + if (unlikely(bio_empty_barrier(bio))) + return __clone_and_map_empty_barrier(ci); + ti = dm_table_find_target(ci->map, ci->sector); if (!dm_target_is_valid(ti)) return -EIO; @@ -771,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci) /* * Allocate a target io object. */ - tio = alloc_tio(ci->md); - tio->io = ci->io; - tio->ti = ti; - memset(&tio->info, 0, sizeof(tio->info)); + tio = alloc_tio(ci, ti); if (ci->sector_count <= max) { /* @@ -830,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci) max = max_io_len(ci->md, ci->sector, ti); - tio = alloc_tio(ci->md); - tio->io = ci->io; - tio->ti = ti; - memset(&tio->info, 0, sizeof(tio->info)); + tio = alloc_tio(ci, ti); } len = min(remaining, max); @@ -868,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) if (!bio_barrier(bio)) bio_io_error(bio); else - md->barrier_error = -EIO; + if (!md->barrier_error) + md->barrier_error = -EIO; return; } @@ -881,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.io->md = md; ci.sector = bio->bi_sector; ci.sector_count = bio_sectors(bio); + if (unlikely(bio_empty_barrier(bio))) + ci.sector_count = 1; ci.idx = bio->bi_idx; start_io_acct(ci.io); @@ -928,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q, */ if (max_size && ti->type->merge) max_size = ti->type->merge(ti, bvm, biovec, max_size); + /* + * If the target doesn't support merge method and some of the devices + * provided their merge_bvec method (we know this by looking at + * queue_max_hw_sectors), then we can't allow bios with multiple vector + * entries. So always set max_size to 0, and the code below allows + * just one page. + */ + else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) + + max_size = 0; out_table: dm_table_put(map); @@ -946,7 +1298,7 @@ out: * The request function that just remaps the bio built up by * dm_merge_bvec. */ -static int dm_request(struct request_queue *q, struct bio *bio) +static int _dm_request(struct request_queue *q, struct bio *bio) { int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; @@ -983,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio) return 0; } +static int dm_make_request(struct request_queue *q, struct bio *bio) +{ + struct mapped_device *md = q->queuedata; + + if (unlikely(bio_barrier(bio))) { + bio_endio(bio, -EOPNOTSUPP); + return 0; + } + + return md->saved_make_request_fn(q, bio); /* call __make_request() */ +} + +static int dm_request_based(struct mapped_device *md) +{ + return blk_queue_stackable(md->queue); +} + +static int dm_request(struct request_queue *q, struct bio *bio) +{ + struct mapped_device *md = q->queuedata; + + if (dm_request_based(md)) + return dm_make_request(q, bio); + + return _dm_request(q, bio); +} + +void dm_dispatch_request(struct request *rq) +{ + int r; + + if (blk_queue_io_stat(rq->q)) + rq->cmd_flags |= REQ_IO_STAT; + + rq->start_time = jiffies; + r = blk_insert_cloned_request(rq->q, rq); + if (r) + dm_complete_request(rq, r); +} +EXPORT_SYMBOL_GPL(dm_dispatch_request); + +static void dm_rq_bio_destructor(struct bio *bio) +{ + struct dm_rq_clone_bio_info *info = bio->bi_private; + struct mapped_device *md = info->tio->md; + + free_bio_info(info); + bio_free(bio, md->bs); +} + +static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, + void *data) +{ + struct dm_rq_target_io *tio = data; + struct mapped_device *md = tio->md; + struct dm_rq_clone_bio_info *info = alloc_bio_info(md); + + if (!info) + return -ENOMEM; + + info->orig = bio_orig; + info->tio = tio; + bio->bi_end_io = end_clone_bio; + bio->bi_private = info; + bio->bi_destructor = dm_rq_bio_destructor; + + return 0; +} + +static int setup_clone(struct request *clone, struct request *rq, + struct dm_rq_target_io *tio) +{ + int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + dm_rq_bio_constructor, tio); + + if (r) + return r; + + clone->cmd = rq->cmd; + clone->cmd_len = rq->cmd_len; + clone->sense = rq->sense; + clone->buffer = rq->buffer; + clone->end_io = end_clone_request; + clone->end_io_data = tio; + + return 0; +} + +static int dm_rq_flush_suspending(struct mapped_device *md) +{ + return !md->suspend_rq.special; +} + +/* + * Called with the queue lock held. + */ +static int dm_prep_fn(struct request_queue *q, struct request *rq) +{ + struct mapped_device *md = q->queuedata; + struct dm_rq_target_io *tio; + struct request *clone; + + if (unlikely(rq == &md->suspend_rq)) { + if (dm_rq_flush_suspending(md)) + return BLKPREP_OK; + else + /* The flush suspend was interrupted */ + return BLKPREP_KILL; + } + + if (unlikely(rq->special)) { + DMWARN("Already has something in rq->special."); + return BLKPREP_KILL; + } + + tio = alloc_rq_tio(md); /* Only one for each original request */ + if (!tio) + /* -ENOMEM */ + return BLKPREP_DEFER; + + tio->md = md; + tio->ti = NULL; + tio->orig = rq; + tio->error = 0; + memset(&tio->info, 0, sizeof(tio->info)); + + clone = &tio->clone; + if (setup_clone(clone, rq, tio)) { + /* -ENOMEM */ + free_rq_tio(tio); + return BLKPREP_DEFER; + } + + rq->special = clone; + rq->cmd_flags |= REQ_DONTPREP; + + return BLKPREP_OK; +} + +static void map_request(struct dm_target *ti, struct request *rq, + struct mapped_device *md) +{ + int r; + struct request *clone = rq->special; + struct dm_rq_target_io *tio = clone->end_io_data; + + /* + * Hold the md reference here for the in-flight I/O. + * We can't rely on the reference count by device opener, + * because the device may be closed during the request completion + * when all bios are completed. + * See the comment in rq_completed() too. + */ + dm_get(md); + + tio->ti = ti; + r = ti->type->map_rq(ti, clone, &tio->info); + switch (r) { + case DM_MAPIO_SUBMITTED: + /* The target has taken the I/O to submit by itself later */ + break; + case DM_MAPIO_REMAPPED: + /* The target has remapped the I/O so dispatch it */ + dm_dispatch_request(clone); + break; + case DM_MAPIO_REQUEUE: + /* The target wants to requeue the I/O */ + dm_requeue_unmapped_request(clone); + break; + default: + if (r > 0) { + DMWARN("unimplemented target map return value: %d", r); + BUG(); + } + + /* The target wants to complete the I/O */ + dm_kill_unmapped_request(clone, r); + break; + } +} + +/* + * q->request_fn for request-based dm. + * Called with the queue lock held. + */ +static void dm_request_fn(struct request_queue *q) +{ + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_table(md); + struct dm_target *ti; + struct request *rq; + + /* + * For noflush suspend, check blk_queue_stopped() to immediately + * quit I/O dispatching. + */ + while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { + rq = blk_peek_request(q); + if (!rq) + goto plug_and_out; + + if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ + if (queue_in_flight(q)) + /* Not quiet yet. Wait more */ + goto plug_and_out; + + /* This device should be quiet now */ + __stop_queue(q); + blk_start_request(rq); + __blk_end_request_all(rq, 0); + wake_up(&md->wait); + goto out; + } + + ti = dm_table_find_target(map, blk_rq_pos(rq)); + if (ti->type->busy && ti->type->busy(ti)) + goto plug_and_out; + + blk_start_request(rq); + spin_unlock(q->queue_lock); + map_request(ti, rq, md); + spin_lock_irq(q->queue_lock); + } + + goto out; + +plug_and_out: + if (!elv_queue_empty(q)) + /* Some requests still remain, retry later */ + blk_plug_device(q); + +out: + dm_table_put(map); + + return; +} + +int dm_underlying_device_busy(struct request_queue *q) +{ + return blk_lld_busy(q); +} +EXPORT_SYMBOL_GPL(dm_underlying_device_busy); + +static int dm_lld_busy(struct request_queue *q) +{ + int r; + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_table(md); + + if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) + r = 1; + else + r = dm_table_any_busy_target(map); + + dm_table_put(map); + + return r; +} + static void dm_unplug_all(struct request_queue *q) { struct mapped_device *md = q->queuedata; struct dm_table *map = dm_get_table(md); if (map) { + if (dm_request_based(md)) + generic_unplug_device(q); + dm_table_unplug_all(map); dm_table_put(map); } @@ -1003,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits) if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { map = dm_get_table(md); if (map) { - r = dm_table_any_congested(map, bdi_bits); + /* + * Request-based dm cares about only own queue for + * the query about congestion status of request_queue + */ + if (dm_request_based(md)) + r = md->queue->backing_dev_info.state & + bdi_bits; + else + r = dm_table_any_congested(map, bdi_bits); + dm_table_put(map); } } @@ -1126,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->uevent_list); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue(GFP_KERNEL); + md->queue = blk_init_queue(dm_request_fn, NULL); if (!md->queue) goto bad_queue; + /* + * Request-based dm devices cannot be stacked on top of bio-based dm + * devices. The type of this dm device has not been decided yet, + * although we initialized the queue using blk_init_queue(). + * The type is decided at the first table loading time. + * To prevent problematic device stacking, clear the queue flag + * for request stacking support until then. + * + * This queue is new, so no concurrency on the queue_flags. + */ + queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); + md->saved_make_request_fn = md->queue->make_request_fn; md->queue->queuedata = md; md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); - blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); md->queue->unplug_fn = dm_unplug_all; blk_queue_merge_bvec(md->queue, dm_merge_bvec); - - md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); - if (!md->io_pool) - goto bad_io_pool; - - md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); - if (!md->tio_pool) - goto bad_tio_pool; - - md->bs = bioset_create(16, 0); - if (!md->bs) - goto bad_no_bioset; + blk_queue_softirq_done(md->queue, dm_softirq_done); + blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_lld_busy(md->queue, dm_lld_busy); md->disk = alloc_disk(1); if (!md->disk) @@ -1173,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor) if (!md->wq) goto bad_thread; + md->bdev = bdget_disk(md->disk, 0); + if (!md->bdev) + goto bad_bdev; + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -1182,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor) return md; +bad_bdev: + destroy_workqueue(md->wq); bad_thread: put_disk(md->disk); bad_disk: - bioset_free(md->bs); -bad_no_bioset: - mempool_destroy(md->tio_pool); -bad_tio_pool: - mempool_destroy(md->io_pool); -bad_io_pool: blk_cleanup_queue(md->queue); bad_queue: free_minor(minor); @@ -1207,14 +1832,15 @@ static void free_dev(struct mapped_device *md) { int minor = MINOR(disk_devt(md->disk)); - if (md->suspended_bdev) { - unlock_fs(md); - bdput(md->suspended_bdev); - } + unlock_fs(md); + bdput(md->bdev); destroy_workqueue(md->wq); - mempool_destroy(md->tio_pool); - mempool_destroy(md->io_pool); - bioset_free(md->bs); + if (md->tio_pool) + mempool_destroy(md->tio_pool); + if (md->io_pool) + mempool_destroy(md->io_pool); + if (md->bs) + bioset_free(md->bs); blk_integrity_unregister(md->disk); del_gendisk(md->disk); free_minor(minor); @@ -1229,6 +1855,29 @@ static void free_dev(struct mapped_device *md) kfree(md); } +static void __bind_mempools(struct mapped_device *md, struct dm_table *t) +{ + struct dm_md_mempools *p; + + if (md->io_pool && md->tio_pool && md->bs) + /* the md already has necessary mempools */ + goto out; + + p = dm_table_get_md_mempools(t); + BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); + + md->io_pool = p->io_pool; + p->io_pool = NULL; + md->tio_pool = p->tio_pool; + p->tio_pool = NULL; + md->bs = p->bs; + p->bs = NULL; + +out: + /* mempool bind completed, now no need any mempools in the table */ + dm_table_free_md_mempools(t); +} + /* * Bind a table to the device. */ @@ -1252,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size) { set_capacity(md->disk, size); - mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); - i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); - mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); + mutex_lock(&md->bdev->bd_inode->i_mutex); + i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); + mutex_unlock(&md->bdev->bd_inode->i_mutex); } -static int __bind(struct mapped_device *md, struct dm_table *t) +static int __bind(struct mapped_device *md, struct dm_table *t, + struct queue_limits *limits) { struct request_queue *q = md->queue; sector_t size; + unsigned long flags; size = dm_table_get_size(t); @@ -1270,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t) if (size != get_capacity(md->disk)) memset(&md->geometry, 0, sizeof(md->geometry)); - if (md->suspended_bdev) - __set_size(md, size); + __set_size(md, size); if (!size) { dm_table_destroy(t); @@ -1280,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t) dm_table_event_callback(t, event_callback, md); - write_lock(&md->map_lock); + /* + * The queue hasn't been stopped yet, if the old table type wasn't + * for request-based during suspension. So stop it to prevent + * I/O mapping before resume. + * This must be done before setting the queue restrictions, + * because request-based dm may be run just after the setting. + */ + if (dm_table_request_based(t) && !blk_queue_stopped(q)) + stop_queue(q); + + __bind_mempools(md, t); + + write_lock_irqsave(&md->map_lock, flags); md->map = t; - dm_table_set_restrictions(t, q); - write_unlock(&md->map_lock); + dm_table_set_restrictions(t, q, limits); + write_unlock_irqrestore(&md->map_lock, flags); return 0; } @@ -1291,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t) static void __unbind(struct mapped_device *md) { struct dm_table *map = md->map; + unsigned long flags; if (!map) return; dm_table_event_callback(map, NULL, NULL); - write_lock(&md->map_lock); + write_lock_irqsave(&md->map_lock, flags); md->map = NULL; - write_unlock(&md->map_lock); + write_unlock_irqrestore(&md->map_lock, flags); dm_table_destroy(map); } @@ -1402,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) { int r = 0; DECLARE_WAITQUEUE(wait, current); + struct request_queue *q = md->queue; + unsigned long flags; dm_unplug_all(md->queue); @@ -1411,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) set_current_state(interruptible); smp_mb(); - if (!atomic_read(&md->pending)) + if (dm_request_based(md)) { + spin_lock_irqsave(q->queue_lock, flags); + if (!queue_in_flight(q) && blk_queue_stopped(q)) { + spin_unlock_irqrestore(q->queue_lock, flags); + break; + } + spin_unlock_irqrestore(q->queue_lock, flags); + } else if (!atomic_read(&md->pending)) break; if (interruptible == TASK_INTERRUPTIBLE && @@ -1429,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) return r; } -static int dm_flush(struct mapped_device *md) +static void dm_flush(struct mapped_device *md) { dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); - return 0; + + bio_init(&md->barrier_bio); + md->barrier_bio.bi_bdev = md->bdev; + md->barrier_bio.bi_rw = WRITE_BARRIER; + __split_and_process_bio(md, &md->barrier_bio); + + dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); } static void process_barrier(struct mapped_device *md, struct bio *bio) { - int error = dm_flush(md); + md->barrier_error = 0; - if (unlikely(error)) { - bio_endio(bio, error); - return; - } - if (bio_empty_barrier(bio)) { - bio_endio(bio, 0); - return; - } - - __split_and_process_bio(md, bio); + dm_flush(md); - error = dm_flush(md); - - if (!error && md->barrier_error) - error = md->barrier_error; + if (!bio_empty_barrier(bio)) { + __split_and_process_bio(md, bio); + dm_flush(md); + } if (md->barrier_error != DM_ENDIO_REQUEUE) - bio_endio(bio, error); + bio_endio(bio, md->barrier_error); + else { + spin_lock_irq(&md->deferred_lock); + bio_list_add_head(&md->deferred, bio); + spin_unlock_irq(&md->deferred_lock); + } } /* @@ -1482,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work) up_write(&md->io_lock); - if (bio_barrier(c)) - process_barrier(md, c); - else - __split_and_process_bio(md, c); + if (dm_request_based(md)) + generic_make_request(c); + else { + if (bio_barrier(c)) + process_barrier(md, c); + else + __split_and_process_bio(md, c); + } down_write(&md->io_lock); } @@ -1505,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md) */ int dm_swap_table(struct mapped_device *md, struct dm_table *table) { + struct queue_limits limits; int r = -EINVAL; mutex_lock(&md->suspend_lock); @@ -1513,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) if (!dm_suspended(md)) goto out; - /* without bdev, the device size cannot be changed */ - if (!md->suspended_bdev) - if (get_capacity(md->disk) != dm_table_get_size(table)) - goto out; + r = dm_calculate_queue_limits(table, &limits); + if (r) + goto out; + + /* cannot change the device type, once a table is bound */ + if (md->map && + (dm_table_get_type(md->map) != dm_table_get_type(table))) { + DMWARN("can't change the device type after a table is bound"); + goto out; + } + + /* + * It is enought that blk_queue_ordered() is called only once when + * the first bio-based table is bound. + * + * This setting should be moved to alloc_dev() when request-based dm + * supports barrier. + */ + if (!md->map && dm_table_bio_based(table)) + blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); __unbind(md); - r = __bind(md, table); + r = __bind(md, table, &limits); out: mutex_unlock(&md->suspend_lock); return r; } +static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) +{ + md->suspend_rq.special = (void *)0x1; +} + +static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) +{ + struct request_queue *q = md->queue; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + if (!noflush) + dm_rq_invalidate_suspend_marker(md); + __start_queue(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void dm_rq_start_suspend(struct mapped_device *md, int noflush) +{ + struct request *rq = &md->suspend_rq; + struct request_queue *q = md->queue; + + if (noflush) + stop_queue(q); + else { + blk_rq_init(q, rq); + blk_insert_request(q, rq, 0, NULL); + } +} + +static int dm_rq_suspend_available(struct mapped_device *md, int noflush) +{ + int r = 1; + struct request *rq = &md->suspend_rq; + struct request_queue *q = md->queue; + unsigned long flags; + + if (noflush) + return r; + + /* The marker must be protected by queue lock if it is in use */ + spin_lock_irqsave(q->queue_lock, flags); + if (unlikely(rq->ref_count)) { + /* + * This can happen, when the previous flush suspend was + * interrupted, the marker is still in the queue and + * this flush suspend has been invoked, because we don't + * remove the marker at the time of suspend interruption. + * We have only one marker per mapped_device, so we can't + * start another flush suspend while it is in use. + */ + BUG_ON(!rq->special); /* The marker should be invalidated */ + DMWARN("Invalidating the previous flush suspend is still in" + " progress. Please retry later."); + r = 0; + } + spin_unlock_irqrestore(q->queue_lock, flags); + + return r; +} + /* * Functions to lock and unlock any filesystem running on the * device. @@ -1536,7 +2292,7 @@ static int lock_fs(struct mapped_device *md) WARN_ON(md->frozen_sb); - md->frozen_sb = freeze_bdev(md->suspended_bdev); + md->frozen_sb = freeze_bdev(md->bdev); if (IS_ERR(md->frozen_sb)) { r = PTR_ERR(md->frozen_sb); md->frozen_sb = NULL; @@ -1545,9 +2301,6 @@ static int lock_fs(struct mapped_device *md) set_bit(DMF_FROZEN, &md->flags); - /* don't bdput right now, we don't want the bdev - * to go away while it is locked. - */ return 0; } @@ -1556,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md) if (!test_bit(DMF_FROZEN, &md->flags)) return; - thaw_bdev(md->suspended_bdev, md->frozen_sb); + thaw_bdev(md->bdev, md->frozen_sb); md->frozen_sb = NULL; clear_bit(DMF_FROZEN, &md->flags); } @@ -1568,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md) * dm_bind_table, dm_suspend must be called to flush any in * flight bios and ensure that any further io gets deferred. */ +/* + * Suspend mechanism in request-based dm. + * + * After the suspend starts, further incoming requests are kept in + * the request_queue and deferred. + * Remaining requests in the request_queue at the start of suspend are flushed + * if it is flush suspend. + * The suspend completes when the following conditions have been satisfied, + * so wait for it: + * 1. q->in_flight is 0 (which means no in_flight request) + * 2. queue has been stopped (which means no request dispatching) + * + * + * Noflush suspend + * --------------- + * Noflush suspend doesn't need to dispatch remaining requests. + * So stop the queue immediately. Then, wait for all in_flight requests + * to be completed or requeued. + * + * To abort noflush suspend, start the queue. + * + * + * Flush suspend + * ------------- + * Flush suspend needs to dispatch remaining requests. So stop the queue + * after the remaining requests are completed. (Requeued request must be also + * re-dispatched and completed. Until then, we can't stop the queue.) + * + * During flushing the remaining requests, further incoming requests are also + * inserted to the same queue. To distinguish which requests are to be + * flushed, we insert a marker request to the queue at the time of starting + * flush suspend, like a barrier. + * The dispatching is blocked when the marker is found on the top of the queue. + * And the queue is stopped when all in_flight requests are completed, since + * that means the remaining requests are completely flushed. + * Then, the marker is removed from the queue. + * + * To abort flush suspend, we also need to take care of the marker, not only + * starting the queue. + * We don't remove the marker forcibly from the queue since it's against + * the block-layer manner. Instead, we put a invalidated mark on the marker. + * When the invalidated marker is found on the top of the queue, it is + * immediately removed from the queue, so it doesn't block dispatching. + * Because we have only one marker per mapped_device, we can't start another + * flush suspend until the invalidated marker is removed from the queue. + * So fail and return with -EBUSY in such a case. + */ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) { struct dm_table *map = NULL; @@ -1582,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) goto out_unlock; } + if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { + r = -EBUSY; + goto out_unlock; + } + map = dm_get_table(md); /* @@ -1594,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) /* This does not get reverted if there's an error later. */ dm_table_presuspend_targets(map); - /* bdget() can stall if the pending I/Os are not flushed */ - if (!noflush) { - md->suspended_bdev = bdget_disk(md->disk, 0); - if (!md->suspended_bdev) { - DMWARN("bdget failed in dm_suspend"); - r = -ENOMEM; + /* + * Flush I/O to the device. noflush supersedes do_lockfs, + * because lock_fs() needs to flush I/Os. + */ + if (!noflush && do_lockfs) { + r = lock_fs(md); + if (r) goto out; - } - - /* - * Flush I/O to the device. noflush supersedes do_lockfs, - * because lock_fs() needs to flush I/Os. - */ - if (do_lockfs) { - r = lock_fs(md); - if (r) - goto out; - } } /* @@ -1637,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) flush_workqueue(md->wq); + if (dm_request_based(md)) + dm_rq_start_suspend(md, noflush); + /* * At this point no more requests are entering target request routines. * We call dm_wait_for_completion to wait for all existing requests @@ -1653,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) if (r < 0) { dm_queue_flush(md); + if (dm_request_based(md)) + dm_rq_abort_suspend(md, noflush); + unlock_fs(md); goto out; /* pushback list is already flushed, so skip flush */ } @@ -1668,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) set_bit(DMF_SUSPENDED, &md->flags); out: - if (r && md->suspended_bdev) { - bdput(md->suspended_bdev); - md->suspended_bdev = NULL; - } - dm_table_put(map); out_unlock: @@ -1699,21 +2495,20 @@ int dm_resume(struct mapped_device *md) dm_queue_flush(md); - unlock_fs(md); + /* + * Flushing deferred I/Os must be done after targets are resumed + * so that mapping of targets can work correctly. + * Request-based dm is queueing the deferred I/Os in its request_queue. + */ + if (dm_request_based(md)) + start_queue(md->queue); - if (md->suspended_bdev) { - bdput(md->suspended_bdev); - md->suspended_bdev = NULL; - } + unlock_fs(md); clear_bit(DMF_SUSPENDED, &md->flags); dm_table_unplug_all(map); - - dm_kobject_uevent(md); - r = 0; - out: dm_table_put(map); mutex_unlock(&md->suspend_lock); @@ -1724,9 +2519,19 @@ out: /*----------------------------------------------------------------- * Event notification. *---------------------------------------------------------------*/ -void dm_kobject_uevent(struct mapped_device *md) -{ - kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE); +void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned cookie) +{ + char udev_cookie[DM_COOKIE_LENGTH]; + char *envp[] = { udev_cookie, NULL }; + + if (!cookie) + kobject_uevent(&disk_to_dev(md->disk)->kobj, action); + else { + snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", + DM_COOKIE_ENV_VAR_NAME, cookie); + kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); + } } uint32_t dm_next_uevent_seq(struct mapped_device *md) @@ -1780,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) if (&md->kobj != kobj) return NULL; + if (test_bit(DMF_FREEING, &md->flags) || + test_bit(DMF_DELETING, &md->flags)) + return NULL; + dm_get(md); return md; } @@ -1800,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_noflush_suspending); +struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) +{ + struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); + + if (!pools) + return NULL; + + pools->io_pool = (type == DM_TYPE_BIO_BASED) ? + mempool_create_slab_pool(MIN_IOS, _io_cache) : + mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); + if (!pools->io_pool) + goto free_pools_and_out; + + pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? + mempool_create_slab_pool(MIN_IOS, _tio_cache) : + mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); + if (!pools->tio_pool) + goto free_io_pool_and_out; + + pools->bs = (type == DM_TYPE_BIO_BASED) ? + bioset_create(16, 0) : bioset_create(MIN_IOS, 0); + if (!pools->bs) + goto free_tio_pool_and_out; + + return pools; + +free_tio_pool_and_out: + mempool_destroy(pools->tio_pool); + +free_io_pool_and_out: + mempool_destroy(pools->io_pool); + +free_pools_and_out: + kfree(pools); + + return NULL; +} + +void dm_free_md_mempools(struct dm_md_mempools *pools) +{ + if (!pools) + return; + + if (pools->io_pool) + mempool_destroy(pools->io_pool); + + if (pools->tio_pool) + mempool_destroy(pools->tio_pool); + + if (pools->bs) + bioset_free(pools->bs); + + kfree(pools); +} + static struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a31506d..23278ae 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -23,6 +23,13 @@ #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) /* + * Type of table and mapped_device's mempool + */ +#define DM_TYPE_NONE 0 +#define DM_TYPE_BIO_BASED 1 +#define DM_TYPE_REQUEST_BASED 2 + +/* * List of devices that a metadevice uses and should open/close. */ struct dm_dev_internal { @@ -32,6 +39,7 @@ struct dm_dev_internal { }; struct dm_table; +struct dm_md_mempools; /*----------------------------------------------------------------- * Internal table functions. @@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); +int dm_calculate_queue_limits(struct dm_table *table, + struct queue_limits *limits); +void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits); struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); +int dm_table_any_busy_target(struct dm_table *t); +int dm_table_set_type(struct dm_table *t); +unsigned dm_table_get_type(struct dm_table *t); +bool dm_table_bio_based(struct dm_table *t); +bool dm_table_request_based(struct dm_table *t); +int dm_table_alloc_md_mempools(struct dm_table *t); +void dm_table_free_md_mempools(struct dm_table *t); +struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); /* * To check the return value from dm_table_find_target(). */ #define dm_target_is_valid(t) ((t)->table) +/* + * To check whether the target type is request-based or not (bio-based). + */ +#define dm_target_request_based(t) ((t)->type->map_rq != NULL) + /*----------------------------------------------------------------- * A registry of target types. *---------------------------------------------------------------*/ @@ -92,9 +116,16 @@ void dm_stripe_exit(void); int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md); -void dm_kobject_uevent(struct mapped_device *md); +void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned cookie); int dm_kcopyd_init(void); void dm_kcopyd_exit(void); +/* + * Mempool operations + */ +struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); +void dm_free_md_mempools(struct dm_md_mempools *pools); + #endif diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 8695809..87d88db 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -255,14 +255,14 @@ static void status(struct seq_file *seq, mddev_t *mddev) } -static int reconfig(mddev_t *mddev, int layout, int chunk_size) +static int reshape(mddev_t *mddev) { - int mode = layout & ModeMask; - int count = layout >> ModeShift; + int mode = mddev->new_layout & ModeMask; + int count = mddev->new_layout >> ModeShift; conf_t *conf = mddev->private; - if (chunk_size != -1) - return -EINVAL; + if (mddev->new_layout < 0) + return 0; /* new layout */ if (mode == ClearFaults) @@ -279,6 +279,7 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) atomic_set(&conf->counters[mode], count); } else return -EINVAL; + mddev->new_layout = -1; mddev->layout = -1; /* makes sure further changes come through */ return 0; } @@ -298,8 +299,12 @@ static int run(mddev_t *mddev) { mdk_rdev_t *rdev; int i; + conf_t *conf; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; - conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL); + conf = kmalloc(sizeof(*conf), GFP_KERNEL); if (!conf) return -ENOMEM; @@ -315,7 +320,7 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; - reconfig(mddev, mddev->layout, -1); + reshape(mddev); return 0; } @@ -338,7 +343,7 @@ static struct mdk_personality faulty_personality = .run = run, .stop = stop, .status = status, - .reconfig = reconfig, + .check_reshape = reshape, .size = faulty_size, }; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 7a36e38..15c8b7b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -27,19 +27,27 @@ */ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) { - dev_info_t *hash; - linear_conf_t *conf = mddev_to_conf(mddev); - sector_t idx = sector >> conf->sector_shift; + int lo, mid, hi; + linear_conf_t *conf; + + lo = 0; + hi = mddev->raid_disks - 1; + conf = rcu_dereference(mddev->private); /* - * sector_div(a,b) returns the remainer and sets a to a/b + * Binary Search */ - (void)sector_div(idx, conf->spacing); - hash = conf->hash_table[idx]; - while (sector >= hash->num_sectors + hash->start_sector) - hash++; - return hash; + while (hi > lo) { + + mid = (hi + lo) / 2; + if (sector < conf->disks[mid].end_sector) + hi = mid; + else + lo = mid + 1; + } + + return conf->disks + lo; } /** @@ -59,8 +67,10 @@ static int linear_mergeable_bvec(struct request_queue *q, unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + rcu_read_lock(); dev0 = which_dev(mddev, sector); - maxsectors = dev0->num_sectors - (sector - dev0->start_sector); + maxsectors = dev0->end_sector - sector; + rcu_read_unlock(); if (maxsectors < bio_sectors) maxsectors = 0; @@ -79,46 +89,57 @@ static int linear_mergeable_bvec(struct request_queue *q, static void linear_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf; int i; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + for (i=0; i < mddev->raid_disks; i++) { struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); blk_unplug(r_queue); } + rcu_read_unlock(); } static int linear_congested(void *data, int bits) { mddev_t *mddev = data; - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf; int i, ret = 0; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } + + rcu_read_unlock(); return ret; } static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) { - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf; + sector_t array_sectors; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); + array_sectors = conf->array_sectors; + rcu_read_unlock(); - return conf->array_sectors; + return array_sectors; } static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) { linear_conf_t *conf; - dev_info_t **table; mdk_rdev_t *rdev; - int i, nb_zone, cnt; - sector_t min_sectors; - sector_t curr_sector; + int i, cnt; conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), GFP_KERNEL); @@ -131,6 +152,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) list_for_each_entry(rdev, &mddev->disks, same_set) { int j = rdev->raid_disk; dev_info_t *disk = conf->disks + j; + sector_t sectors; if (j < 0 || j >= raid_disks || disk->rdev) { printk("linear: disk numbering problem. Aborting!\n"); @@ -138,6 +160,11 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) } disk->rdev = rdev; + if (mddev->chunk_sectors) { + sectors = rdev->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev->sectors = sectors * mddev->chunk_sectors; + } blk_queue_stack_limits(mddev->queue, rdev->bdev->bd_disk->queue); @@ -146,105 +173,27 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - disk->num_sectors = rdev->sectors; conf->array_sectors += rdev->sectors; - cnt++; + } if (cnt != raid_disks) { printk("linear: not enough drives present. Aborting!\n"); goto out; } - min_sectors = conf->array_sectors; - sector_div(min_sectors, PAGE_SIZE/sizeof(struct dev_info *)); - if (min_sectors == 0) - min_sectors = 1; - - /* min_sectors is the minimum spacing that will fit the hash - * table in one PAGE. This may be much smaller than needed. - * We find the smallest non-terminal set of consecutive devices - * that is larger than min_sectors and use the size of that as - * the actual spacing - */ - conf->spacing = conf->array_sectors; - for (i=0; i < cnt-1 ; i++) { - sector_t tmp = 0; - int j; - for (j = i; j < cnt - 1 && tmp < min_sectors; j++) - tmp += conf->disks[j].num_sectors; - if (tmp >= min_sectors && tmp < conf->spacing) - conf->spacing = tmp; - } - - /* spacing may be too large for sector_div to work with, - * so we might need to pre-shift - */ - conf->sector_shift = 0; - if (sizeof(sector_t) > sizeof(u32)) { - sector_t space = conf->spacing; - while (space > (sector_t)(~(u32)0)) { - space >>= 1; - conf->sector_shift++; - } - } /* - * This code was restructured to work around a gcc-2.95.3 internal - * compiler error. Alter it with care. + * Here we calculate the device offsets. */ - { - sector_t sz; - unsigned round; - unsigned long base; - - sz = conf->array_sectors >> conf->sector_shift; - sz += 1; /* force round-up */ - base = conf->spacing >> conf->sector_shift; - round = sector_div(sz, base); - nb_zone = sz + (round ? 1 : 0); - } - BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); - - conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, - GFP_KERNEL); - if (!conf->hash_table) - goto out; + conf->disks[0].end_sector = conf->disks[0].rdev->sectors; - /* - * Here we generate the linear hash table - * First calculate the device offsets. - */ - conf->disks[0].start_sector = 0; for (i = 1; i < raid_disks; i++) - conf->disks[i].start_sector = - conf->disks[i-1].start_sector + - conf->disks[i-1].num_sectors; - - table = conf->hash_table; - i = 0; - for (curr_sector = 0; - curr_sector < conf->array_sectors; - curr_sector += conf->spacing) { - - while (i < raid_disks-1 && - curr_sector >= conf->disks[i+1].start_sector) - i++; - - *table ++ = conf->disks + i; - } - - if (conf->sector_shift) { - conf->spacing >>= conf->sector_shift; - /* round spacing up so that when we divide by it, - * we err on the side of "too-low", which is safest. - */ - conf->spacing++; - } - - BUG_ON(table - conf->hash_table > nb_zone); + conf->disks[i].end_sector = + conf->disks[i-1].end_sector + + conf->disks[i].rdev->sectors; return conf; @@ -257,6 +206,8 @@ static int linear_run (mddev_t *mddev) { linear_conf_t *conf; + if (md_check_no_bitmap(mddev)) + return -EINVAL; mddev->queue->queue_lock = &mddev->queue->__queue_lock; conf = linear_conf(mddev, mddev->raid_disks); @@ -272,6 +223,12 @@ static int linear_run (mddev_t *mddev) return 0; } +static void free_conf(struct rcu_head *head) +{ + linear_conf_t *conf = container_of(head, linear_conf_t, rcu); + kfree(conf); +} + static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) { /* Adding a drive to a linear array allows the array to grow. @@ -282,7 +239,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) * The current one is never freed until the array is stopped. * This avoids races. */ - linear_conf_t *newconf; + linear_conf_t *newconf, *oldconf; if (rdev->saved_raid_disk != mddev->raid_disks) return -EINVAL; @@ -294,25 +251,29 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) if (!newconf) return -ENOMEM; - newconf->prev = mddev_to_conf(mddev); - mddev->private = newconf; + oldconf = rcu_dereference(mddev->private); mddev->raid_disks++; + rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); + call_rcu(&oldconf->rcu, free_conf); return 0; } static int linear_stop (mddev_t *mddev) { - linear_conf_t *conf = mddev_to_conf(mddev); - + linear_conf_t *conf = mddev->private; + + /* + * We do not require rcu protection here since + * we hold reconfig_mutex for both linear_add and + * linear_stop, so they cannot race. + * We should make sure any old 'conf's are properly + * freed though. + */ + rcu_barrier(); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - do { - linear_conf_t *t = conf->prev; - kfree(conf->hash_table); - kfree(conf); - conf = t; - } while (conf); + kfree(conf); return 0; } @@ -322,6 +283,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) const int rw = bio_data_dir(bio); mddev_t *mddev = q->queuedata; dev_info_t *tmp_dev; + sector_t start_sector; int cpu; if (unlikely(bio_barrier(bio))) { @@ -335,33 +297,36 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) bio_sectors(bio)); part_stat_unlock(); + rcu_read_lock(); tmp_dev = which_dev(mddev, bio->bi_sector); - - if (unlikely(bio->bi_sector >= (tmp_dev->num_sectors + - tmp_dev->start_sector) - || (bio->bi_sector < - tmp_dev->start_sector))) { + start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; + + + if (unlikely(bio->bi_sector >= (tmp_dev->end_sector) + || (bio->bi_sector < start_sector))) { char b[BDEVNAME_SIZE]; printk("linear_make_request: Sector %llu out of bounds on " "dev %s: %llu sectors, offset %llu\n", (unsigned long long)bio->bi_sector, bdevname(tmp_dev->rdev->bdev, b), - (unsigned long long)tmp_dev->num_sectors, - (unsigned long long)tmp_dev->start_sector); + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); + rcu_read_unlock(); bio_io_error(bio); return 0; } if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > - tmp_dev->start_sector + tmp_dev->num_sectors)) { + tmp_dev->end_sector)) { /* This bio crosses a device boundary, so we have to * split it. */ struct bio_pair *bp; + sector_t end_sector = tmp_dev->end_sector; + + rcu_read_unlock(); - bp = bio_split(bio, - tmp_dev->start_sector + tmp_dev->num_sectors - - bio->bi_sector); + bp = bio_split(bio, end_sector - bio->bi_sector); if (linear_make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); @@ -372,8 +337,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) } bio->bi_bdev = tmp_dev->rdev->bdev; - bio->bi_sector = bio->bi_sector - tmp_dev->start_sector + bio->bi_sector = bio->bi_sector - start_sector + tmp_dev->rdev->data_offset; + rcu_read_unlock(); return 1; } @@ -381,7 +347,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) static void linear_status (struct seq_file *seq, mddev_t *mddev) { - seq_printf(seq, " %dk rounding", mddev->chunk_size/1024); + seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } diff --git a/drivers/md/linear.h b/drivers/md/linear.h index bf81795..0ce29b6 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -3,27 +3,19 @@ struct dev_info { mdk_rdev_t *rdev; - sector_t num_sectors; - sector_t start_sector; + sector_t end_sector; }; typedef struct dev_info dev_info_t; struct linear_private_data { - struct linear_private_data *prev; /* earlier version */ - dev_info_t **hash_table; - sector_t spacing; sector_t array_sectors; - int sector_shift; /* shift before dividing - * by spacing - */ dev_info_t disks[0]; + struct rcu_head rcu; }; typedef struct linear_private_data linear_conf_t; -#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) - #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 641b211..09be637 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -440,15 +440,6 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev) return MD_NEW_SIZE_SECTORS(num_sectors); } -static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) -{ - sector_t num_sectors = rdev->sb_start; - - if (chunk_size) - num_sectors &= ~((sector_t)chunk_size/512 - 1); - return num_sectors; -} - static int alloc_disk_sb(mdk_rdev_t * rdev) { if (rdev->sb_page) @@ -745,6 +736,24 @@ struct super_type { }; /* + * Check that the given mddev has no bitmap. + * + * This function is called from the run method of all personalities that do not + * support bitmaps. It prints an error message and returns non-zero if mddev + * has a bitmap. Otherwise, it returns 0. + * + */ +int md_check_no_bitmap(mddev_t *mddev) +{ + if (!mddev->bitmap_file && !mddev->bitmap_offset) + return 0; + printk(KERN_ERR "%s: bitmaps are not supported for %s\n", + mdname(mddev), mddev->pers->name); + return 1; +} +EXPORT_SYMBOL(md_check_no_bitmap); + +/* * load_super for 0.90.0 */ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) @@ -797,17 +806,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->data_offset = 0; rdev->sb_size = MD_SB_BYTES; - if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { - if (sb->level != 1 && sb->level != 4 - && sb->level != 5 && sb->level != 6 - && sb->level != 10) { - /* FIXME use a better test */ - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - goto abort; - } - } - if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; else @@ -836,7 +834,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); + rdev->sectors = rdev->sb_start; if (rdev->sectors < sb->size * 2 && sb->level > 1) /* "this cannot possibly happen" ... */ @@ -866,7 +864,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; mddev->external = 0; - mddev->chunk_size = sb->chunk_size; + mddev->chunk_sectors = sb->chunk_size >> 9; mddev->ctime = sb->ctime; mddev->utime = sb->utime; mddev->level = sb->level; @@ -883,13 +881,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->delta_disks = sb->delta_disks; mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; - mddev->new_chunk = sb->new_chunk; + mddev->new_chunk_sectors = sb->new_chunk >> 9; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; } if (sb->state & (1<<MD_SB_CLEAN)) @@ -1004,7 +1002,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_level = mddev->new_level; sb->delta_disks = mddev->delta_disks; sb->new_layout = mddev->new_layout; - sb->new_chunk = mddev->new_chunk; + sb->new_chunk = mddev->new_chunk_sectors << 9; } mddev->minor_version = sb->minor_version; if (mddev->in_sync) @@ -1018,7 +1016,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->recovery_cp = 0; sb->layout = mddev->layout; - sb->chunk_size = mddev->chunk_size; + sb->chunk_size = mddev->chunk_sectors << 9; if (mddev->bitmap && mddev->bitmap_file == NULL) sb->state |= (1<<MD_SB_BITMAP_PRESENT); @@ -1185,24 +1183,13 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) bdevname(rdev->bdev,b)); return -EINVAL; } - if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { - if (sb->level != cpu_to_le32(1) && - sb->level != cpu_to_le32(4) && - sb->level != cpu_to_le32(5) && - sb->level != cpu_to_le32(6) && - sb->level != cpu_to_le32(10)) { - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - return -EINVAL; - } - } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; - bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; @@ -1248,9 +1235,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (rdev->sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); - if (le32_to_cpu(sb->chunksize)) - rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); - if (le64_to_cpu(sb->size) > rdev->sectors) return -EINVAL; return ret; @@ -1271,7 +1255,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->major_version = 1; mddev->patch_version = 0; mddev->external = 0; - mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->chunk_sectors = le32_to_cpu(sb->chunksize); mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); mddev->level = le32_to_cpu(sb->level); @@ -1297,13 +1281,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->delta_disks = le32_to_cpu(sb->delta_disks); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); - mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; + mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else if (mddev->pers == NULL) { @@ -1375,7 +1359,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->dev_sectors); - sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9); + sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); @@ -1402,7 +1386,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_layout = cpu_to_le32(mddev->new_layout); sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); - sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); + sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); } max_dev = 0; @@ -1897,6 +1881,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) int sync_req; int nospares = 0; + mddev->utime = get_seconds(); if (mddev->external) return; repeat: @@ -1926,7 +1911,6 @@ repeat: nospares = 0; sync_req = mddev->in_sync; - mddev->utime = get_seconds(); /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ @@ -2597,15 +2581,6 @@ static void analyze_sbs(mddev_t * mddev) clear_bit(In_sync, &rdev->flags); } } - - - - if (mddev->recovery_cp != MaxSector && - mddev->level >= 1) - printk(KERN_ERR "md: %s: raid array is not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - } static void md_safemode_timeout(unsigned long data); @@ -2746,7 +2721,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (IS_ERR(priv)) { mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; module_put(pers->owner); @@ -2764,7 +2739,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; - mddev->chunk_size = mddev->new_chunk; + mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; pers->run(mddev); mddev_resume(mddev); @@ -2800,11 +2775,14 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) { int err; - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EBUSY; - err = mddev->pers->reconfig(mddev, n, -1); - if (err) + mddev->new_layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_layout = mddev->layout; return err; + } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) @@ -2857,10 +2835,11 @@ static ssize_t chunk_size_show(mddev_t *mddev, char *page) { if (mddev->reshape_position != MaxSector && - mddev->chunk_size != mddev->new_chunk) - return sprintf(page, "%d (%d)\n", mddev->new_chunk, - mddev->chunk_size); - return sprintf(page, "%d\n", mddev->chunk_size); + mddev->chunk_sectors != mddev->new_chunk_sectors) + return sprintf(page, "%d (%d)\n", + mddev->new_chunk_sectors << 9, + mddev->chunk_sectors << 9); + return sprintf(page, "%d\n", mddev->chunk_sectors << 9); } static ssize_t @@ -2874,15 +2853,18 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) { int err; - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EBUSY; - err = mddev->pers->reconfig(mddev, -1, n); - if (err) + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_chunk_sectors = mddev->chunk_sectors; return err; + } } else { - mddev->new_chunk = n; + mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) - mddev->chunk_size = n; + mddev->chunk_sectors = n >> 9; } return len; } @@ -3527,8 +3509,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { - if (min & (sector_t)((mddev->chunk_size>>9)-1)) + if (mddev->chunk_sectors) { + sector_t temp = min; + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_min = min; @@ -3564,8 +3547,9 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { - if (max & (sector_t)((mddev->chunk_size>>9)-1)) + if (mddev->chunk_sectors) { + sector_t temp = max; + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_max = max; @@ -3656,7 +3640,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len) mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; return len; } @@ -3976,11 +3960,9 @@ static int start_dirty_degraded; static int do_md_run(mddev_t * mddev) { int err; - int chunk_size; mdk_rdev_t *rdev; struct gendisk *disk; struct mdk_personality *pers; - char b[BDEVNAME_SIZE]; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ @@ -3998,38 +3980,6 @@ static int do_md_run(mddev_t * mddev) analyze_sbs(mddev); } - chunk_size = mddev->chunk_size; - - if (chunk_size) { - if (chunk_size > MAX_CHUNK_SIZE) { - printk(KERN_ERR "too big chunk_size: %d > %d\n", - chunk_size, MAX_CHUNK_SIZE); - return -EINVAL; - } - /* - * chunk-size has to be a power of 2 - */ - if ( (1 << ffz(~chunk_size)) != chunk_size) { - printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); - return -EINVAL; - } - - /* devices must have minimum size of one chunk */ - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->sectors < chunk_size / 512) { - printk(KERN_WARNING - "md: Dev %s smaller than chunk_size:" - " %llu < %d\n", - bdevname(rdev->bdev,b), - (unsigned long long)rdev->sectors, - chunk_size / 512); - return -EINVAL; - } - } - } - if (mddev->level != LEVEL_NONE) request_module("md-level-%d", mddev->level); else if (mddev->clevel[0]) @@ -4405,7 +4355,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->flags = 0; mddev->ro = 0; mddev->metadata_type[0] = 0; - mddev->chunk_size = 0; + mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; mddev->max_disks = 0; @@ -4413,7 +4363,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->delta_disks = 0; mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; - mddev->new_chunk = 0; + mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; mddev->resync_mismatches = 0; mddev->suspend_lo = mddev->suspend_hi = 0; @@ -4618,7 +4568,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.spare_disks = spare; info.layout = mddev->layout; - info.chunk_size = mddev->chunk_size; + info.chunk_size = mddev->chunk_sectors << 9; if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -4843,7 +4793,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); + rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4913,7 +4863,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) else rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); + rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -5062,7 +5012,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; - mddev->chunk_size = info->chunk_size; + mddev->chunk_sectors = info->chunk_size >> 9; mddev->max_disks = MD_SB_DISKS; @@ -5081,7 +5031,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) get_random_bytes(mddev->uuid, 16); mddev->new_level = mddev->level; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; @@ -5191,7 +5141,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->level != info->level || /* mddev->layout != info->layout || */ !mddev->persistent != info->not_persistent|| - mddev->chunk_size != info->chunk_size || + mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) ) @@ -5215,10 +5165,15 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) * we don't need to do anything at the md level, the * personality will take care of it all. */ - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EINVAL; - else - return mddev->pers->reconfig(mddev, info->layout, -1); + else { + mddev->new_layout = info->layout; + rv = mddev->pers->check_reshape(mddev); + if (rv) + mddev->new_layout = mddev->layout; + return rv; + } } if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); @@ -6717,7 +6672,8 @@ void md_check_recovery(mddev_t *mddev) */ if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape(mddev) != 0) + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) /* Cannot proceed */ goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); diff --git a/drivers/md/md.h b/drivers/md/md.h index 8227ab9..9430a11 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -30,13 +30,6 @@ typedef struct mddev_s mddev_t; typedef struct mdk_rdev_s mdk_rdev_t; /* - * options passed in raidrun: - */ - -/* Currently this must fit in an 'int' */ -#define MAX_CHUNK_SIZE (1<<30) - -/* * MD's 'extended' device */ struct mdk_rdev_s @@ -145,7 +138,7 @@ struct mddev_s int external; /* metadata is * managed externally */ char metadata_type[17]; /* externally set*/ - int chunk_size; + int chunk_sectors; time_t ctime, utime; int level, layout; char clevel[16]; @@ -166,7 +159,8 @@ struct mddev_s * If reshape_position is MaxSector, then no reshape is happening (yet). */ sector_t reshape_position; - int delta_disks, new_level, new_layout, new_chunk; + int delta_disks, new_level, new_layout; + int new_chunk_sectors; struct mdk_thread_s *thread; /* management thread */ struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ @@ -325,7 +319,6 @@ struct mdk_personality int (*check_reshape) (mddev_t *mddev); int (*start_reshape) (mddev_t *mddev); void (*finish_reshape) (mddev_t *mddev); - int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); /* quiesce moves between quiescence states * 0 - fully active * 1 - no new requests allowed @@ -437,5 +430,6 @@ extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); +extern int md_check_no_bitmap(mddev_t *mddev); #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 41ced0c..cbe368f 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -58,7 +58,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) { unsigned long flags; mddev_t *mddev = mp_bh->mddev; - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&mp_bh->retry_list, &conf->retry_list); @@ -75,7 +75,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) { struct bio *bio = mp_bh->master_bio; - multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); + multipath_conf_t *conf = mp_bh->mddev->private; bio_endio(bio, err); mempool_free(mp_bh, conf->pool); @@ -85,7 +85,7 @@ static void multipath_end_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); - multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); + multipath_conf_t *conf = mp_bh->mddev->private; mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; if (uptodate) @@ -107,7 +107,7 @@ static void multipath_end_request(struct bio *bio, int error) static void unplug_slaves(mddev_t *mddev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; int i; rcu_read_lock(); @@ -138,7 +138,7 @@ static void multipath_unplug(struct request_queue *q) static int multipath_make_request (struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; struct multipath_bh * mp_bh; struct multipath_info *multipath; const int rw = bio_data_dir(bio); @@ -180,7 +180,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) static void multipath_status (struct seq_file *seq, mddev_t *mddev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; int i; seq_printf (seq, " [%d/%d] [", conf->raid_disks, @@ -195,7 +195,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) static int multipath_congested(void *data, int bits) { mddev_t *mddev = data; - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; int i, ret = 0; rcu_read_lock(); @@ -220,7 +220,7 @@ static int multipath_congested(void *data, int bits) */ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; if (conf->working_disks <= 1) { /* @@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * merge_bvec_fn will be involved in multipath.) */ if (q->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(q) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); conf->working_disks++; @@ -367,7 +367,7 @@ static void multipathd (mddev_t *mddev) struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; md_check_recovery(mddev); @@ -421,6 +421,9 @@ static int multipath_run (mddev_t *mddev) struct multipath_info *disk; mdk_rdev_t *rdev; + if (md_check_no_bitmap(mddev)) + return -EINVAL; + if (mddev->level != LEVEL_MULTIPATH) { printk("multipath: %s: raid level not set to multipath IO (%d)\n", mdname(mddev), mddev->level); @@ -467,7 +470,7 @@ static int multipath_run (mddev_t *mddev) * violating it, not that we ever expect a device with * a merge_bvec_fn to be involved in multipath */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); if (!test_bit(Faulty, &rdev->flags)) @@ -531,7 +534,7 @@ out: static int multipath_stop (mddev_t *mddev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; md_unregister_thread(mddev->thread); mddev->thread = NULL; diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index 6fa70b4..d1c2a8d 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h @@ -19,12 +19,6 @@ struct multipath_private_data { typedef struct multipath_private_data multipath_conf_t; /* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) - -/* * this is our 'private' 'collective' MULTIPATH buffer head. * it contains information about what kind of IO operations were started * for this MULTIPATH operation, and about their status: diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c08d755..ab4a489 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -26,8 +26,8 @@ static void raid0_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; - raid0_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t **devlist = conf->strip_zone[0].dev; + raid0_conf_t *conf = mddev->private; + mdk_rdev_t **devlist = conf->devlist; int i; for (i=0; i<mddev->raid_disks; i++) { @@ -40,8 +40,8 @@ static void raid0_unplug(struct request_queue *q) static int raid0_congested(void *data, int bits) { mddev_t *mddev = data; - raid0_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t **devlist = conf->strip_zone[0].dev; + raid0_conf_t *conf = mddev->private; + mdk_rdev_t **devlist = conf->devlist; int i, ret = 0; for (i = 0; i < mddev->raid_disks && !ret ; i++) { @@ -52,27 +52,60 @@ static int raid0_congested(void *data, int bits) return ret; } +/* + * inform the user of the raid configuration +*/ +static void dump_zones(mddev_t *mddev) +{ + int j, k, h; + sector_t zone_size = 0; + sector_t zone_start = 0; + char b[BDEVNAME_SIZE]; + raid0_conf_t *conf = mddev->private; + printk(KERN_INFO "******* %s configuration *********\n", + mdname(mddev)); + h = 0; + for (j = 0; j < conf->nr_strip_zones; j++) { + printk(KERN_INFO "zone%d=[", j); + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + printk("%s/", + bdevname(conf->devlist[j*mddev->raid_disks + + k]->bdev, b)); + printk("]\n"); + + zone_size = conf->strip_zone[j].zone_end - zone_start; + printk(KERN_INFO " zone offset=%llukb " + "device offset=%llukb size=%llukb\n", + (unsigned long long)zone_start>>1, + (unsigned long long)conf->strip_zone[j].dev_start>>1, + (unsigned long long)zone_size>>1); + zone_start = conf->strip_zone[j].zone_end; + } + printk(KERN_INFO "**********************************\n\n"); +} -static int create_strip_zones (mddev_t *mddev) +static int create_strip_zones(mddev_t *mddev) { - int i, c, j; - sector_t current_start, curr_zone_start; - sector_t min_spacing; - raid0_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; + int i, c, j, err; + sector_t curr_zone_end, sectors; + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev; struct strip_zone *zone; int cnt; char b[BDEVNAME_SIZE]; - - /* - * The number of 'same size groups' - */ - conf->nr_strip_zones = 0; - + raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + + if (!conf) + return -ENOMEM; list_for_each_entry(rdev1, &mddev->disks, same_set) { printk(KERN_INFO "raid0: looking at %s\n", bdevname(rdev1->bdev,b)); c = 0; + + /* round size to chunk_size */ + sectors = rdev1->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev1->sectors = sectors * mddev->chunk_sectors; + list_for_each_entry(rdev2, &mddev->disks, same_set) { printk(KERN_INFO "raid0: comparing %s(%llu)", bdevname(rdev1->bdev,b), @@ -103,16 +136,16 @@ static int create_strip_zones (mddev_t *mddev) } } printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); - + err = -ENOMEM; conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); if (!conf->strip_zone) - return 1; + goto abort; conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* conf->nr_strip_zones*mddev->raid_disks, GFP_KERNEL); if (!conf->devlist) - return 1; + goto abort; /* The first zone must contain all devices, so here we check that * there is a proper alignment of slots to devices and find them all @@ -120,7 +153,8 @@ static int create_strip_zones (mddev_t *mddev) zone = &conf->strip_zone[0]; cnt = 0; smallest = NULL; - zone->dev = conf->devlist; + dev = conf->devlist; + err = -EINVAL; list_for_each_entry(rdev1, &mddev->disks, same_set) { int j = rdev1->raid_disk; @@ -129,12 +163,12 @@ static int create_strip_zones (mddev_t *mddev) "aborting!\n", j); goto abort; } - if (zone->dev[j]) { + if (dev[j]) { printk(KERN_ERR "raid0: multiple devices for %d - " "aborting!\n", j); goto abort; } - zone->dev[j] = rdev1; + dev[j] = rdev1; blk_queue_stack_limits(mddev->queue, rdev1->bdev->bd_disk->queue); @@ -144,7 +178,7 @@ static int create_strip_zones (mddev_t *mddev) */ if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); if (!smallest || (rdev1->sectors < smallest->sectors)) @@ -157,34 +191,32 @@ static int create_strip_zones (mddev_t *mddev) goto abort; } zone->nb_dev = cnt; - zone->sectors = smallest->sectors * cnt; - zone->zone_start = 0; + zone->zone_end = smallest->sectors * cnt; - current_start = smallest->sectors; - curr_zone_start = zone->sectors; + curr_zone_end = zone->zone_end; /* now do the other zones */ for (i = 1; i < conf->nr_strip_zones; i++) { zone = conf->strip_zone + i; - zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; + dev = conf->devlist + i * mddev->raid_disks; printk(KERN_INFO "raid0: zone %d\n", i); - zone->dev_start = current_start; + zone->dev_start = smallest->sectors; smallest = NULL; c = 0; for (j=0; j<cnt; j++) { char b[BDEVNAME_SIZE]; - rdev = conf->strip_zone[0].dev[j]; + rdev = conf->devlist[j]; printk(KERN_INFO "raid0: checking %s ...", bdevname(rdev->bdev, b)); - if (rdev->sectors <= current_start) { + if (rdev->sectors <= zone->dev_start) { printk(KERN_INFO " nope.\n"); continue; } printk(KERN_INFO " contained as device %d\n", c); - zone->dev[c] = rdev; + dev[c] = rdev; c++; if (!smallest || rdev->sectors < smallest->sectors) { smallest = rdev; @@ -194,47 +226,39 @@ static int create_strip_zones (mddev_t *mddev) } zone->nb_dev = c; - zone->sectors = (smallest->sectors - current_start) * c; + sectors = (smallest->sectors - zone->dev_start) * c; printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", - zone->nb_dev, (unsigned long long)zone->sectors); + zone->nb_dev, (unsigned long long)sectors); - zone->zone_start = curr_zone_start; - curr_zone_start += zone->sectors; + curr_zone_end += sectors; + zone->zone_end = curr_zone_end; - current_start = smallest->sectors; printk(KERN_INFO "raid0: current zone start: %llu\n", - (unsigned long long)current_start); - } - - /* Now find appropriate hash spacing. - * We want a number which causes most hash entries to cover - * at most two strips, but the hash table must be at most - * 1 PAGE. We choose the smallest strip, or contiguous collection - * of strips, that has big enough size. We never consider the last - * strip though as it's size has no bearing on the efficacy of the hash - * table. - */ - conf->spacing = curr_zone_start; - min_spacing = curr_zone_start; - sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*)); - for (i=0; i < conf->nr_strip_zones-1; i++) { - sector_t s = 0; - for (j = i; j < conf->nr_strip_zones - 1 && - s < min_spacing; j++) - s += conf->strip_zone[j].sectors; - if (s >= min_spacing && s < conf->spacing) - conf->spacing = s; + (unsigned long long)smallest->sectors); } - mddev->queue->unplug_fn = raid0_unplug; - mddev->queue->backing_dev_info.congested_fn = raid0_congested; mddev->queue->backing_dev_info.congested_data = mddev; + /* + * now since we have the hard sector sizes, we can make sure + * chunk size is a multiple of that sector size + */ + if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { + printk(KERN_ERR "%s chunk_size of %d not valid\n", + mdname(mddev), + mddev->chunk_sectors << 9); + goto abort; + } printk(KERN_INFO "raid0: done.\n"); + mddev->private = conf; return 0; - abort: - return 1; +abort: + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); + mddev->private = NULL; + return err; } /** @@ -252,10 +276,15 @@ static int raid0_mergeable_bvec(struct request_queue *q, mddev_t *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; + if (is_power_of_2(chunk_sectors)) + max = (chunk_sectors - ((sector & (chunk_sectors-1)) + + bio_sectors)) << 9; + else + max = (chunk_sectors - (sector_div(sector, chunk_sectors) + + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ if (max <= biovec->bv_len && bio_sectors == 0) return biovec->bv_len; @@ -277,84 +306,28 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) return array_sectors; } -static int raid0_run (mddev_t *mddev) +static int raid0_run(mddev_t *mddev) { - unsigned cur=0, i=0, nb_zone; - s64 sectors; - raid0_conf_t *conf; + int ret; - if (mddev->chunk_size == 0) { - printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); + if (mddev->chunk_sectors == 0) { + printk(KERN_ERR "md/raid0: chunk size must be set.\n"); return -EINVAL; } - printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n", - mdname(mddev), - mddev->chunk_size >> 9, - (mddev->chunk_size>>1)-1); - blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); - blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1); + if (md_check_no_bitmap(mddev)) + return -EINVAL; + blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); mddev->queue->queue_lock = &mddev->queue->__queue_lock; - conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL); - if (!conf) - goto out; - mddev->private = (void *)conf; - - conf->strip_zone = NULL; - conf->devlist = NULL; - if (create_strip_zones (mddev)) - goto out_free_conf; + ret = create_strip_zones(mddev); + if (ret < 0) + return ret; /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", (unsigned long long)mddev->array_sectors); - printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", - (unsigned long long)conf->spacing); - { - sector_t s = raid0_size(mddev, 0, 0); - sector_t space = conf->spacing; - int round; - conf->sector_shift = 0; - if (sizeof(sector_t) > sizeof(u32)) { - /*shift down space and s so that sector_div will work */ - while (space > (sector_t) (~(u32)0)) { - s >>= 1; - space >>= 1; - s += 1; /* force round-up */ - conf->sector_shift++; - } - } - round = sector_div(s, (u32)space) ? 1 : 0; - nb_zone = s + round; - } - printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone); - - printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n", - nb_zone*sizeof(struct strip_zone*)); - conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL); - if (!conf->hash_table) - goto out_free_conf; - sectors = conf->strip_zone[cur].sectors; - - conf->hash_table[0] = conf->strip_zone + cur; - for (i=1; i< nb_zone; i++) { - while (sectors <= conf->spacing) { - cur++; - sectors += conf->strip_zone[cur].sectors; - } - sectors -= conf->spacing; - conf->hash_table[i] = conf->strip_zone + cur; - } - if (conf->sector_shift) { - conf->spacing >>= conf->sector_shift; - /* round spacing up so when we divide by it, we - * err on the side of too-low, which is safest - */ - conf->spacing++; - } - /* calculate the max read-ahead size. * For read-ahead of large files to be effective, we need to * readahead at least twice a whole stripe. i.e. number of devices @@ -365,48 +338,107 @@ static int raid0_run (mddev_t *mddev) * chunksize should be used in that case. */ { - int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE; + int stripe = mddev->raid_disks * + (mddev->chunk_sectors << 9) / PAGE_SIZE; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); + dump_zones(mddev); return 0; +} -out_free_conf: +static int raid0_stop(mddev_t *mddev) +{ + raid0_conf_t *conf = mddev->private; + + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); mddev->private = NULL; -out: - return -ENOMEM; + return 0; } -static int raid0_stop (mddev_t *mddev) +/* Find the zone which holds a particular offset + * Update *sectorp to be an offset in that zone + */ +static struct strip_zone *find_zone(struct raid0_private_data *conf, + sector_t *sectorp) { - raid0_conf_t *conf = mddev_to_conf(mddev); + int i; + struct strip_zone *z = conf->strip_zone; + sector_t sector = *sectorp; + + for (i = 0; i < conf->nr_strip_zones; i++) + if (sector < z[i].zone_end) { + if (i) + *sectorp = sector - z[i-1].zone_end; + return z + i; + } + BUG(); +} - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - kfree(conf->hash_table); - conf->hash_table = NULL; - kfree(conf->strip_zone); - conf->strip_zone = NULL; - kfree(conf); - mddev->private = NULL; +/* + * remaps the bio to the target device. we separate two flows. + * power 2 flow and a general flow for the sake of perfromance +*/ +static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, + sector_t sector, sector_t *sector_offset) +{ + unsigned int sect_in_chunk; + sector_t chunk; + raid0_conf_t *conf = mddev->private; + unsigned int chunk_sects = mddev->chunk_sectors; + + if (is_power_of_2(chunk_sects)) { + int chunksect_bits = ffz(~chunk_sects); + /* find the sector offset inside the chunk */ + sect_in_chunk = sector & (chunk_sects - 1); + sector >>= chunksect_bits; + /* chunk in zone */ + chunk = *sector_offset; + /* quotient is the chunk in real device*/ + sector_div(chunk, zone->nb_dev << chunksect_bits); + } else{ + sect_in_chunk = sector_div(sector, chunk_sects); + chunk = *sector_offset; + sector_div(chunk, chunk_sects * zone->nb_dev); + } + /* + * position the bio over the real device + * real sector = chunk in device + starting of zone + * + the position in the chunk + */ + *sector_offset = (chunk * chunk_sects) + sect_in_chunk; + return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks + + sector_div(sector, zone->nb_dev)]; +} - return 0; +/* + * Is io distribute over 1 or more chunks ? +*/ +static inline int is_io_in_chunk_boundary(mddev_t *mddev, + unsigned int chunk_sects, struct bio *bio) +{ + if (likely(is_power_of_2(chunk_sects))) { + return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) + + (bio->bi_size >> 9)); + } else{ + sector_t sector = bio->bi_sector; + return chunk_sects >= (sector_div(sector, chunk_sects) + + (bio->bi_size >> 9)); + } } -static int raid0_make_request (struct request_queue *q, struct bio *bio) +static int raid0_make_request(struct request_queue *q, struct bio *bio) { mddev_t *mddev = q->queuedata; - unsigned int sect_in_chunk, chunksect_bits, chunk_sects; - raid0_conf_t *conf = mddev_to_conf(mddev); + unsigned int chunk_sects; + sector_t sector_offset; struct strip_zone *zone; mdk_rdev_t *tmp_dev; - sector_t chunk; - sector_t sector, rsect; const int rw = bio_data_dir(bio); int cpu; @@ -421,11 +453,9 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) bio_sectors(bio)); part_stat_unlock(); - chunk_sects = mddev->chunk_size >> 9; - chunksect_bits = ffz(~chunk_sects); - sector = bio->bi_sector; - - if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) { + chunk_sects = mddev->chunk_sectors; + if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { + sector_t sector = bio->bi_sector; struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ if (bio->bi_vcnt != 1 || @@ -434,7 +464,12 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1))); + if (likely(is_power_of_2(chunk_sects))) + bp = bio_split(bio, chunk_sects - (sector & + (chunk_sects-1))); + else + bp = bio_split(bio, chunk_sects - + sector_div(sector, chunk_sects)); if (raid0_make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); if (raid0_make_request(q, &bp->bio2)) @@ -443,34 +478,14 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) bio_pair_release(bp); return 0; } - - - { - sector_t x = sector >> conf->sector_shift; - sector_div(x, (u32)conf->spacing); - zone = conf->hash_table[x]; - } - while (sector >= zone->zone_start + zone->sectors) - zone++; - - sect_in_chunk = bio->bi_sector & (chunk_sects - 1); - - - { - sector_t x = (sector - zone->zone_start) >> chunksect_bits; - - sector_div(x, zone->nb_dev); - chunk = x; - - x = sector >> chunksect_bits; - tmp_dev = zone->dev[sector_div(x, zone->nb_dev)]; - } - rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk; - + sector_offset = bio->bi_sector; + zone = find_zone(mddev->private, §or_offset); + tmp_dev = map_sector(mddev, zone, bio->bi_sector, + §or_offset); bio->bi_bdev = tmp_dev->bdev; - bio->bi_sector = rsect + tmp_dev->data_offset; - + bio->bi_sector = sector_offset + zone->dev_start + + tmp_dev->data_offset; /* * Let the main block layer submit the IO and resolve recursion: */ @@ -485,31 +500,35 @@ bad_map: return 0; } -static void raid0_status (struct seq_file *seq, mddev_t *mddev) +static void raid0_status(struct seq_file *seq, mddev_t *mddev) { #undef MD_DEBUG #ifdef MD_DEBUG int j, k, h; char b[BDEVNAME_SIZE]; - raid0_conf_t *conf = mddev_to_conf(mddev); + raid0_conf_t *conf = mddev->private; + sector_t zone_size; + sector_t zone_start = 0; h = 0; + for (j = 0; j < conf->nr_strip_zones; j++) { seq_printf(seq, " z%d", j); - if (conf->hash_table[h] == conf->strip_zone+j) - seq_printf(seq, "(h%d)", h++); seq_printf(seq, "=["); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) seq_printf(seq, "%s/", bdevname( - conf->strip_zone[j].dev[k]->bdev,b)); - - seq_printf(seq, "] zs=%d ds=%d s=%d\n", - conf->strip_zone[j].zone_start, - conf->strip_zone[j].dev_start, - conf->strip_zone[j].sectors); + conf->devlist[j*mddev->raid_disks + k] + ->bdev, b)); + + zone_size = conf->strip_zone[j].zone_end - zone_start; + seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n", + (unsigned long long)zone_start>>1, + (unsigned long long)conf->strip_zone[j].dev_start>>1, + (unsigned long long)zone_size>>1); + zone_start = conf->strip_zone[j].zone_end; } #endif - seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); + seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); return; } diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 824b12e..91f8e87 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -3,26 +3,18 @@ struct strip_zone { - sector_t zone_start; /* Zone offset in md_dev (in sectors) */ + sector_t zone_end; /* Start of the next zone (in sectors) */ sector_t dev_start; /* Zone offset in real dev (in sectors) */ - sector_t sectors; /* Zone size in sectors */ int nb_dev; /* # of devices attached to the zone */ - mdk_rdev_t **dev; /* Devices attached to the zone */ }; struct raid0_private_data { - struct strip_zone **hash_table; /* Table of indexes into strip_zone */ struct strip_zone *strip_zone; mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ int nr_strip_zones; - - sector_t spacing; - int sector_shift; /* shift this before divide by spacing */ }; typedef struct raid0_private_data raid0_conf_t; -#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) - #endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 36df910..89939a7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -182,7 +182,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) static void free_r1bio(r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; /* * Wake up any possible resync thread that waits for the device @@ -196,7 +196,7 @@ static void free_r1bio(r1bio_t *r1_bio) static void put_buf(r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; int i; for (i=0; i<conf->raid_disks; i++) { @@ -214,7 +214,7 @@ static void reschedule_retry(r1bio_t *r1_bio) { unsigned long flags; mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); @@ -253,7 +253,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) */ static inline void update_head_pos(int disk, r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; conf->mirrors[disk].head_position = r1_bio->sector + (r1_bio->sectors); @@ -264,7 +264,7 @@ static void raid1_end_read_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror; - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; mirror = r1_bio->read_disk; /* @@ -309,7 +309,7 @@ static void raid1_end_write_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; struct bio *to_put = NULL; @@ -541,7 +541,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) static void unplug_slaves(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; rcu_read_lock(); @@ -573,7 +573,7 @@ static void raid1_unplug(struct request_queue *q) static int raid1_congested(void *data, int bits) { mddev_t *mddev = data; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, ret = 0; rcu_read_lock(); @@ -772,7 +772,7 @@ do_sync_io: static int make_request(struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *read_bio; @@ -991,7 +991,7 @@ static int make_request(struct request_queue *q, struct bio * bio) static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, @@ -1010,7 +1010,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; /* * If it is not operational, then we have already marked it as dead @@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; @@ -1214,7 +1214,7 @@ static void end_sync_write(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; int mirror=0; @@ -1248,7 +1248,7 @@ static void end_sync_write(struct bio *bio, int error) static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; int disks = conf->raid_disks; struct bio *bio, *wbio; @@ -1562,7 +1562,7 @@ static void raid1d(mddev_t *mddev) r1bio_t *r1_bio; struct bio *bio; unsigned long flags; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; int unplug=0; mdk_rdev_t *rdev; @@ -1585,7 +1585,7 @@ static void raid1d(mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; - conf = mddev_to_conf(mddev); + conf = mddev->private; if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); unplug = 1; @@ -1706,7 +1706,7 @@ static int init_resync(conf_t *conf) static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; r1bio_t *r1_bio; struct bio *bio; sector_t max_sector, nr_sectors; @@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; @@ -2052,6 +2052,10 @@ static int run(mddev_t *mddev) goto out_free_conf; } + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "raid1: %s is not clean" + " -- starting background reconstruction\n", + mdname(mddev)); printk(KERN_INFO "raid1: raid set %s active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, @@ -2087,7 +2091,7 @@ out: static int stop(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct bitmap *bitmap = mddev->bitmap; int behind_wait = 0; @@ -2155,16 +2159,16 @@ static int raid1_reshape(mddev_t *mddev) mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; mirror_info_t *newmirrors; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int cnt, raid_disks; unsigned long flags; int d, d2, err; /* Cannot change chunk_size, layout, or level */ - if (mddev->chunk_size != mddev->new_chunk || + if (mddev->chunk_sectors != mddev->new_chunk_sectors || mddev->layout != mddev->new_layout || mddev->level != mddev->new_level) { - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->new_level = mddev->level; return -EINVAL; @@ -2252,7 +2256,7 @@ static int raid1_reshape(mddev_t *mddev) static void raid1_quiesce(mddev_t *mddev, int state) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; switch(state) { case 1: diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 1620eea..e87b84d 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -64,12 +64,6 @@ struct r1_private_data_s { typedef struct r1_private_data_s conf_t; /* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((conf_t *) mddev->private) - -/* * this is our 'private' RAID1 bio. * * it contains information about what kind of IO operations were started diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 499620a..ae12cea 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -188,7 +188,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) static void free_r10bio(r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; /* * Wake up any possible resync thread that waits for the device @@ -202,7 +202,7 @@ static void free_r10bio(r10bio_t *r10_bio) static void put_buf(r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; mempool_free(r10_bio, conf->r10buf_pool); @@ -213,7 +213,7 @@ static void reschedule_retry(r10bio_t *r10_bio) { unsigned long flags; mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&r10_bio->retry_list, &conf->retry_list); @@ -245,7 +245,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio) */ static inline void update_head_pos(int slot, r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; conf->mirrors[r10_bio->devs[slot].devnum].head_position = r10_bio->devs[slot].addr + (r10_bio->sectors); @@ -256,7 +256,7 @@ static void raid10_end_read_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; @@ -297,7 +297,7 @@ static void raid10_end_write_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; for (slot = 0; slot < conf->copies; slot++) if (r10_bio->devs[slot].bio == bio) @@ -461,7 +461,7 @@ static int raid10_mergeable_bvec(struct request_queue *q, mddev_t *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; @@ -596,7 +596,7 @@ rb_out: static void unplug_slaves(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; rcu_read_lock(); @@ -628,7 +628,7 @@ static void raid10_unplug(struct request_queue *q) static int raid10_congested(void *data, int bits) { mddev_t *mddev = data; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, ret = 0; rcu_read_lock(); @@ -788,7 +788,7 @@ static void unfreeze_array(conf_t *conf) static int make_request(struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; mirror_info_t *mirror; r10bio_t *r10_bio; struct bio *read_bio; @@ -981,11 +981,11 @@ static int make_request(struct request_queue *q, struct bio * bio) static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; if (conf->near_copies < conf->raid_disks) - seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); + seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); if (conf->near_copies > 1) seq_printf(seq, " %d near-copies", conf->near_copies); if (conf->far_copies > 1) { @@ -1006,7 +1006,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; /* * If it is not operational, then we have already marked it as dead @@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; rdev->raid_disk = mirror; @@ -1215,7 +1215,7 @@ abort: static void end_sync_read(struct bio *bio, int error) { r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; int i,d; for (i=0; i<conf->copies; i++) @@ -1253,7 +1253,7 @@ static void end_sync_write(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i,d; for (i = 0; i < conf->copies; i++) @@ -1300,7 +1300,7 @@ static void end_sync_write(struct bio *bio, int error) */ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, first; struct bio *tbio, *fbio; @@ -1400,7 +1400,7 @@ done: static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, d; struct bio *bio, *wbio; @@ -1549,7 +1549,7 @@ static void raid10d(mddev_t *mddev) r10bio_t *r10_bio; struct bio *bio; unsigned long flags; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; int unplug=0; mdk_rdev_t *rdev; @@ -1572,7 +1572,7 @@ static void raid10d(mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r10_bio->mddev; - conf = mddev_to_conf(mddev); + conf = mddev->private; if (test_bit(R10BIO_IsSync, &r10_bio->state)) { sync_request_write(mddev, r10_bio); unplug = 1; @@ -1680,7 +1680,7 @@ static int init_resync(conf_t *conf) static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; r10bio_t *r10_bio; struct bio *biolist = NULL, *bio; sector_t max_sector, nr_sectors; @@ -2026,7 +2026,7 @@ static sector_t raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) { sector_t size; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; if (!raid_disks) raid_disks = mddev->raid_disks; @@ -2050,9 +2050,10 @@ static int run(mddev_t *mddev) int nc, fc, fo; sector_t stride, size; - if (mddev->chunk_size < PAGE_SIZE) { + if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || + !is_power_of_2(mddev->chunk_sectors)) { printk(KERN_ERR "md/raid10: chunk size must be " - "at least PAGE_SIZE(%ld).\n", PAGE_SIZE); + "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); return -EINVAL; } @@ -2095,8 +2096,8 @@ static int run(mddev_t *mddev) conf->far_copies = fc; conf->copies = nc*fc; conf->far_offset = fo; - conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; - conf->chunk_shift = ffz(~mddev->chunk_size) - 9; + conf->chunk_mask = mddev->chunk_sectors - 1; + conf->chunk_shift = ffz(~mddev->chunk_sectors); size = mddev->dev_sectors >> conf->chunk_shift; sector_div(size, fc); size = size * conf->raid_disks; @@ -2145,8 +2146,8 @@ static int run(mddev_t *mddev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; } @@ -2185,6 +2186,10 @@ static int run(mddev_t *mddev) goto out_free_conf; } + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "raid10: %s is not clean" + " -- starting background reconstruction\n", + mdname(mddev)); printk(KERN_INFO "raid10: raid set %s active with %d out of %d devices\n", mdname(mddev), mddev->raid_disks - mddev->degraded, @@ -2204,7 +2209,8 @@ static int run(mddev_t *mddev) * maybe... */ { - int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE); + int stripe = conf->raid_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); stripe /= conf->near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) mddev->queue->backing_dev_info.ra_pages = 2* stripe; @@ -2227,7 +2233,7 @@ out: static int stop(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; raise_barrier(conf, 0); lower_barrier(conf); @@ -2245,7 +2251,7 @@ static int stop(mddev_t *mddev) static void raid10_quiesce(mddev_t *mddev, int state) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; switch(state) { case 1: diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 244dbe5..59cd1ef 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -62,12 +62,6 @@ struct r10_private_data_s { typedef struct r10_private_data_s conf_t; /* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((conf_t *) mddev->private) - -/* * this is our 'private' RAID10 bio. * * it contains information about what kind of IO operations were started diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 54ef8d7..cac6f4d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1617,8 +1617,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, sector_t new_sector; int algorithm = previous ? conf->prev_algo : conf->algorithm; - int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) - : (conf->chunk_size >> 9); + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; int raid_disks = previous ? conf->previous_raid_disks : conf->raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -1823,8 +1823,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) int raid_disks = sh->disks; int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; - int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) - : (conf->chunk_size >> 9); + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; int algorithm = previous ? conf->prev_algo : conf->algorithm; sector_t stripe; @@ -2098,8 +2098,7 @@ static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, struct stripe_head *sh) { int sectors_per_chunk = - previous ? (conf->prev_chunk >> 9) - : (conf->chunk_size >> 9); + previous ? conf->prev_chunk_sectors : conf->chunk_sectors; int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); int disks = previous ? conf->previous_raid_disks : conf->raid_disks; @@ -3496,7 +3495,7 @@ static void activate_bit_delay(raid5_conf_t *conf) static void unplug_slaves(mddev_t *mddev) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; int i; rcu_read_lock(); @@ -3520,7 +3519,7 @@ static void unplug_slaves(mddev_t *mddev) static void raid5_unplug_device(struct request_queue *q) { mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); @@ -3539,7 +3538,7 @@ static void raid5_unplug_device(struct request_queue *q) static int raid5_congested(void *data, int bits) { mddev_t *mddev = data; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; /* No difference between reads and writes. Just check * how busy the stripe_cache is @@ -3564,14 +3563,14 @@ static int raid5_mergeable_bvec(struct request_queue *q, mddev_t *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ - if (mddev->new_chunk < mddev->chunk_size) - chunk_sectors = mddev->new_chunk >> 9; + if (mddev->new_chunk_sectors < mddev->chunk_sectors) + chunk_sectors = mddev->new_chunk_sectors; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; if (max <= biovec->bv_len && bio_sectors == 0) @@ -3584,11 +3583,11 @@ static int raid5_mergeable_bvec(struct request_queue *q, static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) { sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bio->bi_size >> 9; - if (mddev->new_chunk < mddev->chunk_size) - chunk_sectors = mddev->new_chunk >> 9; + if (mddev->new_chunk_sectors < mddev->chunk_sectors) + chunk_sectors = mddev->new_chunk_sectors; return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -3652,7 +3651,7 @@ static void raid5_align_endio(struct bio *bi, int error) bio_put(bi); mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; - conf = mddev_to_conf(mddev); + conf = mddev->private; rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; @@ -3675,10 +3674,10 @@ static int bio_fits_rdev(struct bio *bi) { struct request_queue *q = bdev_get_queue(bi->bi_bdev); - if ((bi->bi_size>>9) > q->max_sectors) + if ((bi->bi_size>>9) > queue_max_sectors(q)) return 0; blk_recount_segments(q, bi); - if (bi->bi_phys_segments > q->max_phys_segments) + if (bi->bi_phys_segments > queue_max_phys_segments(q)) return 0; if (q->merge_bvec_fn) @@ -3694,7 +3693,7 @@ static int bio_fits_rdev(struct bio *bi) static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; unsigned int dd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3811,7 +3810,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) static int make_request(struct request_queue *q, struct bio * bi) { mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; @@ -3908,6 +3907,7 @@ static int make_request(struct request_queue *q, struct bio * bi) spin_unlock_irq(&conf->device_lock); if (must_retry) { release_stripe(sh); + schedule(); goto retry; } } @@ -4003,10 +4003,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * If old and new chunk sizes differ, we need to process the * largest of these */ - if (mddev->new_chunk > mddev->chunk_size) - reshape_sectors = mddev->new_chunk / 512; + if (mddev->new_chunk_sectors > mddev->chunk_sectors) + reshape_sectors = mddev->new_chunk_sectors; else - reshape_sectors = mddev->chunk_size / 512; + reshape_sectors = mddev->chunk_sectors; /* we update the metadata when there is more than 3Meg * in the block range (that is rather arbitrary, should @@ -4129,7 +4129,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped 1, &dd_idx, NULL); last_sector = raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) - *(new_data_disks) - 1), + * new_data_disks - 1), 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; @@ -4158,7 +4158,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -4371,7 +4371,7 @@ static void synchronize_stripe_processing(struct list_head *domain) static void raid5d(mddev_t *mddev) { struct stripe_head *sh; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; int handled; LIST_HEAD(raid_domain); @@ -4428,7 +4428,7 @@ static void raid5d(mddev_t *mddev) static ssize_t raid5_show_stripe_cache_size(mddev_t *mddev, char *page) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (conf) return sprintf(page, "%d\n", conf->max_nr_stripes); else @@ -4438,7 +4438,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page) static ssize_t raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; unsigned long new; int err; @@ -4476,7 +4476,7 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, static ssize_t raid5_show_preread_threshold(mddev_t *mddev, char *page) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (conf) return sprintf(page, "%d\n", conf->bypass_threshold); else @@ -4486,7 +4486,7 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page) static ssize_t raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; unsigned long new; if (len >= PAGE_SIZE) return -EINVAL; @@ -4510,7 +4510,7 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, static ssize_t stripe_cache_active_show(mddev_t *mddev, char *page) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (conf) return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); else @@ -4534,7 +4534,7 @@ static struct attribute_group raid5_attrs_group = { static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (!sectors) sectors = mddev->dev_sectors; @@ -4546,8 +4546,8 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) raid_disks = conf->previous_raid_disks; } - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - sectors &= ~((sector_t)mddev->new_chunk/512 - 1); + sectors &= ~((sector_t)mddev->chunk_sectors - 1); + sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -4691,9 +4691,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) return ERR_PTR(-EINVAL); } - if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { + if (!mddev->new_chunk_sectors || + (mddev->new_chunk_sectors << 9) % PAGE_SIZE || + !is_power_of_2(mddev->new_chunk_sectors)) { printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - mddev->new_chunk, mdname(mddev)); + mddev->new_chunk_sectors << 9, mdname(mddev)); return ERR_PTR(-EINVAL); } @@ -4756,7 +4758,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->fullsync = 1; } - conf->chunk_size = mddev->new_chunk; + conf->chunk_sectors = mddev->new_chunk_sectors; + conf->level = mddev->new_level; if (conf->level == 6) conf->max_degraded = 2; else @@ -4765,7 +4768,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { - conf->prev_chunk = mddev->chunk_size; + conf->prev_chunk_sectors = mddev->chunk_sectors; conf->prev_algo = mddev->layout; } @@ -4803,6 +4806,10 @@ static int run(mddev_t *mddev) int working_disks = 0; mdk_rdev_t *rdev; + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "raid5: %s is not clean" + " -- starting background reconstruction\n", + mdname(mddev)); if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. * Currently only disks can change, it must @@ -4825,7 +4832,7 @@ static int run(mddev_t *mddev) * geometry. */ here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->new_chunk>>9)* + if (sector_div(here_new, mddev->new_chunk_sectors * (mddev->raid_disks - max_degraded))) { printk(KERN_ERR "raid5: reshape_position not " "on a stripe boundary\n"); @@ -4833,7 +4840,7 @@ static int run(mddev_t *mddev) } /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, (mddev->chunk_size>>9)* + sector_div(here_old, mddev->chunk_sectors * (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ @@ -4848,7 +4855,7 @@ static int run(mddev_t *mddev) } else { BUG_ON(mddev->level != mddev->new_level); BUG_ON(mddev->layout != mddev->new_layout); - BUG_ON(mddev->chunk_size != mddev->new_chunk); + BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); BUG_ON(mddev->delta_disks != 0); } @@ -4882,7 +4889,7 @@ static int run(mddev_t *mddev) } /* device size must be a multiple of chunk size */ - mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); + mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); mddev->resync_max_sectors = mddev->dev_sectors; if (mddev->degraded > 0 && @@ -4931,7 +4938,7 @@ static int run(mddev_t *mddev) { int data_disks = conf->previous_raid_disks - conf->max_degraded; int stripe = data_disks * - (mddev->chunk_size / PAGE_SIZE); + ((mddev->chunk_sectors << 9) / PAGE_SIZE); if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } @@ -5021,7 +5028,8 @@ static void status(struct seq_file *seq, mddev_t *mddev) raid5_conf_t *conf = (raid5_conf_t *) mddev->private; int i; - seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); + seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, + mddev->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", @@ -5169,7 +5177,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); + sectors &= ~((sector_t)mddev->chunk_sectors - 1); md_set_array_sectors(mddev, raid5_size(mddev, sectors, mddev->raid_disks)); if (mddev->array_sectors > @@ -5186,14 +5194,37 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return 0; } -static int raid5_check_reshape(mddev_t *mddev) +static int check_stripe_cache(mddev_t *mddev) +{ + /* Can only proceed if there are plenty of stripe_heads. + * We need a minimum of one full stripe,, and for sensible progress + * it is best to have about 4 times that. + * If we require 4 times, then the default 256 4K stripe_heads will + * allow for chunk sizes up to 256K, which is probably OK. + * If the chunk size is greater, user-space should request more + * stripe_heads first. + */ + raid5_conf_t *conf = mddev->private; + if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes || + ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes) { + printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", + ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) + / STRIPE_SIZE)*4); + return 0; + } + return 1; +} + +static int check_reshape(mddev_t *mddev) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && - mddev->new_chunk == mddev->chunk_size) - return -EINVAL; /* nothing to do */ + mddev->new_chunk_sectors == mddev->chunk_sectors) + return 0; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; @@ -5212,28 +5243,15 @@ static int raid5_check_reshape(mddev_t *mddev) return -EINVAL; } - /* Can only proceed if there are plenty of stripe_heads. - * We need a minimum of one full stripe,, and for sensible progress - * it is best to have about 4 times that. - * If we require 4 times, then the default 256 4K stripe_heads will - * allow for chunk sizes up to 256K, which is probably OK. - * If the chunk size is greater, user-space should request more - * stripe_heads first. - */ - if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || - (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { - printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", - (max(mddev->chunk_size, mddev->new_chunk) - / STRIPE_SIZE)*4); + if (!check_stripe_cache(mddev)) return -ENOSPC; - } return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); } static int raid5_start_reshape(mddev_t *mddev) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; mdk_rdev_t *rdev; int spares = 0; int added_devices = 0; @@ -5242,6 +5260,9 @@ static int raid5_start_reshape(mddev_t *mddev) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; + if (!check_stripe_cache(mddev)) + return -ENOSPC; + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) @@ -5268,8 +5289,8 @@ static int raid5_start_reshape(mddev_t *mddev) spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; - conf->prev_chunk = conf->chunk_size; - conf->chunk_size = mddev->new_chunk; + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->chunk_sectors = mddev->new_chunk_sectors; conf->prev_algo = conf->algorithm; conf->algorithm = mddev->new_layout; if (mddev->delta_disks < 0) @@ -5351,7 +5372,7 @@ static void end_reshape(raid5_conf_t *conf) */ { int data_disks = conf->raid_disks - conf->max_degraded; - int stripe = data_disks * (conf->chunk_size + int stripe = data_disks * ((conf->chunk_sectors << 9) / PAGE_SIZE); if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; @@ -5365,7 +5386,7 @@ static void end_reshape(raid5_conf_t *conf) static void raid5_finish_reshape(mddev_t *mddev) { struct block_device *bdev; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -5396,7 +5417,7 @@ static void raid5_finish_reshape(mddev_t *mddev) raid5_remove_disk(mddev, d); } mddev->layout = conf->algorithm; - mddev->chunk_size = conf->chunk_size; + mddev->chunk_sectors = conf->chunk_sectors; mddev->reshape_position = MaxSector; mddev->delta_disks = 0; } @@ -5404,7 +5425,7 @@ static void raid5_finish_reshape(mddev_t *mddev) static void raid5_quiesce(mddev_t *mddev, int state) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; switch(state) { case 2: /* resume for a suspend */ @@ -5454,7 +5475,7 @@ static void *raid5_takeover_raid1(mddev_t *mddev) mddev->new_level = 5; mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; - mddev->new_chunk = chunksect << 9; + mddev->new_chunk_sectors = chunksect; return setup_conf(mddev); } @@ -5493,24 +5514,24 @@ static void *raid5_takeover_raid6(mddev_t *mddev) } -static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +static int raid5_check_reshape(mddev_t *mddev) { /* For a 2-drive array, the layout and chunk size can be changed * immediately as not restriping is needed. * For larger arrays we record the new value - after validation * to be used by a reshape pass. */ - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; + int new_chunk = mddev->new_chunk_sectors; - if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) + if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) return -EINVAL; if (new_chunk > 0) { - if (new_chunk & (new_chunk-1)) - /* not a power of 2 */ + if (!is_power_of_2(new_chunk)) return -EINVAL; - if (new_chunk < PAGE_SIZE) + if (new_chunk < (PAGE_SIZE>>9)) return -EINVAL; - if (mddev->array_sectors & ((new_chunk>>9)-1)) + if (mddev->array_sectors & (new_chunk-1)) /* not factor of array size */ return -EINVAL; } @@ -5518,49 +5539,39 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) /* They look valid */ if (mddev->raid_disks == 2) { - - if (new_layout >= 0) { - conf->algorithm = new_layout; - mddev->layout = mddev->new_layout = new_layout; + /* can make the change immediately */ + if (mddev->new_layout >= 0) { + conf->algorithm = mddev->new_layout; + mddev->layout = mddev->new_layout; } if (new_chunk > 0) { - conf->chunk_size = new_chunk; - mddev->chunk_size = mddev->new_chunk = new_chunk; + conf->chunk_sectors = new_chunk ; + mddev->chunk_sectors = new_chunk; } set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); - } else { - if (new_layout >= 0) - mddev->new_layout = new_layout; - if (new_chunk > 0) - mddev->new_chunk = new_chunk; } - return 0; + return check_reshape(mddev); } -static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +static int raid6_check_reshape(mddev_t *mddev) { - if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) + int new_chunk = mddev->new_chunk_sectors; + + if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) return -EINVAL; if (new_chunk > 0) { - if (new_chunk & (new_chunk-1)) - /* not a power of 2 */ + if (!is_power_of_2(new_chunk)) return -EINVAL; - if (new_chunk < PAGE_SIZE) + if (new_chunk < (PAGE_SIZE >> 9)) return -EINVAL; - if (mddev->array_sectors & ((new_chunk>>9)-1)) + if (mddev->array_sectors & (new_chunk-1)) /* not factor of array size */ return -EINVAL; } /* They look valid */ - - if (new_layout >= 0) - mddev->new_layout = new_layout; - if (new_chunk > 0) - mddev->new_chunk = new_chunk; - - return 0; + return check_reshape(mddev); } static void *raid5_takeover(mddev_t *mddev) @@ -5570,8 +5581,6 @@ static void *raid5_takeover(mddev_t *mddev) * raid1 - if there are two drives. We need to know the chunk size * raid4 - trivial - just use a raid4 layout. * raid6 - Providing it is a *_6 layout - * - * For now, just do raid1 */ if (mddev->level == 1) @@ -5653,12 +5662,11 @@ static struct mdk_personality raid6_personality = .sync_request = sync_request, .resize = raid5_resize, .size = raid5_size, - .check_reshape = raid5_check_reshape, + .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid6_takeover, - .reconfig = raid6_reconfig, }; static struct mdk_personality raid5_personality = { @@ -5681,7 +5689,6 @@ static struct mdk_personality raid5_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid5_takeover, - .reconfig = raid5_reconfig, }; static struct mdk_personality raid4_personality = diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 116d0b4..2390e0e 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -337,7 +337,8 @@ struct raid5_private_data { struct hlist_head *stripe_hashtbl; mddev_t *mddev; struct disk_info *spare; - int chunk_size, level, algorithm; + int chunk_sectors; + int level, algorithm; int max_degraded; int raid_disks; int max_nr_stripes; @@ -353,7 +354,8 @@ struct raid5_private_data { */ sector_t reshape_safe; int previous_raid_disks; - int prev_chunk, prev_algo; + int prev_chunk_sectors; + int prev_algo; short generation; /* increments with every reshape */ unsigned long reshape_checkpoint; /* Time we last updated * metadata */ @@ -424,8 +426,6 @@ struct raid5_private_data { typedef struct raid5_private_data raid5_conf_t; -#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) - /* * Our supported algorithms */ |