diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-24 09:28:01 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-24 09:28:01 -0700 |
commit | 474095e46cd14421821da3201a9fd6a4c070996b (patch) | |
tree | 7203d36f53c376a96099ed0310787b1fb0c4f7a5 /drivers/md/md.c | |
parent | d56a669ca59c37ed0a7282a251b2f2f22533343a (diff) | |
parent | 9ffc8f7cb9647b13dfe4d1ad0d5e1427bb8b46d6 (diff) | |
download | op-kernel-dev-474095e46cd14421821da3201a9fd6a4c070996b.zip op-kernel-dev-474095e46cd14421821da3201a9fd6a4c070996b.tar.gz |
Merge tag 'md/4.1' of git://neil.brown.name/md
Pull md updates from Neil Brown:
"More updates that usual this time. A few have performance impacts
which hould mostly be positive, but RAID5 (in particular) can be very
work-load ensitive... We'll have to wait and see.
Highlights:
- "experimental" code for managing md/raid1 across a cluster using
DLM. Code is not ready for general use and triggers a WARNING if
used. However it is looking good and mostly done and having in
mainline will help co-ordinate development.
- RAID5/6 can now batch multiple (4K wide) stripe_heads so as to
handle a full (chunk wide) stripe as a single unit.
- RAID6 can now perform read-modify-write cycles which should help
performance on larger arrays: 6 or more devices.
- RAID5/6 stripe cache now grows and shrinks dynamically. The value
set is used as a minimum.
- Resync is now allowed to go a little faster than the 'mininum' when
there is competing IO. How much faster depends on the speed of the
devices, so the effective minimum should scale with device speed to
some extent"
* tag 'md/4.1' of git://neil.brown.name/md: (58 commits)
md/raid5: don't do chunk aligned read on degraded array.
md/raid5: allow the stripe_cache to grow and shrink.
md/raid5: change ->inactive_blocked to a bit-flag.
md/raid5: move max_nr_stripes management into grow_one_stripe and drop_one_stripe
md/raid5: pass gfp_t arg to grow_one_stripe()
md/raid5: introduce configuration option rmw_level
md/raid5: activate raid6 rmw feature
md/raid6 algorithms: xor_syndrome() for SSE2
md/raid6 algorithms: xor_syndrome() for generic int
md/raid6 algorithms: improve test program
md/raid6 algorithms: delta syndrome functions
raid5: handle expansion/resync case with stripe batching
raid5: handle io error of batch list
RAID5: batch adjacent full stripe write
raid5: track overwrite disk count
raid5: add a new flag to track if a stripe can be batched
raid5: use flex_array for scribble data
md raid0: access mddev->queue (request queue member) conditionally because it is not set when accessed from dm-raid
md: allow resync to go faster when there is competing IO.
md: remove 'go_faster' option from ->sync_request()
...
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 382 |
1 files changed, 313 insertions, 69 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index e617878..d4f31e1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -53,6 +53,7 @@ #include <linux/slab.h> #include "md.h" #include "bitmap.h" +#include "md-cluster.h" #ifndef MODULE static void autostart_arrays(int part); @@ -66,6 +67,11 @@ static void autostart_arrays(int part); static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); +struct md_cluster_operations *md_cluster_ops; +EXPORT_SYMBOL(md_cluster_ops); +struct module *md_cluster_mod; +EXPORT_SYMBOL(md_cluster_mod); + static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; @@ -640,7 +646,7 @@ void mddev_unlock(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_unlock); -static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) +struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) { struct md_rdev *rdev; @@ -650,6 +656,7 @@ static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) return NULL; } +EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { @@ -2047,11 +2054,11 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) int choice = 0; if (mddev->pers) choice = mddev->raid_disks; - while (find_rdev_nr_rcu(mddev, choice)) + while (md_find_rdev_nr_rcu(mddev, choice)) choice++; rdev->desc_nr = choice; } else { - if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) { + if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { rcu_read_unlock(); return -EBUSY; } @@ -2166,11 +2173,12 @@ static void export_rdev(struct md_rdev *rdev) kobject_put(&rdev->kobj); } -static void kick_rdev_from_array(struct md_rdev *rdev) +void md_kick_rdev_from_array(struct md_rdev *rdev) { unbind_rdev_from_array(rdev); export_rdev(rdev); } +EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); static void export_array(struct mddev *mddev) { @@ -2179,7 +2187,7 @@ static void export_array(struct mddev *mddev) while (!list_empty(&mddev->disks)) { rdev = list_first_entry(&mddev->disks, struct md_rdev, same_set); - kick_rdev_from_array(rdev); + md_kick_rdev_from_array(rdev); } mddev->raid_disks = 0; mddev->major_version = 0; @@ -2208,7 +2216,7 @@ static void sync_sbs(struct mddev *mddev, int nospares) } } -static void md_update_sb(struct mddev *mddev, int force_change) +void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; @@ -2369,6 +2377,37 @@ repeat: wake_up(&rdev->blocked_wait); } } +EXPORT_SYMBOL(md_update_sb); + +static int add_bound_rdev(struct md_rdev *rdev) +{ + struct mddev *mddev = rdev->mddev; + int err = 0; + + if (!mddev->pers->hot_remove_disk) { + /* If there is hot_add_disk but no hot_remove_disk + * then added disks for geometry changes, + * and should be added immediately. + */ + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); + if (err) { + unbind_rdev_from_array(rdev); + export_rdev(rdev); + return err; + } + } + sysfs_notify_dirent_safe(rdev->sysfs_state); + + set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (mddev->degraded) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_new_event(mddev); + md_wakeup_thread(mddev->thread); + return 0; +} /* words written to sysfs files may, or may not, be \n terminated. * We want to accept with case. For this we use cmd_match. @@ -2471,10 +2510,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = -EBUSY; else { struct mddev *mddev = rdev->mddev; - kick_rdev_from_array(rdev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->remove_disk(mddev, rdev); + md_kick_rdev_from_array(rdev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); if (mddev->pers) md_update_sb(mddev, 1); md_new_event(mddev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); err = 0; } } else if (cmd_match(buf, "writemostly")) { @@ -2553,6 +2598,21 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) clear_bit(Replacement, &rdev->flags); err = 0; } + } else if (cmd_match(buf, "re-add")) { + if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { + /* clear_bit is performed _after_ all the devices + * have their local Faulty bit cleared. If any writes + * happen in the meantime in the local node, they + * will land in the local bitmap, which will be synced + * by this node eventually + */ + if (!mddev_is_clustered(rdev->mddev) || + (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { + clear_bit(Faulty, &rdev->flags); + err = add_bound_rdev(rdev); + } + } else + err = -EBUSY; } if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -3127,7 +3187,7 @@ static void analyze_sbs(struct mddev *mddev) "md: fatal superblock inconsistency in %s" " -- removing from array\n", bdevname(rdev->bdev,b)); - kick_rdev_from_array(rdev); + md_kick_rdev_from_array(rdev); } super_types[mddev->major_version]. @@ -3142,18 +3202,27 @@ static void analyze_sbs(struct mddev *mddev) "md: %s: %s: only %d devices permitted\n", mdname(mddev), bdevname(rdev->bdev, b), mddev->max_disks); - kick_rdev_from_array(rdev); + md_kick_rdev_from_array(rdev); continue; } - if (rdev != freshest) + if (rdev != freshest) { if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { printk(KERN_WARNING "md: kicking non-fresh %s" " from array!\n", bdevname(rdev->bdev,b)); - kick_rdev_from_array(rdev); + md_kick_rdev_from_array(rdev); continue; } + /* No device should have a Candidate flag + * when reading devices + */ + if (test_bit(Candidate, &rdev->flags)) { + pr_info("md: kicking Cluster Candidate %s from array!\n", + bdevname(rdev->bdev, b)); + md_kick_rdev_from_array(rdev); + } + } if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; @@ -4008,8 +4077,12 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; if (mddev->pers) { + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); err = update_size(mddev, sectors); md_update_sb(mddev, 1); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -4354,7 +4427,6 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; int err; - int chunk; if (kstrtoull(buf, 10, &min)) return -EINVAL; @@ -4368,16 +4440,8 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; - /* Must be a multiple of chunk_size */ - chunk = mddev->chunk_sectors; - if (chunk) { - sector_t temp = min; - - err = -EINVAL; - if (sector_div(temp, chunk)) - goto out_unlock; - } - mddev->resync_min = min; + /* Round down to multiple of 4K for safety */ + mddev->resync_min = round_down(min, 8); err = 0; out_unlock: @@ -5077,10 +5141,16 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = bitmap_create(mddev); - if (err) + struct bitmap *bitmap; + + bitmap = bitmap_create(mddev, -1); + if (IS_ERR(bitmap)) { + err = PTR_ERR(bitmap); printk(KERN_ERR "%s: failed to create bitmap (%d)\n", mdname(mddev), err); + } else + mddev->bitmap = bitmap; + } if (err) { mddev_detach(mddev); @@ -5232,6 +5302,8 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); flush_workqueue(md_misc_wq); if (mddev->sync_thread) { @@ -5250,6 +5322,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); } void md_stop_writes(struct mddev *mddev) @@ -5636,6 +5710,8 @@ static int get_array_info(struct mddev *mddev, void __user *arg) info.state = (1<<MD_SB_CLEAN); if (mddev->bitmap && mddev->bitmap_info.offset) info.state |= (1<<MD_SB_BITMAP_PRESENT); + if (mddev_is_clustered(mddev)) + info.state |= (1<<MD_SB_CLUSTERED); info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; @@ -5691,7 +5767,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) return -EFAULT; rcu_read_lock(); - rdev = find_rdev_nr_rcu(mddev, info.number); + rdev = md_find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -5724,6 +5800,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) struct md_rdev *rdev; dev_t dev = MKDEV(info->major,info->minor); + if (mddev_is_clustered(mddev) && + !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { + pr_err("%s: Cannot add to clustered mddev.\n", + mdname(mddev)); + return -EINVAL; + } + if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) return -EOVERFLOW; @@ -5810,31 +5893,38 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); + /* + * check whether the device shows up in other nodes + */ + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) { + /* Through --cluster-confirm */ + set_bit(Candidate, &rdev->flags); + err = md_cluster_ops->new_disk_ack(mddev, true); + if (err) { + export_rdev(rdev); + return err; + } + } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + /* --add initiated by this node */ + err = md_cluster_ops->add_new_disk_start(mddev, rdev); + if (err) { + md_cluster_ops->add_new_disk_finish(mddev); + export_rdev(rdev); + return err; + } + } + } + rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); - if (!err && !mddev->pers->hot_remove_disk) { - /* If there is hot_add_disk but no hot_remove_disk - * then added disks for geometry changes, - * and should be added immediately. - */ - super_types[mddev->major_version]. - validate_super(mddev, rdev); - err = mddev->pers->hot_add_disk(mddev, rdev); - if (err) - unbind_rdev_from_array(rdev); - } if (err) export_rdev(rdev); else - sysfs_notify_dirent_safe(rdev->sysfs_state); - - set_bit(MD_CHANGE_DEVS, &mddev->flags); - if (mddev->degraded) - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (!err) - md_new_event(mddev); - md_wakeup_thread(mddev->thread); + err = add_bound_rdev(rdev); + if (mddev_is_clustered(mddev) && + (info->state & (1 << MD_DISK_CLUSTER_ADD))) + md_cluster_ops->add_new_disk_finish(mddev); return err; } @@ -5895,18 +5985,29 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) if (!rdev) return -ENXIO; + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); + clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); if (rdev->raid_disk >= 0) goto busy; - kick_rdev_from_array(rdev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->remove_disk(mddev, rdev); + + md_kick_rdev_from_array(rdev); md_update_sb(mddev, 1); md_new_event(mddev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); + return 0; busy: + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_cancel(mddev); printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; @@ -5956,12 +6057,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) err = -EINVAL; goto abort_export; } + + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) - goto abort_export; + goto abort_clustered; /* * The rest should better be atomic, we can have disk failures @@ -5972,6 +6076,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) md_update_sb(mddev, 1); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); /* * Kick recovery, maybe this spare has to be added to the * array immediately. @@ -5981,6 +6087,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) md_new_event(mddev); return 0; +abort_clustered: + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_cancel(mddev); abort_export: export_rdev(rdev); return err; @@ -6038,9 +6147,14 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (mddev->pers) { mddev->pers->quiesce(mddev, 1); if (fd >= 0) { - err = bitmap_create(mddev); - if (!err) + struct bitmap *bitmap; + + bitmap = bitmap_create(mddev, -1); + if (!IS_ERR(bitmap)) { + mddev->bitmap = bitmap; err = bitmap_load(mddev); + } else + err = PTR_ERR(bitmap); } if (fd < 0 || err) { bitmap_destroy(mddev); @@ -6293,6 +6407,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) return rv; } } + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); @@ -6300,33 +6416,49 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = update_raid_disks(mddev, info->raid_disks); if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { - if (mddev->pers->quiesce == NULL || mddev->thread == NULL) - return -EINVAL; - if (mddev->recovery || mddev->sync_thread) - return -EBUSY; + if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { + rv = -EINVAL; + goto err; + } + if (mddev->recovery || mddev->sync_thread) { + rv = -EBUSY; + goto err; + } if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { + struct bitmap *bitmap; /* add the bitmap */ - if (mddev->bitmap) - return -EEXIST; - if (mddev->bitmap_info.default_offset == 0) - return -EINVAL; + if (mddev->bitmap) { + rv = -EEXIST; + goto err; + } + if (mddev->bitmap_info.default_offset == 0) { + rv = -EINVAL; + goto err; + } mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; mddev->pers->quiesce(mddev, 1); - rv = bitmap_create(mddev); - if (!rv) + bitmap = bitmap_create(mddev, -1); + if (!IS_ERR(bitmap)) { + mddev->bitmap = bitmap; rv = bitmap_load(mddev); + } else + rv = PTR_ERR(bitmap); if (rv) bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); } else { /* remove the bitmap */ - if (!mddev->bitmap) - return -ENOENT; - if (mddev->bitmap->storage.file) - return -EINVAL; + if (!mddev->bitmap) { + rv = -ENOENT; + goto err; + } + if (mddev->bitmap->storage.file) { + rv = -EINVAL; + goto err; + } mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); @@ -6334,6 +6466,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) } } md_update_sb(mddev, 1); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); + return rv; +err: + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_cancel(mddev); return rv; } @@ -6393,6 +6531,7 @@ static inline bool md_ioctl_valid(unsigned int cmd) case SET_DISK_FAULTY: case STOP_ARRAY: case STOP_ARRAY_RO: + case CLUSTERED_DISK_NACK: return true; default: return false; @@ -6665,6 +6804,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto unlock; } + case CLUSTERED_DISK_NACK: + if (mddev_is_clustered(mddev)) + md_cluster_ops->new_disk_ack(mddev, false); + else + err = -EINVAL; + goto unlock; + case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); goto unlock; @@ -7238,6 +7384,55 @@ int unregister_md_personality(struct md_personality *p) } EXPORT_SYMBOL(unregister_md_personality); +int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) +{ + if (md_cluster_ops != NULL) + return -EALREADY; + spin_lock(&pers_lock); + md_cluster_ops = ops; + md_cluster_mod = module; + spin_unlock(&pers_lock); + return 0; +} +EXPORT_SYMBOL(register_md_cluster_operations); + +int unregister_md_cluster_operations(void) +{ + spin_lock(&pers_lock); + md_cluster_ops = NULL; + spin_unlock(&pers_lock); + return 0; +} +EXPORT_SYMBOL(unregister_md_cluster_operations); + +int md_setup_cluster(struct mddev *mddev, int nodes) +{ + int err; + + err = request_module("md-cluster"); + if (err) { + pr_err("md-cluster module not found.\n"); + return err; + } + + spin_lock(&pers_lock); + if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + spin_unlock(&pers_lock); + return -ENOENT; + } + spin_unlock(&pers_lock); + + return md_cluster_ops->join(mddev, nodes); +} + +void md_cluster_stop(struct mddev *mddev) +{ + if (!md_cluster_ops) + return; + md_cluster_ops->leave(mddev); + module_put(md_cluster_mod); +} + static int is_mddev_idle(struct mddev *mddev, int init) { struct md_rdev *rdev; @@ -7375,7 +7570,11 @@ int md_allow_write(struct mddev *mddev) mddev->safemode == 0) mddev->safemode = 1; spin_unlock(&mddev->lock); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); md_update_sb(mddev, 0); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock(&mddev->lock); @@ -7576,6 +7775,9 @@ void md_do_sync(struct md_thread *thread) md_new_event(mddev); update_time = jiffies; + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_start(mddev, j, max_sectors); + blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7618,8 +7820,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; - sectors = mddev->pers->sync_request(mddev, j, &skipped, - currspeed < speed_min(mddev)); + sectors = mddev->pers->sync_request(mddev, j, &skipped); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; @@ -7636,6 +7837,8 @@ void md_do_sync(struct md_thread *thread) j += sectors; if (j > 2) mddev->curr_resync = j; + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_info_update(mddev, j, max_sectors); mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7677,11 +7880,18 @@ void md_do_sync(struct md_thread *thread) /((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > speed_min(mddev)) { - if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev, 0)) { + if (currspeed > speed_max(mddev)) { msleep(500); goto repeat; } + if (!is_mddev_idle(mddev, 0)) { + /* + * Give other IO more of a chance. + * The faster the devices, the less we wait. + */ + wait_event(mddev->recovery_wait, + !atomic_read(&mddev->recovery_active)); + } } } printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, @@ -7694,7 +7904,10 @@ void md_do_sync(struct md_thread *thread) wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ - mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); + mddev->pers->sync_request(mddev, max_sectors, &skipped); + + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_finish(mddev); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { @@ -7925,8 +8138,13 @@ void md_check_recovery(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags & MD_UPDATE_SB_FLAGS) + if (mddev->flags & MD_UPDATE_SB_FLAGS) { + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); md_update_sb(mddev, 0); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); + } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { @@ -8024,6 +8242,8 @@ void md_reap_sync_thread(struct mddev *mddev) set_bit(MD_CHANGE_DEVS, &mddev->flags); } } + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) mddev->pers->finish_reshape(mddev); @@ -8036,6 +8256,8 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -8656,6 +8878,28 @@ err_wq: return ret; } +void md_reload_sb(struct mddev *mddev) +{ + struct md_rdev *rdev, *tmp; + + rdev_for_each_safe(rdev, tmp, mddev) { + rdev->sb_loaded = 0; + ClearPageUptodate(rdev->sb_page); + } + mddev->raid_disks = 0; + analyze_sbs(mddev); + rdev_for_each_safe(rdev, tmp, mddev) { + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + /* since we don't write to faulty devices, we figure out if the + * disk is faulty by comparing events + */ + if (mddev->events > sb->events) + set_bit(Faulty, &rdev->flags); + } + +} +EXPORT_SYMBOL(md_reload_sb); + #ifndef MODULE /* |