diff options
author | Shaohua Li <shli@fb.com> | 2015-12-21 10:51:02 +1100 |
---|---|---|
committer | NeilBrown <neilb@suse.com> | 2016-01-06 11:39:57 +1100 |
commit | f6b6ec5cfac306c1eea66f074050864efcb11851 (patch) | |
tree | 68c0b7210bb1ce3f23bf59332c47ef35d09cf8eb /drivers/md | |
parent | 9ebc6ef188a0656f3620835f9be7fe22c1644c1c (diff) | |
download | op-kernel-dev-f6b6ec5cfac306c1eea66f074050864efcb11851.zip op-kernel-dev-f6b6ec5cfac306c1eea66f074050864efcb11851.tar.gz |
raid5-cache: add journal hot add/remove support
Add support for journal disk hot add/remove. Mostly trival checks in md
part. The raid5 part is a little tricky. For hot-remove, we can't wait
pending write as it's called from raid5d. The wait will cause deadlock.
We simplily fail the hot-remove. A hot-remove retry can success
eventually since if journal disk is faulty all pending write will be
failed and finish. For hot-add, since an array supporting journal but
without journal disk will be marked read-only, we are safe to hot add
journal without stopping IO (should be read IO, while journal only
handles write IO).
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/md.c | 42 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 16 | ||||
-rw-r--r-- | drivers/md/raid5.c | 34 |
3 files changed, 68 insertions, 24 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index d0f0621..c0c3e6d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2055,8 +2055,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return -EEXIST; /* make sure rdev->sectors exceeds mddev->dev_sectors */ - if (rdev->sectors && (mddev->dev_sectors == 0 || - rdev->sectors < mddev->dev_sectors)) { + if (!test_bit(Journal, &rdev->flags) && + rdev->sectors && + (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -2087,7 +2088,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) } } rcu_read_unlock(); - if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + if (!test_bit(Journal, &rdev->flags) && + mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { printk(KERN_WARNING "md: %s: array is limited to %d devices\n", mdname(mddev), mddev->max_disks); return -EBUSY; @@ -6044,8 +6046,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); - if (info->state & (1<<MD_DISK_JOURNAL)) + if (info->state & (1<<MD_DISK_JOURNAL)) { + struct md_rdev *rdev2; + bool has_journal = false; + + /* make sure no existing journal disk */ + rdev_for_each(rdev2, mddev) { + if (test_bit(Journal, &rdev2->flags)) { + has_journal = true; + break; + } + } + if (has_journal) { + export_rdev(rdev); + return -EBUSY; + } set_bit(Journal, &rdev->flags); + } /* * check whether the device shows up in other nodes */ @@ -8181,19 +8198,20 @@ static int remove_and_add_spares(struct mddev *mddev, continue; if (test_bit(Faulty, &rdev->flags)) continue; - if (test_bit(Journal, &rdev->flags)) - continue; - if (mddev->ro && - ! (rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) - continue; + if (!test_bit(Journal, &rdev->flags)) { + if (mddev->ro && + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) + continue; - rdev->recovery_offset = 0; + rdev->recovery_offset = 0; + } if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; - spares++; + if (!test_bit(Journal, &rdev->flags)) + spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 668e973..c1c4d21 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -799,10 +799,18 @@ void r5l_quiesce(struct r5l_log *log, int state) bool r5l_log_disk_error(struct r5conf *conf) { + struct r5l_log *log; + bool ret; /* don't allow write if journal disk is missing */ - if (!conf->log) - return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); - return test_bit(Faulty, &conf->log->rdev->flags); + rcu_read_lock(); + log = rcu_dereference(conf->log); + + if (!log) + ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + else + ret = test_bit(Faulty, &log->rdev->flags); + rcu_read_unlock(); + return ret; } struct r5l_recovery_ctx { @@ -1165,7 +1173,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (r5l_load_log(log)) goto error; - conf->log = log; + rcu_assign_pointer(conf->log, log); return 0; error: md_unregister_thread(&log->reclaim_thread); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2236250..a086014 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7139,14 +7139,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct disk_info *p = conf->disks + number; print_raid5_conf(conf); - if (test_bit(Journal, &rdev->flags)) { + if (test_bit(Journal, &rdev->flags) && conf->log) { + struct r5l_log *log; /* - * journal disk is not removable, but we need give a chance to - * update superblock of other disks. Otherwise journal disk - * will be considered as 'fresh' + * we can't wait pending write here, as this is called in + * raid5d, wait will deadlock. */ - set_bit(MD_CHANGE_DEVS, &mddev->flags); - return -EINVAL; + if (atomic_read(&mddev->writes_pending)) + return -EBUSY; + log = conf->log; + conf->log = NULL; + synchronize_rcu(); + r5l_exit_log(log); + return 0; } if (rdev == p->rdev) rdevp = &p->rdev; @@ -7210,8 +7215,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; - if (test_bit(Journal, &rdev->flags)) - return -EINVAL; + if (test_bit(Journal, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + if (conf->log) + return -EBUSY; + + rdev->raid_disk = 0; + /* + * The array is in readonly mode if journal is missing, so no + * write requests running. We should be safe + */ + r5l_init_log(conf, rdev); + printk(KERN_INFO"md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(rdev->bdev, b)); + return 0; + } if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; |