diff options
Diffstat (limited to 'drivers/md')
75 files changed, 3328 insertions, 2231 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index d5415eed..3e01e6f 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -393,7 +393,7 @@ config DM_MULTIPATH # of SCSI_DH if the latter isn't defined but if # it is, DM_MULTIPATH must depend on it. We get a build # error if SCSI_DH=m and DM_MULTIPATH=y - depends on SCSI_DH || !SCSI_DH + depends on !SCSI_DH || SCSI ---help--- Allow volume managers to support multipath hardware. diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 462f443..f34979c 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o md-mod-y += md.o bitmap.o -raid456-y += raid5.o +raid456-y += raid5.o raid5-cache.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 04f7bc2..6b420a5 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -243,19 +243,6 @@ struct keybuf { DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; -struct bio_split_pool { - struct bio_set *bio_split; - mempool_t *bio_split_hook; -}; - -struct bio_split_hook { - struct closure cl; - struct bio_split_pool *p; - struct bio *bio; - bio_end_io_t *bi_end_io; - void *bi_private; -}; - struct bcache_device { struct closure cl; @@ -288,8 +275,6 @@ struct bcache_device { int (*cache_miss)(struct btree *, struct search *, struct bio *, unsigned); int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); - - struct bio_split_pool bio_split_hook; }; struct io { @@ -454,8 +439,6 @@ struct cache { atomic_long_t meta_sectors_written; atomic_long_t btree_sectors_written; atomic_long_t sectors_written; - - struct bio_split_pool bio_split_hook; }; struct gc_stat { @@ -873,7 +856,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); void bch_bbio_free(struct bio *, struct cache_set *); struct bio *bch_bbio_alloc(struct cache_set *); -void bch_generic_make_request(struct bio *, struct bio_split_pool *); void __bch_submit_bbio(struct bio *, struct cache_set *); void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 00cde40..83392f8 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -278,7 +278,7 @@ err: goto out; } -static void btree_node_read_endio(struct bio *bio, int error) +static void btree_node_read_endio(struct bio *bio) { struct closure *cl = bio->bi_private; closure_put(cl); @@ -305,7 +305,7 @@ static void bch_btree_node_read(struct btree *b) bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (bio->bi_error) set_btree_node_io_error(b); bch_bbio_free(bio, b->c); @@ -371,15 +371,15 @@ static void btree_node_write_done(struct closure *cl) __btree_node_write_done(cl); } -static void btree_node_write_endio(struct bio *bio, int error) +static void btree_node_write_endio(struct bio *bio) { struct closure *cl = bio->bi_private; struct btree *b = container_of(cl, struct btree, io); - if (error) + if (bio->bi_error) set_btree_node_io_error(b); - bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); + bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree"); closure_put(cl); } diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 79a6d63..782cc2c 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -38,7 +38,7 @@ * they are running owned by the thread that is running them. Otherwise, suppose * you submit some bios and wish to have a function run when they all complete: * - * foo_endio(struct bio *bio, int error) + * foo_endio(struct bio *bio) * { * closure_put(cl); * } diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index bf6a9ca..86a0bb8 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -11,105 +11,6 @@ #include <linux/blkdev.h> -static unsigned bch_bio_max_sectors(struct bio *bio) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - struct bio_vec bv; - struct bvec_iter iter; - unsigned ret = 0, seg = 0; - - if (bio->bi_rw & REQ_DISCARD) - return min(bio_sectors(bio), q->limits.max_discard_sectors); - - bio_for_each_segment(bv, bio, iter) { - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_iter.bi_sector, - .bi_size = ret << 9, - .bi_rw = bio->bi_rw, - }; - - if (seg == min_t(unsigned, BIO_MAX_PAGES, - queue_max_segments(q))) - break; - - if (q->merge_bvec_fn && - q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len) - break; - - seg++; - ret += bv.bv_len >> 9; - } - - ret = min(ret, queue_max_sectors(q)); - - WARN_ON(!ret); - ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9); - - return ret; -} - -static void bch_bio_submit_split_done(struct closure *cl) -{ - struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); - - s->bio->bi_end_io = s->bi_end_io; - s->bio->bi_private = s->bi_private; - bio_endio(s->bio, 0); - - closure_debug_destroy(&s->cl); - mempool_free(s, s->p->bio_split_hook); -} - -static void bch_bio_submit_split_endio(struct bio *bio, int error) -{ - struct closure *cl = bio->bi_private; - struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); - - if (error) - clear_bit(BIO_UPTODATE, &s->bio->bi_flags); - - bio_put(bio); - closure_put(cl); -} - -void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) -{ - struct bio_split_hook *s; - struct bio *n; - - if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) - goto submit; - - if (bio_sectors(bio) <= bch_bio_max_sectors(bio)) - goto submit; - - s = mempool_alloc(p->bio_split_hook, GFP_NOIO); - closure_init(&s->cl, NULL); - - s->bio = bio; - s->p = p; - s->bi_end_io = bio->bi_end_io; - s->bi_private = bio->bi_private; - bio_get(bio); - - do { - n = bio_next_split(bio, bch_bio_max_sectors(bio), - GFP_NOIO, s->p->bio_split); - - n->bi_end_io = bch_bio_submit_split_endio; - n->bi_private = &s->cl; - - closure_get(&s->cl); - generic_make_request(n); - } while (n != bio); - - continue_at(&s->cl, bch_bio_submit_split_done, NULL); - return; -submit: - generic_make_request(bio); -} - /* Bios with headers */ void bch_bbio_free(struct bio *bio, struct cache_set *c) @@ -139,7 +40,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; b->submit_time_us = local_clock_us(); - closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); + closure_bio_submit(bio, bio->bi_private); } void bch_submit_bbio(struct bio *bio, struct cache_set *c, diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 418607a..29eba72 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -24,7 +24,7 @@ * bit. */ -static void journal_read_endio(struct bio *bio, int error) +static void journal_read_endio(struct bio *bio) { struct closure *cl = bio->bi_private; closure_put(cl); @@ -61,7 +61,7 @@ reread: left = ca->sb.bucket_size - offset; bio->bi_private = &cl; bch_bio_map(bio, data); - closure_bio_submit(bio, &cl, ca); + closure_bio_submit(bio, &cl); closure_sync(&cl); /* This function could be simpler now since we no longer write @@ -401,7 +401,7 @@ retry: #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) -static void journal_discard_endio(struct bio *bio, int error) +static void journal_discard_endio(struct bio *bio) { struct journal_device *ja = container_of(bio, struct journal_device, discard_bio); @@ -547,11 +547,11 @@ void bch_journal_next(struct journal *j) pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); } -static void journal_write_endio(struct bio *bio, int error) +static void journal_write_endio(struct bio *bio) { struct journal_write *w = bio->bi_private; - cache_set_err_on(error, w->c, "journal io error"); + cache_set_err_on(bio->bi_error, w->c, "journal io error"); closure_put(&w->c->journal.io); } @@ -648,7 +648,7 @@ static void journal_write_unlocked(struct closure *cl) spin_unlock(&c->journal.lock); while ((bio = bio_list_pop(&list))) - closure_bio_submit(bio, cl, c->cache[0]); + closure_bio_submit(bio, cl); continue_at(cl, journal_write_done, NULL); } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index cd74903..b929fc9 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -60,20 +60,20 @@ static void write_moving_finish(struct closure *cl) closure_return_with_destructor(cl, moving_io_destructor); } -static void read_moving_endio(struct bio *bio, int error) +static void read_moving_endio(struct bio *bio) { struct bbio *b = container_of(bio, struct bbio, bio); struct moving_io *io = container_of(bio->bi_private, struct moving_io, cl); - if (error) - io->op.error = error; + if (bio->bi_error) + io->op.error = bio->bi_error; else if (!KEY_DIRTY(&b->key) && ptr_stale(io->op.c, &b->key, 0)) { io->op.error = -EINTR; } - bch_bbio_endio(io->op.c, bio, error, "reading data to move"); + bch_bbio_endio(io->op.c, bio, bio->bi_error, "reading data to move"); } static void moving_init(struct moving_io *io) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f292790..8e9877b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -173,22 +173,22 @@ static void bch_data_insert_error(struct closure *cl) bch_data_insert_keys(cl); } -static void bch_data_insert_endio(struct bio *bio, int error) +static void bch_data_insert_endio(struct bio *bio) { struct closure *cl = bio->bi_private; struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - if (error) { + if (bio->bi_error) { /* TODO: We could try to recover from this. */ if (op->writeback) - op->error = error; + op->error = bio->bi_error; else if (!op->replace) set_closure_fn(cl, bch_data_insert_error, op->wq); else set_closure_fn(cl, NULL, NULL); } - bch_bbio_endio(op->c, bio, error, "writing data to cache"); + bch_bbio_endio(op->c, bio, bio->bi_error, "writing data to cache"); } static void bch_data_insert_start(struct closure *cl) @@ -477,7 +477,7 @@ struct search { struct data_insert_op iop; }; -static void bch_cache_read_endio(struct bio *bio, int error) +static void bch_cache_read_endio(struct bio *bio) { struct bbio *b = container_of(bio, struct bbio, bio); struct closure *cl = bio->bi_private; @@ -490,15 +490,15 @@ static void bch_cache_read_endio(struct bio *bio, int error) * from the backing device. */ - if (error) - s->iop.error = error; + if (bio->bi_error) + s->iop.error = bio->bi_error; else if (!KEY_DIRTY(&b->key) && ptr_stale(s->iop.c, &b->key, 0)) { atomic_long_inc(&s->iop.c->cache_read_races); s->iop.error = -EINTR; } - bch_bbio_endio(s->iop.c, bio, error, "reading from cache"); + bch_bbio_endio(s->iop.c, bio, bio->bi_error, "reading from cache"); } /* @@ -591,13 +591,13 @@ static void cache_lookup(struct closure *cl) /* Common code for the make_request functions */ -static void request_endio(struct bio *bio, int error) +static void request_endio(struct bio *bio) { struct closure *cl = bio->bi_private; - if (error) { + if (bio->bi_error) { struct search *s = container_of(cl, struct search, cl); - s->iop.error = error; + s->iop.error = bio->bi_error; /* Only cache read errors are recoverable */ s->recoverable = false; } @@ -613,7 +613,8 @@ static void bio_complete(struct search *s) &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); - bio_endio(s->orig_bio, s->iop.error); + s->orig_bio->bi_error = s->iop.error; + bio_endio(s->orig_bio); s->orig_bio = NULL; } } @@ -718,7 +719,7 @@ static void cached_dev_read_error(struct closure *cl) /* XXX: invalidate cache */ - closure_bio_submit(bio, cl, s->d); + closure_bio_submit(bio, cl); } continue_at(cl, cached_dev_cache_miss_done, NULL); @@ -841,7 +842,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); - closure_bio_submit(cache_bio, &s->cl, s->d); + closure_bio_submit(cache_bio, &s->cl); return ret; out_put: @@ -849,7 +850,7 @@ out_put: out_submit: miss->bi_end_io = request_endio; miss->bi_private = &s->cl; - closure_bio_submit(miss, &s->cl, s->d); + closure_bio_submit(miss, &s->cl); return ret; } @@ -914,7 +915,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) if (!(bio->bi_rw & REQ_DISCARD) || blk_queue_discard(bdev_get_queue(dc->bdev))) - closure_bio_submit(bio, cl, s->d); + closure_bio_submit(bio, cl); } else if (s->iop.writeback) { bch_writeback_add(dc); s->iop.bio = bio; @@ -929,12 +930,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) flush->bi_end_io = request_endio; flush->bi_private = cl; - closure_bio_submit(flush, cl, s->d); + closure_bio_submit(flush, cl); } } else { s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - closure_bio_submit(bio, cl, s->d); + closure_bio_submit(bio, cl); } closure_call(&s->iop.cl, bch_data_insert, NULL, cl); @@ -950,7 +951,7 @@ static void cached_dev_nodata(struct closure *cl) bch_journal_meta(s->iop.c, cl); /* If it's a flush, we send the flush to the backing device too */ - closure_bio_submit(bio, cl, s->d); + closure_bio_submit(bio, cl); continue_at(cl, cached_dev_bio_complete, NULL); } @@ -992,9 +993,9 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) } else { if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(dc->bdev))) - bio_endio(bio, 0); + bio_endio(bio); else - bch_generic_make_request(bio, &d->bio_split_hook); + generic_make_request(bio); } } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 94980bf..679a093 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -59,29 +59,6 @@ struct workqueue_struct *bcache_wq; #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) -static void bio_split_pool_free(struct bio_split_pool *p) -{ - if (p->bio_split_hook) - mempool_destroy(p->bio_split_hook); - - if (p->bio_split) - bioset_free(p->bio_split); -} - -static int bio_split_pool_init(struct bio_split_pool *p) -{ - p->bio_split = bioset_create(4, 0); - if (!p->bio_split) - return -ENOMEM; - - p->bio_split_hook = mempool_create_kmalloc_pool(4, - sizeof(struct bio_split_hook)); - if (!p->bio_split_hook) - return -ENOMEM; - - return 0; -} - /* Superblock */ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, @@ -221,7 +198,7 @@ err: return err; } -static void write_bdev_super_endio(struct bio *bio, int error) +static void write_bdev_super_endio(struct bio *bio) { struct cached_dev *dc = bio->bi_private; /* XXX: error checking */ @@ -290,11 +267,11 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } -static void write_super_endio(struct bio *bio, int error) +static void write_super_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - bch_count_io_errors(ca, error, "writing superblock"); + bch_count_io_errors(ca, bio->bi_error, "writing superblock"); closure_put(&ca->set->sb_write); } @@ -339,12 +316,12 @@ void bcache_write_super(struct cache_set *c) /* UUID io */ -static void uuid_endio(struct bio *bio, int error) +static void uuid_endio(struct bio *bio) { struct closure *cl = bio->bi_private; struct cache_set *c = container_of(cl, struct cache_set, uuid_write); - cache_set_err_on(error, c, "accessing uuids"); + cache_set_err_on(bio->bi_error, c, "accessing uuids"); bch_bbio_free(bio, c); closure_put(cl); } @@ -512,11 +489,11 @@ static struct uuid_entry *uuid_find_empty(struct cache_set *c) * disk. */ -static void prio_endio(struct bio *bio, int error) +static void prio_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - cache_set_err_on(error, ca->set, "accessing priorities"); + cache_set_err_on(bio->bi_error, ca->set, "accessing priorities"); bch_bbio_free(bio, ca->set); closure_put(&ca->prio); } @@ -537,7 +514,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) bio->bi_private = ca; bch_bio_map(bio, ca->disk_buckets); - closure_bio_submit(bio, &ca->prio, ca); + closure_bio_submit(bio, &ca->prio); closure_sync(cl); } @@ -757,7 +734,6 @@ static void bcache_device_free(struct bcache_device *d) put_disk(d->disk); } - bio_split_pool_free(&d->bio_split_hook); if (d->bio_split) bioset_free(d->bio_split); kvfree(d->full_dirty_stripes); @@ -804,7 +780,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, return minor; if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - bio_split_pool_init(&d->bio_split_hook) || !(d->disk = alloc_disk(1))) { ida_simple_remove(&bcache_minor, minor); return -ENOMEM; @@ -830,7 +805,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; q->limits.max_segments = BIO_MAX_PAGES; - q->limits.max_discard_sectors = UINT_MAX; + blk_queue_max_discard_sectors(q, UINT_MAX); q->limits.discard_granularity = 512; q->limits.io_min = block_size; q->limits.logical_block_size = block_size; @@ -1793,8 +1768,6 @@ void bch_cache_release(struct kobject *kobj) ca->set->cache[ca->sb.nr_this_dev] = NULL; } - bio_split_pool_free(&ca->bio_split_hook); - free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); kfree(ca->prio_buckets); vfree(ca->buckets); @@ -1839,8 +1812,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) ca->sb.nbuckets)) || !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 2, GFP_KERNEL)) || - !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || - bio_split_pool_init(&ca->bio_split_hook)) + !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca))) return -ENOMEM; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1d04c48..cf2cbc2 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -4,6 +4,7 @@ #include <linux/blkdev.h> #include <linux/errno.h> +#include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/llist.h> #include <linux/ratelimit.h> @@ -570,10 +571,10 @@ static inline sector_t bdev_sectors(struct block_device *bdev) return bdev->bd_inode->i_size >> 9; } -#define closure_bio_submit(bio, cl, dev) \ +#define closure_bio_submit(bio, cl) \ do { \ closure_get(cl); \ - bch_generic_make_request(bio, &(dev)->bio_split_hook); \ + generic_make_request(bio); \ } while (0) uint64_t bch_crc64_update(uint64_t, const void *, size_t); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index f1986bc..b23f88d 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -166,12 +166,12 @@ static void write_dirty_finish(struct closure *cl) closure_return_with_destructor(cl, dirty_io_destructor); } -static void dirty_endio(struct bio *bio, int error) +static void dirty_endio(struct bio *bio) { struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; - if (error) + if (bio->bi_error) SET_KEY_DIRTY(&w->key, false); closure_put(&io->cl); @@ -188,27 +188,27 @@ static void write_dirty(struct closure *cl) io->bio.bi_bdev = io->dc->bdev; io->bio.bi_end_io = dirty_endio; - closure_bio_submit(&io->bio, cl, &io->dc->disk); + closure_bio_submit(&io->bio, cl); continue_at(cl, write_dirty_finish, system_wq); } -static void read_dirty_endio(struct bio *bio, int error) +static void read_dirty_endio(struct bio *bio) { struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), - error, "reading dirty data from cache"); + bio->bi_error, "reading dirty data from cache"); - dirty_endio(bio, error); + dirty_endio(bio); } static void read_dirty_submit(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); - closure_bio_submit(&io->bio, cl, &io->dc->disk); + closure_bio_submit(&io->bio, cl); continue_at(cl, write_dirty, system_wq); } diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index e51de52..4f22e91 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -613,12 +613,10 @@ re_read: daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); - /* XXX: This is a hack to ensure that we don't use clustering - * in case: - * - dm-raid is in use and - * - the nodes written in bitmap_sb is erroneous. + /* Setup nodes/clustername only if bitmap version is + * cluster-compatible */ - if (!bitmap->mddev->sync_super) { + if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { nodes = le32_to_cpu(sb->nodes); strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); @@ -628,7 +626,7 @@ re_read: if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) reason = "bad magic"; else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || - le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) + le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED) reason = "unrecognized superblock version"; else if (chunksize < 512) reason = "bitmap chunksize too small"; @@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap) } EXPORT_SYMBOL(bitmap_close_sync); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) { sector_t s = 0; sector_t blocks; @@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) bitmap->last_end_sync = jiffies; return; } - if (time_before(jiffies, (bitmap->last_end_sync + if (!force && time_before(jiffies, (bitmap->last_end_sync + bitmap->mddev->bitmap_info.daemon_sleep))) return; wait_event(bitmap->mddev->recovery_wait, @@ -1997,7 +1995,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) ret = bitmap_storage_alloc(&store, chunks, !bitmap->mddev->bitmap_info.external, - bitmap->cluster_slot); + mddev_is_clustered(bitmap->mddev) + ? bitmap->cluster_slot : 0); if (ret) goto err; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index f1f4dd0..7d5c3a6 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -9,8 +9,10 @@ #define BITMAP_MAJOR_LO 3 /* version 4 insists the bitmap is in little-endian order * with version 3, it is host-endian which is non-portable + * Version 5 is currently set only for clustered devices */ #define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_CLUSTERED 5 #define BITMAP_MAJOR_HOSTENDIAN 3 /* @@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void bitmap_close_sync(struct bitmap *bitmap); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); void bitmap_unplug(struct bitmap *bitmap); void bitmap_daemon_work(struct mddev *mddev); diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index cd6d1d2..03af174 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -236,8 +236,10 @@ void dm_cell_error(struct dm_bio_prison *prison, bio_list_init(&bios); dm_cell_release(prison, cell, &bios); - while ((bio = bio_list_pop(&bios))) - bio_endio(bio, error); + while ((bio = bio_list_pop(&bios))) { + bio->bi_error = error; + bio_endio(bio); + } } EXPORT_SYMBOL_GPL(dm_cell_error); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 86dbbc7..2dd3308 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -545,7 +545,8 @@ static void dmio_complete(unsigned long error, void *context) { struct dm_buffer *b = context; - b->bio.bi_end_io(&b->bio, error ? -EIO : 0); + b->bio.bi_error = error ? -EIO : 0; + b->bio.bi_end_io(&b->bio); } static void use_dmio(struct dm_buffer *b, int rw, sector_t block, @@ -575,13 +576,16 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, b->bio.bi_end_io = end_io; r = dm_io(&io_req, 1, ®ion, NULL); - if (r) - end_io(&b->bio, r); + if (r) { + b->bio.bi_error = r; + end_io(&b->bio); + } } -static void inline_endio(struct bio *bio, int error) +static void inline_endio(struct bio *bio) { bio_end_io_t *end_fn = bio->bi_private; + int error = bio->bi_error; /* * Reset the bio to free any attached resources @@ -589,7 +593,8 @@ static void inline_endio(struct bio *bio, int error) */ bio_reset(bio); - end_fn(bio, error); + bio->bi_error = error; + end_fn(bio); } static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, @@ -661,13 +666,14 @@ static void submit_io(struct dm_buffer *b, int rw, sector_t block, * Set the error, clear B_WRITING bit and wake anyone who was waiting on * it. */ -static void write_endio(struct bio *bio, int error) +static void write_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->write_error = error; - if (unlikely(error)) { + b->write_error = bio->bi_error; + if (unlikely(bio->bi_error)) { struct dm_bufio_client *c = b->c; + int error = bio->bi_error; (void)cmpxchg(&c->async_write_error, 0, error); } @@ -1026,11 +1032,11 @@ found_buffer: * The endio routine for reading: set the error, clear the bit and wake up * anyone waiting on the buffer. */ -static void read_endio(struct bio *bio, int error) +static void read_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->read_error = error; + b->read_error = bio->bi_error; BUG_ON(!test_bit(B_READING, &b->state)); @@ -1592,11 +1598,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->bdev = bdev; c->block_size = block_size; - c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; - c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? - ffs(block_size) - 1 - PAGE_SHIFT : 0; - c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? - PAGE_SHIFT - (ffs(block_size) - 1) : 0); + c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; + c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ? + __ffs(block_size) - PAGE_SHIFT : 0; + c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ? + PAGE_SHIFT - __ffs(block_size) : 0); c->aux_size = aux_size; c->alloc_callback = alloc_callback; @@ -1855,12 +1861,8 @@ static void __exit dm_bufio_exit(void) cancel_delayed_work_sync(&dm_bufio_work); destroy_workqueue(dm_bufio_wq); - for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { - struct kmem_cache *kc = dm_bufio_caches[i]; - - if (kc) - kmem_cache_destroy(kc); - } + for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) + kmem_cache_destroy(dm_bufio_caches[i]); for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) kfree(dm_bufio_cache_names[i]); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 20cc36b..f6543f3 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -260,7 +260,9 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result) } } - return dm_bm_unlock(b); + dm_bm_unlock(b); + + return 0; } static void __setup_mapping_info(struct dm_cache_metadata *cmd) @@ -465,7 +467,9 @@ static int __open_metadata(struct dm_cache_metadata *cmd) dm_disk_bitset_init(cmd->tm, &cmd->discard_info); sb_flags = le32_to_cpu(disk_super->flags); cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); - return dm_bm_unlock(sblock); + dm_bm_unlock(sblock); + + return 0; bad: dm_bm_unlock(sblock); @@ -634,10 +638,10 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super = dm_block_data(sblock); + disk_super->flags = cpu_to_le32(cmd->flags); if (mutator) update_flags(disk_super, mutator); - disk_super->flags = cpu_to_le32(cmd->flags); disk_super->mapping_root = cpu_to_le64(cmd->root); disk_super->hint_root = cpu_to_le64(cmd->hint_root); disk_super->discard_root = cpu_to_le64(cmd->discard_root); diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c index 240c9f0..14aaaf0 100644 --- a/drivers/md/dm-cache-policy-cleaner.c +++ b/drivers/md/dm-cache-policy-cleaner.c @@ -83,7 +83,7 @@ static struct list_head *list_pop(struct list_head *q) static int alloc_hash(struct hash *hash, unsigned elts) { hash->nr_buckets = next_power(elts >> 4, 16); - hash->hash_bits = ffs(hash->nr_buckets) - 1; + hash->hash_bits = __ffs(hash->nr_buckets); hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets); return hash->table ? 0 : -ENOMEM; @@ -436,7 +436,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, static struct dm_cache_policy_type wb_policy_type = { .name = "cleaner", .version = {1, 0, 0}, - .hint_size = 0, + .hint_size = 4, .owner = THIS_MODULE, .create = wb_create }; diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 3281437..ddb2698 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -1410,7 +1410,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); - mq->hash_bits = ffs(mq->nr_buckets) - 1; + mq->hash_bits = __ffs(mq->nr_buckets); mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); if (!mq->table) goto bad_alloc_table; @@ -1471,5 +1471,3 @@ module_exit(mq_exit); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("mq cache policy"); - -MODULE_ALIAS("dm-cache-default"); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 48a4a82..28d4586 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -566,7 +566,7 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent ht->es = es; nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u)); - ht->hash_bits = ffs(nr_buckets) - 1; + ht->hash_bits = __ffs(nr_buckets); ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets); if (!ht->buckets) @@ -772,7 +772,7 @@ struct smq_policy { struct dm_cache_policy policy; /* protects everything */ - struct mutex lock; + spinlock_t lock; dm_cblock_t cache_size; sector_t cache_block_size; @@ -807,13 +807,7 @@ struct smq_policy { /* * Keeps track of time, incremented by the core. We use this to * avoid attributing multiple hits within the same tick. - * - * Access to tick_protected should be done with the spin lock held. - * It's copied to tick at the start of the map function (within the - * mutex). */ - spinlock_t tick_lock; - unsigned tick_protected; unsigned tick; /* @@ -1296,46 +1290,20 @@ static void smq_destroy(struct dm_cache_policy *p) kfree(mq); } -static void copy_tick(struct smq_policy *mq) -{ - unsigned long flags, tick; - - spin_lock_irqsave(&mq->tick_lock, flags); - tick = mq->tick_protected; - if (tick != mq->tick) { - update_sentinels(mq); - end_hotspot_period(mq); - end_cache_period(mq); - mq->tick = tick; - } - spin_unlock_irqrestore(&mq->tick_lock, flags); -} - -static bool maybe_lock(struct smq_policy *mq, bool can_block) -{ - if (can_block) { - mutex_lock(&mq->lock); - return true; - } else - return mutex_trylock(&mq->lock); -} - static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, bool can_block, bool can_migrate, bool fast_promote, struct bio *bio, struct policy_locker *locker, struct policy_result *result) { int r; + unsigned long flags; struct smq_policy *mq = to_smq_policy(p); result->op = POLICY_MISS; - if (!maybe_lock(mq, can_block)) - return -EWOULDBLOCK; - - copy_tick(mq); + spin_lock_irqsave(&mq->lock, flags); r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result); - mutex_unlock(&mq->lock); + spin_unlock_irqrestore(&mq->lock, flags); return r; } @@ -1343,20 +1311,18 @@ static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) { int r; + unsigned long flags; struct smq_policy *mq = to_smq_policy(p); struct entry *e; - if (!mutex_trylock(&mq->lock)) - return -EWOULDBLOCK; - + spin_lock_irqsave(&mq->lock, flags); e = h_lookup(&mq->table, oblock); if (e) { *cblock = infer_cblock(mq, e); r = 0; } else r = -ENOENT; - - mutex_unlock(&mq->lock); + spin_unlock_irqrestore(&mq->lock, flags); return r; } @@ -1375,20 +1341,22 @@ static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, boo static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) { + unsigned long flags; struct smq_policy *mq = to_smq_policy(p); - mutex_lock(&mq->lock); + spin_lock_irqsave(&mq->lock, flags); __smq_set_clear_dirty(mq, oblock, true); - mutex_unlock(&mq->lock); + spin_unlock_irqrestore(&mq->lock, flags); } static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) { struct smq_policy *mq = to_smq_policy(p); + unsigned long flags; - mutex_lock(&mq->lock); + spin_lock_irqsave(&mq->lock, flags); __smq_set_clear_dirty(mq, oblock, false); - mutex_unlock(&mq->lock); + spin_unlock_irqrestore(&mq->lock, flags); } static int smq_load_mapping(struct dm_cache_policy *p, @@ -1433,14 +1401,14 @@ static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, struct smq_policy *mq = to_smq_policy(p); int r = 0; - mutex_lock(&mq->lock); - + /* + * We don't need to lock here since this method is only called once + * the IO has stopped. + */ r = smq_save_hints(mq, &mq->clean, fn, context); if (!r) r = smq_save_hints(mq, &mq->dirty, fn, context); - mutex_unlock(&mq->lock); - return r; } @@ -1458,10 +1426,11 @@ static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) { struct smq_policy *mq = to_smq_policy(p); + unsigned long flags; - mutex_lock(&mq->lock); + spin_lock_irqsave(&mq->lock, flags); __remove_mapping(mq, oblock); - mutex_unlock(&mq->lock); + spin_unlock_irqrestore(&mq->lock, flags); } static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) @@ -1480,11 +1449,12 @@ static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) { int r; + unsigned long flags; struct smq_policy *mq = to_smq_policy(p); - mutex_lock(&mq->lock); + spin_lock_irqsave(&mq->lock, flags); r = __remove_cblock(mq, cblock); - mutex_unlock(&mq->lock); + spin_unlock_irqrestore(&mq->lock, flags); return r; } @@ -1537,11 +1507,12 @@ static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock, bool critical_only) { int r; + unsigned long flags; struct smq_policy *mq = to_smq_policy(p); - mutex_lock(&mq->lock); + spin_lock_irqsave(&mq->lock, flags); r = __smq_writeback_work(mq, oblock, cblock, critical_only); - mutex_unlock(&mq->lock); + spin_unlock_irqrestore(&mq->lock, flags); return r; } @@ -1562,21 +1533,23 @@ static void __force_mapping(struct smq_policy *mq, static void smq_force_mapping(struct dm_cache_policy *p, dm_oblock_t current_oblock, dm_oblock_t new_oblock) { + unsigned long flags; struct smq_policy *mq = to_smq_policy(p); - mutex_lock(&mq->lock); + spin_lock_irqsave(&mq->lock, flags); __force_mapping(mq, current_oblock, new_oblock); - mutex_unlock(&mq->lock); + spin_unlock_irqrestore(&mq->lock, flags); } static dm_cblock_t smq_residency(struct dm_cache_policy *p) { dm_cblock_t r; + unsigned long flags; struct smq_policy *mq = to_smq_policy(p); - mutex_lock(&mq->lock); + spin_lock_irqsave(&mq->lock, flags); r = to_cblock(mq->cache_alloc.nr_allocated); - mutex_unlock(&mq->lock); + spin_unlock_irqrestore(&mq->lock, flags); return r; } @@ -1586,15 +1559,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block) struct smq_policy *mq = to_smq_policy(p); unsigned long flags; - spin_lock_irqsave(&mq->tick_lock, flags); - mq->tick_protected++; - spin_unlock_irqrestore(&mq->tick_lock, flags); - - if (can_block) { - mutex_lock(&mq->lock); - copy_tick(mq); - mutex_unlock(&mq->lock); - } + spin_lock_irqsave(&mq->lock, flags); + mq->tick++; + update_sentinels(mq); + end_hotspot_period(mq); + end_cache_period(mq); + spin_unlock_irqrestore(&mq->lock, flags); } /* Init the policy plugin interface function pointers. */ @@ -1694,10 +1664,8 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, } else mq->cache_hit_bits = NULL; - mq->tick_protected = 0; mq->tick = 0; - mutex_init(&mq->lock); - spin_lock_init(&mq->tick_lock); + spin_lock_init(&mq->lock); q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS); mq->hotspot.nr_top_levels = 8; @@ -1789,3 +1757,5 @@ module_exit(smq_exit); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("smq cache policy"); + +MODULE_ALIAS("dm-cache-default"); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1fe93cf..2fd4c82 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -424,7 +424,6 @@ static void free_migration(struct dm_cache_migration *mg) wake_up(&cache->migration_wait); mempool_free(mg, cache->migration_pool); - wake_worker(cache); } static int prealloc_data_structs(struct cache *cache, struct prealloc *p) @@ -919,14 +918,14 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) wake_worker(cache); } -static void writethrough_endio(struct bio *bio, int err) +static void writethrough_endio(struct bio *bio) { struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); dm_unhook_bio(&pb->hook_info, bio); - if (err) { - bio_endio(bio, err); + if (bio->bi_error) { + bio_endio(bio); return; } @@ -1064,14 +1063,6 @@ static void dec_io_migrations(struct cache *cache) atomic_dec(&cache->nr_io_migrations); } -static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell, - bool holder, struct bio_list *bios) -{ - (holder ? dm_cell_release : dm_cell_release_no_holder) - (cache->prison, cell, bios); - free_prison_cell(cache, cell); -} - static bool discard_or_flush(struct bio *bio) { return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD); @@ -1079,14 +1070,13 @@ static bool discard_or_flush(struct bio *bio) static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) { - if (discard_or_flush(cell->holder)) + if (discard_or_flush(cell->holder)) { /* - * We have to handle these bios - * individually. + * We have to handle these bios individually. */ - __cell_release(cache, cell, true, &cache->deferred_bios); - - else + dm_cell_release(cache->prison, cell, &cache->deferred_bios); + free_prison_cell(cache, cell); + } else list_add_tail(&cell->user_list, &cache->deferred_cells); } @@ -1113,7 +1103,7 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, boo static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) { dm_cell_error(cache->prison, cell, err); - dm_bio_prison_free_cell(cache->prison, cell); + free_prison_cell(cache, cell); } static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) @@ -1123,8 +1113,11 @@ static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) static void free_io_migration(struct dm_cache_migration *mg) { - dec_io_migrations(mg->cache); + struct cache *cache = mg->cache; + + dec_io_migrations(cache); free_migration(mg); + wake_worker(cache); } static void migration_failure(struct dm_cache_migration *mg) @@ -1231,7 +1224,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) * The block was promoted via an overwrite, so it's dirty. */ set_dirty(cache, mg->new_oblock, mg->cblock); - bio_endio(mg->new_ocell->holder, 0); + bio_endio(mg->new_ocell->holder); cell_defer(cache, mg->new_ocell, false); } free_io_migration(mg); @@ -1284,7 +1277,7 @@ static void issue_copy(struct dm_cache_migration *mg) } } -static void overwrite_endio(struct bio *bio, int err) +static void overwrite_endio(struct bio *bio) { struct dm_cache_migration *mg = bio->bi_private; struct cache *cache = mg->cache; @@ -1294,7 +1287,7 @@ static void overwrite_endio(struct bio *bio, int err) dm_unhook_bio(&pb->hook_info, bio); - if (err) + if (bio->bi_error) mg->err = true; mg->requeue_holder = false; @@ -1351,16 +1344,18 @@ static void issue_discard(struct dm_cache_migration *mg) { dm_dblock_t b, e; struct bio *bio = mg->new_ocell->holder; + struct cache *cache = mg->cache; - calc_discard_block_range(mg->cache, bio, &b, &e); + calc_discard_block_range(cache, bio, &b, &e); while (b != e) { - set_discard(mg->cache, b); + set_discard(cache, b); b = to_dblock(from_dblock(b) + 1); } - bio_endio(bio, 0); - cell_defer(mg->cache, mg->new_ocell, false); + bio_endio(bio); + cell_defer(cache, mg->new_ocell, false); free_migration(mg); + wake_worker(cache); } static void issue_copy_or_discard(struct dm_cache_migration *mg) @@ -1631,7 +1626,7 @@ static void process_discard_bio(struct cache *cache, struct prealloc *structs, calc_discard_block_range(cache, bio, &b, &e); if (b == e) { - bio_endio(bio, 0); + bio_endio(bio); return; } @@ -1729,6 +1724,8 @@ static void remap_cell_to_origin_clear_discard(struct cache *cache, remap_to_origin(cache, bio); issue(cache, bio); } + + free_prison_cell(cache, cell); } static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, @@ -1763,6 +1760,8 @@ static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_ remap_to_cache(cache, bio, cblock); issue(cache, bio); } + + free_prison_cell(cache, cell); } /*----------------------------------------------------------------*/ @@ -2217,8 +2216,10 @@ static void requeue_deferred_bios(struct cache *cache) bio_list_merge(&bios, &cache->deferred_bios); bio_list_init(&cache->deferred_bios); - while ((bio = bio_list_pop(&bios))) - bio_endio(bio, DM_ENDIO_REQUEUE); + while ((bio = bio_list_pop(&bios))) { + bio->bi_error = DM_ENDIO_REQUEUE; + bio_endio(bio); + } } static int more_work(struct cache *cache) @@ -2308,8 +2309,7 @@ static void destroy(struct cache *cache) { unsigned i; - if (cache->migration_pool) - mempool_destroy(cache->migration_pool); + mempool_destroy(cache->migration_pool); if (cache->all_io_ds) dm_deferred_set_destroy(cache->all_io_ds); @@ -3123,7 +3123,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio) * This is a duplicate writethrough io that is no * longer needed because the block has been demoted. */ - bio_endio(bio, 0); + bio_endio(bio); // FIXME: remap everything as a miss cell_defer(cache, cell, false); r = DM_MAPIO_SUBMITTED; @@ -3778,26 +3778,6 @@ static int cache_iterate_devices(struct dm_target *ti, return r; } -/* - * We assume I/O is going to the origin (which is the volume - * more likely to have restrictions e.g. by being striped). - * (Looking up the exact location of the data would be expensive - * and could always be out of date by the time the bio is submitted.) - */ -static int cache_bvec_merge(struct dm_target *ti, - struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct cache *cache = ti->private; - struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = cache->origin_dev->bdev; - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - static void set_discard_limits(struct cache *cache, struct queue_limits *limits) { /* @@ -3841,7 +3821,6 @@ static struct target_type cache_target = { .status = cache_status, .message = cache_message, .iterate_devices = cache_iterate_devices, - .merge = cache_bvec_merge, .io_hints = cache_io_hints, }; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0f48fed..3729b39 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -968,7 +968,8 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone); /* * Generate a new unfragmented bio with the given size - * This should never violate the device limitations + * This should never violate the device limitations (but only because + * max_segment_size is being constrained to PAGE_SIZE). * * This function may be called concurrently. If we allocate from the mempool * concurrently, there is a possibility of deadlock. For example, if we have @@ -1076,7 +1077,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io) if (io->ctx.req) crypt_free_req(cc, io->ctx.req, base_bio); - bio_endio(base_bio, error); + base_bio->bi_error = error; + bio_endio(base_bio); } /* @@ -1096,14 +1098,12 @@ static void crypt_dec_pending(struct dm_crypt_io *io) * The work is done per CPU global for all dm-crypt instances. * They should not depend on each other and do not block. */ -static void crypt_endio(struct bio *clone, int error) +static void crypt_endio(struct bio *clone) { struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->cc; unsigned rw = bio_data_dir(clone); - - if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) - error = -EIO; + int error; /* * free the processed pages @@ -1111,6 +1111,7 @@ static void crypt_endio(struct bio *clone, int error) if (rw == WRITE) crypt_free_buffer_pages(cc, clone); + error = clone->bi_error; bio_put(clone); if (rw == READ && !error) { @@ -1543,10 +1544,8 @@ static void crypt_dtr(struct dm_target *ti) if (cc->bs) bioset_free(cc->bs); - if (cc->page_pool) - mempool_destroy(cc->page_pool); - if (cc->req_pool) - mempool_destroy(cc->req_pool); + mempool_destroy(cc->page_pool); + mempool_destroy(cc->req_pool); if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); @@ -1811,11 +1810,13 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->iv_offset = tmpll; - if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { + ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev); + if (ret) { ti->error = "Device lookup failed"; goto bad; } + ret = -EINVAL; if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid device sector"; goto bad; @@ -2035,21 +2036,6 @@ error: return -EINVAL; } -static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct crypt_config *cc = ti->private; - struct request_queue *q = bdev_get_queue(cc->dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = cc->dev->bdev; - bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - static int crypt_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -2058,9 +2044,20 @@ static int crypt_iterate_devices(struct dm_target *ti, return fn(ti, cc->dev, cc->start, ti->len, data); } +static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + /* + * Unfortunate constraint that is required to avoid the potential + * for exceeding underlying device's max_segments limits -- due to + * crypt_alloc_buffer() possibly allocating pages for the encryption + * bio that are not as physically contiguous as the original bio. + */ + limits->max_segment_size = PAGE_SIZE; +} + static struct target_type crypt_target = { .name = "crypt", - .version = {1, 14, 0}, + .version = {1, 14, 1}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, @@ -2070,8 +2067,8 @@ static struct target_type crypt_target = { .preresume = crypt_preresume, .resume = crypt_resume, .message = crypt_message, - .merge = crypt_merge, .iterate_devices = crypt_iterate_devices, + .io_hints = crypt_io_hints, }; static int __init dm_crypt_init(void) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 57b6a19..b4c356a 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -122,6 +122,7 @@ static void flush_expired_bios(struct work_struct *work) * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>] * * With separate write parameters, the first set is only used for reads. + * Offsets are specified in sectors. * Delays are specified in milliseconds. */ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) @@ -129,9 +130,10 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct delay_c *dc; unsigned long long tmpll; char dummy; + int ret; if (argc != 3 && argc != 6) { - ti->error = "requires exactly 3 or 6 arguments"; + ti->error = "Requires exactly 3 or 6 arguments"; return -EINVAL; } @@ -143,6 +145,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) dc->reads = dc->writes = 0; + ret = -EINVAL; if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid device sector"; goto bad; @@ -154,12 +157,14 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), - &dc->dev_read)) { + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &dc->dev_read); + if (ret) { ti->error = "Device lookup failed"; goto bad; } + ret = -EINVAL; dc->dev_write = NULL; if (argc == 3) goto out; @@ -175,13 +180,15 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_dev_read; } - if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), - &dc->dev_write)) { + ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), + &dc->dev_write); + if (ret) { ti->error = "Write device lookup failed"; goto bad_dev_read; } out: + ret = -EINVAL; dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); if (!dc->kdelayd_wq) { DMERR("Couldn't start kdelayd"); @@ -208,7 +215,7 @@ bad_dev_read: dm_put_device(ti, dc->dev_read); bad: kfree(dc); - return -EINVAL; + return ret; } static void delay_dtr(struct dm_target *ti) @@ -231,7 +238,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) unsigned long expires = 0; if (!delay || !atomic_read(&dc->may_delay)) - return 1; + return DM_MAPIO_REMAPPED; delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); @@ -251,7 +258,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) queue_timeout(dc, expires); - return 0; + return DM_MAPIO_SUBMITTED; } static void delay_presuspend(struct dm_target *ti) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index ad913cd..665bf32 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -343,7 +343,9 @@ static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result) } } - return dm_bm_unlock(b); + dm_bm_unlock(b); + + return 0; } /*----------------------------------------------------------------*/ @@ -582,7 +584,9 @@ static int open_metadata(struct era_metadata *md) md->metadata_snap = le64_to_cpu(disk->metadata_snap); md->archived_writesets = true; - return dm_bm_unlock(sblock); + dm_bm_unlock(sblock); + + return 0; bad: dm_bm_unlock(sblock); @@ -1046,12 +1050,7 @@ static int metadata_take_snap(struct era_metadata *md) md->metadata_snap = dm_block_location(clone); - r = dm_tm_unlock(md->tm, clone); - if (r) { - DMERR("%s: couldn't unlock clone", __func__); - md->metadata_snap = SUPERBLOCK_LOCATION; - return r; - } + dm_tm_unlock(md->tm, clone); return 0; } @@ -1673,20 +1672,6 @@ static int era_iterate_devices(struct dm_target *ti, return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data); } -static int era_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct era *era = ti->private; - struct request_queue *q = bdev_get_queue(era->origin_dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = era->origin_dev->bdev; - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - static void era_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct era *era = ti->private; @@ -1717,7 +1702,6 @@ static struct target_type era_target = { .status = era_status, .message = era_message, .iterate_devices = era_iterate_devices, - .merge = era_merge, .io_hints = era_io_hints }; diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index ebaa4f8..3997f34 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -183,7 +183,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, store->chunk_size = chunk_size; store->chunk_mask = chunk_size - 1; - store->chunk_shift = ffs(chunk_size) - 1; + store->chunk_shift = __ffs(chunk_size); return 0; } @@ -203,7 +203,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, return -EINVAL; } - tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL); + tmp_store = kzalloc(sizeof(*tmp_store), GFP_KERNEL); if (!tmp_store) { ti->error = "Exception store allocation failed"; return -ENOMEM; @@ -215,7 +215,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, else if (persistent == 'N') type = get_type("N"); else { - ti->error = "Persistent flag is not P or N"; + ti->error = "Exception store type is not P or N"; r = -EINVAL; goto bad_type; } @@ -233,7 +233,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, if (r) goto bad; - r = type->ctr(tmp_store, 0, NULL); + r = type->ctr(tmp_store, (strlen(argv[0]) > 1 ? &argv[0][1] : NULL)); if (r) { ti->error = "Exception store type constructor failed"; goto bad; diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 0b25362..fae34e7 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -42,8 +42,7 @@ struct dm_exception_store_type { const char *name; struct module *module; - int (*ctr) (struct dm_exception_store *store, - unsigned argc, char **argv); + int (*ctr) (struct dm_exception_store *store, char *options); /* * Destroys this object when you've finished with it. @@ -123,6 +122,8 @@ struct dm_exception_store { unsigned chunk_shift; void *context; + + bool userspace_supports_overflow; }; /* diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b257e46..09e2afc 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -183,6 +183,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) devname = dm_shift_arg(&as); + r = -EINVAL; if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid device sector"; goto bad; @@ -211,7 +212,8 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) { + r = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev); + if (r) { ti->error = "Device lookup failed"; goto bad; } @@ -224,7 +226,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) bad: kfree(fc); - return -EINVAL; + return r; } static void flakey_dtr(struct dm_target *ti) @@ -296,7 +298,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) * Drop writes? */ if (test_bit(DROP_WRITES, &fc->flags)) { - bio_endio(bio, 0); + bio_endio(bio); return DM_MAPIO_SUBMITTED; } @@ -371,35 +373,20 @@ static void flakey_status(struct dm_target *ti, status_type_t type, } } -static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) +static int flakey_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode) { struct flakey_c *fc = ti->private; - struct dm_dev *dev = fc->dev; - int r = 0; + + *bdev = fc->dev->bdev; /* * Only pass ioctls through if the device sizes match exactly. */ if (fc->start || - ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); - - return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); -} - -static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct flakey_c *fc = ti->private; - struct request_queue *q = bdev_get_queue(fc->dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = fc->dev->bdev; - bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); + ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) + return 1; + return 0; } static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -418,8 +405,7 @@ static struct target_type flakey_target = { .map = flakey_map, .end_io = flakey_end_io, .status = flakey_status, - .ioctl = flakey_ioctl, - .merge = flakey_merge, + .prepare_ioctl = flakey_prepare_ioctl, .iterate_devices = flakey_iterate_devices, }; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 74adcd2..81c5e1a 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -65,8 +65,7 @@ struct dm_io_client *dm_io_client_create(void) return client; bad: - if (client->pool) - mempool_destroy(client->pool); + mempool_destroy(client->pool); kfree(client); return ERR_PTR(-ENOMEM); } @@ -134,12 +133,13 @@ static void dec_count(struct io *io, unsigned int region, int error) complete_io(io); } -static void endio(struct bio *bio, int error) +static void endio(struct bio *bio) { struct io *io; unsigned region; + int error; - if (error && bio_data_dir(bio) == READ) + if (bio->bi_error && bio_data_dir(bio) == READ) zero_fill_bio(bio); /* @@ -147,6 +147,7 @@ static void endio(struct bio *bio, int error) */ retrieve_io_and_region_from_bio(bio, &io, ®ion); + error = bio->bi_error; bio_put(bio); dec_count(io, region, error); @@ -314,7 +315,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME)) num_bvecs = 1; else - num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), + num_bvecs = min_t(int, BIO_MAX_PAGES, dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 720ceeb..80a4395 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1919,9 +1919,7 @@ int __init dm_interface_init(void) void dm_interface_exit(void) { - if (misc_deregister(&_dm_misc) < 0) - DMERR("misc_deregister failed for control device"); - + misc_deregister(&_dm_misc); dm_hash_exit(); } diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 53e848c..05c35aa 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -30,6 +30,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct linear_c *lc; unsigned long long tmp; char dummy; + int ret; if (argc != 2) { ti->error = "Invalid argument count"; @@ -38,18 +39,20 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) lc = kmalloc(sizeof(*lc), GFP_KERNEL); if (lc == NULL) { - ti->error = "dm-linear: Cannot allocate linear context"; + ti->error = "Cannot allocate linear context"; return -ENOMEM; } + ret = -EINVAL; if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { - ti->error = "dm-linear: Invalid device sector"; + ti->error = "Invalid device sector"; goto bad; } lc->start = tmp; - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) { - ti->error = "dm-linear: Device lookup failed"; + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev); + if (ret) { + ti->error = "Device lookup failed"; goto bad; } @@ -61,7 +64,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) bad: kfree(lc); - return -EINVAL; + return ret; } static void linear_dtr(struct dm_target *ti) @@ -113,36 +116,21 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_ioctl(struct dm_target *ti, unsigned int cmd, - unsigned long arg) +static int linear_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode) { struct linear_c *lc = (struct linear_c *) ti->private; struct dm_dev *dev = lc->dev; - int r = 0; + + *bdev = dev->bdev; /* * Only pass ioctls through if the device sizes match exactly. */ if (lc->start || ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); - - return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); -} - -static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct linear_c *lc = ti->private; - struct request_queue *q = bdev_get_queue(lc->dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = lc->dev->bdev; - bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); + return 1; + return 0; } static int linear_iterate_devices(struct dm_target *ti, @@ -161,8 +149,7 @@ static struct target_type linear_target = { .dtr = linear_dtr, .map = linear_map, .status = linear_status, - .ioctl = linear_ioctl, - .merge = linear_merge, + .prepare_ioctl = linear_prepare_ioctl, .iterate_devices = linear_iterate_devices, }; diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 058256d..53b7b06d 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -313,8 +313,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, out: kfree(devices_rdata); if (r) { - if (lc->flush_entry_pool) - mempool_destroy(lc->flush_entry_pool); + mempool_destroy(lc->flush_entry_pool); kfree(lc); kfree(ctr_str); } else { diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index ad1b049..624589d 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -146,16 +146,16 @@ static void put_io_block(struct log_writes_c *lc) } } -static void log_end_io(struct bio *bio, int err) +static void log_end_io(struct bio *bio) { struct log_writes_c *lc = bio->bi_private; struct bio_vec *bvec; int i; - if (err) { + if (bio->bi_error) { unsigned long flags; - DMERR("Error writing log block, error=%d", err); + DMERR("Error writing log block, error=%d", bio->bi_error); spin_lock_irqsave(&lc->blocks_lock, flags); lc->logging_enabled = false; spin_unlock_irqrestore(&lc->blocks_lock, flags); @@ -205,7 +205,6 @@ static int write_metadata(struct log_writes_c *lc, void *entry, bio->bi_bdev = lc->logdev->bdev; bio->bi_end_io = log_end_io; bio->bi_private = lc; - set_bit(BIO_UPTODATE, &bio->bi_flags); page = alloc_page(GFP_KERNEL); if (!page) { @@ -270,7 +269,6 @@ static int log_one_block(struct log_writes_c *lc, bio->bi_bdev = lc->logdev->bdev; bio->bi_end_io = log_end_io; bio->bi_private = lc; - set_bit(BIO_UPTODATE, &bio->bi_flags); for (i = 0; i < block->vec_cnt; i++) { /* @@ -292,7 +290,6 @@ static int log_one_block(struct log_writes_c *lc, bio->bi_bdev = lc->logdev->bdev; bio->bi_end_io = log_end_io; bio->bi_private = lc; - set_bit(BIO_UPTODATE, &bio->bi_flags); ret = bio_add_page(bio, block->vecs[i].bv_page, block->vecs[i].bv_len, 0); @@ -420,6 +417,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct log_writes_c *lc; struct dm_arg_set as; const char *devname, *logdevname; + int ret; as.argc = argc; as.argv = argv; @@ -443,18 +441,22 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) atomic_set(&lc->pending_blocks, 0); devname = dm_shift_arg(&as); - if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) { + ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev); + if (ret) { ti->error = "Device lookup failed"; goto bad; } logdevname = dm_shift_arg(&as); - if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) { + ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), + &lc->logdev); + if (ret) { ti->error = "Log device lookup failed"; dm_put_device(ti, lc->dev); goto bad; } + ret = -EINVAL; lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write"); if (!lc->log_kthread) { ti->error = "Couldn't alloc kthread"; @@ -479,7 +481,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) bad: kfree(lc); - return -EINVAL; + return ret; } static int log_mark(struct log_writes_c *lc, char *data) @@ -606,7 +608,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) WARN_ON(flush_bio || fua_bio); if (lc->device_supports_discard) goto map_bio; - bio_endio(bio, 0); + bio_endio(bio); return DM_MAPIO_SUBMITTED; } @@ -712,35 +714,19 @@ static void log_writes_status(struct dm_target *ti, status_type_t type, } } -static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd, - unsigned long arg) +static int log_writes_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode) { struct log_writes_c *lc = ti->private; struct dm_dev *dev = lc->dev; - int r = 0; + *bdev = dev->bdev; /* * Only pass ioctls through if the device sizes match exactly. */ if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); - - return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); -} - -static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct log_writes_c *lc = ti->private; - struct request_queue *q = bdev_get_queue(lc->dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = lc->dev->bdev; - bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); + return 1; + return 0; } static int log_writes_iterate_devices(struct dm_target *ti, @@ -795,8 +781,7 @@ static struct target_type log_writes_target = { .map = log_writes_map, .end_io = normal_end_io, .status = log_writes_status, - .ioctl = log_writes_ioctl, - .merge = log_writes_merge, + .prepare_ioctl = log_writes_prepare_ioctl, .message = log_writes_message, .iterate_devices = log_writes_iterate_devices, .io_hints = log_writes_io_hints, diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index eff7bdd..aaa6caa 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -159,12 +159,9 @@ static struct priority_group *alloc_priority_group(void) static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) { struct pgpath *pgpath, *tmp; - struct multipath *m = ti->private; list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { list_del(&pgpath->list); - if (m->hw_handler_name) - scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); dm_put_device(ti, pgpath->path.dev); free_pgpath(pgpath); } @@ -580,6 +577,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps q = bdev_get_queue(p->path.dev->bdev); if (m->retain_attached_hw_handler) { +retain: attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); if (attached_handler_name) { /* @@ -599,20 +597,14 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps } if (m->hw_handler_name) { - /* - * Increments scsi_dh reference, even when using an - * already-attached handler. - */ r = scsi_dh_attach(q, m->hw_handler_name); if (r == -EBUSY) { - /* - * Already attached to different hw_handler: - * try to reattach with correct one. - */ - scsi_dh_detach(q); - r = scsi_dh_attach(q, m->hw_handler_name); - } + char b[BDEVNAME_SIZE]; + printk(KERN_INFO "dm-mpath: retaining handler on device %s\n", + bdevname(p->path.dev->bdev, b)); + goto retain; + } if (r < 0) { ti->error = "error attaching hardware handler"; dm_put_device(ti, p->path.dev); @@ -624,7 +616,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps if (r < 0) { ti->error = "unable to set hardware " "handler parameters"; - scsi_dh_detach(q); dm_put_device(ti, p->path.dev); goto bad; } @@ -734,12 +725,6 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) return 0; m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); - if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name), - "scsi_dh_%s", m->hw_handler_name)) { - ti->error = "unknown hardware handler type"; - ret = -EINVAL; - goto fail; - } if (hw_argc > 1) { char *p; @@ -1548,18 +1533,14 @@ out: return r; } -static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, - unsigned long arg) +static int multipath_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode) { struct multipath *m = ti->private; struct pgpath *pgpath; - struct block_device *bdev; - fmode_t mode; unsigned long flags; int r; - bdev = NULL; - mode = 0; r = 0; spin_lock_irqsave(&m->lock, flags); @@ -1570,26 +1551,17 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, pgpath = m->current_pgpath; if (pgpath) { - bdev = pgpath->path.dev->bdev; - mode = pgpath->path.dev->mode; + *bdev = pgpath->path.dev->bdev; + *mode = pgpath->path.dev->mode; } if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) r = -ENOTCONN; - else if (!bdev) + else if (!*bdev) r = -EIO; spin_unlock_irqrestore(&m->lock, flags); - /* - * Only pass ioctls through if the device sizes match exactly. - */ - if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) { - int err = scsi_verify_blk_ioctl(NULL, cmd); - if (err) - r = err; - } - if (r == -ENOTCONN && !fatal_signal_pending(current)) { spin_lock_irqsave(&m->lock, flags); if (!m->current_pg) { @@ -1602,7 +1574,12 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, dm_table_run_md_queue_async(m->ti->table); } - return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) + return 1; + return r; } static int multipath_iterate_devices(struct dm_target *ti, @@ -1705,7 +1682,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, @@ -1718,7 +1695,7 @@ static struct target_type multipath_target = { .resume = multipath_resume, .status = multipath_status, .message = multipath_message, - .ioctl = multipath_ioctl, + .prepare_ioctl = multipath_prepare_ioctl, .iterate_devices = multipath_iterate_devices, .busy = multipath_busy, }; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 2daa677..a090121 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -329,8 +329,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) */ if (min_region_size > (1 << 13)) { /* If not a power of 2, make it the next power of 2 */ - if (min_region_size & (min_region_size - 1)) - region_size = 1 << fls(region_size); + region_size = roundup_pow_of_two(min_region_size); DMINFO("Choosing default region size of %lu sectors", region_size); } else { @@ -1717,24 +1716,6 @@ static void raid_resume(struct dm_target *ti) mddev_resume(&rs->md); } -static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct raid_set *rs = ti->private; - struct md_personality *pers = rs->md.pers; - - if (pers && pers->mergeable_bvec) - return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec)); - - /* - * In case we can't request the personality because - * the raid set is not running yet - * - * -> return safe minimum - */ - return rs->md.chunk_sectors; -} - static struct target_type raid_target = { .name = "raid", .version = {1, 7, 0}, @@ -1749,7 +1730,6 @@ static struct target_type raid_target = { .presuspend = raid_presuspend, .postsuspend = raid_postsuspend, .resume = raid_resume, - .merge = raid_merge, }; static int __init dm_raid_init(void) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d83696b..f2a363a 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -490,9 +490,11 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio) * If device is suspended, complete the bio. */ if (dm_noflush_suspending(ms->ti)) - bio_endio(bio, DM_ENDIO_REQUEUE); + bio->bi_error = DM_ENDIO_REQUEUE; else - bio_endio(bio, -EIO); + bio->bi_error = -EIO; + + bio_endio(bio); return; } @@ -515,7 +517,7 @@ static void read_callback(unsigned long error, void *context) bio_set_m(bio, NULL); if (likely(!error)) { - bio_endio(bio, 0); + bio_endio(bio); return; } @@ -531,7 +533,7 @@ static void read_callback(unsigned long error, void *context) DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", m->dev->name); - bio_endio(bio, -EIO); + bio_io_error(bio); } /* Asynchronous read. */ @@ -580,7 +582,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) if (likely(m)) read_async_bio(m, bio); else - bio_endio(bio, -EIO); + bio_io_error(bio); } } @@ -598,7 +600,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) static void write_callback(unsigned long error, void *context) { - unsigned i, ret = 0; + unsigned i; struct bio *bio = (struct bio *) context; struct mirror_set *ms; int should_wake = 0; @@ -614,7 +616,7 @@ static void write_callback(unsigned long error, void *context) * regions with the same code. */ if (likely(!error)) { - bio_endio(bio, ret); + bio_endio(bio); return; } @@ -623,7 +625,8 @@ static void write_callback(unsigned long error, void *context) * degrade the array. */ if (bio->bi_rw & REQ_DISCARD) { - bio_endio(bio, -EOPNOTSUPP); + bio->bi_error = -EOPNOTSUPP; + bio_endio(bio); return; } @@ -828,13 +831,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) * be wrong if the failed leg returned after reboot and * got replicated back to the good legs.) */ - if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure))) - bio_endio(bio, -EIO); + bio_io_error(bio); else if (errors_handled(ms) && !keep_log(ms)) hold_bio(ms, bio); else - bio_endio(bio, 0); + bio_endio(bio); } } @@ -943,16 +945,18 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, { unsigned long long offset; char dummy; + int ret; if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { ti->error = "Invalid offset"; return -EINVAL; } - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), - &ms->mirror[mirror].dev)) { + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &ms->mirror[mirror].dev); + if (ret) { ti->error = "Device lookup failure"; - return -ENXIO; + return ret; } ms->mirror[mirror].ms = ms; diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index b929fd5..74cb7b99 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -193,7 +193,7 @@ struct dm_region_hash *dm_region_hash_create( rh->max_recovery = max_recovery; rh->log = log; rh->region_size = region_size; - rh->region_shift = ffs(region_size) - 1; + rh->region_shift = __ffs(region_size); rwlock_init(&rh->hash_lock); rh->mask = nr_buckets - 1; rh->nr_buckets = nr_buckets; @@ -249,9 +249,7 @@ void dm_region_hash_destroy(struct dm_region_hash *rh) if (rh->log) dm_dirty_log_destroy(rh->log); - if (rh->region_pool) - mempool_destroy(rh->region_pool); - + mempool_destroy(rh->region_pool); vfree(rh->buckets); kfree(rh); } diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 808b841..3164b8b 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -7,6 +7,7 @@ #include "dm-exception-store.h" +#include <linux/ctype.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> @@ -321,7 +322,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) bdev_logical_block_size(dm_snap_cow(ps->store->snap)-> bdev) >> 9); ps->store->chunk_mask = ps->store->chunk_size - 1; - ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; + ps->store->chunk_shift = __ffs(ps->store->chunk_size); chunk_size_supplied = 0; } @@ -533,7 +534,7 @@ static int read_exceptions(struct pstore *ps, chunk = area_location(ps, ps->current_area); area = dm_bufio_read(client, chunk, &bp); - if (unlikely(IS_ERR(area))) { + if (IS_ERR(area)) { r = PTR_ERR(area); goto ret_destroy_bufio; } @@ -843,10 +844,10 @@ static void persistent_drop_snapshot(struct dm_exception_store *store) DMWARN("write header failed"); } -static int persistent_ctr(struct dm_exception_store *store, - unsigned argc, char **argv) +static int persistent_ctr(struct dm_exception_store *store, char *options) { struct pstore *ps; + int r; /* allocate the pstore */ ps = kzalloc(sizeof(*ps), GFP_KERNEL); @@ -868,14 +869,32 @@ static int persistent_ctr(struct dm_exception_store *store, ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); if (!ps->metadata_wq) { - kfree(ps); DMERR("couldn't start header metadata update thread"); - return -ENOMEM; + r = -ENOMEM; + goto err_workqueue; + } + + if (options) { + char overflow = toupper(options[0]); + if (overflow == 'O') + store->userspace_supports_overflow = true; + else { + DMERR("Unsupported persistent store option: %s", options); + r = -EINVAL; + goto err_options; + } } store->context = ps; return 0; + +err_options: + destroy_workqueue(ps->metadata_wq); +err_workqueue: + kfree(ps); + + return r; } static unsigned persistent_status(struct dm_exception_store *store, @@ -888,7 +907,8 @@ static unsigned persistent_status(struct dm_exception_store *store, case STATUSTYPE_INFO: break; case STATUSTYPE_TABLE: - DMEMIT(" P %llu", (unsigned long long)store->chunk_size); + DMEMIT(" %s %llu", store->userspace_supports_overflow ? "PO" : "P", + (unsigned long long)store->chunk_size); } return sz; diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index 1ce9a25..9b7c8c8 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c @@ -70,8 +70,7 @@ static void transient_usage(struct dm_exception_store *store, *metadata_sectors = 0; } -static int transient_ctr(struct dm_exception_store *store, - unsigned argc, char **argv) +static int transient_ctr(struct dm_exception_store *store, char *options) { struct transient_c *tc; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 7c82d3c..c06b74e 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -63,6 +63,13 @@ struct dm_snapshot { */ int valid; + /* + * The snapshot overflowed because of a write to the snapshot device. + * We don't have to invalidate the snapshot in this case, but we need + * to prevent further writes. + */ + int snapshot_overflowed; + /* Origin writes don't trigger exceptions until this is set */ int active; @@ -1091,7 +1098,7 @@ static void stop_merge(struct dm_snapshot *s) } /* - * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> + * Construct a snapshot mapping: <origin_dev> <COW-dev> <p|po|n> <chunk-size> */ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) { @@ -1152,6 +1159,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->ti = ti; s->valid = 1; + s->snapshot_overflowed = 0; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); s->exception_start_sequence = 0; @@ -1294,6 +1302,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src, u.store_swap = snap_dest->store; snap_dest->store = snap_src->store; + snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow; snap_src->store = u.store_swap; snap_dest->store->snap = snap_dest; @@ -1301,6 +1310,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src, snap_dest->ti->max_io_len = snap_dest->store->chunk_size; snap_dest->valid = snap_src->valid; + snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed; /* * Set source invalid to ensure it receives no further I/O. @@ -1490,7 +1500,7 @@ out: error_bios(snapshot_bios); } else { if (full_bio) - bio_endio(full_bio, 0); + bio_endio(full_bio); flush_bios(snapshot_bios); } @@ -1580,11 +1590,11 @@ static void start_copy(struct dm_snap_pending_exception *pe) dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } -static void full_bio_end_io(struct bio *bio, int error) +static void full_bio_end_io(struct bio *bio) { void *callback_data = bio->bi_private; - dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0); + dm_kcopyd_do_callback(callback_data, 0, bio->bi_error ? 1 : 0); } static void start_full_bio(struct dm_snap_pending_exception *pe, @@ -1691,7 +1701,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) * to copy an exception */ down_write(&s->lock); - if (!s->valid) { + if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_rw(bio) == WRITE)) { r = -EIO; goto out_unlock; } @@ -1715,7 +1725,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) pe = alloc_pending_exception(s); down_write(&s->lock); - if (!s->valid) { + if (!s->valid || s->snapshot_overflowed) { free_pending_exception(pe); r = -EIO; goto out_unlock; @@ -1730,7 +1740,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) pe = __find_pending_exception(s, pe, chunk); if (!pe) { - __invalidate_snapshot(s, -ENOMEM); + if (s->store->userspace_supports_overflow) { + s->snapshot_overflowed = 1; + DMERR("Snapshot overflowed: Unable to allocate exception."); + } else + __invalidate_snapshot(s, -ENOMEM); r = -EIO; goto out_unlock; } @@ -1990,6 +2004,8 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, DMEMIT("Invalid"); else if (snap->merge_failed) DMEMIT("Merge failed"); + else if (snap->snapshot_overflowed) + DMEMIT("Overflow"); else { if (snap->store->type->usage) { sector_t total_sectors, sectors_allocated, @@ -2330,20 +2346,6 @@ static void origin_status(struct dm_target *ti, status_type_t type, } } -static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct dm_origin *o = ti->private; - struct request_queue *q = bdev_get_queue(o->dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = o->dev->bdev; - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - static int origin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -2362,13 +2364,12 @@ static struct target_type origin_target = { .resume = origin_resume, .postsuspend = origin_postsuspend, .status = origin_status, - .merge = origin_merge, .iterate_devices = origin_iterate_devices, }; static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 13, 0}, + .version = {1, 15, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, @@ -2382,7 +2383,7 @@ static struct target_type snapshot_target = { static struct target_type merge_target = { .name = dm_snapshot_merge_target_name, - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 8a8b48f..8289804 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -457,12 +457,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program, list_for_each_entry(s, &stats->list, list_entry) { if (!program || !strcmp(program, s->program_id)) { len = s->end - s->start; - DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id, + DMEMIT("%d: %llu+%llu %llu %s %s", s->id, (unsigned long long)s->start, (unsigned long long)len, (unsigned long long)s->step, s->program_id, s->aux_data); + if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) + DMEMIT(" precise_timestamps"); + if (s->n_histogram_entries) { + unsigned i; + DMEMIT(" histogram:"); + for (i = 0; i < s->n_histogram_entries; i++) { + if (i) + DMEMIT(","); + DMEMIT("%llu", s->histogram_boundaries[i]); + } + } + DMEMIT("\n"); } } mutex_unlock(&stats->mutex); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index a672a15..797ddb9 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -75,13 +75,15 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, { unsigned long long start; char dummy; + int ret; if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) return -EINVAL; - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), - &sc->stripe[stripe].dev)) - return -ENXIO; + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &sc->stripe[stripe].dev); + if (ret) + return ret; sc->stripe[stripe].physical_start = start; @@ -273,7 +275,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio, return DM_MAPIO_REMAPPED; } else { /* The range doesn't map to the target stripe */ - bio_endio(bio, 0); + bio_endio(bio); return DM_MAPIO_SUBMITTED; } } @@ -412,26 +414,6 @@ static void stripe_io_hints(struct dm_target *ti, blk_limits_io_opt(limits, chunk_size * sc->stripes); } -static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct stripe_c *sc = ti->private; - sector_t bvm_sector = bvm->bi_sector; - uint32_t stripe; - struct request_queue *q; - - stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector); - - q = bdev_get_queue(sc->stripe[stripe].dev->bdev); - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = sc->stripe[stripe].dev->bdev; - bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector; - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - static struct target_type stripe_target = { .name = "striped", .version = {1, 5, 1}, @@ -443,7 +425,6 @@ static struct target_type stripe_target = { .status = stripe_status, .iterate_devices = stripe_iterate_devices, .io_hints = stripe_io_hints, - .merge = stripe_merge, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 50fca46..871c18f 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -99,11 +99,11 @@ static int alloc_region_table(struct dm_target *ti, unsigned nr_paths) if (sector_div(nr_regions, sctx->region_size)) nr_regions++; - sctx->nr_regions = nr_regions; - if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) { + if (nr_regions >= ULONG_MAX) { ti->error = "Region table too large"; return -EINVAL; } + sctx->nr_regions = nr_regions; nr_slots = nr_regions; if (sector_div(nr_slots, sctx->region_entries_per_slot)) @@ -511,27 +511,24 @@ static void switch_status(struct dm_target *ti, status_type_t type, * * Passthrough all ioctls to the path for sector 0 */ -static int switch_ioctl(struct dm_target *ti, unsigned cmd, - unsigned long arg) +static int switch_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode) { struct switch_ctx *sctx = ti->private; - struct block_device *bdev; - fmode_t mode; unsigned path_nr; - int r = 0; path_nr = switch_get_path_nr(sctx, 0); - bdev = sctx->path_list[path_nr].dmdev->bdev; - mode = sctx->path_list[path_nr].dmdev->mode; + *bdev = sctx->path_list[path_nr].dmdev->bdev; + *mode = sctx->path_list[path_nr].dmdev->mode; /* * Only pass ioctls through if the device sizes match exactly. */ - if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); - - return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (ti->len + sctx->path_list[path_nr].start != + i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) + return 1; + return 0; } static int switch_iterate_devices(struct dm_target *ti, @@ -560,7 +557,7 @@ static struct target_type switch_target = { .map = switch_map, .message = switch_message, .status = switch_status, - .ioctl = switch_ioctl, + .prepare_ioctl = switch_prepare_ioctl, .iterate_devices = switch_iterate_devices, }; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 16ba55a..061152a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -440,14 +440,6 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, q->limits.alignment_offset, (unsigned long long) start << SECTOR_SHIFT); - /* - * Check if merge fn is supported. - * If not we'll force DM to use PAGE_SIZE or - * smaller I/O, just to be safe. - */ - if (dm_queue_merge_is_compulsory(q) && !ti->type->merge) - blk_limits_max_hw_sectors(limits, - (unsigned int) (PAGE_SIZE >> 9)); return 0; } @@ -1022,15 +1014,16 @@ static int dm_table_build_index(struct dm_table *t) return r; } +static bool integrity_profile_exists(struct gendisk *disk) +{ + return !!blk_get_integrity(disk); +} + /* * Get a disk whose integrity profile reflects the table's profile. - * If %match_all is true, all devices' profiles must match. - * If %match_all is false, all devices must at least have an - * allocated integrity profile; but uninitialized is ok. * Returns NULL if integrity support was inconsistent or unavailable. */ -static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, - bool match_all) +static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) { struct list_head *devices = dm_table_get_devices(t); struct dm_dev_internal *dd = NULL; @@ -1038,10 +1031,8 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, list_for_each_entry(dd, devices, list) { template_disk = dd->dm_dev->bdev->bd_disk; - if (!blk_get_integrity(template_disk)) + if (!integrity_profile_exists(template_disk)) goto no_integrity; - if (!match_all && !blk_integrity_is_initialized(template_disk)) - continue; /* skip uninitialized profiles */ else if (prev_disk && blk_integrity_compare(prev_disk, template_disk) < 0) goto no_integrity; @@ -1060,34 +1051,40 @@ no_integrity: } /* - * Register the mapped device for blk_integrity support if - * the underlying devices have an integrity profile. But all devices - * may not have matching profiles (checking all devices isn't reliable + * Register the mapped device for blk_integrity support if the + * underlying devices have an integrity profile. But all devices may + * not have matching profiles (checking all devices isn't reliable * during table load because this table may use other DM device(s) which - * must be resumed before they will have an initialized integity profile). - * Stacked DM devices force a 2 stage integrity profile validation: - * 1 - during load, validate all initialized integrity profiles match - * 2 - during resume, validate all integrity profiles match + * must be resumed before they will have an initialized integity + * profile). Consequently, stacked DM devices force a 2 stage integrity + * profile validation: First pass during table load, final pass during + * resume. */ -static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) +static int dm_table_register_integrity(struct dm_table *t) { + struct mapped_device *md = t->md; struct gendisk *template_disk = NULL; - template_disk = dm_table_get_integrity_disk(t, false); + template_disk = dm_table_get_integrity_disk(t); if (!template_disk) return 0; - if (!blk_integrity_is_initialized(dm_disk(md))) { + if (!integrity_profile_exists(dm_disk(md))) { t->integrity_supported = 1; - return blk_integrity_register(dm_disk(md), NULL); + /* + * Register integrity profile during table load; we can do + * this because the final profile must match during resume. + */ + blk_integrity_register(dm_disk(md), + blk_get_integrity(template_disk)); + return 0; } /* - * If DM device already has an initalized integrity + * If DM device already has an initialized integrity * profile the new profile should not conflict. */ - if (blk_integrity_is_initialized(template_disk) && - blk_integrity_compare(dm_disk(md), template_disk) < 0) { + if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { DMWARN("%s: conflict with existing integrity profile: " "%s profile mismatch", dm_device_name(t->md), @@ -1095,7 +1092,7 @@ static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device return 1; } - /* Preserve existing initialized integrity profile */ + /* Preserve existing integrity profile */ t->integrity_supported = 1; return 0; } @@ -1120,7 +1117,7 @@ int dm_table_complete(struct dm_table *t) return r; } - r = dm_table_prealloc_integrity(t, t->md); + r = dm_table_register_integrity(t); if (r) { DMERR("could not register integrity profile."); return r; @@ -1286,29 +1283,30 @@ combine_limits: } /* - * Set the integrity profile for this device if all devices used have - * matching profiles. We're quite deep in the resume path but still - * don't know if all devices (particularly DM devices this device - * may be stacked on) have matching profiles. Even if the profiles - * don't match we have no way to fail (to resume) at this point. + * Verify that all devices have an integrity profile that matches the + * DM device's registered integrity profile. If the profiles don't + * match then unregister the DM device's integrity profile. */ -static void dm_table_set_integrity(struct dm_table *t) +static void dm_table_verify_integrity(struct dm_table *t) { struct gendisk *template_disk = NULL; - if (!blk_get_integrity(dm_disk(t->md))) - return; + if (t->integrity_supported) { + /* + * Verify that the original integrity profile + * matches all the devices in this table. + */ + template_disk = dm_table_get_integrity_disk(t); + if (template_disk && + blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) + return; + } - template_disk = dm_table_get_integrity_disk(t, true); - if (template_disk) - blk_integrity_register(dm_disk(t->md), - blk_get_integrity(template_disk)); - else if (blk_integrity_is_initialized(dm_disk(t->md))) - DMWARN("%s: device no longer has a valid integrity profile", - dm_device_name(t->md)); - else + if (integrity_profile_exists(dm_disk(t->md))) { DMWARN("%s: unable to establish an integrity profile", dm_device_name(t->md)); + blk_integrity_unregister(dm_disk(t->md)); + } } static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, @@ -1388,14 +1386,6 @@ static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); } -static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags); -} - static bool dm_table_all_devices_attribute(struct dm_table *t, iterate_devices_callout_fn func) { @@ -1516,12 +1506,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); - if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps)) - queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q); - else - queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q); - - dm_table_set_integrity(t); + dm_table_verify_integrity(t); /* * Determine whether or not this queue's I/O timings contribute diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 48dfe3c..1fa4569 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -396,7 +396,9 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) } } - return dm_bm_unlock(b); + dm_bm_unlock(b); + + return 0; } static void __setup_btree_details(struct dm_pool_metadata *pmd) @@ -650,7 +652,9 @@ static int __open_metadata(struct dm_pool_metadata *pmd) } __setup_btree_details(pmd); - return dm_bm_unlock(sblock); + dm_bm_unlock(sblock); + + return 0; bad_cleanup_data_sm: dm_sm_destroy(pmd->data_sm); @@ -1293,11 +1297,13 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd) return r; disk_super = dm_block_data(copy); - dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root)); - dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root)); + dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root)); + dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root)); dm_sm_dec_block(pmd->metadata_sm, held_root); - return dm_tm_unlock(pmd->tm, copy); + dm_tm_unlock(pmd->tm, copy); + + return 0; } int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) @@ -1327,7 +1333,9 @@ static int __get_metadata_snap(struct dm_pool_metadata *pmd, disk_super = dm_block_data(sblock); *result = le64_to_cpu(disk_super->held_root); - return dm_bm_unlock(sblock); + dm_bm_unlock(sblock); + + return 0; } int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index d2bbe8c..3897b90 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -332,9 +332,6 @@ struct thin_c { * * Description: * Asynchronously issue a discard request for the sectors in question. - * NOTE: this variant of blk-core's blkdev_issue_discard() is a stop-gap - * that is being kept local to DM thinp until the block changes to allow - * late bio splitting land upstream. */ static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags, @@ -342,91 +339,36 @@ static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sect { struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; - unsigned int max_discard_sectors, granularity; - int alignment; struct bio *bio; - int ret = 0; - struct blk_plug plug; - if (!q) + if (!q || !nr_sects) return -ENXIO; if (!blk_queue_discard(q)) return -EOPNOTSUPP; - /* Zero-sector (unknown) and one-sector granularities are the same. */ - granularity = max(q->limits.discard_granularity >> 9, 1U); - alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; - - /* - * Ensure that max_discard_sectors is of the proper - * granularity, so that requests stay aligned after a split. - */ - max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - max_discard_sectors -= max_discard_sectors % granularity; - if (unlikely(!max_discard_sectors)) { - /* Avoid infinite loop below. Being cautious never hurts. */ - return -EOPNOTSUPP; - } - if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; type |= REQ_SECURE; } - blk_start_plug(&plug); - while (nr_sects) { - unsigned int req_sects; - sector_t end_sect, tmp; - - /* - * Required bio_put occurs in bio_endio thanks to bio_chain below - */ - bio = bio_alloc(gfp_mask, 1); - if (!bio) { - ret = -ENOMEM; - break; - } - - req_sects = min_t(sector_t, nr_sects, max_discard_sectors); - - /* - * If splitting a request, and the next starting sector would be - * misaligned, stop the discard at the previous aligned sector. - */ - end_sect = sector + req_sects; - tmp = end_sect; - if (req_sects < nr_sects && - sector_div(tmp, granularity) != alignment) { - end_sect = end_sect - alignment; - sector_div(end_sect, granularity); - end_sect = end_sect * granularity + alignment; - req_sects = end_sect - sector; - } - - bio_chain(bio, parent_bio); + /* + * Required bio_put occurs in bio_endio thanks to bio_chain below + */ + bio = bio_alloc(gfp_mask, 1); + if (!bio) + return -ENOMEM; - bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_chain(bio, parent_bio); - bio->bi_iter.bi_size = req_sects << 9; - nr_sects -= req_sects; - sector = end_sect; + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_iter.bi_size = nr_sects << 9; - submit_bio(type, bio); + submit_bio(type, bio); - /* - * We can loop for a long time in here, if someone does - * full device discards (like mkfs). Be nice and allow - * us to schedule out to avoid softlocking if preempt - * is disabled. - */ - cond_resched(); - } - blk_finish_plug(&plug); - - return ret; + return 0; } static bool block_size_is_power_of_two(struct pool *pool) @@ -615,8 +557,10 @@ static void error_bio_list(struct bio_list *bios, int error) { struct bio *bio; - while ((bio = bio_list_pop(bios))) - bio_endio(bio, error); + while ((bio = bio_list_pop(bios))) { + bio->bi_error = error; + bio_endio(bio); + } } static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error) @@ -870,14 +814,14 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) complete_mapping_preparation(m); } -static void overwrite_endio(struct bio *bio, int err) +static void overwrite_endio(struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct dm_thin_new_mapping *m = h->overwrite_mapping; bio->bi_end_io = m->saved_bi_end_io; - m->err = err; + m->err = bio->bi_error; complete_mapping_preparation(m); } @@ -1002,7 +946,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) */ if (bio) { inc_remap_and_issue_cell(tc, m->cell, m->data_block); - bio_endio(bio, 0); + bio_endio(bio); } else { inc_all_io_entry(tc->pool, m->cell->holder); remap_and_issue(tc, m->cell->holder, m->data_block); @@ -1032,7 +976,7 @@ static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) static void process_prepared_discard_success(struct dm_thin_new_mapping *m) { - bio_endio(m->bio, 0); + bio_endio(m->bio); free_discard_mapping(m); } @@ -1046,7 +990,7 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m) metadata_operation_failed(tc->pool, "dm_thin_remove_range", r); bio_io_error(m->bio); } else - bio_endio(m->bio, 0); + bio_endio(m->bio); cell_defer_no_holder(tc, m->cell); mempool_free(m, tc->pool->mapping_pool); @@ -1117,7 +1061,8 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) * Even if r is set, there could be sub discards in flight that we * need to wait for. */ - bio_endio(m->bio, r); + m->bio->bi_error = r; + bio_endio(m->bio); cell_defer_no_holder(tc, m->cell); mempool_free(m, pool->mapping_pool); } @@ -1493,9 +1438,10 @@ static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) { int error = should_error_unserviceable_bio(pool); - if (error) - bio_endio(bio, error); - else + if (error) { + bio->bi_error = error; + bio_endio(bio); + } else retry_on_resume(bio); } @@ -1539,9 +1485,8 @@ static void process_discard_cell_no_passdown(struct thin_c *tc, } /* - * FIXME: DM local hack to defer parent bios's end_io until we - * _know_ all chained sub range discard bios have completed. - * Will go away once late bio splitting lands upstream! + * __bio_inc_remaining() is used to defer parent bios's end_io until + * we _know_ all chained sub range discard bios have completed. */ static inline void __bio_inc_remaining(struct bio *bio) { @@ -1631,7 +1576,7 @@ static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_priso * will prevent completion until the sub range discards have * completed. */ - bio_endio(bio, 0); + bio_endio(bio); } static void process_discard_bio(struct thin_c *tc, struct bio *bio) @@ -1645,7 +1590,7 @@ static void process_discard_bio(struct thin_c *tc, struct bio *bio) /* * The discard covers less than a block. */ - bio_endio(bio, 0); + bio_endio(bio); return; } @@ -1790,7 +1735,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block if (bio_data_dir(bio) == READ) { zero_fill_bio(bio); cell_defer_no_holder(tc, cell); - bio_endio(bio, 0); + bio_endio(bio); return; } @@ -1855,7 +1800,7 @@ static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) } else { zero_fill_bio(bio); - bio_endio(bio, 0); + bio_endio(bio); } } else provision_block(tc, bio, block, cell); @@ -1926,7 +1871,7 @@ static void __process_bio_read_only(struct thin_c *tc, struct bio *bio, } zero_fill_bio(bio); - bio_endio(bio, 0); + bio_endio(bio); break; default: @@ -1951,7 +1896,7 @@ static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell static void process_bio_success(struct thin_c *tc, struct bio *bio) { - bio_endio(bio, 0); + bio_endio(bio); } static void process_bio_fail(struct thin_c *tc, struct bio *bio) @@ -2600,7 +2545,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) thin_hook_bio(tc, bio); if (tc->requeue_mode) { - bio_endio(bio, DM_ENDIO_REQUEUE); + bio->bi_error = DM_ENDIO_REQUEUE; + bio_endio(bio); return DM_MAPIO_SUBMITTED; } @@ -3255,7 +3201,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) metadata_low_callback, pool); if (r) - goto out_free_pt; + goto out_flags_changed; pt->callbacks.congested_fn = pool_is_congested; dm_table_add_target_callbacks(ti->table, &pt->callbacks); @@ -3875,20 +3821,6 @@ static int pool_iterate_devices(struct dm_target *ti, return fn(ti, pt->data_dev, 0, ti->len, data); } -static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct pool_c *pt = ti->private; - struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = pt->data_dev->bdev; - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct pool_c *pt = ti->private; @@ -3965,7 +3897,6 @@ static struct target_type pool_target = { .resume = pool_resume, .message = pool_message, .status = pool_status, - .merge = pool_merge, .iterate_devices = pool_iterate_devices, .io_hints = pool_io_hints, }; @@ -4292,21 +4223,6 @@ err: DMEMIT("Error"); } -static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct thin_c *tc = ti->private; - struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = tc->pool_dev->bdev; - bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - static int thin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -4333,6 +4249,10 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct thin_c *tc = ti->private; struct pool *pool = tc->pool; + struct queue_limits *pool_limits = dm_get_queue_limits(pool->pool_md); + + if (!pool_limits->discard_granularity) + return; /* pool's discard support is disabled */ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */ @@ -4350,7 +4270,6 @@ static struct target_type thin_target = { .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, - .merge = thin_merge, .iterate_devices = thin_iterate_devices, .io_hints = thin_io_hints, }; diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index bb9c6a0..ccf4188 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -26,8 +26,6 @@ #define DM_VERITY_ENV_LENGTH 42 #define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" -#define DM_VERITY_IO_VEC_INLINE 16 -#define DM_VERITY_MEMPOOL_SIZE 4 #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 #define DM_VERITY_MAX_LEVELS 63 @@ -76,8 +74,6 @@ struct dm_verity { enum verity_mode mode; /* mode for handling verification errors */ unsigned corrupted_errs;/* Number of errors for corrupted blocks */ - mempool_t *vec_mempool; /* mempool of bio vector */ - struct workqueue_struct *verify_wq; /* starting blocks for each tree level. 0 is the lowest level. */ @@ -271,7 +267,7 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block, verity_hash_at_level(v, block, level, &hash_block, &offset); data = dm_bufio_read(v->bufio, hash_block, &buf); - if (unlikely(IS_ERR(data))) + if (IS_ERR(data)) return PTR_ERR(data); aux = dm_bufio_get_aux_data(buf); @@ -458,8 +454,9 @@ static void verity_finish_io(struct dm_verity_io *io, int error) bio->bi_end_io = io->orig_bi_end_io; bio->bi_private = io->orig_bi_private; + bio->bi_error = error; - bio_endio(bio, error); + bio_endio(bio); } static void verity_work(struct work_struct *w) @@ -469,12 +466,12 @@ static void verity_work(struct work_struct *w) verity_finish_io(io, verity_verify_io(io)); } -static void verity_end_io(struct bio *bio, int error) +static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (error) { - verity_finish_io(io, error); + if (bio->bi_error) { + verity_finish_io(io, bio->bi_error); return; } @@ -634,33 +631,17 @@ static void verity_status(struct dm_target *ti, status_type_t type, } } -static int verity_ioctl(struct dm_target *ti, unsigned cmd, - unsigned long arg) +static int verity_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode) { struct dm_verity *v = ti->private; - int r = 0; + + *bdev = v->data_dev->bdev; if (v->data_start || ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); - - return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode, - cmd, arg); -} - -static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct dm_verity *v = ti->private; - struct request_queue *q = bdev_get_queue(v->data_dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = v->data_dev->bdev; - bvm->bi_sector = verity_map_sector(v, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); + return 1; + return 0; } static int verity_iterate_devices(struct dm_target *ti, @@ -691,9 +672,6 @@ static void verity_dtr(struct dm_target *ti) if (v->verify_wq) destroy_workqueue(v->verify_wq); - if (v->vec_mempool) - mempool_destroy(v->vec_mempool); - if (v->bufio) dm_bufio_client_destroy(v->bufio); @@ -962,14 +940,6 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io)); - v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, - BIO_MAX_PAGES * sizeof(struct bio_vec)); - if (!v->vec_mempool) { - ti->error = "Cannot allocate vector mempool"; - r = -ENOMEM; - goto bad; - } - /* WQ_UNBOUND greatly improves performance when running on ramdisk */ v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); if (!v->verify_wq) { @@ -994,8 +964,7 @@ static struct target_type verity_target = { .dtr = verity_dtr, .map = verity_map, .status = verity_status, - .ioctl = verity_ioctl, - .merge = verity_merge, + .prepare_ioctl = verity_prepare_ioctl, .iterate_devices = verity_iterate_devices, .io_hints = verity_io_hints, }; diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index b9a64bb..766bc93 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -47,7 +47,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio) break; } - bio_endio(bio, 0); + bio_endio(bio); /* accepted bio, don't make new request */ return DM_MAPIO_SUBMITTED; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ab37ae1..32440ad 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -24,6 +24,7 @@ #include <linux/ktime.h> #include <linux/elevator.h> /* for rq_end_sector() */ #include <linux/blk-mq.h> +#include <linux/pr.h> #include <trace/events/block.h> @@ -124,9 +125,8 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 -#define DMF_MERGE_IS_OPTIONAL 6 -#define DMF_DEFERRED_REMOVE 7 -#define DMF_SUSPENDED_INTERNALLY 8 +#define DMF_DEFERRED_REMOVE 6 +#define DMF_SUSPENDED_INTERNALLY 7 /* * A dummy definition to make RCU happy. @@ -556,18 +556,16 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +static int dm_get_live_table_for_ioctl(struct mapped_device *md, + struct dm_target **tgt, struct block_device **bdev, + fmode_t *mode, int *srcu_idx) { - struct mapped_device *md = bdev->bd_disk->private_data; - int srcu_idx; struct dm_table *map; - struct dm_target *tgt; - int r = -ENOTTY; + int r; retry: - map = dm_get_live_table(md, &srcu_idx); - + r = -ENOTTY; + map = dm_get_live_table(md, srcu_idx); if (!map || !dm_table_get_size(map)) goto out; @@ -575,8 +573,9 @@ retry: if (dm_table_get_num_targets(map) != 1) goto out; - tgt = dm_table_get_target(map, 0); - if (!tgt->type->ioctl) + *tgt = dm_table_get_target(map, 0); + + if (!(*tgt)->type->prepare_ioctl) goto out; if (dm_suspended_md(md)) { @@ -584,16 +583,46 @@ retry: goto out; } - r = tgt->type->ioctl(tgt, cmd, arg); + r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode); + if (r < 0) + goto out; + + return r; out: - dm_put_live_table(md, srcu_idx); - + dm_put_live_table(md, *srcu_idx); if (r == -ENOTCONN) { msleep(10); goto retry; } + return r; +} +static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + struct dm_target *tgt; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + if (r > 0) { + /* + * Target determined this ioctl is being issued against + * a logical partition of the parent bdev; so extra + * validation is needed. + */ + r = scsi_verify_blk_ioctl(NULL, cmd); + if (r) + goto out; + } + + r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); +out: + dm_put_live_table(md, srcu_idx); return r; } @@ -944,7 +973,8 @@ static void dec_pending(struct dm_io *io, int error) } else { /* done with normal IO or empty flush */ trace_block_bio_complete(md->queue, bio, io_error); - bio_endio(bio, io_error); + bio->bi_error = io_error; + bio_endio(bio); } } } @@ -957,17 +987,15 @@ static void disable_write_same(struct mapped_device *md) limits->max_write_same_sectors = 0; } -static void clone_endio(struct bio *bio, int error) +static void clone_endio(struct bio *bio) { + int error = bio->bi_error; int r = error; struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; - if (!bio_flagged(bio, BIO_UPTODATE) && !error) - error = -EIO; - if (endio) { r = endio(tio->ti, bio, error); if (r < 0 || r == DM_ENDIO_REQUEUE) @@ -996,13 +1024,14 @@ static void clone_endio(struct bio *bio, int error) /* * Partial completion handling for request-based dm */ -static void end_clone_bio(struct bio *clone, int error) +static void end_clone_bio(struct bio *clone) { struct dm_rq_clone_bio_info *info = container_of(clone, struct dm_rq_clone_bio_info, clone); struct dm_rq_target_io *tio = info->tio; struct bio *bio = info->orig; unsigned int nr_bytes = info->orig->bi_iter.bi_size; + int error = clone->bi_error; bio_put(clone); @@ -1466,7 +1495,7 @@ static void __map_bio(struct dm_target_io *tio) md = tio->io->md; dec_pending(tio->io, r); free_tio(md, tio); - } else if (r) { + } else if (r != DM_MAPIO_SUBMITTED) { DMWARN("unimplemented target map return value: %d", r); BUG(); } @@ -1722,67 +1751,6 @@ static void __split_and_process_bio(struct mapped_device *md, * CRUD END *---------------------------------------------------------------*/ -static int dm_merge_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table_fast(md); - struct dm_target *ti; - sector_t max_sectors, max_size = 0; - - if (unlikely(!map)) - goto out; - - ti = dm_table_find_target(map, bvm->bi_sector); - if (!dm_target_is_valid(ti)) - goto out; - - /* - * Find maximum amount of I/O that won't need splitting - */ - max_sectors = min(max_io_len(bvm->bi_sector, ti), - (sector_t) queue_max_sectors(q)); - max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; - - /* - * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t - * to the targets' merge function since it holds sectors not bytes). - * Just doing this as an interim fix for stable@ because the more - * comprehensive cleanup of switching to sector_t will impact every - * DM target that implements a ->merge hook. - */ - if (max_size > INT_MAX) - max_size = INT_MAX; - - /* - * merge_bvec_fn() returns number of bytes - * it can accept at this offset - * max is precomputed maximal io size - */ - if (max_size && ti->type->merge) - max_size = ti->type->merge(ti, bvm, biovec, (int) max_size); - /* - * If the target doesn't support merge method and some of the devices - * provided their merge_bvec method (we know this by looking for the - * max_hw_sectors that dm_set_device_limits may set), then we can't - * allow bios with multiple vector entries. So always set max_size - * to 0, and the code below allows just one page. - */ - else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) - max_size = 0; - -out: - dm_put_live_table_fast(md); - /* - * Always allow an entire first page - */ - if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) - max_size = biovec->bv_len; - - return max_size; -} - /* * The request function that just remaps the bio built up by * dm_merge_bvec. @@ -2258,6 +2226,13 @@ static void dm_init_md_queue(struct mapped_device *md) * This queue is new, so no concurrency on the queue_flags. */ queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); + + /* + * Initialize data that will only be used by a non-blk-mq DM queue + * - must do so here (in alloc_dev callchain) before queue is used + */ + md->queue->queuedata = md; + md->queue->backing_dev_info.congested_data = md; } static void dm_init_old_md_queue(struct mapped_device *md) @@ -2268,10 +2243,7 @@ static void dm_init_old_md_queue(struct mapped_device *md) /* * Initialize aspects of queue that aren't relevant for blk-mq */ - md->queue->queuedata = md; md->queue->backing_dev_info.congested_fn = dm_any_congested; - md->queue->backing_dev_info.congested_data = md; - blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); } @@ -2281,10 +2253,8 @@ static void cleanup_mapped_device(struct mapped_device *md) destroy_workqueue(md->wq); if (md->kworker_task) kthread_stop(md->kworker_task); - if (md->io_pool) - mempool_destroy(md->io_pool); - if (md->rq_pool) - mempool_destroy(md->rq_pool); + mempool_destroy(md->io_pool); + mempool_destroy(md->rq_pool); if (md->bs) bioset_free(md->bs); @@ -2294,8 +2264,6 @@ static void cleanup_mapped_device(struct mapped_device *md) spin_lock(&_minor_lock); md->disk->private_data = NULL; spin_unlock(&_minor_lock); - if (blk_get_integrity(md->disk)) - blk_integrity_unregister(md->disk); del_gendisk(md->disk); put_disk(md->disk); } @@ -2503,59 +2471,6 @@ static void __set_size(struct mapped_device *md, sector_t size) } /* - * Return 1 if the queue has a compulsory merge_bvec_fn function. - * - * If this function returns 0, then the device is either a non-dm - * device without a merge_bvec_fn, or it is a dm device that is - * able to split any bios it receives that are too big. - */ -int dm_queue_merge_is_compulsory(struct request_queue *q) -{ - struct mapped_device *dev_md; - - if (!q->merge_bvec_fn) - return 0; - - if (q->make_request_fn == dm_make_request) { - dev_md = q->queuedata; - if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) - return 0; - } - - return 1; -} - -static int dm_device_merge_is_compulsory(struct dm_target *ti, - struct dm_dev *dev, sector_t start, - sector_t len, void *data) -{ - struct block_device *bdev = dev->bdev; - struct request_queue *q = bdev_get_queue(bdev); - - return dm_queue_merge_is_compulsory(q); -} - -/* - * Return 1 if it is acceptable to ignore merge_bvec_fn based - * on the properties of the underlying devices. - */ -static int dm_table_merge_is_optional(struct dm_table *table) -{ - unsigned i = 0; - struct dm_target *ti; - - while (i < dm_table_get_num_targets(table)) { - ti = dm_table_get_target(table, i++); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) - return 0; - } - - return 1; -} - -/* * Returns old map, which caller must destroy. */ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, @@ -2564,7 +2479,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct dm_table *old_map; struct request_queue *q = md->queue; sector_t size; - int merge_is_optional; size = dm_table_get_size(t); @@ -2590,17 +2504,11 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, __bind_mempools(md, t); - merge_is_optional = dm_table_merge_is_optional(t); - old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, t); md->immutable_target_type = dm_table_get_immutable_target_type(t); dm_table_set_restrictions(t, q, limits); - if (merge_is_optional) - set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - else - clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); if (old_map) dm_sync_table(md); @@ -2881,7 +2789,12 @@ int dm_setup_md_queue(struct mapped_device *md) case DM_TYPE_BIO_BASED: dm_init_old_md_queue(md); blk_queue_make_request(md->queue, dm_make_request); - blk_queue_merge_bvec(md->queue, dm_merge_bvec); + /* + * DM handles splitting bios as needed. Free the bio_split bioset + * since it won't be used (saves 1 process per bio-based DM device). + */ + bioset_free(md->queue->bio_split); + md->queue->bio_split = NULL; break; } @@ -2959,8 +2872,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) might_sleep(); - map = dm_get_live_table(md, &srcu_idx); - spin_lock(&_minor_lock); idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); set_bit(DMF_FREEING, &md->flags); @@ -2974,14 +2885,14 @@ static void __dm_destroy(struct mapped_device *md, bool wait) * do not race with internal suspend. */ mutex_lock(&md->suspend_lock); + map = dm_get_live_table(md, &srcu_idx); if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); dm_table_postsuspend_targets(map); } - mutex_unlock(&md->suspend_lock); - /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ dm_put_live_table(md, srcu_idx); + mutex_unlock(&md->suspend_lock); /* * Rare, but there may be I/O requests still going to complete, @@ -3630,11 +3541,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) if (!pools) return; - if (pools->io_pool) - mempool_destroy(pools->io_pool); - - if (pools->rq_pool) - mempool_destroy(pools->rq_pool); + mempool_destroy(pools->io_pool); + mempool_destroy(pools->rq_pool); if (pools->bs) bioset_free(pools->bs); @@ -3642,11 +3550,133 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) kfree(pools); } +static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, + u32 flags) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_register) + r = ops->pr_register(bdev, old_key, new_key, flags); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, + u32 flags) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_reserve) + r = ops->pr_reserve(bdev, key, type, flags); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_release) + r = ops->pr_release(bdev, key, type); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, + enum pr_type type, bool abort) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_preempt) + r = ops->pr_preempt(bdev, old_key, new_key, type, abort); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static int dm_pr_clear(struct block_device *bdev, u64 key) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_clear) + r = ops->pr_clear(bdev, key); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static const struct pr_ops dm_pr_ops = { + .pr_register = dm_pr_register, + .pr_reserve = dm_pr_reserve, + .pr_release = dm_pr_release, + .pr_preempt = dm_pr_preempt, + .pr_clear = dm_pr_clear, +}; + static const struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, .getgeo = dm_blk_getgeo, + .pr_ops = &dm_pr_ops, .owner = THIS_MODULE }; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 4e98499..7edcf97 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -78,8 +78,6 @@ bool dm_table_mq_request_based(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); -int dm_queue_merge_is_compulsory(struct request_queue *q); - void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); void dm_set_md_type(struct mapped_device *md, unsigned type); diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 1277eb2..4a8e150 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -70,7 +70,7 @@ #include <linux/seq_file.h> -static void faulty_fail(struct bio *bio, int error) +static void faulty_fail(struct bio *bio) { struct bio *b = bio->bi_private; @@ -181,7 +181,7 @@ static void make_request(struct mddev *mddev, struct bio *bio) /* special case - don't decrement, don't generic_make_request, * just fail immediately */ - bio_endio(bio, -EIO); + bio_io_error(bio); return; } diff --git a/drivers/md/linear.c b/drivers/md/linear.c index fa7d577..b7fe7e9 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -52,48 +52,6 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) return conf->disks + lo; } -/** - * linear_mergeable_bvec -- tell bio layer if two requests can be merged - * @q: request queue - * @bvm: properties of new bio - * @biovec: the request that could be merged to it. - * - * Return amount of bytes we can take at this offset - */ -static int linear_mergeable_bvec(struct mddev *mddev, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct dev_info *dev0; - unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int maxbytes = biovec->bv_len; - struct request_queue *subq; - - dev0 = which_dev(mddev, sector); - maxsectors = dev0->end_sector - sector; - subq = bdev_get_queue(dev0->rdev->bdev); - if (subq->merge_bvec_fn) { - bvm->bi_bdev = dev0->rdev->bdev; - bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors; - maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, - biovec)); - } - - if (maxsectors < bio_sectors) - maxsectors = 0; - else - maxsectors -= bio_sectors; - - if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) - return maxbytes; - - if (maxsectors > (maxbytes >> 9)) - return maxbytes; - else - return maxsectors << 9; -} - static int linear_congested(struct mddev *mddev, int bits) { struct linear_conf *conf; @@ -297,7 +255,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) if (unlikely((split->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { /* Just ignore it */ - bio_endio(split, 0); + bio_endio(split); } else generic_make_request(split); } while (split != bio); @@ -338,7 +296,6 @@ static struct md_personality linear_personality = .size = linear_size, .quiesce = linear_quiesce, .congested = linear_congested, - .mergeable_bvec = linear_mergeable_bvec, }; static int __init linear_init (void) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 0072190..d6a1126 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -28,6 +28,7 @@ struct dlm_lock_resource { struct completion completion; /* completion for synchronized locking */ void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ struct mddev *mddev; /* pointing back to mddev. */ + int mode; }; struct suspend_info { @@ -45,6 +46,7 @@ struct resync_info { /* md_cluster_info flags */ #define MD_CLUSTER_WAITING_FOR_NEWDISK 1 #define MD_CLUSTER_SUSPEND_READ_BALANCING 2 +#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3 struct md_cluster_info { @@ -52,9 +54,8 @@ struct md_cluster_info { dlm_lockspace_t *lockspace; int slot_number; struct completion completion; - struct dlm_lock_resource *sb_lock; - struct mutex sb_mutex; struct dlm_lock_resource *bitmap_lockres; + struct dlm_lock_resource *resync_lockres; struct list_head suspend_list; spinlock_t suspend_lock; struct md_thread *recovery_thread; @@ -75,23 +76,24 @@ enum msg_type { NEWDISK, REMOVE, RE_ADD, + BITMAP_NEEDS_SYNC, }; struct cluster_msg { - int type; - int slot; + __le32 type; + __le32 slot; /* TODO: Unionize this for smaller footprint */ - sector_t low; - sector_t high; + __le64 low; + __le64 high; char uuid[16]; - int raid_slot; + __le32 raid_slot; }; static void sync_ast(void *arg) { struct dlm_lock_resource *res; - res = (struct dlm_lock_resource *) arg; + res = arg; complete(&res->completion); } @@ -99,13 +101,14 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) { int ret = 0; - init_completion(&res->completion); ret = dlm_lock(res->ls, mode, &res->lksb, res->flags, res->name, strlen(res->name), 0, sync_ast, res, res->bast); if (ret) return ret; wait_for_completion(&res->completion); + if (res->lksb.sb_status == 0) + res->mode = mode; return res->lksb.sb_status; } @@ -124,8 +127,10 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev, res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); if (!res) return NULL; + init_completion(&res->completion); res->ls = cinfo->lockspace; res->mddev = mddev; + res->mode = DLM_LOCK_IV; namelen = strlen(name); res->name = kzalloc(namelen + 1, GFP_KERNEL); if (!res->name) { @@ -165,11 +170,24 @@ out_err: static void lockres_free(struct dlm_lock_resource *res) { + int ret; + if (!res) return; - init_completion(&res->completion); - dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); + /* cancel a lock request or a conversion request that is blocked */ + res->flags |= DLM_LKF_CANCEL; +retry: + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); + if (unlikely(ret != 0)) { + pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret); + + /* if a lock conversion is cancelled, then the lock is put + * back to grant queue, need to ensure it is unlocked */ + if (ret == -DLM_ECANCEL) + goto retry; + } + res->flags &= ~DLM_LKF_CANCEL; wait_for_completion(&res->completion); kfree(res->name); @@ -177,20 +195,8 @@ static void lockres_free(struct dlm_lock_resource *res) kfree(res); } -static char *pretty_uuid(char *dest, char *src) -{ - int i, len = 0; - - for (i = 0; i < 16; i++) { - if (i == 4 || i == 6 || i == 8 || i == 10) - len += sprintf(dest + len, "-"); - len += sprintf(dest + len, "%02x", (__u8)src[i]); - } - return dest; -} - -static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, - sector_t lo, sector_t hi) +static void add_resync_info(struct dlm_lock_resource *lockres, + sector_t lo, sector_t hi) { struct resync_info *ri; @@ -208,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc dlm_lock_sync(lockres, DLM_LOCK_CR); memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); hi = le64_to_cpu(ri.hi); - if (ri.hi > 0) { + if (hi > 0) { s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); if (!s) goto out; @@ -281,16 +287,11 @@ static void recover_prep(void *arg) set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); } -static void recover_slot(void *arg, struct dlm_slot *slot) +static void __recover_slot(struct mddev *mddev, int slot) { - struct mddev *mddev = arg; struct md_cluster_info *cinfo = mddev->cluster_info; - pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", - mddev->bitmap_info.cluster_name, - slot->nodeid, slot->slot, - cinfo->slot_number); - set_bit(slot->slot - 1, &cinfo->recovery_map); + set_bit(slot, &cinfo->recovery_map); if (!cinfo->recovery_thread) { cinfo->recovery_thread = md_register_thread(recover_bitmaps, mddev, "recover"); @@ -302,6 +303,20 @@ static void recover_slot(void *arg, struct dlm_slot *slot) md_wakeup_thread(cinfo->recovery_thread); } +static void recover_slot(void *arg, struct dlm_slot *slot) +{ + struct mddev *mddev = arg; + struct md_cluster_info *cinfo = mddev->cluster_info; + + pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", + mddev->bitmap_info.cluster_name, + slot->nodeid, slot->slot, + cinfo->slot_number); + /* deduct one since dlm slot starts from one while the num of + * cluster-md begins with 0 */ + __recover_slot(mddev, slot->slot - 1); +} + static void recover_done(void *arg, struct dlm_slot *slots, int num_slots, int our_slot, uint32_t generation) @@ -310,10 +325,17 @@ static void recover_done(void *arg, struct dlm_slot *slots, struct md_cluster_info *cinfo = mddev->cluster_info; cinfo->slot_number = our_slot; - complete(&cinfo->completion); + /* completion is only need to be complete when node join cluster, + * it doesn't need to run during another node's failure */ + if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) { + complete(&cinfo->completion); + clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); + } clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); } +/* the ops is called when node join the cluster, and do lock recovery + * if node failure occurs */ static const struct dlm_lockspace_ops md_ls_ops = { .recover_prep = recover_prep, .recover_slot = recover_slot, @@ -327,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = { */ static void ack_bast(void *arg, int mode) { - struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg; + struct dlm_lock_resource *res = arg; struct md_cluster_info *cinfo = res->mddev->cluster_info; if (mode == DLM_LOCK_EX) @@ -340,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) if (slot == s->slot) { - pr_info("%s:%d Deleting suspend_info: %d\n", - __func__, __LINE__, slot); list_del(&s->list); kfree(s); break; } } -static void remove_suspend_info(struct md_cluster_info *cinfo, int slot) +static void remove_suspend_info(struct mddev *mddev, int slot) { + struct md_cluster_info *cinfo = mddev->cluster_info; spin_lock_irq(&cinfo->suspend_lock); __remove_suspend_info(cinfo, slot); spin_unlock_irq(&cinfo->suspend_lock); + mddev->pers->quiesce(mddev, 2); } -static void process_suspend_info(struct md_cluster_info *cinfo, +static void process_suspend_info(struct mddev *mddev, int slot, sector_t lo, sector_t hi) { + struct md_cluster_info *cinfo = mddev->cluster_info; struct suspend_info *s; if (!hi) { - remove_suspend_info(cinfo, slot); + remove_suspend_info(mddev, slot); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); return; } s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); @@ -371,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo, s->slot = slot; s->lo = lo; s->hi = hi; + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); spin_lock_irq(&cinfo->suspend_lock); /* Remove existing entry (if exists) before adding */ __remove_suspend_info(cinfo, slot); list_add(&s->list, &cinfo->suspend_list); spin_unlock_irq(&cinfo->suspend_lock); + mddev->pers->quiesce(mddev, 2); } static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) @@ -388,8 +416,8 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) int len; len = snprintf(disk_uuid, 64, "DEVICE_UUID="); - pretty_uuid(disk_uuid + len, cmsg->uuid); - snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); + sprintf(disk_uuid + len, "%pU", cmsg->uuid); + snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot)); pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); init_completion(&cinfo->newdisk_completion); set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); @@ -403,60 +431,60 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { struct md_cluster_info *cinfo = mddev->cluster_info; - - md_reload_sb(mddev); + md_reload_sb(mddev, le32_to_cpu(msg->raid_slot)); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); } static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, + le32_to_cpu(msg->raid_slot)); if (rdev) md_kick_rdev_from_array(rdev); else - pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot); + pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", + __func__, __LINE__, le32_to_cpu(msg->raid_slot)); } static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, + le32_to_cpu(msg->raid_slot)); if (rdev && test_bit(Faulty, &rdev->flags)) clear_bit(Faulty, &rdev->flags); else - pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot); + pr_warn("%s: %d Could not find disk(%d) which is faulty", + __func__, __LINE__, le32_to_cpu(msg->raid_slot)); } static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) { - switch (msg->type) { + if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), + "node %d received it's own msg\n", le32_to_cpu(msg->slot))) + return; + switch (le32_to_cpu(msg->type)) { case METADATA_UPDATED: - pr_info("%s: %d Received message: METADATA_UPDATE from %d\n", - __func__, __LINE__, msg->slot); process_metadata_update(mddev, msg); break; case RESYNCING: - pr_info("%s: %d Received message: RESYNCING from %d\n", - __func__, __LINE__, msg->slot); - process_suspend_info(mddev->cluster_info, msg->slot, - msg->low, msg->high); + process_suspend_info(mddev, le32_to_cpu(msg->slot), + le64_to_cpu(msg->low), + le64_to_cpu(msg->high)); break; case NEWDISK: - pr_info("%s: %d Received message: NEWDISK from %d\n", - __func__, __LINE__, msg->slot); process_add_new_disk(mddev, msg); break; case REMOVE: - pr_info("%s: %d Received REMOVE from %d\n", - __func__, __LINE__, msg->slot); process_remove_disk(mddev, msg); break; case RE_ADD: - pr_info("%s: %d Received RE_ADD from %d\n", - __func__, __LINE__, msg->slot); process_readd_disk(mddev, msg); break; + case BITMAP_NEEDS_SYNC: + __recover_slot(mddev, le32_to_cpu(msg->slot)); + break; default: pr_warn("%s:%d Received unknown message from %d\n", __func__, __LINE__, msg->slot); @@ -472,6 +500,7 @@ static void recv_daemon(struct md_thread *thread) struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres; struct dlm_lock_resource *message_lockres = cinfo->message_lockres; struct cluster_msg msg; + int ret; /*get CR on Message*/ if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { @@ -484,23 +513,37 @@ static void recv_daemon(struct md_thread *thread) process_recvd_msg(thread->mddev, &msg); /*release CR on ack_lockres*/ - dlm_unlock_sync(ack_lockres); - /*up-convert to EX on message_lockres*/ - dlm_lock_sync(message_lockres, DLM_LOCK_EX); + ret = dlm_unlock_sync(ack_lockres); + if (unlikely(ret != 0)) + pr_info("unlock ack failed return %d\n", ret); + /*up-convert to PR on message_lockres*/ + ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR); + if (unlikely(ret != 0)) + pr_info("lock PR on msg failed return %d\n", ret); /*get CR on ack_lockres again*/ - dlm_lock_sync(ack_lockres, DLM_LOCK_CR); + ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR); + if (unlikely(ret != 0)) + pr_info("lock CR on ack failed return %d\n", ret); /*release CR on message_lockres*/ - dlm_unlock_sync(message_lockres); + ret = dlm_unlock_sync(message_lockres); + if (unlikely(ret != 0)) + pr_info("unlock msg failed return %d\n", ret); } /* lock_comm() * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. + * If called again, and the TOKEN lock is alread in EX mode + * return success. However, care must be taken that unlock_comm() + * is called only once. */ static int lock_comm(struct md_cluster_info *cinfo) { int error; + if (cinfo->token_lockres->mode == DLM_LOCK_EX) + return 0; + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); if (error) pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", @@ -510,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo) static void unlock_comm(struct md_cluster_info *cinfo) { + WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX); dlm_unlock_sync(cinfo->token_lockres); } @@ -519,7 +563,7 @@ static void unlock_comm(struct md_cluster_info *cinfo) * The function: * 1. Grabs the message lockresource in EX mode * 2. Copies the message to the message LVB - * 3. Downconverts message lockresource to CR + * 3. Downconverts message lockresource to CW * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes * and the other nodes read the message. The thread will wait here until all other * nodes have released ack lock resource. @@ -540,12 +584,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, sizeof(struct cluster_msg)); - /*down-convert EX to CR on Message*/ - error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR); + /*down-convert EX to CW on Message*/ + error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW); if (error) { - pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n", + pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n", error); - goto failed_message; + goto failed_ack; } /*up-convert CR to EX on Ack*/ @@ -565,7 +609,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) } failed_ack: - dlm_unlock_sync(cinfo->message_lockres); + error = dlm_unlock_sync(cinfo->message_lockres); + if (unlikely(error != 0)) { + pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n", + error); + /* in case the message can't be released due to some reason */ + goto failed_ack; + } failed_message: return error; } @@ -587,6 +637,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) struct dlm_lock_resource *bm_lockres; struct suspend_info *s; char str[64]; + sector_t lo, hi; for (i = 0; i < total_slots; i++) { @@ -617,9 +668,24 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) lockres_free(bm_lockres); continue; } - if (ret) + if (ret) { + lockres_free(bm_lockres); goto out; - /* TODO: Read the disk bitmap sb and check if it needs recovery */ + } + + /* Read the disk bitmap sb and check if it needs recovery */ + ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false); + if (ret) { + pr_warn("md-cluster: Could not gather bitmaps from slot %d", i); + lockres_free(bm_lockres); + continue; + } + if ((hi > 0) && (lo < mddev->recovery_cp)) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + mddev->recovery_cp = lo; + md_check_recovery(mddev); + } + dlm_unlock_sync(bm_lockres); lockres_free(bm_lockres); } @@ -633,20 +699,19 @@ static int join(struct mddev *mddev, int nodes) int ret, ops_rv; char str[64]; - if (!try_module_get(THIS_MODULE)) - return -ENOENT; - cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL); if (!cinfo) return -ENOMEM; + INIT_LIST_HEAD(&cinfo->suspend_list); + spin_lock_init(&cinfo->suspend_lock); init_completion(&cinfo->completion); + set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); - mutex_init(&cinfo->sb_mutex); mddev->cluster_info = cinfo; memset(str, 0, 64); - pretty_uuid(str, mddev->uuid); + sprintf(str, "%pU", mddev->uuid); ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, DLM_LSFL_FS, LVB_SIZE, &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace); @@ -659,12 +724,6 @@ static int join(struct mddev *mddev, int nodes) ret = -ERANGE; goto err; } - cinfo->sb_lock = lockres_init(mddev, "cmd-super", - NULL, 0); - if (!cinfo->sb_lock) { - ret = -ENOMEM; - goto err; - } /* Initiate the communication resources */ ret = -ENOMEM; cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); @@ -705,8 +764,9 @@ static int join(struct mddev *mddev, int nodes) goto err; } - INIT_LIST_HEAD(&cinfo->suspend_list); - spin_lock_init(&cinfo->suspend_lock); + cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0); + if (!cinfo->resync_lockres) + goto err; ret = gather_all_resync_info(mddev, nodes); if (ret) @@ -718,29 +778,47 @@ err: lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); + lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); - lockres_free(cinfo->sb_lock); if (cinfo->lockspace) dlm_release_lockspace(cinfo->lockspace, 2); mddev->cluster_info = NULL; kfree(cinfo); - module_put(THIS_MODULE); return ret; } +static void resync_bitmap(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + int err; + + cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC); + err = sendmsg(cinfo, &cmsg); + if (err) + pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n", + __func__, __LINE__, err); +} + static int leave(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; if (!cinfo) return 0; + + /* BITMAP_NEEDS_SYNC message should be sent when node + * is leaving the cluster with dirty bitmap, also we + * can only deliver it when dlm connection is available */ + if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) + resync_bitmap(mddev); + md_unregister_thread(&cinfo->recovery_thread); md_unregister_thread(&cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); - lockres_free(cinfo->sb_lock); lockres_free(cinfo->bitmap_lockres); dlm_release_lockspace(cinfo->lockspace, 2); return 0; @@ -757,15 +835,6 @@ static int slot_number(struct mddev *mddev) return cinfo->slot_number - 1; } -static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) -{ - struct md_cluster_info *cinfo = mddev->cluster_info; - - add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); - /* Re-acquire the lock to refresh LVB */ - dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); -} - static int metadata_update_start(struct mddev *mddev) { return lock_comm(mddev->cluster_info); @@ -775,50 +844,62 @@ static int metadata_update_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; struct cluster_msg cmsg; - int ret; + struct md_rdev *rdev; + int ret = 0; + int raid_slot = -1; memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(METADATA_UPDATED); - ret = __sendmsg(cinfo, &cmsg); + /* Pick up a good active device number to send. + */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { + raid_slot = rdev->desc_nr; + break; + } + if (raid_slot >= 0) { + cmsg.raid_slot = cpu_to_le32(raid_slot); + ret = __sendmsg(cinfo, &cmsg); + } else + pr_warn("md-cluster: No good device id found to send\n"); unlock_comm(cinfo); return ret; } -static int metadata_update_cancel(struct mddev *mddev) +static void metadata_update_cancel(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + unlock_comm(cinfo); +} - return dlm_unlock_sync(cinfo->token_lockres); +static int resync_start(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE; + return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); } -static int resync_send(struct mddev *mddev, enum msg_type type, - sector_t lo, sector_t hi) +static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; - struct cluster_msg cmsg; - int slot = cinfo->slot_number - 1; + struct cluster_msg cmsg = {0}; - pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, - (unsigned long long)lo, - (unsigned long long)hi); - resync_info_update(mddev, lo, hi); - cmsg.type = cpu_to_le32(type); - cmsg.slot = cpu_to_le32(slot); + add_resync_info(cinfo->bitmap_lockres, lo, hi); + /* Re-acquire the lock to refresh LVB */ + dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); + cmsg.type = cpu_to_le32(RESYNCING); cmsg.low = cpu_to_le64(lo); cmsg.high = cpu_to_le64(hi); - return sendmsg(cinfo, &cmsg); -} -static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) -{ - pr_info("%s:%d\n", __func__, __LINE__); - return resync_send(mddev, RESYNCING, lo, hi); + return sendmsg(cinfo, &cmsg); } -static void resync_finish(struct mddev *mddev) +static int resync_finish(struct mddev *mddev) { - pr_info("%s:%d\n", __func__, __LINE__); - resync_send(mddev, RESYNCING, 0, 0); + struct md_cluster_info *cinfo = mddev->cluster_info; + cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE; + dlm_unlock_sync(cinfo->resync_lockres); + return resync_info_update(mddev, 0, 0); } static int area_resyncing(struct mddev *mddev, int direction, @@ -845,7 +926,11 @@ out: return ret; } -static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) +/* add_new_disk() - initiates a disk add + * However, if this fails before writing md_update_sb(), + * add_new_disk_cancel() must be called to release token lock + */ +static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) { struct md_cluster_info *cinfo = mddev->cluster_info; struct cluster_msg cmsg; @@ -856,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(NEWDISK); memcpy(cmsg.uuid, uuid, 16); - cmsg.raid_slot = rdev->desc_nr; + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); lock_comm(cinfo); ret = __sendmsg(cinfo, &cmsg); if (ret) @@ -867,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) /* Some node does not "see" the device */ if (ret == -EAGAIN) ret = -ENOENT; + if (ret) + unlock_comm(cinfo); else dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); return ret; } -static int add_new_disk_finish(struct mddev *mddev) +static void add_new_disk_cancel(struct mddev *mddev) { - struct cluster_msg cmsg; struct md_cluster_info *cinfo = mddev->cluster_info; - int ret; - /* Write sb and inform others */ - md_update_sb(mddev, 1); - cmsg.type = METADATA_UPDATED; - ret = __sendmsg(cinfo, &cmsg); unlock_comm(cinfo); - return ret; } static int new_disk_ack(struct mddev *mddev, bool ack) @@ -902,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack) static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) { - struct cluster_msg cmsg; + struct cluster_msg cmsg = {0}; struct md_cluster_info *cinfo = mddev->cluster_info; - cmsg.type = REMOVE; - cmsg.raid_slot = rdev->desc_nr; + cmsg.type = cpu_to_le32(REMOVE); + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); return __sendmsg(cinfo, &cmsg); } @@ -913,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev) { int sn, err; sector_t lo, hi; - struct cluster_msg cmsg; + struct cluster_msg cmsg = {0}; struct mddev *mddev = rdev->mddev; struct md_cluster_info *cinfo = mddev->cluster_info; - cmsg.type = RE_ADD; - cmsg.raid_slot = rdev->desc_nr; + cmsg.type = cpu_to_le32(RE_ADD); + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); err = sendmsg(cinfo, &cmsg); if (err) goto out; @@ -942,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = { .join = join, .leave = leave, .slot_number = slot_number, - .resync_info_update = resync_info_update, .resync_start = resync_start, .resync_finish = resync_finish, + .resync_info_update = resync_info_update, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, .metadata_update_cancel = metadata_update_cancel, .area_resyncing = area_resyncing, - .add_new_disk_start = add_new_disk_start, - .add_new_disk_finish = add_new_disk_finish, + .add_new_disk = add_new_disk, + .add_new_disk_cancel = add_new_disk_cancel, .new_disk_ack = new_disk_ack, .remove_disk = remove_disk, .gather_bitmaps = gather_bitmaps, @@ -971,5 +1051,6 @@ static void cluster_exit(void) module_init(cluster_init); module_exit(cluster_exit); +MODULE_AUTHOR("SUSE"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Clustering support for MD"); diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 00defe2..e75ea26 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -12,15 +12,15 @@ struct md_cluster_operations { int (*join)(struct mddev *mddev, int nodes); int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); - void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); - int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi); - void (*resync_finish)(struct mddev *mddev); + int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); - int (*metadata_update_cancel)(struct mddev *mddev); + void (*metadata_update_cancel)(struct mddev *mddev); + int (*resync_start)(struct mddev *mddev); + int (*resync_finish)(struct mddev *mddev); int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi); - int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); - int (*add_new_disk_finish)(struct mddev *mddev); + int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev); + void (*add_new_disk_cancel)(struct mddev *mddev); int (*new_disk_ack)(struct mddev *mddev, bool ack); int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); int (*gather_bitmaps)(struct md_rdev *rdev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0c2a4e8..3f9a514 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -257,13 +257,17 @@ static void md_make_request(struct request_queue *q, struct bio *bio) unsigned int sectors; int cpu; + blk_queue_split(q, &bio, q->bio_split); + if (mddev == NULL || mddev->pers == NULL || !mddev->ready) { bio_io_error(bio); return; } if (mddev->ro == 1 && unlikely(rw == WRITE)) { - bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); + if (bio_sectors(bio) != 0) + bio->bi_error = -EROFS; + bio_endio(bio); return; } smp_rmb(); /* Ensure implications of 'active' are visible */ @@ -350,34 +354,11 @@ static int md_congested(void *data, int bits) return mddev_congested(mddev, bits); } -static int md_mergeable_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mddev *mddev = q->queuedata; - int ret; - rcu_read_lock(); - if (mddev->suspended) { - /* Must always allow one vec */ - if (bvm->bi_size == 0) - ret = biovec->bv_len; - else - ret = 0; - } else { - struct md_personality *pers = mddev->pers; - if (pers && pers->mergeable_bvec) - ret = pers->mergeable_bvec(mddev, bvm, biovec); - else - ret = biovec->bv_len; - } - rcu_read_unlock(); - return ret; -} /* * Generic flush handling for md */ -static void md_end_flush(struct bio *bio, int err) +static void md_end_flush(struct bio *bio) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; @@ -433,7 +414,7 @@ static void md_submit_flush_data(struct work_struct *ws) if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ - bio_endio(bio, 0); + bio_endio(bio); else { bio->bi_rw &= ~REQ_FLUSH; mddev->pers->make_request(mddev, bio); @@ -502,6 +483,8 @@ static void mddev_put(struct mddev *mddev) bioset_free(bs); } +static void md_safemode_timeout(unsigned long data); + void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); @@ -509,7 +492,8 @@ void mddev_init(struct mddev *mddev) mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); - init_timer(&mddev->safemode_timer); + setup_timer(&mddev->safemode_timer, md_safemode_timeout, + (unsigned long) mddev); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); @@ -728,15 +712,13 @@ void md_rdev_clear(struct md_rdev *rdev) } EXPORT_SYMBOL_GPL(md_rdev_clear); -static void super_written(struct bio *bio, int error) +static void super_written(struct bio *bio) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; - if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { - printk("md: super_written gets error=%d, uptodate=%d\n", - error, test_bit(BIO_UPTODATE, &bio->bi_flags)); - WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); + if (bio->bi_error) { + printk("md: super_written gets error=%d\n", bio->bi_error); md_error(mddev, rdev); } @@ -791,7 +773,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, bio_add_page(bio, page, size, 0); submit_bio_wait(rw, bio); - ret = test_bit(BIO_UPTODATE, &bio->bi_flags); + ret = !bio->bi_error; bio_put(bio); return ret; } @@ -1626,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ++ev1; if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { @@ -1646,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) int role; if (rdev->desc_nr < 0 || rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { - role = 0xffff; + role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; } else role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { - case 0xffff: /* spare */ + case MD_DISK_ROLE_SPARE: /* spare */ break; - case 0xfffe: /* faulty */ + case MD_DISK_ROLE_FAULTY: /* faulty */ set_bit(Faulty, &rdev->flags); break; + case MD_DISK_ROLE_JOURNAL: /* journal device */ + if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { + /* journal device without journal feature */ + printk(KERN_WARNING + "md: journal device provided without journal feature, ignoring the device\n"); + return -EINVAL; + } + set_bit(Journal, &rdev->flags); + rdev->journal_tail = le64_to_cpu(sb->journal_tail); + if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); + rdev->raid_disk = mddev->raid_disks; + break; default: rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & @@ -1673,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) set_bit(WriteMostly, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) + set_bit(MD_HAS_JOURNAL, &mddev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); @@ -1697,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) + sb->resync_offset = cpu_to_le64(MaxSector); else sb->resync_offset = cpu_to_le64(0); @@ -1720,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } - if (rdev->raid_disk >= 0 && + if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); @@ -1730,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } + /* Note: recovery_offset and journal_tail share space */ + if (test_bit(Journal, &rdev->flags)) + sb->journal_tail = cpu_to_le64(rdev->journal_tail); if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_REPLACEMENT); @@ -1753,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) } } + if (mddev_is_clustered(mddev)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); + if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) @@ -1803,18 +1809,23 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (test_bit(Journal, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else - sb->dev_roles[i] = cpu_to_le16(0xffff); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); } sb->sb_csum = calc_sb_1_csum(sb); @@ -1930,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) struct md_rdev *rdev, *rdev2; rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev1) - rdev_for_each_rcu(rdev2, mddev2) + rdev_for_each_rcu(rdev, mddev1) { + if (test_bit(Faulty, &rdev->flags) || + test_bit(Journal, &rdev->flags) || + rdev->raid_disk == -1) + continue; + rdev_for_each_rcu(rdev2, mddev2) { + if (test_bit(Faulty, &rdev2->flags) || + test_bit(Journal, &rdev2->flags) || + rdev2->raid_disk == -1) + continue; if (rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { rcu_read_unlock(); return 1; } + } + } rcu_read_unlock(); return 0; } @@ -1980,12 +2001,9 @@ int md_integrity_register(struct mddev *mddev) * All component devices are integrity capable and have matching * profiles, register the common profile for the md device. */ - if (blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)) != 0) { - printk(KERN_ERR "md: failed to register integrity for %s\n", - mdname(mddev)); - return -EINVAL; - } + blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)); + printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { printk(KERN_ERR "md: failed to create integrity pool for %s\n", @@ -2015,6 +2033,7 @@ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) if (bi_rdev && blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) >= 0) return; + WARN_ON_ONCE(!mddev->suspended); printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); blk_integrity_unregister(mddev->gendisk); } @@ -2214,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares) } } +static bool does_sb_need_changing(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct mdp_superblock_1 *sb; + int role; + + /* Find a good rdev */ + rdev_for_each(rdev, mddev) + if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) + break; + + /* No good device found. */ + if (!rdev) + return false; + + sb = page_address(rdev->sb_page); + /* Check if a device has become faulty or a spare become active */ + rdev_for_each(rdev, mddev) { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + /* Device activated? */ + if (role == 0xffff && rdev->raid_disk >=0 && + !test_bit(Faulty, &rdev->flags)) + return true; + /* Device turned faulty? */ + if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) + return true; + } + + /* Check if any mddev parameters have changed */ + if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || + (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || + (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || + (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) + return true; + + return false; +} + void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; + int ret = -1; if (mddev->ro) { if (force_change) set_bit(MD_CHANGE_DEVS, &mddev->flags); return; } + + if (mddev_is_clustered(mddev)) { + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + ret = md_cluster_ops->metadata_update_start(mddev); + /* Has someone else has updated the sb */ + if (!does_sb_need_changing(mddev)) { + if (ret == 0) + md_cluster_ops->metadata_update_cancel(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + return; + } + } repeat: /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; @@ -2374,6 +2447,9 @@ repeat: clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } + + if (mddev_is_clustered(mddev) && ret == 0) + md_cluster_ops->metadata_update_finish(mddev); } EXPORT_SYMBOL(md_update_sb); @@ -2449,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page) len += sprintf(page+len, "%sin_sync",sep); sep = ","; } + if (test_bit(Journal, &flags)) { + len += sprintf(page+len, "%sjournal",sep); + sep = ","; + } if (test_bit(WriteMostly, &flags)) { len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; @@ -2460,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page) sep = ","; } if (!test_bit(Faulty, &flags) && + !test_bit(Journal, &flags) && !test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; @@ -2508,17 +2589,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = -EBUSY; else { struct mddev *mddev = rdev->mddev; - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); - md_kick_rdev_from_array(rdev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); - if (mddev->pers) - md_update_sb(mddev, 1); - md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); err = 0; + if (mddev_is_clustered(mddev)) + err = md_cluster_ops->remove_disk(mddev, rdev); + + if (err == 0) { + md_kick_rdev_from_array(rdev); + if (mddev->pers) + md_update_sb(mddev, 1); + md_new_event(mddev); + } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); @@ -2547,7 +2627,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; - } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags)) { if (rdev->mddev->pers == NULL) { clear_bit(In_sync, &rdev->flags); rdev->saved_raid_disk = rdev->raid_disk; @@ -2566,6 +2647,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * check if recovery is needed. */ if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); @@ -2643,7 +2725,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(struct md_rdev *rdev, char *page) { - if (rdev->raid_disk < 0) + if (test_bit(Journal, &rdev->flags)) + return sprintf(page, "journal\n"); + else if (rdev->raid_disk < 0) return sprintf(page, "none\n"); else return sprintf(page, "%d\n", rdev->raid_disk); @@ -2655,6 +2739,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) int slot; int err; + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strncmp(buf, "none", 4)==0) slot = -1; else { @@ -2706,15 +2792,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); - err = rdev->mddev->pers-> - hot_add_disk(rdev->mddev, rdev); - if (err) { - rdev->raid_disk = -1; - return err; - } else - sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk == -1) + return -EBUSY; /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -2859,6 +2939,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) sector_t oldsectors = rdev->sectors; sector_t sectors; + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (rdev->data_offset != rdev->new_data_offset) @@ -3216,20 +3298,14 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); continue; } - /* No device should have a Candidate flag - * when reading devices - */ - if (test_bit(Candidate, &rdev->flags)) { - pr_info("md: kicking Cluster Candidate %s from array!\n", - bdevname(rdev->bdev, b)); - md_kick_rdev_from_array(rdev); - } } if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { + } else if (rdev->raid_disk >= + (mddev->raid_disks - min(0, mddev->delta_disks)) && + !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } @@ -3276,8 +3352,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) return 0; } -static void md_safemode_timeout(unsigned long data); - static ssize_t safe_delay_show(struct mddev *mddev, char *page) { @@ -3289,6 +3363,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { unsigned long msec; + if (mddev_is_clustered(mddev)) { + pr_info("md: Safemode is disabled for clustered mode\n"); + return -EINVAL; + } + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) return -EINVAL; if (msec == 0) @@ -3889,7 +3968,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case clean: if (mddev->pers) { - restart_array(mddev); + err = restart_array(mddev); + if (err) + break; spin_lock(&mddev->lock); if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { @@ -3907,7 +3988,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case active: if (mddev->pers) { - restart_array(mddev); + err = restart_array(mddev); + if (err) + break; clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; @@ -4086,12 +4169,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; if (mddev->pers) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); err = update_size(mddev, sectors); md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -4210,6 +4289,8 @@ action_show(struct mddev *mddev, char *page) type = "repair"; } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) type = "recover"; + else if (mddev->reshape_position != MaxSector) + type = "reshape"; } return sprintf(page, "%s\n", type); } @@ -5186,7 +5267,6 @@ int md_run(struct mddev *mddev) if (mddev->queue) { mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = md_congested; - blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec); } if (pers->sync_request) { if (mddev->kobj.sd && @@ -5202,9 +5282,10 @@ int md_run(struct mddev *mddev) atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; - mddev->safemode_timer.function = md_safemode_timeout; - mddev->safemode_timer.data = (unsigned long) mddev; - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + if (mddev_is_clustered(mddev)) + mddev->safemode_delay = 0; + else + mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); @@ -5216,6 +5297,11 @@ int md_run(struct mddev *mddev) if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; + if (mddev->degraded && !mddev->ro) + /* This ensures that recovering status is reported immediately + * via sysfs - until a lack of spares is confirmed. + */ + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); if (mddev->flags & MD_UPDATE_SB_FLAGS) @@ -5242,6 +5328,9 @@ static int do_md_run(struct mddev *mddev) goto out; } + if (mddev_is_clustered(mddev)) + md_allow_write(mddev); + md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ @@ -5264,6 +5353,25 @@ static int restart_array(struct mddev *mddev) return -EINVAL; if (!mddev->ro) return -EBUSY; + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + struct md_rdev *rdev; + bool has_journal = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { + has_journal = true; + break; + } + } + rcu_read_unlock(); + + /* Don't restart rw with journal missing/faulty */ + if (!has_journal) + return -EINVAL; + } + mddev->safemode = 0; mddev->ro = 0; set_disk_ro(disk, 0); @@ -5315,7 +5423,6 @@ static void md_clean(struct mddev *mddev) mddev->degraded = 0; mddev->safemode = 0; mddev->private = NULL; - mddev->merge_check_needed = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.default_space = 0; @@ -5326,8 +5433,6 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); flush_workqueue(md_misc_wq); if (mddev->sync_thread) { @@ -5341,13 +5446,13 @@ static void __md_stop_writes(struct mddev *mddev) md_super_wait(mddev); if (mddev->ro == 0 && - (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { + ((!mddev->in_sync && !mddev_is_clustered(mddev)) || + (mddev->flags & MD_UPDATE_SB_FLAGS))) { /* mark array as shutdown cleanly */ - mddev->in_sync = 1; + if (!mddev_is_clustered(mddev)) + mddev->in_sync = 1; md_update_sb(mddev, 1); } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } void md_stop_writes(struct mddev *mddev) @@ -5426,9 +5531,13 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) * which will now never happen */ wake_up_process(mddev->sync_thread->tsk); + if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) + return -EBUSY; mddev_unlock(mddev); wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); @@ -5514,7 +5623,6 @@ static int do_md_stop(struct mddev *mddev, int mode, __md_stop_writes(mddev); __md_stop(mddev); - mddev->queue->merge_bvec_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; /* tell userspace to handle 'inactive' */ @@ -5556,7 +5664,6 @@ static int do_md_stop(struct mddev *mddev, int mode, if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; } - blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; @@ -5759,22 +5866,22 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg) char *ptr; int err; - file = kmalloc(sizeof(*file), GFP_NOIO); + file = kzalloc(sizeof(*file), GFP_NOIO); if (!file) return -ENOMEM; err = 0; spin_lock(&mddev->lock); - /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap_info.file) - file->pathname[0] = '\0'; - else if ((ptr = file_path(mddev->bitmap_info.file, - file->pathname, sizeof(file->pathname))), - IS_ERR(ptr)) - err = PTR_ERR(ptr); - else - memmove(file->pathname, ptr, - sizeof(file->pathname)-(ptr-file->pathname)); + /* bitmap enabled */ + if (mddev->bitmap_info.file) { + ptr = file_path(mddev->bitmap_info.file, file->pathname, + sizeof(file->pathname)); + if (IS_ERR(ptr)) + err = PTR_ERR(ptr); + else + memmove(file->pathname, ptr, + sizeof(file->pathname)-(ptr-file->pathname)); + } spin_unlock(&mddev->lock); if (err == 0 && @@ -5806,6 +5913,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_SYNC); } + if (test_bit(Journal, &rdev->flags)) + info.state |= (1<<MD_DISK_JOURNAL); if (test_bit(WriteMostly, &rdev->flags)) info.state |= (1<<MD_DISK_WRITEMOSTLY); } else { @@ -5920,23 +6029,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); + if (info->state & (1<<MD_DISK_JOURNAL)) + set_bit(Journal, &rdev->flags); /* * check whether the device shows up in other nodes */ if (mddev_is_clustered(mddev)) { - if (info->state & (1 << MD_DISK_CANDIDATE)) { - /* Through --cluster-confirm */ + if (info->state & (1 << MD_DISK_CANDIDATE)) set_bit(Candidate, &rdev->flags); - err = md_cluster_ops->new_disk_ack(mddev, true); - if (err) { - export_rdev(rdev); - return err; - } - } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk_start(mddev, rdev); + err = md_cluster_ops->add_new_disk(mddev, rdev); if (err) { - md_cluster_ops->add_new_disk_finish(mddev); export_rdev(rdev); return err; } @@ -5945,13 +6049,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); + if (err) export_rdev(rdev); - else + + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) + md_cluster_ops->new_disk_ack(mddev, (err == 0)); + else { + if (err) + md_cluster_ops->add_new_disk_cancel(mddev); + else + err = add_bound_rdev(rdev); + } + + } else if (!err) err = add_bound_rdev(rdev); - if (mddev_is_clustered(mddev) && - (info->state & (1 << MD_DISK_CLUSTER_ADD))) - md_cluster_ops->add_new_disk_finish(mddev); + return err; } @@ -6007,13 +6121,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev; + int ret = -1; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + ret = md_cluster_ops->metadata_update_start(mddev); + + if (rdev->raid_disk < 0) + goto kick_rdev; clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); @@ -6021,20 +6139,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) if (rdev->raid_disk >= 0) goto busy; - if (mddev_is_clustered(mddev)) +kick_rdev: + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); md_update_sb(mddev, 1); md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - return 0; busy: - if (mddev_is_clustered(mddev)) + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->metadata_update_cancel(mddev); + printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; @@ -6085,14 +6202,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) goto abort_export; } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) - goto abort_clustered; + goto abort_export; /* * The rest should better be atomic, we can have disk failures @@ -6102,9 +6217,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) rdev->raid_disk = -1; md_update_sb(mddev, 1); - - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); /* * Kick recovery, maybe this spare has to be added to the * array immediately. @@ -6114,9 +6226,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) md_new_event(mddev); return 0; -abort_clustered: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); abort_export: export_rdev(rdev); return err; @@ -6434,8 +6543,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) return rv; } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); @@ -6493,12 +6600,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) } } md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); return rv; err: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); return rv; } @@ -7093,7 +7196,7 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "\n"); } -static void status_resync(struct seq_file *seq, struct mddev *mddev) +static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; unsigned long dt, db; @@ -7101,18 +7204,32 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev) int scale; unsigned int per_milli; - if (mddev->curr_resync <= 3) - resync = 0; - else - resync = mddev->curr_resync - - atomic_read(&mddev->recovery_active); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; + resync = mddev->curr_resync; + if (resync <= 3) { + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + /* Still cleaning up */ + resync = max_sectors; + } else + resync -= atomic_read(&mddev->recovery_active); + + if (resync == 0) { + if (mddev->recovery_cp < MaxSector) { + seq_printf(seq, "\tresync=PENDING"); + return 1; + } + return 0; + } + if (resync < 3) { + seq_printf(seq, "\tresync=DELAYED"); + return 1; + } + WARN_ON(max_sectors == 0); /* Pick 'scale' such that (resync>>scale)*1000 will fit * in a sector_t, and (max_sectors>>scale) will fit in a @@ -7177,6 +7294,7 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev) ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); + return 1; } static void *md_seq_start(struct seq_file *seq, loff_t *pos) @@ -7284,6 +7402,8 @@ static int md_seq_show(struct seq_file *seq, void *v) bdevname(rdev->bdev,b), rdev->desc_nr); if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); + if (test_bit(Journal, &rdev->flags)) + seq_printf(seq, "(J)"); if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; @@ -7322,13 +7442,8 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->pers->status(seq, mddev); seq_printf(seq, "\n "); if (mddev->pers->sync_request) { - if (mddev->curr_resync > 2) { - status_resync(seq, mddev); + if (status_resync(seq, mddev)) seq_printf(seq, "\n "); - } else if (mddev->curr_resync >= 1) - seq_printf(seq, "\tresync=DELAYED\n "); - else if (mddev->recovery_cp < MaxSector) - seq_printf(seq, "\tresync=PENDING\n "); } } else seq_printf(seq, "\n "); @@ -7411,15 +7526,19 @@ int unregister_md_personality(struct md_personality *p) } EXPORT_SYMBOL(unregister_md_personality); -int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) +int register_md_cluster_operations(struct md_cluster_operations *ops, + struct module *module) { - if (md_cluster_ops != NULL) - return -EALREADY; + int ret = 0; spin_lock(&pers_lock); - md_cluster_ops = ops; - md_cluster_mod = module; + if (md_cluster_ops != NULL) + ret = -EALREADY; + else { + md_cluster_ops = ops; + md_cluster_mod = module; + } spin_unlock(&pers_lock); - return 0; + return ret; } EXPORT_SYMBOL(register_md_cluster_operations); @@ -7597,11 +7716,7 @@ int md_allow_write(struct mddev *mddev) mddev->safemode == 0) mddev->safemode = 1; spin_unlock(&mddev->lock); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock(&mddev->lock); @@ -7633,6 +7748,7 @@ void md_do_sync(struct md_thread *thread) struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; + bool cluster_resync_finished = false; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7742,6 +7858,7 @@ void md_do_sync(struct md_thread *thread) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) @@ -7802,9 +7919,6 @@ void md_do_sync(struct md_thread *thread) md_new_event(mddev); update_time = jiffies; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start(mddev, j, max_sectors); - blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7817,7 +7931,8 @@ void md_do_sync(struct md_thread *thread) > (max_sectors >> 4)) || time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || (j - mddev->curr_resync_completed)*2 - >= mddev->resync_max - mddev->curr_resync_completed + >= mddev->resync_max - mddev->curr_resync_completed || + mddev->curr_resync_completed > mddev->resync_max )) { /* time to update curr_resync_completed */ wait_event(mddev->recovery_wait, @@ -7862,10 +7977,11 @@ void md_do_sync(struct md_thread *thread) break; j += sectors; + if (j > max_sectors) + /* when skipping, extra large numbers can be returned. */ + j = max_sectors; if (j > 2) mddev->curr_resync = j; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_info_update(mddev, j, max_sectors); mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7930,11 +8046,18 @@ void md_do_sync(struct md_thread *thread) blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); - /* tell personality that we are finished */ - mddev->pers->sync_request(mddev, max_sectors, &skipped); - - if (mddev_is_clustered(mddev)) + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev->curr_resync > 2) { + mddev->curr_resync_completed = mddev->curr_resync; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + } + /* tell personality and other nodes that we are finished */ + if (mddev_is_clustered(mddev)) { md_cluster_ops->resync_finish(mddev); + cluster_resync_finished = true; + } + mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { @@ -7961,6 +8084,7 @@ void md_do_sync(struct md_thread *thread) rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) @@ -7971,6 +8095,11 @@ void md_do_sync(struct md_thread *thread) skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !cluster_resync_finished) + md_cluster_ops->resync_finish(mddev); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ @@ -7979,11 +8108,11 @@ void md_do_sync(struct md_thread *thread) mddev->resync_max = MaxSector; } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; + set_bit(MD_RECOVERY_DONE, &mddev->recovery); mddev->curr_resync = 0; spin_unlock(&mddev->lock); wake_up(&resync_wait); - set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); return; } @@ -8001,7 +8130,8 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || - ! test_bit(In_sync, &rdev->flags)) && + (!test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags))) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { @@ -8013,25 +8143,31 @@ static int remove_and_add_spares(struct mddev *mddev, if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); - if (this) + if (this && removed) goto no_add; rdev_for_each(rdev, mddev) { + if (this && this != rdev) + continue; + if (test_bit(Candidate, &rdev->flags)) + continue; if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; if (rdev->raid_disk >= 0) continue; if (test_bit(Faulty, &rdev->flags)) continue; + if (test_bit(Journal, &rdev->flags)) + continue; if (mddev->ro && ! (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))) continue; - if (rdev->saved_raid_disk < 0) - rdev->recovery_offset = 0; + rdev->recovery_offset = 0; if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) @@ -8050,14 +8186,25 @@ no_add: static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); + int ret = 0; + + if (mddev_is_clustered(mddev)) { + ret = md_cluster_ops->resync_start(mddev); + if (ret) { + mddev->sync_thread = NULL; + goto out; + } + } mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); +out: if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); + if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -8152,7 +8299,9 @@ void md_check_recovery(struct mddev *mddev) */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); goto unlock; } @@ -8174,13 +8323,8 @@ void md_check_recovery(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags & MD_UPDATE_SB_FLAGS) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { @@ -8278,8 +8422,6 @@ void md_reap_sync_thread(struct mddev *mddev) set_bit(MD_CHANGE_DEVS, &mddev->flags); } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) mddev->pers->finish_reshape(mddev); @@ -8292,8 +8434,6 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -8598,6 +8738,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); + set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); md_wakeup_thread(rdev->mddev->thread); } return rv; @@ -8915,25 +9056,128 @@ err_wq: return ret; } -void md_reload_sb(struct mddev *mddev) +static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *rdev, *tmp; + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + struct md_rdev *rdev2; + int role, ret; + char b[BDEVNAME_SIZE]; - rdev_for_each_safe(rdev, tmp, mddev) { - rdev->sb_loaded = 0; - ClearPageUptodate(rdev->sb_page); + /* Check for change of roles in the active devices */ + rdev_for_each(rdev2, mddev) { + if (test_bit(Faulty, &rdev2->flags)) + continue; + + /* Check if the roles changed */ + role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); + + if (test_bit(Candidate, &rdev2->flags)) { + if (role == 0xfffe) { + pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); + md_kick_rdev_from_array(rdev2); + continue; + } + else + clear_bit(Candidate, &rdev2->flags); + } + + if (role != rdev2->raid_disk) { + /* got activated */ + if (rdev2->raid_disk == -1 && role != 0xffff) { + rdev2->saved_raid_disk = role; + ret = remove_and_add_spares(mddev, rdev2); + pr_info("Activated spare: %s\n", + bdevname(rdev2->bdev,b)); + continue; + } + /* device faulty + * We just want to do the minimum to mark the disk + * as faulty. The recovery is performed by the + * one who initiated the error. + */ + if ((role == 0xfffe) || (role == 0xfffd)) { + md_error(mddev, rdev2); + clear_bit(Blocked, &rdev2->flags); + } + } } - mddev->raid_disks = 0; - analyze_sbs(mddev); - rdev_for_each_safe(rdev, tmp, mddev) { - struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - /* since we don't write to faulty devices, we figure out if the - * disk is faulty by comparing events - */ - if (mddev->events > sb->events) - set_bit(Faulty, &rdev->flags); + + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) + update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + + /* Finally set the event to be up to date */ + mddev->events = le64_to_cpu(sb->events); +} + +static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + int err; + struct page *swapout = rdev->sb_page; + struct mdp_superblock_1 *sb; + + /* Store the sb page of the rdev in the swapout temporary + * variable in case we err in the future + */ + rdev->sb_page = NULL; + alloc_disk_sb(rdev); + ClearPageUptodate(rdev->sb_page); + rdev->sb_loaded = 0; + err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); + + if (err < 0) { + pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", + __func__, __LINE__, rdev->desc_nr, err); + put_page(rdev->sb_page); + rdev->sb_page = swapout; + rdev->sb_loaded = 1; + return err; + } + + sb = page_address(rdev->sb_page); + /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET + * is not set + */ + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + + /* The other node finished recovery, call spare_active to set + * device In_sync and mddev->degraded + */ + if (rdev->recovery_offset == MaxSector && + !test_bit(In_sync, &rdev->flags) && + mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, "degraded"); + + put_page(swapout); + return 0; +} + +void md_reload_sb(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + int err; + + /* Find the rdev */ + rdev_for_each_rcu(rdev, mddev) { + if (rdev->desc_nr == nr) + break; } + if (!rdev || rdev->desc_nr != nr) { + pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); + return; + } + + err = read_rdev(mddev, rdev); + if (err < 0) + return; + + check_sb_changes(mddev, rdev); + + /* Read all rdev's to update recovery_offset */ + rdev_for_each_rcu(rdev, mddev) + read_rdev(mddev, rdev); } EXPORT_SYMBOL(md_reload_sb); diff --git a/drivers/md/md.h b/drivers/md/md.h index 7da6e9c..2bea51e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -87,10 +87,16 @@ struct md_rdev { * array and could again if we did a partial * resync from the bitmap */ - sector_t recovery_offset;/* If this device has been partially + union { + sector_t recovery_offset;/* If this device has been partially * recovered, this is where we were * up to. */ + sector_t journal_tail; /* If this device is a journal device, + * this is the journal tail (journal + * recovery start point) + */ + }; atomic_t nr_pending; /* number of pending requests. * only maintained for arrays that @@ -134,10 +140,6 @@ enum flag_bits { Bitmap_sync, /* ..actually, not quite In_sync. Need a * bitmap-based recovery to get fully in sync */ - Unmerged, /* device is being added to array and should - * be considerred for bvec_merge_fn but not - * yet for actual IO - */ WriteMostly, /* Avoid reading if at all possible */ AutoDetected, /* added by auto-detect */ Blocked, /* An error occurred but has not yet @@ -176,6 +178,11 @@ enum flag_bits { * This device is seen locally but not * by the whole cluster */ + Journal, /* This device is used as journal for + * raid-5/6. + * Usually, this device should be faster + * than other devices in the array + */ }; #define BB_LEN_MASK (0x00000000000001FFULL) @@ -225,6 +232,8 @@ struct mddev { #define MD_STILL_CLOSED 4 /* If set, then array has not been opened since * md_ioctl checked on it. */ +#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ +#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ int suspended; atomic_t active_io; @@ -374,10 +383,6 @@ struct mddev { int degraded; /* whether md should consider * adding a spare */ - int merge_check_needed; /* at least one - * member device - * has a - * merge_bvec_fn */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; @@ -532,10 +537,6 @@ struct md_personality /* congested implements bdi.congested_fn(). * Will not be called while array is 'suspended' */ int (*congested)(struct mddev *mddev, int bits); - /* mergeable_bvec is use to implement ->merge_bvec_fn */ - int (*mergeable_bvec)(struct mddev *mddev, - struct bvec_merge_data *bvm, - struct bio_vec *biovec); }; struct md_sysfs_entry { @@ -670,7 +671,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); -extern void md_reload_sb(struct mddev *mddev); +extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index ac3ede2..7331a80 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -77,18 +77,18 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) struct bio *bio = mp_bh->master_bio; struct mpconf *conf = mp_bh->mddev->private; - bio_endio(bio, err); + bio->bi_error = err; + bio_endio(bio); mempool_free(mp_bh, conf->pool); } -static void multipath_end_request(struct bio *bio, int error) +static void multipath_end_request(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct multipath_bh *mp_bh = bio->bi_private; struct mpconf *conf = mp_bh->mddev->private; struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; - if (uptodate) + if (!bio->bi_error) multipath_end_bh_io(mp_bh, 0); else if (!(bio->bi_rw & REQ_RAHEAD)) { /* @@ -101,7 +101,7 @@ static void multipath_end_request(struct bio *bio, int error) (unsigned long long)bio->bi_iter.bi_sector); multipath_reschedule_retry(mp_bh); } else - multipath_end_bh_io(mp_bh, error); + multipath_end_bh_io(mp_bh, bio->bi_error); rdev_dec_pending(rdev, conf->mddev); } @@ -123,7 +123,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->path = multipath_map(conf); if (mp_bh->path < 0) { - bio_endio(bio, -EIO); + bio_io_error(bio); mempool_free(mp_bh, conf->pool); return; } @@ -257,18 +257,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_segments to one, lying - * within a single page. - * (Note: it is very unlikely that a device with - * merge_bvec_fn will be involved in multipath.) - */ - if (q->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } - spin_lock_irq(&conf->device_lock); mddev->degraded--; rdev->raid_disk = path; @@ -276,7 +264,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) spin_unlock_irq(&conf->device_lock); rcu_assign_pointer(p->rdev, rdev); err = 0; + mddev_suspend(mddev); md_integrity_add_rdev(rdev, mddev); + mddev_resume(mddev); break; } @@ -432,15 +422,6 @@ static int multipath_run (struct mddev *mddev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, not that we ever expect a device with - * a merge_bvec_fn to be involved in multipath */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } - if (!test_bit(Faulty, &rdev->flags)) working_disks++; } @@ -491,8 +472,7 @@ static int multipath_run (struct mddev *mddev) return 0; out_free_conf: - if (conf->pool) - mempool_destroy(conf->pool); + mempool_destroy(conf->pool); kfree(conf->multipaths); kfree(conf); mddev->private = NULL; diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index e64b61ad..431a030 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -233,9 +233,9 @@ static int get_ablock(struct dm_array_info *info, dm_block_t b, /* * Unlocks an array block. */ -static int unlock_ablock(struct dm_array_info *info, struct dm_block *block) +static void unlock_ablock(struct dm_array_info *info, struct dm_block *block) { - return dm_tm_unlock(info->btree_info.tm, block); + dm_tm_unlock(info->btree_info.tm, block); } /*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 4d6c9b6..f2393ba 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -454,7 +454,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, int r; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) + if (IS_ERR(p)) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -490,7 +490,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) + if (IS_ERR(p)) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -523,7 +523,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, int r; p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) + if (IS_ERR(p)) return PTR_ERR(p); if (unlikely(!p)) return -EWOULDBLOCK; @@ -559,7 +559,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) + if (IS_ERR(p)) return PTR_ERR(p); memset(p, 0, dm_bm_block_size(bm)); @@ -578,7 +578,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, } EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero); -int dm_bm_unlock(struct dm_block *b) +void dm_bm_unlock(struct dm_block *b) { struct buffer_aux *aux; aux = dm_bufio_get_aux_data(to_buffer(b)); @@ -590,8 +590,6 @@ int dm_bm_unlock(struct dm_block *b) bl_up_read(&aux->lock); dm_bufio_release(to_buffer(b)); - - return 0; } EXPORT_SYMBOL_GPL(dm_bm_unlock); diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 84330f5..3627d1b 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -94,7 +94,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, struct dm_block_validator *v, struct dm_block **result); -int dm_bm_unlock(struct dm_block *b); +void dm_bm_unlock(struct dm_block *b); /* * It's a common idiom to have a superblock that should be committed last. diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index bf2b80d..a240990 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -52,7 +52,7 @@ void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt); int new_block(struct dm_btree_info *info, struct dm_block **result); -int unlock_block(struct dm_btree_info *info, struct dm_block *b); +void unlock_block(struct dm_btree_info *info, struct dm_block *b); /* * Spines keep track of the rolling locks. There are 2 variants, read-only @@ -138,4 +138,10 @@ int lower_bound(struct btree_node *n, uint64_t key); extern struct dm_block_validator btree_node_validator; +/* + * Value type for upper levels of multi-level btrees. + */ +extern void init_le64_type(struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt); + #endif /* DM_BTREE_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 9836c0a..21ea537 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -165,9 +165,9 @@ static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt return 0; } -static int exit_child(struct dm_btree_info *info, struct child *c) +static void exit_child(struct dm_btree_info *info, struct child *c) { - return dm_tm_unlock(info->tm, c->block); + dm_tm_unlock(info->tm, c->block); } static void shift(struct btree_node *left, struct btree_node *right, int count) @@ -249,13 +249,10 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, __rebalance2(info, parent, &left, &right); - r = exit_child(info, &left); - if (r) { - exit_child(info, &right); - return r; - } + exit_child(info, &left); + exit_child(info, &right); - return exit_child(info, &right); + return 0; } /* @@ -301,11 +298,16 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, { int s; uint32_t max_entries = le32_to_cpu(left->header.max_entries); - unsigned target = (nr_left + nr_center + nr_right) / 3; - BUG_ON(target > max_entries); + unsigned total = nr_left + nr_center + nr_right; + unsigned target_right = total / 3; + unsigned remainder = (target_right * 3) != total; + unsigned target_left = target_right + remainder; + + BUG_ON(target_left > max_entries); + BUG_ON(target_right > max_entries); if (nr_left < nr_right) { - s = nr_left - target; + s = nr_left - target_left; if (s < 0 && nr_center < -s) { /* not enough in central node */ @@ -316,10 +318,10 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, } else shift(left, center, s); - shift(center, right, target - nr_right); + shift(center, right, target_right - nr_right); } else { - s = target - nr_right; + s = target_right - nr_right; if (s > 0 && nr_center < s) { /* not enough in central node */ shift(center, right, nr_center); @@ -329,7 +331,7 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, } else shift(center, right, s); - shift(left, center, nr_left - target); + shift(left, center, nr_left - target_left); } *key_ptr(parent, c->index) = center->keys[0]; @@ -389,49 +391,18 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, __rebalance3(info, parent, &left, ¢er, &right); - r = exit_child(info, &left); - if (r) { - exit_child(info, ¢er); - exit_child(info, &right); - return r; - } - - r = exit_child(info, ¢er); - if (r) { - exit_child(info, &right); - return r; - } - - r = exit_child(info, &right); - if (r) - return r; + exit_child(info, &left); + exit_child(info, ¢er); + exit_child(info, &right); return 0; } -static int get_nr_entries(struct dm_transaction_manager *tm, - dm_block_t b, uint32_t *result) -{ - int r; - struct dm_block *block; - struct btree_node *n; - - r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); - if (r) - return r; - - n = dm_block_data(block); - *result = le32_to_cpu(n->header.nr_entries); - - return dm_tm_unlock(tm, block); -} - static int rebalance_children(struct shadow_spine *s, struct dm_btree_info *info, struct dm_btree_value_type *vt, uint64_t key) { int i, r, has_left_sibling, has_right_sibling; - uint32_t child_entries; struct btree_node *n; n = dm_block_data(shadow_current(s)); @@ -446,9 +417,7 @@ static int rebalance_children(struct shadow_spine *s, memcpy(n, dm_block_data(child), dm_bm_block_size(dm_tm_get_bm(info->tm))); - r = dm_tm_unlock(info->tm, child); - if (r) - return r; + dm_tm_unlock(info->tm, child); dm_tm_dec(info->tm, dm_block_location(child)); return 0; @@ -458,10 +427,6 @@ static int rebalance_children(struct shadow_spine *s, if (i < 0) return -ENODATA; - r = get_nr_entries(info->tm, value64(n, i), &child_entries); - if (r) - return r; - has_left_sibling = i > 0; has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); @@ -544,14 +509,6 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, return r; } -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { @@ -559,12 +516,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, int index = 0, r = 0; struct shadow_spine spine; struct btree_node *n; + struct dm_btree_value_type le64_vt; + init_le64_type(info->tm, &le64_vt); init_shadow_spine(&spine, info); for (level = 0; level < info->levels; level++) { r = remove_raw(&spine, info, (level == last_level ? - &info->value_type : &le64_type), + &info->value_type : &le64_vt), root, keys[level], (unsigned *)&index); if (r < 0) break; @@ -654,11 +613,13 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root, int index = 0, r = 0; struct shadow_spine spine; struct btree_node *n; + struct dm_btree_value_type le64_vt; uint64_t k; + init_le64_type(info->tm, &le64_vt); init_shadow_spine(&spine, info); for (level = 0; level < last_level; level++) { - r = remove_raw(&spine, info, &le64_type, + r = remove_raw(&spine, info, &le64_vt, root, keys[level], (unsigned *) &index); if (r < 0) goto out; @@ -689,6 +650,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root, value_ptr(n, index)); delete_at(n, index); + keys[last_level] = k + 1ull; } else r = -ENODATA; diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index 1b5e13e..b27b8091 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -117,9 +117,9 @@ int new_block(struct dm_btree_info *info, struct dm_block **result) return dm_tm_new_block(info->tm, &btree_node_validator, result); } -int unlock_block(struct dm_btree_info *info, struct dm_block *b) +void unlock_block(struct dm_btree_info *info, struct dm_block *b) { - return dm_tm_unlock(info->tm, b); + dm_tm_unlock(info->tm, b); } /*----------------------------------------------------------------*/ @@ -137,9 +137,7 @@ int exit_ro_spine(struct ro_spine *s) int r = 0, i; for (i = 0; i < s->count; i++) { - int r2 = unlock_block(s->info, s->nodes[i]); - if (r2 < 0) - r = r2; + unlock_block(s->info, s->nodes[i]); } return r; @@ -150,9 +148,7 @@ int ro_step(struct ro_spine *s, dm_block_t new_child) int r; if (s->count == 2) { - r = unlock_block(s->info, s->nodes[0]); - if (r < 0) - return r; + unlock_block(s->info, s->nodes[0]); s->nodes[0] = s->nodes[1]; s->count--; } @@ -194,9 +190,7 @@ int exit_shadow_spine(struct shadow_spine *s) int r = 0, i; for (i = 0; i < s->count; i++) { - int r2 = unlock_block(s->info, s->nodes[i]); - if (r2 < 0) - r = r2; + unlock_block(s->info, s->nodes[i]); } return r; @@ -208,9 +202,7 @@ int shadow_step(struct shadow_spine *s, dm_block_t b, int r; if (s->count == 2) { - r = unlock_block(s->info, s->nodes[0]); - if (r < 0) - return r; + unlock_block(s->info, s->nodes[0]); s->nodes[0] = s->nodes[1]; s->count--; } @@ -249,3 +241,40 @@ int shadow_root(struct shadow_spine *s) { return s->root; } + +static void le64_inc(void *context, const void *value_le) +{ + struct dm_transaction_manager *tm = context; + __le64 v_le; + + memcpy(&v_le, value_le, sizeof(v_le)); + dm_tm_inc(tm, le64_to_cpu(v_le)); +} + +static void le64_dec(void *context, const void *value_le) +{ + struct dm_transaction_manager *tm = context; + __le64 v_le; + + memcpy(&v_le, value_le, sizeof(v_le)); + dm_tm_dec(tm, le64_to_cpu(v_le)); +} + +static int le64_equal(void *context, const void *value1_le, const void *value2_le) +{ + __le64 v1_le, v2_le; + + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + return v1_le == v2_le; +} + +void init_le64_type(struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt) +{ + vt->context = tm; + vt->size = sizeof(__le64); + vt->inc = le64_inc; + vt->dec = le64_dec; + vt->equal = le64_equal; +} diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index fdd3793..c573402 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -141,7 +141,9 @@ int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) n->header.value_size = cpu_to_le32(info->value_type.size); *root = dm_block_location(b); - return unlock_block(info, b); + unlock_block(info, b); + + return 0; } EXPORT_SYMBOL_GPL(dm_btree_empty); @@ -420,8 +422,8 @@ EXPORT_SYMBOL_GPL(dm_btree_lookup); * * Where A* is a shadow of A. */ -static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, - unsigned parent_index, uint64_t key) +static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, + uint64_t key) { int r; size_t size; @@ -523,7 +525,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) r = new_block(s->info, &right); if (r < 0) { - /* FIXME: put left */ + unlock_block(s->info, left); return r; } @@ -625,7 +627,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, if (top) r = btree_split_beneath(s, key); else - r = btree_split_sibling(s, root, i, key); + r = btree_split_sibling(s, i, key); if (r < 0) return r; @@ -667,12 +669,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root, struct btree_node *n; struct dm_btree_value_type le64_type; - le64_type.context = NULL; - le64_type.size = sizeof(__le64); - le64_type.inc = NULL; - le64_type.dec = NULL; - le64_type.equal = NULL; - + init_le64_type(info->tm, &le64_type); init_shadow_spine(&spine, info); for (level = 0; level < (info->levels - 1); level++) { diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index aacbe70..306d2e4 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -259,9 +259,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) idx.blocknr = cpu_to_le64(dm_block_location(b)); - r = dm_tm_unlock(ll->tm, b); - if (r < 0) - return r; + dm_tm_unlock(ll->tm, b); idx.nr_free = cpu_to_le32(ll->entries_per_block); idx.none_free_before = 0; @@ -293,7 +291,9 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) *result = sm_lookup_bitmap(dm_bitmap_data(blk), b); - return dm_tm_unlock(ll->tm, blk); + dm_tm_unlock(ll->tm, blk); + + return 0; } static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b, @@ -373,9 +373,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, return r; } - r = dm_tm_unlock(ll->tm, blk); - if (r < 0) - return r; + dm_tm_unlock(ll->tm, blk); *result = i * ll->entries_per_block + (dm_block_t) position; return 0; @@ -429,9 +427,7 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); - r = dm_tm_unlock(ll->tm, nb); - if (r < 0) - return r; + dm_tm_unlock(ll->tm, nb); if (old > 2) { r = dm_btree_remove(&ll->ref_count_info, @@ -445,9 +441,7 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, __le32 le_rc = cpu_to_le32(ref_count); sm_set_bitmap(bm_le, bit, 3); - r = dm_tm_unlock(ll->tm, nb); - if (r < 0) - return r; + dm_tm_unlock(ll->tm, nb); __dm_bless_for_disk(&le_rc); r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, @@ -556,7 +550,9 @@ static int metadata_ll_init_index(struct ll_disk *ll) memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); ll->bitmap_root = dm_block_location(b); - return dm_tm_unlock(ll->tm, b); + dm_tm_unlock(ll->tm, b); + + return 0; } static int metadata_ll_open(struct ll_disk *ll) @@ -570,7 +566,9 @@ static int metadata_ll_open(struct ll_disk *ll) return r; memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le)); - return dm_tm_unlock(ll->tm, block); + dm_tm_unlock(ll->tm, block); + + return 0; } static dm_block_t metadata_ll_max_entries(struct ll_disk *ll) @@ -590,7 +588,9 @@ static int metadata_ll_commit(struct ll_disk *ll) memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); ll->bitmap_root = dm_block_location(b); - return dm_tm_unlock(ll->tm, b); + dm_tm_unlock(ll->tm, b); + + return 0; } int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm) diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 9cb797d..abe2c5d 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -342,9 +342,9 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, } EXPORT_SYMBOL_GPL(dm_tm_read_lock); -int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) +void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) { - return dm_bm_unlock(b); + dm_bm_unlock(b); } EXPORT_SYMBOL_GPL(dm_tm_unlock); diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index 2e0d4d6..f3a18be 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -94,7 +94,7 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, struct dm_block_validator *v, struct dm_block **result); -int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); +void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); /* * Functions for altering the reference count of a block directly. diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index efb654e..f8e5db0 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -83,7 +83,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - bool discard_supported = false; + unsigned short blksize = 512; if (!conf) return -ENOMEM; @@ -98,6 +98,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; + blksize = max(blksize, queue_logical_block_size( + rdev1->bdev->bd_disk->queue)); + rdev_for_each(rdev2, mddev) { pr_debug("md/raid0:%s: comparing %s(%llu)" " with %s(%llu)\n", @@ -134,6 +137,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); + /* + * now since we have the hard sector sizes, we can make sure + * chunk size is a multiple of that sector size + */ + if ((mddev->chunk_sectors << 9) % blksize) { + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n", + mdname(mddev), + mddev->chunk_sectors << 9, blksize); + err = -EINVAL; + goto abort; + } + err = -ENOMEM; conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); @@ -188,19 +203,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } dev[j] = rdev1; - if (mddev->queue) - disk_stack_limits(mddev->gendisk, rdev1->bdev, - rdev1->data_offset << 9); - - if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) - conf->has_merge_bvec = 1; - if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; - - if (blk_queue_discard(bdev_get_queue(rdev1->bdev))) - discard_supported = true; } if (cnt != mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " @@ -261,28 +266,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) (unsigned long long)smallest->sectors); } - /* - * now since we have the hard sector sizes, we can make sure - * chunk size is a multiple of that sector size - */ - if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { - printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", - mdname(mddev), - mddev->chunk_sectors << 9); - goto abort; - } - - if (mddev->queue) { - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - blk_queue_io_opt(mddev->queue, - (mddev->chunk_sectors << 9) * mddev->raid_disks); - - if (!discard_supported) - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - else - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - } - pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -351,58 +334,6 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, + sector_div(sector, zone->nb_dev)]; } -/** - * raid0_mergeable_bvec -- tell bio layer if two requests can be merged - * @mddev: the md device - * @bvm: properties of new bio - * @biovec: the request that could be merged to it. - * - * Return amount of bytes we can accept at this offset - */ -static int raid0_mergeable_bvec(struct mddev *mddev, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct r0conf *conf = mddev->private; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - sector_t sector_offset = sector; - int max; - unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bvm->bi_size >> 9; - struct strip_zone *zone; - struct md_rdev *rdev; - struct request_queue *subq; - - if (is_power_of_2(chunk_sectors)) - max = (chunk_sectors - ((sector & (chunk_sectors-1)) - + bio_sectors)) << 9; - else - max = (chunk_sectors - (sector_div(sector, chunk_sectors) - + bio_sectors)) << 9; - if (max < 0) - max = 0; /* bio_add cannot handle a negative return */ - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - if (max < biovec->bv_len) - /* too small already, no need to check further */ - return max; - if (!conf->has_merge_bvec) - return max; - - /* May need to check subordinate device */ - sector = sector_offset; - zone = find_zone(mddev->private, §or_offset); - rdev = map_sector(mddev, zone, sector, §or_offset); - subq = bdev_get_queue(rdev->bdev); - if (subq->merge_bvec_fn) { - bvm->bi_bdev = rdev->bdev; - bvm->bi_sector = sector_offset + zone->dev_start + - rdev->data_offset; - return min(max, subq->merge_bvec_fn(subq, bvm, biovec)); - } else - return max; -} - static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks) { sector_t array_sectors = 0; @@ -433,12 +364,6 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; - if (mddev->queue) { - blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); - } - /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { ret = create_strip_zones(mddev, &conf); @@ -447,6 +372,29 @@ static int raid0_run(struct mddev *mddev) mddev->private = conf; } conf = mddev->private; + if (mddev->queue) { + struct md_rdev *rdev; + bool discard_supported = false; + + blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + + rdev_for_each(rdev, mddev) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; + } + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + } /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); @@ -543,7 +491,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) if (unlikely((split->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { /* Just ignore it */ - bio_endio(split, 0); + bio_endio(split); } else generic_make_request(split); } while (split != bio); @@ -727,7 +675,6 @@ static struct md_personality raid0_personality= .takeover = raid0_takeover, .quiesce = raid0_quiesce, .congested = raid0_congested, - .mergeable_bvec = raid0_mergeable_bvec, }; static int __init raid0_init (void) diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 05539d9..7127a62 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -12,8 +12,6 @@ struct r0conf { struct md_rdev **devlist; /* lists of rdevs, pointed to * by strip_zone->dev */ int nr_strip_zones; - int has_merge_bvec; /* at least one member has - * a merge_bvec_fn */ }; #endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 94f5b55..e2169ff 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) @@ -255,9 +257,10 @@ static void call_bio_endio(struct r1bio *r1_bio) done = 1; if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - clear_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = -EIO; + if (done) { - bio_endio(bio, 0); + bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle. @@ -312,9 +315,9 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) return mirror; } -static void raid1_end_read_request(struct bio *bio, int error) +static void raid1_end_read_request(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + int uptodate = !bio->bi_error; struct r1bio *r1_bio = bio->bi_private; int mirror; struct r1conf *conf = r1_bio->mddev->private; @@ -397,9 +400,8 @@ static void r1_bio_write_done(struct r1bio *r1_bio) } } -static void raid1_end_write_request(struct bio *bio, int error) +static void raid1_end_write_request(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r1bio *r1_bio = bio->bi_private; int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); struct r1conf *conf = r1_bio->mddev->private; @@ -410,7 +412,7 @@ static void raid1_end_write_request(struct bio *bio, int error) /* * 'one mirror IO has finished' event handler: */ - if (!uptodate) { + if (bio->bi_error) { set_bit(WriteErrorSeen, &conf->mirrors[mirror].rdev->flags); if (!test_and_set_bit(WantReplacement, @@ -557,7 +559,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL - || test_bit(Unmerged, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && @@ -708,38 +709,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } -static int raid1_mergeable_bvec(struct mddev *mddev, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct r1conf *conf = mddev->private; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int max = biovec->bv_len; - - if (mddev->merge_check_needed) { - int disk; - rcu_read_lock(); - for (disk = 0; disk < conf->raid_disks * 2; disk++) { - struct md_rdev *rdev = rcu_dereference( - conf->mirrors[disk].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = - bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) { - bvm->bi_sector = sector + - rdev->data_offset; - bvm->bi_bdev = rdev->bdev; - max = min(max, q->merge_bvec_fn( - q, bvm, biovec)); - } - } - } - rcu_read_unlock(); - } - return max; - -} - static int raid1_congested(struct mddev *mddev, int bits) { struct r1conf *conf = mddev->private; @@ -793,7 +762,7 @@ static void flush_pending_writes(struct r1conf *conf) if (unlikely((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ - bio_endio(bio, 0); + bio_endio(bio); else generic_make_request(bio); bio = next; @@ -914,8 +883,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) } if (bio && bio_data_dir(bio) == WRITE) { - if (bio->bi_iter.bi_sector >= - conf->mddev->curr_resync_completed) { + if (bio->bi_iter.bi_sector >= conf->next_resync) { if (conf->start_next_window == MaxSector) conf->start_next_window = conf->next_resync + @@ -1068,7 +1036,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) if (unlikely((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ - bio_endio(bio, 0); + bio_endio(bio); else generic_make_request(bio); bio = next; @@ -1158,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) * non-zero, then it is the number of not-completed requests. */ bio->bi_phys_segments = 0; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); + bio_clear_flag(bio, BIO_SEG_VALID); if (rw == READ) { /* @@ -1269,8 +1237,7 @@ read_again: break; } r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags) - || test_bit(Unmerged, &rdev->flags)) { + if (!rdev || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) set_bit(R1BIO_Degraded, &r1_bio->state); continue; @@ -1476,6 +1443,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r1conf *conf = mddev->private; + unsigned long flags; /* * If it is not operational, then we have already marked it as dead @@ -1495,19 +1463,19 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) return; } set_bit(Blocked, &rdev->flags); + spin_lock_irqsave(&conf->device_lock, flags); if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; set_bit(Faulty, &rdev->flags); - spin_unlock_irqrestore(&conf->device_lock, flags); } else set_bit(Faulty, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" "md/raid1:%s: Operation continuing on %d devices.\n", @@ -1549,7 +1517,7 @@ static void close_sync(struct r1conf *conf) conf->r1buf_pool = NULL; spin_lock_irq(&conf->resync_lock); - conf->next_resync = 0; + conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE; conf->start_next_window = MaxSector; conf->current_window_requests += conf->next_window_requests; @@ -1568,7 +1536,10 @@ static int raid1_spare_active(struct mddev *mddev) * Find all failed disks within the RAID1 configuration * and mark them readable. * Called under mddev lock, so rcu protection not needed. + * device_lock used to avoid races with raid1_end_read_request + * which expects 'In_sync' flags and ->degraded to be consistent. */ + spin_lock_irqsave(&conf->device_lock, flags); for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev; struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; @@ -1599,7 +1570,6 @@ static int raid1_spare_active(struct mddev *mddev) sysfs_notify_dirent_safe(rdev->sysfs_state); } } - spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded -= count; spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1615,7 +1585,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) struct raid1_info *p; int first = 0; int last = conf->raid_disks - 1; - struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; @@ -1623,10 +1592,14 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; - if (q->merge_bvec_fn) { - set_bit(Unmerged, &rdev->flags); - mddev->merge_check_needed = 1; - } + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + first = last = rdev->saved_raid_disk; for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors+mirror; @@ -1659,20 +1632,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } - if (err == 0 && test_bit(Unmerged, &rdev->flags)) { - /* Some requests might not have seen this new - * merge_bvec_fn. We must wait for them to complete - * before merging the device fully. - * First we make sure any code which has tested - * our function has submitted the request, then - * we wait for all outstanding requests to complete. - */ - synchronize_sched(); - freeze_array(conf, 0); - unfreeze_array(conf); - clear_bit(Unmerged, &rdev->flags); - } + mddev_suspend(mddev); md_integrity_add_rdev(rdev, mddev); + mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); @@ -1735,7 +1697,7 @@ abort: return err; } -static void end_sync_read(struct bio *bio, int error) +static void end_sync_read(struct bio *bio) { struct r1bio *r1_bio = bio->bi_private; @@ -1746,16 +1708,16 @@ static void end_sync_read(struct bio *bio, int error) * or re-read if the read failed. * We don't do much here, just schedule handling by raid1d */ - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (!bio->bi_error) set_bit(R1BIO_Uptodate, &r1_bio->state); if (atomic_dec_and_test(&r1_bio->remaining)) reschedule_retry(r1_bio); } -static void end_sync_write(struct bio *bio, int error) +static void end_sync_write(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + int uptodate = !bio->bi_error; struct r1bio *r1_bio = bio->bi_private; struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -1942,7 +1904,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) idx ++; } set_bit(R1BIO_Uptodate, &r1_bio->state); - set_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = 0; return 1; } @@ -1966,15 +1928,14 @@ static void process_checks(struct r1bio *r1_bio) for (i = 0; i < conf->raid_disks * 2; i++) { int j; int size; - int uptodate; + int error; struct bio *b = r1_bio->bios[i]; if (b->bi_end_io != end_sync_read) continue; - /* fixup the bio for reuse, but preserve BIO_UPTODATE */ - uptodate = test_bit(BIO_UPTODATE, &b->bi_flags); + /* fixup the bio for reuse, but preserve errno */ + error = b->bi_error; bio_reset(b); - if (!uptodate) - clear_bit(BIO_UPTODATE, &b->bi_flags); + b->bi_error = error; b->bi_vcnt = vcnt; b->bi_iter.bi_size = r1_bio->sectors << 9; b->bi_iter.bi_sector = r1_bio->sector + @@ -1997,7 +1958,7 @@ static void process_checks(struct r1bio *r1_bio) } for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { + !r1_bio->bios[primary]->bi_error) { r1_bio->bios[primary]->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[primary].rdev, mddev); break; @@ -2007,14 +1968,14 @@ static void process_checks(struct r1bio *r1_bio) int j; struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; - int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags); + int error = sbio->bi_error; if (sbio->bi_end_io != end_sync_read) continue; - /* Now we can 'fixup' the BIO_UPTODATE flag */ - set_bit(BIO_UPTODATE, &sbio->bi_flags); + /* Now we can 'fixup' the error value */ + sbio->bi_error = 0; - if (uptodate) { + if (!error) { for (j = vcnt; j-- ; ) { struct page *p, *s; p = pbio->bi_io_vec[j].bv_page; @@ -2029,7 +1990,7 @@ static void process_checks(struct r1bio *r1_bio) if (j >= 0) atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && uptodate)) { + && !error)) { /* No need to write to this device. */ sbio->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[i].rdev, mddev); @@ -2247,7 +2208,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) bio_trim(wbio, sector - r1_bio->sector, sectors); wbio->bi_iter.bi_sector += rdev->data_offset; wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) == 0) + if (submit_bio_wait(WRITE, wbio) < 0) /* failure! */ ok = rdev_set_badblocks(rdev, sector, sectors, 0) @@ -2270,11 +2231,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio struct bio *bio = r1_bio->bios[m]; if (bio->bi_end_io == NULL) continue; - if (test_bit(BIO_UPTODATE, &bio->bi_flags) && + if (!bio->bi_error && test_bit(R1BIO_MadeGood, &r1_bio->state)) { rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); } - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && + if (bio->bi_error && test_bit(R1BIO_WriteError, &r1_bio->state)) { if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) md_error(conf->mddev, rdev); @@ -2287,6 +2248,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { int m; + bool fail = false; for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; @@ -2299,6 +2261,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * narrow down and record precise write * errors. */ + fail = true; if (!narrow_write_error(r1_bio, m)) { md_error(conf->mddev, conf->mirrors[m].rdev); @@ -2308,9 +2271,16 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } - if (test_bit(R1BIO_WriteError, &r1_bio->state)) - close_write(r1_bio); - raid_end_bio_io(r1_bio); + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r1_bio->retry_list, &conf->bio_end_io_list); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else { + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } } static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) @@ -2416,6 +2386,27 @@ static void raid1d(struct md_thread *thread) md_check_recovery(mddev); + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + list_add(&tmp, &conf->bio_end_io_list); + list_del_init(&conf->bio_end_io_list); + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r1_bio = list_first_entry(&tmp, struct r1bio, + retry_list); + list_del(&r1_bio->retry_list); + if (mddev->degraded) + set_bit(R1BIO_Degraded, &r1_bio->state); + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } + } + blk_start_plug(&plug); for (;;) { @@ -2515,6 +2506,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bitmap_close_sync(mddev->bitmap); close_sync(conf); + + if (mddev_is_clustered(mddev)) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + } return 0; } @@ -2535,7 +2531,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp return sync_blocks; } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + /* we are incrementing sector_nr below. To be safe, we check against + * sector_nr + two times RESYNC_SECTORS + */ + + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); raise_barrier(conf, sector_nr); @@ -2713,7 +2714,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp /* remove last page from this bio */ bio->bi_vcnt--; bio->bi_iter.bi_size -= len; - __clear_bit(BIO_SEG_VALID, &bio->bi_flags); + bio_clear_flag(bio, BIO_SEG_VALID); } goto bio_full; } @@ -2726,6 +2727,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bio_full: r1_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + /* For a user-requested sync, we read all readable devices and do a * compare */ @@ -2808,8 +2819,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; disk->rdev = rdev; q = bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) - mddev->merge_check_needed = 1; disk->head_position = 0; disk->seq_start = MaxSector; @@ -2817,6 +2826,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -2870,8 +2880,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) abort: if (conf) { - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); @@ -2973,8 +2982,7 @@ static void raid1_free(struct mddev *mddev, void *priv) { struct r1conf *conf = priv; - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); @@ -3043,9 +3051,11 @@ static int raid1_reshape(struct mddev *mddev) return -EINVAL; } - err = md_allow_write(mddev); - if (err) - return err; + if (!mddev_is_clustered(mddev)) { + err = md_allow_write(mddev); + if (err) + return err; + } raid_disks = mddev->raid_disks + mddev->delta_disks; @@ -3111,6 +3121,7 @@ static int raid1_reshape(struct mddev *mddev) unfreeze_array(conf); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -3174,7 +3185,6 @@ static struct md_personality raid1_personality = .quiesce = raid1_quiesce, .takeover = raid1_takeover, .congested = raid1_congested, - .mergeable_bvec = raid1_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 14ebb28..61c39b3 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -61,6 +61,11 @@ struct r1conf { * block, or anything else. */ struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; /* queue pending writes to be submitted on unplug */ struct bio_list pending_bio_list; @@ -106,6 +111,13 @@ struct r1conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + + /* Keep track of cluster resync window to send to other + * nodes. + */ + sector_t cluster_sync_low; + sector_t cluster_sync_high; + }; /* diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 38c58e1..41d70bc 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -39,6 +39,7 @@ * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) * use_far_sets (stored in bit 17 of layout ) + * use_far_sets_bugfixed (stored in bit 18 of layout ) * * The data to be stored is divided into chunks using chunksize. Each device * is divided into far_copies sections. In each section, chunks are laid out @@ -101,7 +102,7 @@ static int _enough(struct r10conf *conf, int previous, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped); static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); -static void end_reshape_write(struct bio *bio, int error); +static void end_reshape_write(struct bio *bio); static void end_reshape(struct r10conf *conf); static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) @@ -307,9 +308,9 @@ static void raid_end_bio_io(struct r10bio *r10_bio) } else done = 1; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - clear_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = -EIO; if (done) { - bio_endio(bio, 0); + bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle. @@ -358,9 +359,9 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, return r10_bio->devs[slot].devnum; } -static void raid10_end_read_request(struct bio *bio, int error) +static void raid10_end_read_request(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + int uptodate = !bio->bi_error; struct r10bio *r10_bio = bio->bi_private; int slot, dev; struct md_rdev *rdev; @@ -438,9 +439,8 @@ static void one_write_done(struct r10bio *r10_bio) } } -static void raid10_end_write_request(struct bio *bio, int error) +static void raid10_end_write_request(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; int dev; int dec_rdev = 1; @@ -460,7 +460,7 @@ static void raid10_end_write_request(struct bio *bio, int error) /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) { + if (bio->bi_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it. @@ -672,93 +672,6 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) return (vchunk << geo->chunk_shift) + offset; } -/** - * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged - * @mddev: the md device - * @bvm: properties of new bio - * @biovec: the request that could be merged to it. - * - * Return amount of bytes we can accept at this offset - * This requires checking for end-of-chunk if near_copies != raid_disks, - * and for subordinate merge_bvec_fns if merge_check_needed. - */ -static int raid10_mergeable_bvec(struct mddev *mddev, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct r10conf *conf = mddev->private; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int max; - unsigned int chunk_sectors; - unsigned int bio_sectors = bvm->bi_size >> 9; - struct geom *geo = &conf->geo; - - chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; - if (conf->reshape_progress != MaxSector && - ((sector >= conf->reshape_progress) != - conf->mddev->reshape_backwards)) - geo = &conf->prev; - - if (geo->near_copies < geo->raid_disks) { - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) - + bio_sectors)) << 9; - if (max < 0) - /* bio_add cannot handle a negative return */ - max = 0; - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - } else - max = biovec->bv_len; - - if (mddev->merge_check_needed) { - struct { - struct r10bio r10_bio; - struct r10dev devs[conf->copies]; - } on_stack; - struct r10bio *r10_bio = &on_stack.r10_bio; - int s; - if (conf->reshape_progress != MaxSector) { - /* Cannot give any guidance during reshape */ - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - return 0; - } - r10_bio->sector = sector; - raid10_find_phys(conf, r10_bio); - rcu_read_lock(); - for (s = 0; s < conf->copies; s++) { - int disk = r10_bio->devs[s].devnum; - struct md_rdev *rdev = rcu_dereference( - conf->mirrors[disk].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = - bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) { - bvm->bi_sector = r10_bio->devs[s].addr - + rdev->data_offset; - bvm->bi_bdev = rdev->bdev; - max = min(max, q->merge_bvec_fn( - q, bvm, biovec)); - } - } - rdev = rcu_dereference(conf->mirrors[disk].replacement); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = - bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) { - bvm->bi_sector = r10_bio->devs[s].addr - + rdev->data_offset; - bvm->bi_bdev = rdev->bdev; - max = min(max, q->merge_bvec_fn( - q, bvm, biovec)); - } - } - } - rcu_read_unlock(); - } - return max; -} - /* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector @@ -821,12 +734,10 @@ retry: disk = r10_bio->devs[slot].devnum; rdev = rcu_dereference(conf->mirrors[disk].replacement); if (rdev == NULL || test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags) || r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) rdev = rcu_dereference(conf->mirrors[disk].rdev); if (rdev == NULL || - test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags)) + test_bit(Faulty, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) @@ -957,7 +868,7 @@ static void flush_pending_writes(struct r10conf *conf) if (unlikely((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ - bio_endio(bio, 0); + bio_endio(bio); else generic_make_request(bio); bio = next; @@ -1133,7 +1044,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) if (unlikely((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ - bio_endio(bio, 0); + bio_endio(bio); else generic_make_request(bio); bio = next; @@ -1217,7 +1128,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio) * non-zero, then it is the number of not-completed requests. */ bio->bi_phys_segments = 0; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); + bio_clear_flag(bio, BIO_SEG_VALID); if (rw == READ) { /* @@ -1326,11 +1237,9 @@ retry_write: blocked_rdev = rrdev; break; } - if (rdev && (test_bit(Faulty, &rdev->flags) - || test_bit(Unmerged, &rdev->flags))) + if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; - if (rrdev && (test_bit(Faulty, &rrdev->flags) - || test_bit(Unmerged, &rrdev->flags))) + if (rrdev && (test_bit(Faulty, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; @@ -1589,6 +1498,8 @@ static void status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " %d offset-copies", conf->geo.far_copies); else seq_printf(seq, " %d far-copies", conf->geo.far_copies); + if (conf->geo.far_set_size != conf->geo.raid_disks) + seq_printf(seq, " %d devices per set", conf->geo.far_set_size); } seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, conf->geo.raid_disks - mddev->degraded); @@ -1681,6 +1592,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" @@ -1777,7 +1689,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) int mirror; int first = 0; int last = conf->geo.raid_disks - 1; - struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is @@ -1790,11 +1701,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; - if (q->merge_bvec_fn) { - set_bit(Unmerged, &rdev->flags); - mddev->merge_check_needed = 1; - } - if (rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; @@ -1833,20 +1739,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - if (err == 0 && test_bit(Unmerged, &rdev->flags)) { - /* Some requests might not have seen this new - * merge_bvec_fn. We must wait for them to complete - * before merging the device fully. - * First we make sure any code which has tested - * our function has submitted the request, then - * we wait for all outstanding requests to complete. - */ - synchronize_sched(); - freeze_array(conf, 0); - unfreeze_array(conf); - clear_bit(Unmerged, &rdev->flags); - } + mddev_suspend(mddev); md_integrity_add_rdev(rdev, mddev); + mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); @@ -1916,7 +1811,7 @@ abort: return err; } -static void end_sync_read(struct bio *bio, int error) +static void end_sync_read(struct bio *bio) { struct r10bio *r10_bio = bio->bi_private; struct r10conf *conf = r10_bio->mddev->private; @@ -1928,7 +1823,7 @@ static void end_sync_read(struct bio *bio, int error) } else d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (!bio->bi_error) set_bit(R10BIO_Uptodate, &r10_bio->state); else /* The write handler will notice the lack of @@ -1977,9 +1872,8 @@ static void end_sync_request(struct r10bio *r10_bio) } } -static void end_sync_write(struct bio *bio, int error) +static void end_sync_write(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; @@ -1996,7 +1890,7 @@ static void end_sync_write(struct bio *bio, int error) else rdev = conf->mirrors[d].rdev; - if (!uptodate) { + if (bio->bi_error) { if (repl) md_error(mddev, rdev); else { @@ -2044,7 +1938,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) /* find the first device with a block */ for (i=0; i<conf->copies; i++) - if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) + if (!r10_bio->devs[i].bio->bi_error) break; if (i == conf->copies) @@ -2064,7 +1958,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) continue; if (i == first) continue; - if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) { + if (!r10_bio->devs[i].bio->bi_error) { /* We know that the bi_io_vec layout is the same for * both 'first' and 'i', so we just compare them. * All vec entries are PAGE_SIZE; @@ -2394,7 +2288,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && - !test_bit(Unmerged, &rdev->flags) && test_bit(In_sync, &rdev->flags) && is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, &first_bad, &bad_sectors) == 0) { @@ -2448,7 +2341,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || - test_bit(Unmerged, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -2580,7 +2472,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) choose_data_offset(r10_bio, rdev) + (sector - r10_bio->sector)); wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) == 0) + if (submit_bio_wait(WRITE, wbio) < 0) /* Failure! */ ok = rdev_set_badblocks(rdev, sector, sectors, 0) @@ -2706,8 +2598,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev = conf->mirrors[dev].rdev; if (r10_bio->devs[m].bio == NULL) continue; - if (test_bit(BIO_UPTODATE, - &r10_bio->devs[m].bio->bi_flags)) { + if (!r10_bio->devs[m].bio->bi_error) { rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, @@ -2722,8 +2613,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev = conf->mirrors[dev].replacement; if (r10_bio->devs[m].repl_bio == NULL) continue; - if (test_bit(BIO_UPTODATE, - &r10_bio->devs[m].repl_bio->bi_flags)) { + + if (!r10_bio->devs[m].repl_bio->bi_error) { rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, @@ -2738,6 +2629,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } put_buf(r10_bio); } else { + bool fail = false; for (m = 0; m < conf->copies; m++) { int dev = r10_bio->devs[m].devnum; struct bio *bio = r10_bio->devs[m].bio; @@ -2748,8 +2640,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) r10_bio->devs[m].addr, r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); - } else if (bio != NULL && - !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + } else if (bio != NULL && bio->bi_error) { + fail = true; if (!narrow_write_error(r10_bio, m)) { md_error(conf->mddev, rdev); set_bit(R10BIO_Degraded, @@ -2767,10 +2659,17 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } } - if (test_bit(R10BIO_WriteError, - &r10_bio->state)) - close_write(r10_bio); - raid_end_bio_io(r10_bio); + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r10_bio->retry_list, &conf->bio_end_io_list); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else { + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } } } @@ -2785,6 +2684,29 @@ static void raid10d(struct md_thread *thread) md_check_recovery(mddev); + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + list_add(&tmp, &conf->bio_end_io_list); + list_del_init(&conf->bio_end_io_list); + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r10_bio = list_first_entry(&tmp, struct r10bio, + retry_list); + list_del(&r10_bio->retry_list); + if (mddev->degraded) + set_bit(R10BIO_Degraded, &r10_bio->state); + + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } + } + blk_start_plug(&plug); for (;;) { @@ -3227,7 +3149,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* resync. Schedule a read for every block at this virt offset */ int count = 0; - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && @@ -3263,7 +3185,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[i].bio; bio_reset(bio); - clear_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = -EIO; if (conf->mirrors[d].rdev == NULL || test_bit(Faulty, &conf->mirrors[d].rdev->flags)) continue; @@ -3300,7 +3222,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; bio_reset(bio); - clear_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = -EIO; sector = r10_bio->devs[i].addr; atomic_inc(&conf->mirrors[d].rdev->nr_pending); @@ -3357,7 +3279,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* remove last page from this bio */ bio2->bi_vcnt--; bio2->bi_iter.bi_size -= len; - __clear_bit(BIO_SEG_VALID, &bio2->bi_flags); + bio_clear_flag(bio2, BIO_SEG_VALID); } goto bio_full; } @@ -3377,7 +3299,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io == end_sync_read) { md_sync_acct(bio->bi_bdev, nr_sectors); - set_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = 0; generic_make_request(bio); } } @@ -3477,7 +3399,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) disks = mddev->raid_disks + mddev->delta_disks; break; } - if (layout >> 18) + if (layout >> 19) return -1; if (chunk < (PAGE_SIZE >> 9) || !is_power_of_2(chunk)) @@ -3489,7 +3411,22 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) geo->near_copies = nc; geo->far_copies = fc; geo->far_offset = fo; - geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; + switch (layout >> 17) { + case 0: /* original layout. simple but not always optimal */ + geo->far_set_size = disks; + break; + case 1: /* "improved" layout which was buggy. Hopefully no-one is + * actually using this, but leave code here just in case.*/ + geo->far_set_size = disks/fc; + WARN(geo->far_set_size < fc, + "This RAID10 layout does not provide data safety - please backup and create new array\n"); + break; + case 2: /* "improved" layout fixed to match documentation */ + geo->far_set_size = fc * nc; + break; + default: /* Not a valid layout */ + return -1; + } geo->chunk_mask = chunk - 1; geo->chunk_shift = ffz(~chunk); return nc*fc; @@ -3559,6 +3496,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) conf->reshape_safe = conf->reshape_progress; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -3575,8 +3513,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", mdname(mddev)); if (conf) { - if (conf->r10bio_pool) - mempool_destroy(conf->r10bio_pool); + mempool_destroy(conf->r10bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf); @@ -3643,8 +3580,6 @@ static int run(struct mddev *mddev) disk->rdev = rdev; } q = bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) - mddev->merge_check_needed = 1; diff = (rdev->new_data_offset - rdev->data_offset); if (!mddev->reshape_backwards) diff = -diff; @@ -3773,8 +3708,7 @@ static int run(struct mddev *mddev) out_free_conf: md_unregister_thread(&mddev->thread); - if (conf->r10bio_pool) - mempool_destroy(conf->r10bio_pool); + mempool_destroy(conf->r10bio_pool); safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf); @@ -3787,8 +3721,7 @@ static void raid10_free(struct mddev *mddev, void *priv) { struct r10conf *conf = priv; - if (conf->r10bio_pool) - mempool_destroy(conf->r10bio_pool); + mempool_destroy(conf->r10bio_pool); safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf->mirrors_old); @@ -4215,7 +4148,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, * at a time, possibly less if that exceeds RESYNC_PAGES, * or we hit a bad block or something. * This might mean we pause for normal IO in the middle of - * a chunk, but that is not a problem was mddev->reshape_position + * a chunk, but that is not a problem as mddev->reshape_position * can record any location. * * If we will want to write to a location that isn't @@ -4239,7 +4172,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, * * In all this the minimum difference in data offsets * (conf->offset_diff - always positive) allows a bit of slack, - * so next can be after 'safe', but not by more than offset_disk + * so next can be after 'safe', but not by more than offset_diff * * We need to prepare all the bios here before we start any IO * to ensure the size we choose is acceptable to all devices. @@ -4382,7 +4315,7 @@ read_more: read_bio->bi_end_io = end_sync_read; read_bio->bi_rw = READ; read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); - __set_bit(BIO_UPTODATE, &read_bio->bi_flags); + read_bio->bi_error = 0; read_bio->bi_vcnt = 0; read_bio->bi_iter.bi_size = 0; r10_bio->master_bio = read_bio; @@ -4439,7 +4372,7 @@ read_more: /* Remove last page from this bio */ bio2->bi_vcnt--; bio2->bi_iter.bi_size -= len; - __clear_bit(BIO_SEG_VALID, &bio2->bi_flags); + bio_clear_flag(bio2, BIO_SEG_VALID); } goto bio_full; } @@ -4604,9 +4537,8 @@ static int handle_reshape_read_error(struct mddev *mddev, return 0; } -static void end_reshape_write(struct bio *bio, int error) +static void end_reshape_write(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; @@ -4623,7 +4555,7 @@ static void end_reshape_write(struct bio *bio, int error) rdev = conf->mirrors[d].rdev; } - if (!uptodate) { + if (bio->bi_error) { /* FIXME should record badblock */ md_error(mddev, rdev); } @@ -4700,7 +4632,6 @@ static struct md_personality raid10_personality = .start_reshape = raid10_start_reshape, .finish_reshape = raid10_finish_reshape, .congested = raid10_congested, - .mergeable_bvec = raid10_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 5ee6473..6fc2c75 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -53,6 +53,12 @@ struct r10conf { sector_t offset_diff; struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; + /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; int pending_count; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c new file mode 100644 index 0000000..b887e04 --- /dev/null +++ b/drivers/md/raid5-cache.c @@ -0,0 +1,1191 @@ +/* + * Copyright (C) 2015 Shaohua Li <shli@fb.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/kernel.h> +#include <linux/wait.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/raid/md_p.h> +#include <linux/crc32c.h> +#include <linux/random.h> +#include "md.h" +#include "raid5.h" + +/* + * metadata/data stored in disk with 4k size unit (a block) regardless + * underneath hardware sector size. only works with PAGE_SIZE == 4096 + */ +#define BLOCK_SECTORS (8) + +/* + * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent + * recovery scans a very long log + */ +#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ +#define RECLAIM_MAX_FREE_SPACE_SHIFT (2) + +struct r5l_log { + struct md_rdev *rdev; + + u32 uuid_checksum; + + sector_t device_size; /* log device size, round to + * BLOCK_SECTORS */ + sector_t max_free_space; /* reclaim run if free space is at + * this size */ + + sector_t last_checkpoint; /* log tail. where recovery scan + * starts from */ + u64 last_cp_seq; /* log tail sequence */ + + sector_t log_start; /* log head. where new data appends */ + u64 seq; /* log head sequence */ + + sector_t next_checkpoint; + u64 next_cp_seq; + + struct mutex io_mutex; + struct r5l_io_unit *current_io; /* current io_unit accepting new data */ + + spinlock_t io_list_lock; + struct list_head running_ios; /* io_units which are still running, + * and have not yet been completely + * written to the log */ + struct list_head io_end_ios; /* io_units which have been completely + * written to the log but not yet written + * to the RAID */ + struct list_head flushing_ios; /* io_units which are waiting for log + * cache flush */ + struct list_head finished_ios; /* io_units which settle down in log disk */ + struct bio flush_bio; + + struct kmem_cache *io_kc; + + struct md_thread *reclaim_thread; + unsigned long reclaim_target; /* number of space that need to be + * reclaimed. if it's 0, reclaim spaces + * used by io_units which are in + * IO_UNIT_STRIPE_END state (eg, reclaim + * dones't wait for specific io_unit + * switching to IO_UNIT_STRIPE_END + * state) */ + wait_queue_head_t iounit_wait; + + struct list_head no_space_stripes; /* pending stripes, log has no space */ + spinlock_t no_space_stripes_lock; + + bool need_cache_flush; + bool in_teardown; +}; + +/* + * an IO range starts from a meta data block and end at the next meta data + * block. The io unit's the meta data block tracks data/parity followed it. io + * unit is written to log disk with normal write, as we always flush log disk + * first and then start move data to raid disks, there is no requirement to + * write io unit with FLUSH/FUA + */ +struct r5l_io_unit { + struct r5l_log *log; + + struct page *meta_page; /* store meta block */ + int meta_offset; /* current offset in meta_page */ + + struct bio *current_bio;/* current_bio accepting new data */ + + atomic_t pending_stripe;/* how many stripes not flushed to raid */ + u64 seq; /* seq number of the metablock */ + sector_t log_start; /* where the io_unit starts */ + sector_t log_end; /* where the io_unit ends */ + struct list_head log_sibling; /* log->running_ios */ + struct list_head stripe_list; /* stripes added to the io_unit */ + + int state; + bool need_split_bio; +}; + +/* r5l_io_unit state */ +enum r5l_io_unit_state { + IO_UNIT_RUNNING = 0, /* accepting new IO */ + IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, + * don't accepting new bio */ + IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ + IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ +}; + +static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) +{ + start += inc; + if (start >= log->device_size) + start = start - log->device_size; + return start; +} + +static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, + sector_t end) +{ + if (end >= start) + return end - start; + else + return end + log->device_size - start; +} + +static bool r5l_has_free_space(struct r5l_log *log, sector_t size) +{ + sector_t used_size; + + used_size = r5l_ring_distance(log, log->last_checkpoint, + log->log_start); + + return log->device_size > used_size + size; +} + +static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io) +{ + __free_page(io->meta_page); + kmem_cache_free(log->io_kc, io); +} + +static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to, + enum r5l_io_unit_state state) +{ + struct r5l_io_unit *io; + + while (!list_empty(from)) { + io = list_first_entry(from, struct r5l_io_unit, log_sibling); + /* don't change list order */ + if (io->state >= state) + list_move_tail(&io->log_sibling, to); + else + break; + } +} + +static void __r5l_set_io_unit_state(struct r5l_io_unit *io, + enum r5l_io_unit_state state) +{ + if (WARN_ON(io->state >= state)) + return; + io->state = state; +} + +static void r5l_io_run_stripes(struct r5l_io_unit *io) +{ + struct stripe_head *sh, *next; + + list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } +} + +static void r5l_log_run_stripes(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + + assert_spin_locked(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_IO_END) + break; + + list_move_tail(&io->log_sibling, &log->finished_ios); + r5l_io_run_stripes(io); + } +} + +static void r5l_log_endio(struct bio *bio) +{ + struct r5l_io_unit *io = bio->bi_private; + struct r5l_log *log = io->log; + unsigned long flags; + + if (bio->bi_error) + md_error(log->rdev->mddev, log->rdev); + + bio_put(bio); + + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_IO_END); + if (log->need_cache_flush) + r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios, + IO_UNIT_IO_END); + else + r5l_log_run_stripes(log); + spin_unlock_irqrestore(&log->io_list_lock, flags); + + if (log->need_cache_flush) + md_wakeup_thread(log->rdev->mddev->thread); +} + +static void r5l_submit_current_io(struct r5l_log *log) +{ + struct r5l_io_unit *io = log->current_io; + struct r5l_meta_block *block; + unsigned long flags; + u32 crc; + + if (!io) + return; + + block = page_address(io->meta_page); + block->meta_size = cpu_to_le32(io->meta_offset); + crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); + block->checksum = cpu_to_le32(crc); + + log->current_io = NULL; + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_IO_START); + spin_unlock_irqrestore(&log->io_list_lock, flags); + + submit_bio(WRITE, io->current_bio); +} + +static struct bio *r5l_bio_alloc(struct r5l_log *log) +{ + struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES); + + bio->bi_rw = WRITE; + bio->bi_bdev = log->rdev->bdev; + bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; + + return bio; +} + +static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) +{ + log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); + + /* + * If we filled up the log device start from the beginning again, + * which will require a new bio. + * + * Note: for this to work properly the log size needs to me a multiple + * of BLOCK_SECTORS. + */ + if (log->log_start == 0) + io->need_split_bio = true; + + io->log_end = log->log_start; +} + +static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) +{ + struct r5l_io_unit *io; + struct r5l_meta_block *block; + + /* We can't handle memory allocate failure so far */ + io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL); + io->log = log; + INIT_LIST_HEAD(&io->log_sibling); + INIT_LIST_HEAD(&io->stripe_list); + io->state = IO_UNIT_RUNNING; + + io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO); + block = page_address(io->meta_page); + block->magic = cpu_to_le32(R5LOG_MAGIC); + block->version = R5LOG_VERSION; + block->seq = cpu_to_le64(log->seq); + block->position = cpu_to_le64(log->log_start); + + io->log_start = log->log_start; + io->meta_offset = sizeof(struct r5l_meta_block); + io->seq = log->seq++; + + io->current_bio = r5l_bio_alloc(log); + io->current_bio->bi_end_io = r5l_log_endio; + io->current_bio->bi_private = io; + bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); + + r5_reserve_log_entry(log, io); + + spin_lock_irq(&log->io_list_lock); + list_add_tail(&io->log_sibling, &log->running_ios); + spin_unlock_irq(&log->io_list_lock); + + return io; +} + +static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) +{ + if (log->current_io && + log->current_io->meta_offset + payload_size > PAGE_SIZE) + r5l_submit_current_io(log); + + if (!log->current_io) + log->current_io = r5l_new_meta(log); + return 0; +} + +static void r5l_append_payload_meta(struct r5l_log *log, u16 type, + sector_t location, + u32 checksum1, u32 checksum2, + bool checksum2_valid) +{ + struct r5l_io_unit *io = log->current_io; + struct r5l_payload_data_parity *payload; + + payload = page_address(io->meta_page) + io->meta_offset; + payload->header.type = cpu_to_le16(type); + payload->header.flags = cpu_to_le16(0); + payload->size = cpu_to_le32((1 + !!checksum2_valid) << + (PAGE_SHIFT - 9)); + payload->location = cpu_to_le64(location); + payload->checksum[0] = cpu_to_le32(checksum1); + if (checksum2_valid) + payload->checksum[1] = cpu_to_le32(checksum2); + + io->meta_offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * (1 + !!checksum2_valid); +} + +static void r5l_append_payload_page(struct r5l_log *log, struct page *page) +{ + struct r5l_io_unit *io = log->current_io; + + if (io->need_split_bio) { + struct bio *prev = io->current_bio; + + io->current_bio = r5l_bio_alloc(log); + bio_chain(io->current_bio, prev); + + submit_bio(WRITE, prev); + } + + if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) + BUG(); + + r5_reserve_log_entry(log, io); +} + +static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, + int data_pages, int parity_pages) +{ + int i; + int meta_size; + struct r5l_io_unit *io; + + meta_size = + ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) + * data_pages) + + sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * parity_pages; + + r5l_get_meta(log, meta_size); + io = log->current_io; + + for (i = 0; i < sh->disks; i++) { + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) + continue; + if (i == sh->pd_idx || i == sh->qd_idx) + continue; + r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, + raid5_compute_blocknr(sh, i, 0), + sh->dev[i].log_checksum, 0, false); + r5l_append_payload_page(log, sh->dev[i].page); + } + + if (sh->qd_idx >= 0) { + r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, + sh->sector, sh->dev[sh->pd_idx].log_checksum, + sh->dev[sh->qd_idx].log_checksum, true); + r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); + r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); + } else { + r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, + sh->sector, sh->dev[sh->pd_idx].log_checksum, + 0, false); + r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); + } + + list_add_tail(&sh->log_list, &io->stripe_list); + atomic_inc(&io->pending_stripe); + sh->log_io = io; +} + +static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); +/* + * running in raid5d, where reclaim could wait for raid5d too (when it flushes + * data from log to raid disks), so we shouldn't wait for reclaim here + */ +int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) +{ + int write_disks = 0; + int data_pages, parity_pages; + int meta_size; + int reserve; + int i; + + if (!log) + return -EAGAIN; + /* Don't support stripe batch */ + if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || + test_bit(STRIPE_SYNCING, &sh->state)) { + /* the stripe is written to log, we start writing it to raid */ + clear_bit(STRIPE_LOG_TRAPPED, &sh->state); + return -EAGAIN; + } + + for (i = 0; i < sh->disks; i++) { + void *addr; + + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) + continue; + write_disks++; + /* checksum is already calculated in last run */ + if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) + continue; + addr = kmap_atomic(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_atomic(addr); + } + parity_pages = 1 + !!(sh->qd_idx >= 0); + data_pages = write_disks - parity_pages; + + meta_size = + ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) + * data_pages) + + sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * parity_pages; + /* Doesn't work with very big raid array */ + if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) + return -EINVAL; + + set_bit(STRIPE_LOG_TRAPPED, &sh->state); + /* + * The stripe must enter state machine again to finish the write, so + * don't delay. + */ + clear_bit(STRIPE_DELAYED, &sh->state); + atomic_inc(&sh->count); + + mutex_lock(&log->io_mutex); + /* meta + data */ + reserve = (1 + write_disks) << (PAGE_SHIFT - 9); + if (r5l_has_free_space(log, reserve)) + r5l_log_stripe(log, sh, data_pages, parity_pages); + else { + spin_lock(&log->no_space_stripes_lock); + list_add_tail(&sh->log_list, &log->no_space_stripes); + spin_unlock(&log->no_space_stripes_lock); + + r5l_wake_reclaim(log, reserve); + } + mutex_unlock(&log->io_mutex); + + return 0; +} + +void r5l_write_stripe_run(struct r5l_log *log) +{ + if (!log) + return; + mutex_lock(&log->io_mutex); + r5l_submit_current_io(log); + mutex_unlock(&log->io_mutex); +} + +int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) +{ + if (!log) + return -ENODEV; + /* + * we flush log disk cache first, then write stripe data to raid disks. + * So if bio is finished, the log disk cache is flushed already. The + * recovery guarantees we can recovery the bio from log disk, so we + * don't need to flush again + */ + if (bio->bi_iter.bi_size == 0) { + bio_endio(bio); + return 0; + } + bio->bi_rw &= ~REQ_FLUSH; + return -EAGAIN; +} + +/* This will run after log space is reclaimed */ +static void r5l_run_no_space_stripes(struct r5l_log *log) +{ + struct stripe_head *sh; + + spin_lock(&log->no_space_stripes_lock); + while (!list_empty(&log->no_space_stripes)) { + sh = list_first_entry(&log->no_space_stripes, + struct stripe_head, log_list); + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } + spin_unlock(&log->no_space_stripes_lock); +} + +static sector_t r5l_reclaimable_space(struct r5l_log *log) +{ + return r5l_ring_distance(log, log->last_checkpoint, + log->next_checkpoint); +} + +static bool r5l_complete_finished_ios(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + bool found = false; + + assert_spin_locked(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_STRIPE_END) + break; + + log->next_checkpoint = io->log_start; + log->next_cp_seq = io->seq; + + list_del(&io->log_sibling); + r5l_free_io_unit(log, io); + + found = true; + } + + return found; +} + +static void __r5l_stripe_write_finished(struct r5l_io_unit *io) +{ + struct r5l_log *log = io->log; + unsigned long flags; + + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); + + if (!r5l_complete_finished_ios(log)) { + spin_unlock_irqrestore(&log->io_list_lock, flags); + return; + } + + if (r5l_reclaimable_space(log) > log->max_free_space) + r5l_wake_reclaim(log, 0); + + spin_unlock_irqrestore(&log->io_list_lock, flags); + wake_up(&log->iounit_wait); +} + +void r5l_stripe_write_finished(struct stripe_head *sh) +{ + struct r5l_io_unit *io; + + io = sh->log_io; + sh->log_io = NULL; + + if (io && atomic_dec_and_test(&io->pending_stripe)) + __r5l_stripe_write_finished(io); +} + +static void r5l_log_flush_endio(struct bio *bio) +{ + struct r5l_log *log = container_of(bio, struct r5l_log, + flush_bio); + unsigned long flags; + struct r5l_io_unit *io; + + if (bio->bi_error) + md_error(log->rdev->mddev, log->rdev); + + spin_lock_irqsave(&log->io_list_lock, flags); + list_for_each_entry(io, &log->flushing_ios, log_sibling) + r5l_io_run_stripes(io); + list_splice_tail_init(&log->flushing_ios, &log->finished_ios); + spin_unlock_irqrestore(&log->io_list_lock, flags); +} + +/* + * Starting dispatch IO to raid. + * io_unit(meta) consists of a log. There is one situation we want to avoid. A + * broken meta in the middle of a log causes recovery can't find meta at the + * head of log. If operations require meta at the head persistent in log, we + * must make sure meta before it persistent in log too. A case is: + * + * stripe data/parity is in log, we start write stripe to raid disks. stripe + * data/parity must be persistent in log before we do the write to raid disks. + * + * The solution is we restrictly maintain io_unit list order. In this case, we + * only write stripes of an io_unit to raid disks till the io_unit is the first + * one whose data/parity is in log. + */ +void r5l_flush_stripe_to_raid(struct r5l_log *log) +{ + bool do_flush; + + if (!log || !log->need_cache_flush) + return; + + spin_lock_irq(&log->io_list_lock); + /* flush bio is running */ + if (!list_empty(&log->flushing_ios)) { + spin_unlock_irq(&log->io_list_lock); + return; + } + list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); + do_flush = !list_empty(&log->flushing_ios); + spin_unlock_irq(&log->io_list_lock); + + if (!do_flush) + return; + bio_reset(&log->flush_bio); + log->flush_bio.bi_bdev = log->rdev->bdev; + log->flush_bio.bi_end_io = r5l_log_flush_endio; + submit_bio(WRITE_FLUSH, &log->flush_bio); +} + +static void r5l_write_super(struct r5l_log *log, sector_t cp); +static void r5l_write_super_and_discard_space(struct r5l_log *log, + sector_t end) +{ + struct block_device *bdev = log->rdev->bdev; + struct mddev *mddev; + + r5l_write_super(log, end); + + if (!blk_queue_discard(bdev_get_queue(bdev))) + return; + + mddev = log->rdev->mddev; + /* + * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and + * wait for this thread to finish. This thread waits for + * MD_CHANGE_PENDING clear, which is supposed to be done in + * md_check_recovery(). md_check_recovery() tries to get + * reconfig_mutex. Since r5l_quiesce already holds the mutex, + * md_check_recovery() fails, so the PENDING never get cleared. The + * in_teardown check workaround this issue. + */ + if (!log->in_teardown) { + set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags) || + log->in_teardown); + /* + * r5l_quiesce could run after in_teardown check and hold + * mutex first. Superblock might get updated twice. + */ + if (log->in_teardown) + md_update_sb(mddev, 1); + } else { + WARN_ON(!mddev_is_locked(mddev)); + md_update_sb(mddev, 1); + } + + /* discard IO error really doesn't matter, ignore it */ + if (log->last_checkpoint < end) { + blkdev_issue_discard(bdev, + log->last_checkpoint + log->rdev->data_offset, + end - log->last_checkpoint, GFP_NOIO, 0); + } else { + blkdev_issue_discard(bdev, + log->last_checkpoint + log->rdev->data_offset, + log->device_size - log->last_checkpoint, + GFP_NOIO, 0); + blkdev_issue_discard(bdev, log->rdev->data_offset, end, + GFP_NOIO, 0); + } +} + + +static void r5l_do_reclaim(struct r5l_log *log) +{ + sector_t reclaim_target = xchg(&log->reclaim_target, 0); + sector_t reclaimable; + sector_t next_checkpoint; + u64 next_cp_seq; + + spin_lock_irq(&log->io_list_lock); + /* + * move proper io_unit to reclaim list. We should not change the order. + * reclaimable/unreclaimable io_unit can be mixed in the list, we + * shouldn't reuse space of an unreclaimable io_unit + */ + while (1) { + reclaimable = r5l_reclaimable_space(log); + if (reclaimable >= reclaim_target || + (list_empty(&log->running_ios) && + list_empty(&log->io_end_ios) && + list_empty(&log->flushing_ios) && + list_empty(&log->finished_ios))) + break; + + md_wakeup_thread(log->rdev->mddev->thread); + wait_event_lock_irq(log->iounit_wait, + r5l_reclaimable_space(log) > reclaimable, + log->io_list_lock); + } + + next_checkpoint = log->next_checkpoint; + next_cp_seq = log->next_cp_seq; + spin_unlock_irq(&log->io_list_lock); + + BUG_ON(reclaimable < 0); + if (reclaimable == 0) + return; + + /* + * write_super will flush cache of each raid disk. We must write super + * here, because the log area might be reused soon and we don't want to + * confuse recovery + */ + r5l_write_super_and_discard_space(log, next_checkpoint); + + mutex_lock(&log->io_mutex); + log->last_checkpoint = next_checkpoint; + log->last_cp_seq = next_cp_seq; + mutex_unlock(&log->io_mutex); + + r5l_run_no_space_stripes(log); +} + +static void r5l_reclaim_thread(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r5conf *conf = mddev->private; + struct r5l_log *log = conf->log; + + if (!log) + return; + r5l_do_reclaim(log); +} + +static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) +{ + unsigned long target; + unsigned long new = (unsigned long)space; /* overflow in theory */ + + do { + target = log->reclaim_target; + if (new < target) + return; + } while (cmpxchg(&log->reclaim_target, target, new) != target); + md_wakeup_thread(log->reclaim_thread); +} + +void r5l_quiesce(struct r5l_log *log, int state) +{ + struct mddev *mddev; + if (!log || state == 2) + return; + if (state == 0) { + log->in_teardown = 0; + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, + log->rdev->mddev, "reclaim"); + } else if (state == 1) { + /* + * at this point all stripes are finished, so io_unit is at + * least in STRIPE_END state + */ + log->in_teardown = 1; + /* make sure r5l_write_super_and_discard_space exits */ + mddev = log->rdev->mddev; + wake_up(&mddev->sb_wait); + r5l_wake_reclaim(log, -1L); + md_unregister_thread(&log->reclaim_thread); + r5l_do_reclaim(log); + } +} + +bool r5l_log_disk_error(struct r5conf *conf) +{ + /* don't allow write if journal disk is missing */ + if (!conf->log) + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + return test_bit(Faulty, &conf->log->rdev->flags); +} + +struct r5l_recovery_ctx { + struct page *meta_page; /* current meta */ + sector_t meta_total_blocks; /* total size of current meta and data */ + sector_t pos; /* recovery position */ + u64 seq; /* recovery position seq */ +}; + +static int r5l_read_meta_block(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct page *page = ctx->meta_page; + struct r5l_meta_block *mb; + u32 crc, stored_crc; + + if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false)) + return -EIO; + + mb = page_address(page); + stored_crc = le32_to_cpu(mb->checksum); + mb->checksum = 0; + + if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || + le64_to_cpu(mb->seq) != ctx->seq || + mb->version != R5LOG_VERSION || + le64_to_cpu(mb->position) != ctx->pos) + return -EINVAL; + + crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + if (stored_crc != crc) + return -EINVAL; + + if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) + return -EINVAL; + + ctx->meta_total_blocks = BLOCK_SECTORS; + + return 0; +} + +static int r5l_recovery_flush_one_stripe(struct r5l_log *log, + struct r5l_recovery_ctx *ctx, + sector_t stripe_sect, + int *offset, sector_t *log_offset) +{ + struct r5conf *conf = log->rdev->mddev->private; + struct stripe_head *sh; + struct r5l_payload_data_parity *payload; + int disk_index; + + sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); + while (1) { + payload = page_address(ctx->meta_page) + *offset; + + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { + raid5_compute_sector(conf, + le64_to_cpu(payload->location), 0, + &disk_index, sh); + + sync_page_io(log->rdev, *log_offset, PAGE_SIZE, + sh->dev[disk_index].page, READ, false); + sh->dev[disk_index].log_checksum = + le32_to_cpu(payload->checksum[0]); + set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); + ctx->meta_total_blocks += BLOCK_SECTORS; + } else { + disk_index = sh->pd_idx; + sync_page_io(log->rdev, *log_offset, PAGE_SIZE, + sh->dev[disk_index].page, READ, false); + sh->dev[disk_index].log_checksum = + le32_to_cpu(payload->checksum[0]); + set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); + + if (sh->qd_idx >= 0) { + disk_index = sh->qd_idx; + sync_page_io(log->rdev, + r5l_ring_add(log, *log_offset, BLOCK_SECTORS), + PAGE_SIZE, sh->dev[disk_index].page, + READ, false); + sh->dev[disk_index].log_checksum = + le32_to_cpu(payload->checksum[1]); + set_bit(R5_Wantwrite, + &sh->dev[disk_index].flags); + } + ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; + } + + *log_offset = r5l_ring_add(log, *log_offset, + le32_to_cpu(payload->size)); + *offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) + break; + } + + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + void *addr; + u32 checksum; + + if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) + continue; + addr = kmap_atomic(sh->dev[disk_index].page); + checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); + kunmap_atomic(addr); + if (checksum != sh->dev[disk_index].log_checksum) + goto error; + } + + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + struct md_rdev *rdev, *rrdev; + + if (!test_and_clear_bit(R5_Wantwrite, + &sh->dev[disk_index].flags)) + continue; + + /* in case device is broken */ + rdev = rcu_dereference(conf->disks[disk_index].rdev); + if (rdev) + sync_page_io(rdev, stripe_sect, PAGE_SIZE, + sh->dev[disk_index].page, WRITE, false); + rrdev = rcu_dereference(conf->disks[disk_index].replacement); + if (rrdev) + sync_page_io(rrdev, stripe_sect, PAGE_SIZE, + sh->dev[disk_index].page, WRITE, false); + } + raid5_release_stripe(sh); + return 0; + +error: + for (disk_index = 0; disk_index < sh->disks; disk_index++) + sh->dev[disk_index].flags = 0; + raid5_release_stripe(sh); + return -EINVAL; +} + +static int r5l_recovery_flush_one_meta(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct r5conf *conf = log->rdev->mddev->private; + struct r5l_payload_data_parity *payload; + struct r5l_meta_block *mb; + int offset; + sector_t log_offset; + sector_t stripe_sector; + + mb = page_address(ctx->meta_page); + offset = sizeof(struct r5l_meta_block); + log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + + while (offset < le32_to_cpu(mb->meta_size)) { + int dd; + + payload = (void *)mb + offset; + stripe_sector = raid5_compute_sector(conf, + le64_to_cpu(payload->location), 0, &dd, NULL); + if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, + &offset, &log_offset)) + return -EINVAL; + } + return 0; +} + +/* copy data/parity from log to raid disks */ +static void r5l_recovery_flush_log(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + while (1) { + if (r5l_read_meta_block(log, ctx)) + return; + if (r5l_recovery_flush_one_meta(log, ctx)) + return; + ctx->seq++; + ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); + } +} + +static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, + u64 seq) +{ + struct page *page; + struct r5l_meta_block *mb; + u32 crc; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return -ENOMEM; + mb = page_address(page); + mb->magic = cpu_to_le32(R5LOG_MAGIC); + mb->version = R5LOG_VERSION; + mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); + mb->seq = cpu_to_le64(seq); + mb->position = cpu_to_le64(pos); + crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + mb->checksum = cpu_to_le32(crc); + + if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) { + __free_page(page); + return -EIO; + } + __free_page(page); + return 0; +} + +static int r5l_recovery_log(struct r5l_log *log) +{ + struct r5l_recovery_ctx ctx; + + ctx.pos = log->last_checkpoint; + ctx.seq = log->last_cp_seq; + ctx.meta_page = alloc_page(GFP_KERNEL); + if (!ctx.meta_page) + return -ENOMEM; + + r5l_recovery_flush_log(log, &ctx); + __free_page(ctx.meta_page); + + /* + * we did a recovery. Now ctx.pos points to an invalid meta block. New + * log will start here. but we can't let superblock point to last valid + * meta block. The log might looks like: + * | meta 1| meta 2| meta 3| + * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If + * superblock points to meta 1, we write a new valid meta 2n. if crash + * happens again, new recovery will start from meta 1. Since meta 2n is + * valid now, recovery will think meta 3 is valid, which is wrong. + * The solution is we create a new meta in meta2 with its seq == meta + * 1's seq + 10 and let superblock points to meta2. The same recovery will + * not think meta 3 is a valid meta, because its seq doesn't match + */ + if (ctx.seq > log->last_cp_seq + 1) { + int ret; + + ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); + if (ret) + return ret; + log->seq = ctx.seq + 11; + log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); + r5l_write_super(log, ctx.pos); + } else { + log->log_start = ctx.pos; + log->seq = ctx.seq; + } + return 0; +} + +static void r5l_write_super(struct r5l_log *log, sector_t cp) +{ + struct mddev *mddev = log->rdev->mddev; + + log->rdev->journal_tail = cp; + set_bit(MD_CHANGE_DEVS, &mddev->flags); +} + +static int r5l_load_log(struct r5l_log *log) +{ + struct md_rdev *rdev = log->rdev; + struct page *page; + struct r5l_meta_block *mb; + sector_t cp = log->rdev->journal_tail; + u32 stored_crc, expected_crc; + bool create_super = false; + int ret; + + /* Make sure it's valid */ + if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) + cp = 0; + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) { + ret = -EIO; + goto ioerr; + } + mb = page_address(page); + + if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || + mb->version != R5LOG_VERSION) { + create_super = true; + goto create; + } + stored_crc = le32_to_cpu(mb->checksum); + mb->checksum = 0; + expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + if (stored_crc != expected_crc) { + create_super = true; + goto create; + } + if (le64_to_cpu(mb->position) != cp) { + create_super = true; + goto create; + } +create: + if (create_super) { + log->last_cp_seq = prandom_u32(); + cp = 0; + /* + * Make sure super points to correct address. Log might have + * data very soon. If super hasn't correct log tail address, + * recovery can't find the log + */ + r5l_write_super(log, cp); + } else + log->last_cp_seq = le64_to_cpu(mb->seq); + + log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); + log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; + if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) + log->max_free_space = RECLAIM_MAX_FREE_SPACE; + log->last_checkpoint = cp; + + __free_page(page); + + return r5l_recovery_log(log); +ioerr: + __free_page(page); + return ret; +} + +int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) +{ + struct r5l_log *log; + + if (PAGE_SIZE != 4096) + return -EINVAL; + log = kzalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return -ENOMEM; + log->rdev = rdev; + + log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0); + + log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, + sizeof(rdev->mddev->uuid)); + + mutex_init(&log->io_mutex); + + spin_lock_init(&log->io_list_lock); + INIT_LIST_HEAD(&log->running_ios); + INIT_LIST_HEAD(&log->io_end_ios); + INIT_LIST_HEAD(&log->flushing_ios); + INIT_LIST_HEAD(&log->finished_ios); + bio_init(&log->flush_bio); + + log->io_kc = KMEM_CACHE(r5l_io_unit, 0); + if (!log->io_kc) + goto io_kc; + + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, + log->rdev->mddev, "reclaim"); + if (!log->reclaim_thread) + goto reclaim_thread; + init_waitqueue_head(&log->iounit_wait); + + INIT_LIST_HEAD(&log->no_space_stripes); + spin_lock_init(&log->no_space_stripes_lock); + + if (r5l_load_log(log)) + goto error; + + conf->log = log; + return 0; +error: + md_unregister_thread(&log->reclaim_thread); +reclaim_thread: + kmem_cache_destroy(log->io_kc); +io_kc: + kfree(log); + return -EINVAL; +} + +void r5l_exit_log(struct r5l_log *log) +{ + md_unregister_thread(&log->reclaim_thread); + kmem_cache_destroy(log->io_kc); + kfree(log); +} diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 643d217..704ef7f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void return_io(struct bio *return_bi) +static void return_io(struct bio_list *return_bi) { - struct bio *bi = return_bi; - while (bi) { - - return_bi = bi->bi_next; - bi->bi_next = NULL; + struct bio *bi; + while ((bi = bio_list_pop(return_bi)) != NULL) { bi->bi_iter.bi_size = 0; trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); - bio_endio(bi, 0); - bi = return_bi; + bio_endio(bi); } } @@ -357,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf, struct list_head *list = &temp_inactive_list[size - 1]; /* - * We don't hold any lock here yet, get_active_stripe() might + * We don't hold any lock here yet, raid5_get_active_stripe() might * remove stripes from the list */ if (!list_empty_careful(list)) { @@ -417,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf, return count; } -static void release_stripe(struct stripe_head *sh) +void raid5_release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; @@ -662,9 +658,9 @@ static int has_failed(struct r5conf *conf) return 0; } -static struct stripe_head * -get_active_stripe(struct r5conf *conf, sector_t sector, - int previous, int noblock, int noquiesce) +struct stripe_head * +raid5_get_active_stripe(struct r5conf *conf, sector_t sector, + int previous, int noblock, int noquiesce) { struct stripe_head *sh; int hash = stripe_hash_locks_hash(sector); @@ -759,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) /* Only freshly new full stripe normal write stripe can be added to a batch list */ static bool stripe_can_batch(struct stripe_head *sh) { + struct r5conf *conf = sh->raid_conf; + + if (conf->log) + return false; return test_bit(STRIPE_BATCH_READY, &sh->state) && !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && is_full_stripe_write(sh); @@ -862,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh unlock_out: unlock_two_stripes(head, sh); out: - release_stripe(head); + raid5_release_stripe(head); } /* Determine if 'data_offset' or 'new_data_offset' should be used @@ -887,9 +887,9 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) } static void -raid5_end_read_request(struct bio *bi, int error); +raid5_end_read_request(struct bio *bi); static void -raid5_end_write_request(struct bio *bi, int error); +raid5_end_write_request(struct bio *bi); static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { @@ -899,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) might_sleep(); + if (r5l_write_stripe(conf->log, sh) == 0) + return; for (i = disks; i--; ) { int rw; int replace_only = 0; @@ -1177,7 +1179,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - struct bio *return_bi = NULL; + struct bio_list return_bi = BIO_EMPTY_LIST; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -1201,20 +1203,18 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_active_stripes(rbi)) { - rbi->bi_next = return_bi; - return_bi = rbi; - } + if (!raid5_dec_bi_active_stripes(rbi)) + bio_list_add(&return_bi, rbi); rbi = rbi2; } } } clear_bit(STRIPE_BIOFILL_RUN, &sh->state); - return_io(return_bi); + return_io(&return_bi); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void ops_run_biofill(struct stripe_head *sh) @@ -1277,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref) if (sh->check_state == check_state_compute_run) sh->check_state = check_state_compute_result; set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } /* return a pointer to the address conversion region of the scribble buffer */ @@ -1703,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) } set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void @@ -1861,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref) sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) @@ -2023,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) /* we just created an active stripe so... */ atomic_inc(&conf->active_stripes); - release_stripe(sh); + raid5_release_stripe(sh); conf->max_nr_stripes++; return 1; } @@ -2242,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!p) err = -ENOMEM; } - release_stripe(nsh); + raid5_release_stripe(nsh); } /* critical section pass, GFP_NOIO no longer needed */ @@ -2256,7 +2256,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) static int drop_one_stripe(struct r5conf *conf) { struct stripe_head *sh; - int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; + int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; spin_lock_irq(conf->hash_locks + hash); sh = get_free_stripe(conf, hash); @@ -2277,17 +2277,15 @@ static void shrink_stripes(struct r5conf *conf) drop_one_stripe(conf)) ; - if (conf->slab_cache) - kmem_cache_destroy(conf->slab_cache); + kmem_cache_destroy(conf->slab_cache); conf->slab_cache = NULL; } -static void raid5_end_read_request(struct bio * bi, int error) +static void raid5_end_read_request(struct bio * bi) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); char b[BDEVNAME_SIZE]; struct md_rdev *rdev = NULL; sector_t s; @@ -2296,9 +2294,9 @@ static void raid5_end_read_request(struct bio * bi, int error) if (bi == &sh->dev[i].req) break; - pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", + pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); + bi->bi_error); if (i == disks) { BUG(); return; @@ -2317,7 +2315,7 @@ static void raid5_end_read_request(struct bio * bi, int error) s = sh->sector + rdev->new_data_offset; else s = sh->sector + rdev->data_offset; - if (uptodate) { + if (!bi->bi_error) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { /* Note that this cannot happen on a @@ -2402,16 +2400,15 @@ static void raid5_end_read_request(struct bio * bi, int error) rdev_dec_pending(rdev, conf->mddev); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } -static void raid5_end_write_request(struct bio *bi, int error) +static void raid5_end_write_request(struct bio *bi) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; struct md_rdev *uninitialized_var(rdev); - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); sector_t first_bad; int bad_sectors; int replacement = 0; @@ -2434,23 +2431,23 @@ static void raid5_end_write_request(struct bio *bi, int error) break; } } - pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", + pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); + bi->bi_error); if (i == disks) { BUG(); return; } if (replacement) { - if (!uptodate) + if (bi->bi_error) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { - if (!uptodate) { + if (bi->bi_error) { set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); @@ -2471,20 +2468,18 @@ static void raid5_end_write_request(struct bio *bi, int error) } rdev_dec_pending(rdev, conf->mddev); - if (sh->batch_head && !uptodate && !replacement) + if (sh->batch_head && bi->bi_error && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); if (sh->batch_head && sh != sh->batch_head) - release_stripe(sh->batch_head); + raid5_release_stripe(sh->batch_head); } -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); - static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; @@ -2500,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) dev->rreq.bi_private = sh; dev->flags = 0; - dev->sector = compute_blocknr(sh, i, previous); + dev->sector = raid5_compute_blocknr(sh, i, previous); } static void error(struct mddev *mddev, struct md_rdev *rdev) @@ -2519,6 +2514,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" "md/raid:%s: Operation continuing on %d devices.\n", @@ -2532,9 +2528,9 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, - int previous, int *dd_idx, - struct stripe_head *sh) +sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh) { sector_t stripe, stripe2; sector_t chunk_number; @@ -2734,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, return new_sector; } -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) +sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) { struct r5conf *conf = sh->raid_conf; int raid_disks = sh->disks; @@ -3071,7 +3067,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, - struct bio **return_bi) + struct bio_list *return_bi) { int i; BUG_ON(sh->batch_head); @@ -3106,17 +3102,19 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) bitmap_end = 1; + r5l_stripe_write_finished(sh); + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); + + bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = nextbi; } @@ -3136,11 +3134,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); + + bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = bi2; } @@ -3149,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, * the data has not reached the cache yet. */ if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && + s->failed > conf->max_degraded && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { spin_lock_irq(&sh->stripe_lock); @@ -3157,15 +3156,16 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); + if (bi) + s->to_read--; while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_active_stripes(bi)) { - bi->bi_next = *return_bi; - *return_bi = bi; - } + + bi->bi_error = -EIO; + if (!raid5_dec_bi_active_stripes(bi)) + bio_list_add(return_bi, bi); bi = nextbi; } } @@ -3177,6 +3177,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, */ clear_bit(R5_LOCKED, &sh->dev[i].flags); } + s->to_write = 0; + s->written = 0; if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) @@ -3308,7 +3310,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, */ return 0; - for (i = 0; i < s->failed; i++) { + for (i = 0; i < s->failed && i < 2; i++) { if (fdev[i]->towrite && !test_bit(R5_UPTODATE, &fdev[i]->flags) && !test_bit(R5_OVERWRITE, &fdev[i]->flags)) @@ -3332,7 +3334,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; - for (i = 0; i < s->failed; i++) { + for (i = 0; i < s->failed && i < 2; i++) { if (s->failed_num[i] != sh->pd_idx && s->failed_num[i] != sh->qd_idx && !test_bit(R5_UPTODATE, &fdev[i]->flags) && @@ -3444,7 +3446,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, * never LOCKED, so we don't need to test 'failed' directly. */ static void handle_stripe_clean_event(struct r5conf *conf, - struct stripe_head *sh, int disks, struct bio **return_bi) + struct stripe_head *sh, int disks, struct bio_list *return_bi) { int i; struct r5dev *dev; @@ -3478,8 +3480,7 @@ returnbi: wbi2 = r5_next_bio(wbi, dev->sector); if (!raid5_dec_bi_active_stripes(wbi)) { md_write_end(conf->mddev); - wbi->bi_next = *return_bi; - *return_bi = wbi; + bio_list_add(return_bi, wbi); } wbi = wbi2; } @@ -3503,8 +3504,12 @@ returnbi: WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); WARN_ON(dev->page != dev->orig_page); } + + r5l_stripe_write_finished(sh); + if (!discard_pending && test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { + int hash; clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); if (sh->qd_idx >= 0) { @@ -3518,16 +3523,17 @@ returnbi: * no updated data, so remove it from hash list and the stripe * will be reinitialized */ - spin_lock_irq(&conf->device_lock); unhash: + hash = sh->hash_lock_index; + spin_lock_irq(conf->hash_locks + hash); remove_hash(sh); + spin_unlock_irq(conf->hash_locks + hash); if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, batch_list); if (sh != head_sh) goto unhash; } - spin_unlock_irq(&conf->device_lock); sh = head_sh; if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) @@ -3943,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) struct stripe_head *sh2; struct async_submit_ctl submit; - sector_t bn = compute_blocknr(sh, i, 1); + sector_t bn = raid5_compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); - sh2 = get_active_stripe(conf, s, 0, 1, 1); + sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -3956,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) if (!test_bit(STRIPE_EXPANDING, &sh2->state) || test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { /* must have already done this block */ - release_stripe(sh2); + raid5_release_stripe(sh2); continue; } @@ -3977,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) set_bit(STRIPE_EXPAND_READY, &sh2->state); set_bit(STRIPE_HANDLE, &sh2->state); } - release_stripe(sh2); + raid5_release_stripe(sh2); } /* done submitting copies, wait for them to complete */ @@ -4012,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; s->failed_num[0] = -1; s->failed_num[1] = -1; + s->log_failed = r5l_log_disk_error(conf); /* Now to look around and see what can be done */ rcu_read_lock(); @@ -4263,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, if (handle_flags == 0 || sh->state & handle_flags) set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } spin_lock_irq(&head_sh->stripe_lock); head_sh->batch_head = NULL; @@ -4324,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh) analyse_stripe(sh, &s); + if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) + goto finish; + if (s.handle_bad_blocks) { set_bit(STRIPE_HANDLE, &sh->state); goto finish; @@ -4352,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh) /* check if the array has lost more than max_degraded devices and, * if so, some requests might need to be failed. */ - if (s.failed > conf->max_degraded) { + if (s.failed > conf->max_degraded || s.log_failed) { sh->check_state = 0; sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); @@ -4510,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh_src - = get_active_stripe(conf, sh->sector, 1, 1, 1); + = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { /* sh cannot be written until sh_src has been read. * so arrange for sh to be delayed a little @@ -4520,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh) if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh_src->state)) atomic_inc(&conf->preread_active_stripes); - release_stripe(sh_src); + raid5_release_stripe(sh_src); goto finish; } if (sh_src) - release_stripe(sh_src); + raid5_release_stripe(sh_src); sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); @@ -4612,7 +4622,15 @@ finish: md_wakeup_thread(conf->mddev->thread); } - return_io(s.return_bi); + if (!bio_list_empty(&s.return_bi)) { + if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->return_bi, &s.return_bi); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else + return_io(&s.return_bi); + } clear_bit_unlock(STRIPE_ACTIVE, &sh->state); } @@ -4669,43 +4687,14 @@ static int raid5_congested(struct mddev *mddev, int bits) return 0; } -/* We want read requests to align with chunks where possible, - * but write requests don't need to. - */ -static int raid5_mergeable_bvec(struct mddev *mddev, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int max; - unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bvm->bi_size >> 9; - - /* - * always allow writes to be mergeable, read as well if array - * is degraded as we'll go through stripe cache anyway. - */ - if ((bvm->bi_rw & 1) == WRITE || mddev->degraded) - return biovec->bv_len; - - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; - if (max < 0) max = 0; - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - else - return max; -} - static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { + struct r5conf *conf = mddev->private; sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); - unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; + chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -4756,13 +4745,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) * first). * If the read failed.. */ -static void raid5_align_endio(struct bio *bi, int error) +static void raid5_align_endio(struct bio *bi) { struct bio* raid_bi = bi->bi_private; struct mddev *mddev; struct r5conf *conf; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); struct md_rdev *rdev; + int error = bi->bi_error; bio_put(bi); @@ -4773,10 +4762,10 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); - if (!error && uptodate) { + if (!error) { trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), raid_bi, 0); - bio_endio(raid_bi, 0); + bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); return; @@ -4787,26 +4776,7 @@ static void raid5_align_endio(struct bio *bi, int error) add_bio_to_retry(raid_bi, conf); } -static int bio_fits_rdev(struct bio *bi) -{ - struct request_queue *q = bdev_get_queue(bi->bi_bdev); - - if (bio_sectors(bi) > queue_max_sectors(q)) - return 0; - blk_recount_segments(q, bi); - if (bi->bi_phys_segments > queue_max_segments(q)) - return 0; - - if (q->merge_bvec_fn) - /* it's too hard to apply the merge_bvec_fn at this stage, - * just just give up - */ - return 0; - - return 1; -} - -static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) +static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) { struct r5conf *conf = mddev->private; int dd_idx; @@ -4815,7 +4785,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) sector_t end_sector; if (!in_chunk_boundary(mddev, raid_bio)) { - pr_debug("chunk_aligned_read : non aligned\n"); + pr_debug("%s: non aligned\n", __func__); return 0; } /* @@ -4857,13 +4827,11 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; - __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags); + bio_clear_flag(align_bi, BIO_SEG_VALID); - if (!bio_fits_rdev(align_bi) || - is_badblock(rdev, align_bi->bi_iter.bi_sector, + if (is_badblock(rdev, align_bi->bi_iter.bi_sector, bio_sectors(align_bi), &first_bad, &bad_sectors)) { - /* too big in some way, or has a known bad block */ bio_put(align_bi); rdev_dec_pending(rdev, mddev); return 0; @@ -4892,6 +4860,31 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) } } +static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) +{ + struct bio *split; + + do { + sector_t sector = raid_bio->bi_iter.bi_sector; + unsigned chunk_sects = mddev->chunk_sectors; + unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); + + if (sectors < bio_sectors(raid_bio)) { + split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); + bio_chain(split, raid_bio); + } else + split = raid_bio; + + if (!raid5_read_one_chunk(mddev, split)) { + if (split != raid_bio) + generic_make_request(raid_bio); + return split; + } + } while (split != raid_bio); + + return NULL; +} + /* __get_priority_stripe - get the next stripe to process * * Full stripe writes are allowed to pass preread active stripes up until @@ -5033,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev, struct raid5_plug_cb *cb; if (!blk_cb) { - release_stripe(sh); + raid5_release_stripe(sh); return; } @@ -5049,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev, if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) list_add_tail(&sh->lru, &cb->list); else - release_stripe(sh); + raid5_release_stripe(sh); } static void make_discard_request(struct mddev *mddev, struct bio *bi) @@ -5084,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) DEFINE_WAIT(w); int d; again: - sh = get_active_stripe(conf, logical_sector, 0, 0, 0); + sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); if (test_bit(STRIPE_SYNCING, &sh->state)) { - release_stripe(sh); + raid5_release_stripe(sh); schedule(); goto again; } @@ -5101,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) if (sh->dev[d].towrite || sh->dev[d].toread) { set_bit(R5_Overlap, &sh->dev[d].flags); spin_unlock_irq(&sh->stripe_lock); - release_stripe(sh); + raid5_release_stripe(sh); schedule(); goto again; } @@ -5140,7 +5133,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { md_write_end(mddev); - bio_endio(bi, 0); + bio_endio(bi); } } @@ -5157,8 +5150,15 @@ static void make_request(struct mddev *mddev, struct bio * bi) bool do_prepare; if (unlikely(bi->bi_rw & REQ_FLUSH)) { - md_flush_request(mddev, bi); - return; + int ret = r5l_handle_flush_request(conf->log, bi); + + if (ret == 0) + return; + if (ret == -ENODEV) { + md_flush_request(mddev, bi); + return; + } + /* ret == -EAGAIN, fallback */ } md_write_start(mddev, bi); @@ -5169,9 +5169,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) * data on failed drives. */ if (rw == READ && mddev->degraded == 0 && - mddev->reshape_position == MaxSector && - chunk_aligned_read(mddev,bi)) - return; + mddev->reshape_position == MaxSector) { + bi = chunk_aligned_read(mddev, bi); + if (!bi) + return; + } if (unlikely(bi->bi_rw & REQ_DISCARD)) { make_discard_request(mddev, bi); @@ -5229,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) (unsigned long long)new_sector, (unsigned long long)logical_sector); - sh = get_active_stripe(conf, new_sector, previous, + sh = raid5_get_active_stripe(conf, new_sector, previous, (bi->bi_rw&RWA_MASK), 0); if (sh) { if (unlikely(previous)) { @@ -5250,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) must_retry = 1; spin_unlock_irq(&conf->device_lock); if (must_retry) { - release_stripe(sh); + raid5_release_stripe(sh); schedule(); do_prepare = true; goto retry; @@ -5260,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi) /* Might have got the wrong stripe_head * by accident */ - release_stripe(sh); + raid5_release_stripe(sh); goto retry; } if (rw == WRITE && logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { - release_stripe(sh); + raid5_release_stripe(sh); /* As the suspend_* range is controlled by * userspace, we want an interruptible * wait. @@ -5290,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) * and wait a while */ md_wakeup_thread(mddev->thread); - release_stripe(sh); + raid5_release_stripe(sh); schedule(); do_prepare = true; goto retry; @@ -5304,7 +5306,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ - clear_bit(BIO_UPTODATE, &bi->bi_flags); + bi->bi_error = -EIO; break; } } @@ -5318,7 +5320,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); - bio_endio(bi, 0); + bio_endio(bi); } } @@ -5347,6 +5349,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_t stripe_addr; int reshape_sectors; struct list_head stripes; + sector_t retn; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -5354,6 +5357,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; + } else if (mddev->reshape_backwards && + conf->reshape_progress == MaxSector) { + /* shouldn't happen, but just in case, finish up.*/ + sector_nr = MaxSector; } else if (!mddev->reshape_backwards && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; @@ -5362,7 +5369,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk mddev->curr_resync_completed = sector_nr; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); *skipped = 1; - return sector_nr; + retn = sector_nr; + goto finish; } } @@ -5370,10 +5378,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * If old and new chunk sizes differ, we need to process the * largest of these */ - if (mddev->new_chunk_sectors > mddev->chunk_sectors) - reshape_sectors = mddev->new_chunk_sectors; - else - reshape_sectors = mddev->chunk_sectors; + + reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); /* We update the metadata at least every 10 seconds, or when * the data about to be copied would over-write the source of @@ -5388,11 +5394,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->reshape_backwards) { - writepos -= min_t(sector_t, reshape_sectors, writepos); + BUG_ON(writepos < reshape_sectors); + writepos -= reshape_sectors; readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; + /* readpos and safepos are worst-case calculations. + * A negative number is overly pessimistic, and causes + * obvious problems for unsigned storage. So clip to 0. + */ readpos -= min_t(sector_t, reshape_sectors, readpos); safepos -= min_t(sector_t, reshape_sectors, safepos); } @@ -5468,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped_disk = 0; - sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); + sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -5481,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (conf->level == 6 && j == sh->qd_idx) continue; - s = compute_blocknr(sh, j, 0); + s = raid5_compute_blocknr(sh, j, 0); if (s < raid5_size(mddev, 0, 0)) { skipped_disk = 1; continue; @@ -5517,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - sh = get_active_stripe(conf, first_sector, 1, 0, 1); + sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); first_sector += STRIPE_SECTORS; } /* Now that the sources are clearly marked, we can release @@ -5529,13 +5540,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk while (!list_empty(&stripes)) { sh = list_entry(stripes.next, struct stripe_head, lru); list_del_init(&sh->lru); - release_stripe(sh); + raid5_release_stripe(sh); } /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. */ sector_nr += reshape_sectors; - if ((sector_nr - mddev->curr_resync_completed) * 2 + retn = reshape_sectors; +finish: + if (mddev->curr_resync_completed > mddev->resync_max || + (sector_nr - mddev->curr_resync_completed) * 2 >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, @@ -5560,7 +5574,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } ret: - return reshape_sectors; + return retn; } static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) @@ -5622,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); - sh = get_active_stripe(conf, sector_nr, 0, 1, 0); + sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, 0, 0, 0); + sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -5650,7 +5664,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); return STRIPE_SECTORS; } @@ -5689,7 +5703,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1, 1); + sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); if (!sh) { /* failed to get a stripe - must wait */ @@ -5699,7 +5713,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { - release_stripe(sh); + raid5_release_stripe(sh); raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; @@ -5707,14 +5721,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); handle_stripe(sh); - release_stripe(sh); + raid5_release_stripe(sh); handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); if (remaining == 0) { trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), raid_bio, 0); - bio_endio(raid_bio, 0); + bio_endio(raid_bio); } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -5737,8 +5751,12 @@ static int handle_active_stripes(struct r5conf *conf, int group, for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) if (!list_empty(temp_inactive_list + i)) break; - if (i == NR_STRIPE_HASH_LOCKS) + if (i == NR_STRIPE_HASH_LOCKS) { + spin_unlock_irq(&conf->device_lock); + r5l_flush_stripe_to_raid(conf->log); + spin_lock_irq(&conf->device_lock); return batch_size; + } release_inactive = true; } spin_unlock_irq(&conf->device_lock); @@ -5746,6 +5764,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, release_inactive_stripe_list(conf, temp_inactive_list, NR_STRIPE_HASH_LOCKS); + r5l_flush_stripe_to_raid(conf->log); if (release_inactive) { spin_lock_irq(&conf->device_lock); return 0; @@ -5753,6 +5772,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, for (i = 0; i < batch_size; i++) handle_stripe(batch[i]); + r5l_write_stripe_run(conf->log); cond_resched(); @@ -5816,6 +5836,18 @@ static void raid5d(struct md_thread *thread) md_check_recovery(mddev); + if (!bio_list_empty(&conf->return_bi) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + struct bio_list tmp = BIO_EMPTY_LIST; + spin_lock_irq(&conf->device_lock); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + bio_list_merge(&tmp, &conf->return_bi); + bio_list_init(&conf->return_bi); + } + spin_unlock_irq(&conf->device_lock); + return_io(&tmp); + } + blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); @@ -5874,6 +5906,8 @@ static void raid5d(struct md_thread *thread) mutex_unlock(&conf->cache_size_mutex); } + r5l_flush_stripe_to_raid(conf->log); + async_tx_issue_pending_all(); blk_finish_plug(&plug); @@ -6256,8 +6290,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) /* size is defined by the smallest of previous and new size */ raid_disks = min(conf->raid_disks, conf->previous_raid_disks); - sectors &= ~((sector_t)mddev->chunk_sectors - 1); - sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); + sectors &= ~((sector_t)conf->chunk_sectors - 1); + sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -6311,8 +6345,11 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + if (conf->log) + r5l_exit_log(conf->log); if (conf->shrinker.seeks) unregister_shrinker(&conf->shrinker); + free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); @@ -6388,7 +6425,8 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink, if (mutex_trylock(&conf->cache_size_mutex)) { ret= 0; - while (ret < sc->nr_to_scan) { + while (ret < sc->nr_to_scan && + conf->max_nr_stripes > conf->min_nr_stripes) { if (drop_one_stripe(conf) == 0) { ret = SHRINK_STOP; break; @@ -6474,6 +6512,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); + bio_list_init(&conf->return_bi); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -6523,7 +6562,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) rdev_for_each(rdev, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= max_disks - || raid_disk < 0) + || raid_disk < 0 || test_bit(Journal, &rdev->flags)) continue; disk = conf->disks + raid_disk; @@ -6563,6 +6602,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf->reshape_progress != MaxSector) { conf->prev_chunk_sectors = mddev->chunk_sectors; conf->prev_algo = mddev->layout; + } else { + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->prev_algo = conf->algorithm; } conf->min_nr_stripes = NR_STRIPES; @@ -6640,6 +6682,7 @@ static int run(struct mddev *mddev) int working_disks = 0; int dirty_parity_disks = 0; struct md_rdev *rdev; + struct md_rdev *journal_dev = NULL; sector_t reshape_offset = 0; int i; long long min_offset_diff = 0; @@ -6652,6 +6695,11 @@ static int run(struct mddev *mddev) rdev_for_each(rdev, mddev) { long long diff; + + if (test_bit(Journal, &rdev->flags)) { + journal_dev = rdev; + continue; + } if (rdev->raid_disk < 0) continue; diff = (rdev->new_data_offset - rdev->data_offset); @@ -6682,6 +6730,14 @@ static int run(struct mddev *mddev) sector_t here_new, here_old; int old_disks; int max_degraded = (mddev->level == 6 ? 2 : 1); + int chunk_sectors; + int new_data_disks; + + if (journal_dev) { + printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n", + mdname(mddev)); + return -EINVAL; + } if (mddev->new_level != mddev->level) { printk(KERN_ERR "md/raid:%s: unsupported reshape " @@ -6693,28 +6749,25 @@ static int run(struct mddev *mddev) /* reshape_position must be on a new-stripe boundary, and one * further up in new geometry must map after here in old * geometry. + * If the chunk sizes are different, then as we perform reshape + * in units of the largest of the two, reshape_position needs + * be a multiple of the largest chunk size times new data disks. */ here_new = mddev->reshape_position; - if (sector_div(here_new, mddev->new_chunk_sectors * - (mddev->raid_disks - max_degraded))) { + chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); + new_data_disks = mddev->raid_disks - max_degraded; + if (sector_div(here_new, chunk_sectors * new_data_disks)) { printk(KERN_ERR "md/raid:%s: reshape_position not " "on a stripe boundary\n", mdname(mddev)); return -EINVAL; } - reshape_offset = here_new * mddev->new_chunk_sectors; + reshape_offset = here_new * chunk_sectors; /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, mddev->chunk_sectors * - (old_disks-max_degraded)); + sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ if (mddev->delta_disks == 0) { - if ((here_new * mddev->new_chunk_sectors != - here_old * mddev->chunk_sectors)) { - printk(KERN_ERR "md/raid:%s: reshape position is" - " confused - aborting\n", mdname(mddev)); - return -EINVAL; - } /* We cannot be sure it is safe to start an in-place * reshape. It is only safe if user-space is monitoring * and taking constant backups. @@ -6733,10 +6786,10 @@ static int run(struct mddev *mddev) return -EINVAL; } } else if (mddev->reshape_backwards - ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= - here_old * mddev->chunk_sectors) - : (here_new * mddev->new_chunk_sectors >= - here_old * mddev->chunk_sectors + (-min_offset_diff))) { + ? (here_new * chunk_sectors + min_offset_diff <= + here_old * chunk_sectors) + : (here_new * chunk_sectors >= + here_old * chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "md/raid:%s: reshape_position too early for " "auto-recovery - aborting.\n", @@ -6761,6 +6814,13 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) { + printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n", + mdname(mddev)); + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + } + conf->min_offset_diff = min_offset_diff; mddev->thread = conf->thread; conf->thread = NULL; @@ -6964,6 +7024,14 @@ static int run(struct mddev *mddev) mddev->queue); } + if (journal_dev) { + char b[BDEVNAME_SIZE]; + + printk(KERN_INFO"md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(journal_dev->bdev, b)); + r5l_init_log(conf, journal_dev); + } + return 0; abort: md_unregister_thread(&mddev->thread); @@ -6988,7 +7056,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) int i; seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, - mddev->chunk_sectors / 2, mddev->layout); + conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", @@ -7073,6 +7141,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct disk_info *p = conf->disks + number; print_raid5_conf(conf); + if (test_bit(Journal, &rdev->flags)) { + /* + * journal disk is not removable, but we need give a chance to + * update superblock of other disks. Otherwise journal disk + * will be considered as 'fresh' + */ + set_bit(MD_CHANGE_DEVS, &mddev->flags); + return -EINVAL; + } if (rdev == p->rdev) rdevp = &p->rdev; else if (rdev == p->replacement) @@ -7135,6 +7212,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; + if (test_bit(Journal, &rdev->flags)) + return -EINVAL; if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; @@ -7194,7 +7273,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize; - sectors &= ~((sector_t)mddev->chunk_sectors - 1); + struct r5conf *conf = mddev->private; + + if (conf->log) + return -EINVAL; + sectors &= ~((sector_t)conf->chunk_sectors - 1); newsize = raid5_size(mddev, sectors, mddev->raid_disks); if (mddev->external_size && mddev->array_sectors > newsize) @@ -7245,6 +7328,8 @@ static int check_reshape(struct mddev *mddev) { struct r5conf *conf = mddev->private; + if (conf->log) + return -EINVAL; if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && mddev->new_chunk_sectors == mddev->chunk_sectors) @@ -7433,6 +7518,7 @@ static void end_reshape(struct r5conf *conf) rdev->data_offset = rdev->new_data_offset; smp_wmb(); conf->reshape_progress = MaxSector; + conf->mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); @@ -7520,6 +7606,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) unlock_all_device_hash_locks_irq(conf); break; } + r5l_quiesce(conf->log, state); } static void *raid45_takeover_raid0(struct mddev *mddev, int level) @@ -7778,7 +7865,6 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .congested = raid5_congested, - .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid5_personality = { @@ -7802,7 +7888,6 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .congested = raid5_congested, - .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid4_personality = @@ -7827,7 +7912,6 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .congested = raid5_congested, - .mergeable_bvec = raid5_mergeable_bvec, }; static int __init raid5_init(void) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index d051442..a415e1c 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -223,6 +223,9 @@ struct stripe_head { struct stripe_head *batch_head; /* protected by stripe lock */ spinlock_t batch_lock; /* only header's lock is useful */ struct list_head batch_list; /* protected by head's batch lock*/ + + struct r5l_io_unit *log_io; + struct list_head log_list; /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target @@ -244,6 +247,7 @@ struct stripe_head { struct bio *toread, *read, *towrite, *written; sector_t sector; /* sector of this page */ unsigned long flags; + u32 log_checksum; } dev[1]; /* allocated with extra space depending of RAID geometry */ }; @@ -265,9 +269,10 @@ struct stripe_head_state { int dec_preread_active; unsigned long ops_request; - struct bio *return_bi; + struct bio_list return_bi; struct md_rdev *blocked_rdev; int handle_bad_blocks; + int log_failed; }; /* Flags for struct r5dev.flags */ @@ -340,6 +345,7 @@ enum { STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add * to batch yet. */ + STRIPE_LOG_TRAPPED, /* trapped into log */ }; #define STRIPE_EXPAND_SYNC_FLAGS \ @@ -476,6 +482,9 @@ struct r5conf { int skip_copy; /* Don't copy data from bio to stripe cache */ struct list_head *last_hold; /* detect hold_list promotions */ + /* bios to have bi_end_io called after metadata is synced */ + struct bio_list return_bi; + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ /* unfortunately we need two cache names as we temporarily have * two caches. @@ -540,6 +549,7 @@ struct r5conf { struct r5worker_group *worker_groups; int group_cnt; int worker_cnt_per_group; + struct r5l_log *log; }; @@ -606,4 +616,21 @@ static inline int algorithm_is_DDF(int layout) extern void md_raid5_kick_device(struct r5conf *conf); extern int raid5_set_cache_size(struct mddev *mddev, int size); +extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); +extern void raid5_release_stripe(struct stripe_head *sh); +extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh); +extern struct stripe_head * +raid5_get_active_stripe(struct r5conf *conf, sector_t sector, + int previous, int noblock, int noquiesce); +extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); +extern void r5l_exit_log(struct r5l_log *log); +extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); +extern void r5l_write_stripe_run(struct r5l_log *log); +extern void r5l_flush_stripe_to_raid(struct r5l_log *log); +extern void r5l_stripe_write_finished(struct stripe_head *sh); +extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); +extern void r5l_quiesce(struct r5l_log *log, int state); +extern bool r5l_log_disk_error(struct r5conf *conf); #endif |