summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig2
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bcache/bcache.h18
-rw-r--r--drivers/md/bcache/btree.c10
-rw-r--r--drivers/md/bcache/closure.h2
-rw-r--r--drivers/md/bcache/io.c101
-rw-r--r--drivers/md/bcache/journal.c12
-rw-r--r--drivers/md/bcache/movinggc.c8
-rw-r--r--drivers/md/bcache/request.c43
-rw-r--r--drivers/md/bcache/super.c48
-rw-r--r--drivers/md/bcache/util.h5
-rw-r--r--drivers/md/bcache/writeback.c14
-rw-r--r--drivers/md/bitmap.c17
-rw-r--r--drivers/md/bitmap.h4
-rw-r--r--drivers/md/dm-bio-prison.c6
-rw-r--r--drivers/md/dm-bufio.c44
-rw-r--r--drivers/md/dm-cache-metadata.c10
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c4
-rw-r--r--drivers/md/dm-cache-policy-mq.c4
-rw-r--r--drivers/md/dm-cache-policy-smq.c114
-rw-r--r--drivers/md/dm-cache-target.c87
-rw-r--r--drivers/md/dm-crypt.c53
-rw-r--r--drivers/md/dm-delay.c23
-rw-r--r--drivers/md/dm-era-target.c30
-rw-r--r--drivers/md/dm-exception-store.c8
-rw-r--r--drivers/md/dm-exception-store.h5
-rw-r--r--drivers/md/dm-flakey.c40
-rw-r--r--drivers/md/dm-io.c11
-rw-r--r--drivers/md/dm-ioctl.c4
-rw-r--r--drivers/md/dm-linear.c43
-rw-r--r--drivers/md/dm-log-userspace-base.c3
-rw-r--r--drivers/md/dm-log-writes.c51
-rw-r--r--drivers/md/dm-mpath.c61
-rw-r--r--drivers/md/dm-raid.c22
-rw-r--r--drivers/md/dm-raid1.c32
-rw-r--r--drivers/md/dm-region-hash.c6
-rw-r--r--drivers/md/dm-snap-persistent.c34
-rw-r--r--drivers/md/dm-snap-transient.c3
-rw-r--r--drivers/md/dm-snap.c49
-rw-r--r--drivers/md/dm-stats.c14
-rw-r--r--drivers/md/dm-stripe.c31
-rw-r--r--drivers/md/dm-switch.c25
-rw-r--r--drivers/md/dm-table.c109
-rw-r--r--drivers/md/dm-thin-metadata.c20
-rw-r--r--drivers/md/dm-thin.c167
-rw-r--r--drivers/md/dm-verity.c57
-rw-r--r--drivers/md/dm-zero.c2
-rw-r--r--drivers/md/dm.c356
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/faulty.c4
-rw-r--r--drivers/md/linear.c45
-rw-r--r--drivers/md/md-cluster.c363
-rw-r--r--drivers/md/md-cluster.h12
-rw-r--r--drivers/md/md.c666
-rw-r--r--drivers/md/md.h29
-rw-r--r--drivers/md/multipath.c38
-rw-r--r--drivers/md/persistent-data/dm-array.c4
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c12
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h2
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h8
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c92
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c57
-rw-r--r--drivers/md/persistent-data/dm-btree.c19
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c32
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c4
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h2
-rw-r--r--drivers/md/raid0.c133
-rw-r--r--drivers/md/raid0.h2
-rw-r--r--drivers/md/raid1.c218
-rw-r--r--drivers/md/raid1.h12
-rw-r--r--drivers/md/raid10.c265
-rw-r--r--drivers/md/raid10.h6
-rw-r--r--drivers/md/raid5-cache.c1191
-rw-r--r--drivers/md/raid5.c498
-rw-r--r--drivers/md/raid5.h29
75 files changed, 3328 insertions, 2231 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index d5415eed..3e01e6f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -393,7 +393,7 @@ config DM_MULTIPATH
# of SCSI_DH if the latter isn't defined but if
# it is, DM_MULTIPATH must depend on it. We get a build
# error if SCSI_DH=m and DM_MULTIPATH=y
- depends on SCSI_DH || !SCSI_DH
+ depends on !SCSI_DH || SCSI
---help---
Allow volume managers to support multipath hardware.
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 462f443..f34979c 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
md-mod-y += md.o bitmap.o
-raid456-y += raid5.o
+raid456-y += raid5.o raid5-cache.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 04f7bc2..6b420a5 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -243,19 +243,6 @@ struct keybuf {
DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
};
-struct bio_split_pool {
- struct bio_set *bio_split;
- mempool_t *bio_split_hook;
-};
-
-struct bio_split_hook {
- struct closure cl;
- struct bio_split_pool *p;
- struct bio *bio;
- bio_end_io_t *bi_end_io;
- void *bi_private;
-};
-
struct bcache_device {
struct closure cl;
@@ -288,8 +275,6 @@ struct bcache_device {
int (*cache_miss)(struct btree *, struct search *,
struct bio *, unsigned);
int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
-
- struct bio_split_pool bio_split_hook;
};
struct io {
@@ -454,8 +439,6 @@ struct cache {
atomic_long_t meta_sectors_written;
atomic_long_t btree_sectors_written;
atomic_long_t sectors_written;
-
- struct bio_split_pool bio_split_hook;
};
struct gc_stat {
@@ -873,7 +856,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
void bch_bbio_free(struct bio *, struct cache_set *);
struct bio *bch_bbio_alloc(struct cache_set *);
-void bch_generic_make_request(struct bio *, struct bio_split_pool *);
void __bch_submit_bbio(struct bio *, struct cache_set *);
void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 00cde40..83392f8 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -278,7 +278,7 @@ err:
goto out;
}
-static void btree_node_read_endio(struct bio *bio, int error)
+static void btree_node_read_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
closure_put(cl);
@@ -305,7 +305,7 @@ static void bch_btree_node_read(struct btree *b)
bch_submit_bbio(bio, b->c, &b->key, 0);
closure_sync(&cl);
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (bio->bi_error)
set_btree_node_io_error(b);
bch_bbio_free(bio, b->c);
@@ -371,15 +371,15 @@ static void btree_node_write_done(struct closure *cl)
__btree_node_write_done(cl);
}
-static void btree_node_write_endio(struct bio *bio, int error)
+static void btree_node_write_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
struct btree *b = container_of(cl, struct btree, io);
- if (error)
+ if (bio->bi_error)
set_btree_node_io_error(b);
- bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
+ bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree");
closure_put(cl);
}
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 79a6d63..782cc2c 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -38,7 +38,7 @@
* they are running owned by the thread that is running them. Otherwise, suppose
* you submit some bios and wish to have a function run when they all complete:
*
- * foo_endio(struct bio *bio, int error)
+ * foo_endio(struct bio *bio)
* {
* closure_put(cl);
* }
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index bf6a9ca..86a0bb8 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -11,105 +11,6 @@
#include <linux/blkdev.h>
-static unsigned bch_bio_max_sectors(struct bio *bio)
-{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct bio_vec bv;
- struct bvec_iter iter;
- unsigned ret = 0, seg = 0;
-
- if (bio->bi_rw & REQ_DISCARD)
- return min(bio_sectors(bio), q->limits.max_discard_sectors);
-
- bio_for_each_segment(bv, bio, iter) {
- struct bvec_merge_data bvm = {
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_iter.bi_sector,
- .bi_size = ret << 9,
- .bi_rw = bio->bi_rw,
- };
-
- if (seg == min_t(unsigned, BIO_MAX_PAGES,
- queue_max_segments(q)))
- break;
-
- if (q->merge_bvec_fn &&
- q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
- break;
-
- seg++;
- ret += bv.bv_len >> 9;
- }
-
- ret = min(ret, queue_max_sectors(q));
-
- WARN_ON(!ret);
- ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9);
-
- return ret;
-}
-
-static void bch_bio_submit_split_done(struct closure *cl)
-{
- struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-
- s->bio->bi_end_io = s->bi_end_io;
- s->bio->bi_private = s->bi_private;
- bio_endio(s->bio, 0);
-
- closure_debug_destroy(&s->cl);
- mempool_free(s, s->p->bio_split_hook);
-}
-
-static void bch_bio_submit_split_endio(struct bio *bio, int error)
-{
- struct closure *cl = bio->bi_private;
- struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-
- if (error)
- clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
-
- bio_put(bio);
- closure_put(cl);
-}
-
-void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
-{
- struct bio_split_hook *s;
- struct bio *n;
-
- if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
- goto submit;
-
- if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
- goto submit;
-
- s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
- closure_init(&s->cl, NULL);
-
- s->bio = bio;
- s->p = p;
- s->bi_end_io = bio->bi_end_io;
- s->bi_private = bio->bi_private;
- bio_get(bio);
-
- do {
- n = bio_next_split(bio, bch_bio_max_sectors(bio),
- GFP_NOIO, s->p->bio_split);
-
- n->bi_end_io = bch_bio_submit_split_endio;
- n->bi_private = &s->cl;
-
- closure_get(&s->cl);
- generic_make_request(n);
- } while (n != bio);
-
- continue_at(&s->cl, bch_bio_submit_split_done, NULL);
- return;
-submit:
- generic_make_request(bio);
-}
-
/* Bios with headers */
void bch_bbio_free(struct bio *bio, struct cache_set *c)
@@ -139,7 +40,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev;
b->submit_time_us = local_clock_us();
- closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
+ closure_bio_submit(bio, bio->bi_private);
}
void bch_submit_bbio(struct bio *bio, struct cache_set *c,
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 418607a..29eba72 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -24,7 +24,7 @@
* bit.
*/
-static void journal_read_endio(struct bio *bio, int error)
+static void journal_read_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
closure_put(cl);
@@ -61,7 +61,7 @@ reread: left = ca->sb.bucket_size - offset;
bio->bi_private = &cl;
bch_bio_map(bio, data);
- closure_bio_submit(bio, &cl, ca);
+ closure_bio_submit(bio, &cl);
closure_sync(&cl);
/* This function could be simpler now since we no longer write
@@ -401,7 +401,7 @@ retry:
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
-static void journal_discard_endio(struct bio *bio, int error)
+static void journal_discard_endio(struct bio *bio)
{
struct journal_device *ja =
container_of(bio, struct journal_device, discard_bio);
@@ -547,11 +547,11 @@ void bch_journal_next(struct journal *j)
pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
}
-static void journal_write_endio(struct bio *bio, int error)
+static void journal_write_endio(struct bio *bio)
{
struct journal_write *w = bio->bi_private;
- cache_set_err_on(error, w->c, "journal io error");
+ cache_set_err_on(bio->bi_error, w->c, "journal io error");
closure_put(&w->c->journal.io);
}
@@ -648,7 +648,7 @@ static void journal_write_unlocked(struct closure *cl)
spin_unlock(&c->journal.lock);
while ((bio = bio_list_pop(&list)))
- closure_bio_submit(bio, cl, c->cache[0]);
+ closure_bio_submit(bio, cl);
continue_at(cl, journal_write_done, NULL);
}
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index cd74903..b929fc9 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -60,20 +60,20 @@ static void write_moving_finish(struct closure *cl)
closure_return_with_destructor(cl, moving_io_destructor);
}
-static void read_moving_endio(struct bio *bio, int error)
+static void read_moving_endio(struct bio *bio)
{
struct bbio *b = container_of(bio, struct bbio, bio);
struct moving_io *io = container_of(bio->bi_private,
struct moving_io, cl);
- if (error)
- io->op.error = error;
+ if (bio->bi_error)
+ io->op.error = bio->bi_error;
else if (!KEY_DIRTY(&b->key) &&
ptr_stale(io->op.c, &b->key, 0)) {
io->op.error = -EINTR;
}
- bch_bbio_endio(io->op.c, bio, error, "reading data to move");
+ bch_bbio_endio(io->op.c, bio, bio->bi_error, "reading data to move");
}
static void moving_init(struct moving_io *io)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f292790..8e9877b 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -173,22 +173,22 @@ static void bch_data_insert_error(struct closure *cl)
bch_data_insert_keys(cl);
}
-static void bch_data_insert_endio(struct bio *bio, int error)
+static void bch_data_insert_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
- if (error) {
+ if (bio->bi_error) {
/* TODO: We could try to recover from this. */
if (op->writeback)
- op->error = error;
+ op->error = bio->bi_error;
else if (!op->replace)
set_closure_fn(cl, bch_data_insert_error, op->wq);
else
set_closure_fn(cl, NULL, NULL);
}
- bch_bbio_endio(op->c, bio, error, "writing data to cache");
+ bch_bbio_endio(op->c, bio, bio->bi_error, "writing data to cache");
}
static void bch_data_insert_start(struct closure *cl)
@@ -477,7 +477,7 @@ struct search {
struct data_insert_op iop;
};
-static void bch_cache_read_endio(struct bio *bio, int error)
+static void bch_cache_read_endio(struct bio *bio)
{
struct bbio *b = container_of(bio, struct bbio, bio);
struct closure *cl = bio->bi_private;
@@ -490,15 +490,15 @@ static void bch_cache_read_endio(struct bio *bio, int error)
* from the backing device.
*/
- if (error)
- s->iop.error = error;
+ if (bio->bi_error)
+ s->iop.error = bio->bi_error;
else if (!KEY_DIRTY(&b->key) &&
ptr_stale(s->iop.c, &b->key, 0)) {
atomic_long_inc(&s->iop.c->cache_read_races);
s->iop.error = -EINTR;
}
- bch_bbio_endio(s->iop.c, bio, error, "reading from cache");
+ bch_bbio_endio(s->iop.c, bio, bio->bi_error, "reading from cache");
}
/*
@@ -591,13 +591,13 @@ static void cache_lookup(struct closure *cl)
/* Common code for the make_request functions */
-static void request_endio(struct bio *bio, int error)
+static void request_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
- if (error) {
+ if (bio->bi_error) {
struct search *s = container_of(cl, struct search, cl);
- s->iop.error = error;
+ s->iop.error = bio->bi_error;
/* Only cache read errors are recoverable */
s->recoverable = false;
}
@@ -613,7 +613,8 @@ static void bio_complete(struct search *s)
&s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
- bio_endio(s->orig_bio, s->iop.error);
+ s->orig_bio->bi_error = s->iop.error;
+ bio_endio(s->orig_bio);
s->orig_bio = NULL;
}
}
@@ -718,7 +719,7 @@ static void cached_dev_read_error(struct closure *cl)
/* XXX: invalidate cache */
- closure_bio_submit(bio, cl, s->d);
+ closure_bio_submit(bio, cl);
}
continue_at(cl, cached_dev_cache_miss_done, NULL);
@@ -841,7 +842,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
s->cache_miss = miss;
s->iop.bio = cache_bio;
bio_get(cache_bio);
- closure_bio_submit(cache_bio, &s->cl, s->d);
+ closure_bio_submit(cache_bio, &s->cl);
return ret;
out_put:
@@ -849,7 +850,7 @@ out_put:
out_submit:
miss->bi_end_io = request_endio;
miss->bi_private = &s->cl;
- closure_bio_submit(miss, &s->cl, s->d);
+ closure_bio_submit(miss, &s->cl);
return ret;
}
@@ -914,7 +915,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
if (!(bio->bi_rw & REQ_DISCARD) ||
blk_queue_discard(bdev_get_queue(dc->bdev)))
- closure_bio_submit(bio, cl, s->d);
+ closure_bio_submit(bio, cl);
} else if (s->iop.writeback) {
bch_writeback_add(dc);
s->iop.bio = bio;
@@ -929,12 +930,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
flush->bi_end_io = request_endio;
flush->bi_private = cl;
- closure_bio_submit(flush, cl, s->d);
+ closure_bio_submit(flush, cl);
}
} else {
s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
- closure_bio_submit(bio, cl, s->d);
+ closure_bio_submit(bio, cl);
}
closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
@@ -950,7 +951,7 @@ static void cached_dev_nodata(struct closure *cl)
bch_journal_meta(s->iop.c, cl);
/* If it's a flush, we send the flush to the backing device too */
- closure_bio_submit(bio, cl, s->d);
+ closure_bio_submit(bio, cl);
continue_at(cl, cached_dev_bio_complete, NULL);
}
@@ -992,9 +993,9 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
} else {
if ((bio->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(dc->bdev)))
- bio_endio(bio, 0);
+ bio_endio(bio);
else
- bch_generic_make_request(bio, &d->bio_split_hook);
+ generic_make_request(bio);
}
}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 94980bf..679a093 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -59,29 +59,6 @@ struct workqueue_struct *bcache_wq;
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
-static void bio_split_pool_free(struct bio_split_pool *p)
-{
- if (p->bio_split_hook)
- mempool_destroy(p->bio_split_hook);
-
- if (p->bio_split)
- bioset_free(p->bio_split);
-}
-
-static int bio_split_pool_init(struct bio_split_pool *p)
-{
- p->bio_split = bioset_create(4, 0);
- if (!p->bio_split)
- return -ENOMEM;
-
- p->bio_split_hook = mempool_create_kmalloc_pool(4,
- sizeof(struct bio_split_hook));
- if (!p->bio_split_hook)
- return -ENOMEM;
-
- return 0;
-}
-
/* Superblock */
static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
@@ -221,7 +198,7 @@ err:
return err;
}
-static void write_bdev_super_endio(struct bio *bio, int error)
+static void write_bdev_super_endio(struct bio *bio)
{
struct cached_dev *dc = bio->bi_private;
/* XXX: error checking */
@@ -290,11 +267,11 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
}
-static void write_super_endio(struct bio *bio, int error)
+static void write_super_endio(struct bio *bio)
{
struct cache *ca = bio->bi_private;
- bch_count_io_errors(ca, error, "writing superblock");
+ bch_count_io_errors(ca, bio->bi_error, "writing superblock");
closure_put(&ca->set->sb_write);
}
@@ -339,12 +316,12 @@ void bcache_write_super(struct cache_set *c)
/* UUID io */
-static void uuid_endio(struct bio *bio, int error)
+static void uuid_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
- cache_set_err_on(error, c, "accessing uuids");
+ cache_set_err_on(bio->bi_error, c, "accessing uuids");
bch_bbio_free(bio, c);
closure_put(cl);
}
@@ -512,11 +489,11 @@ static struct uuid_entry *uuid_find_empty(struct cache_set *c)
* disk.
*/
-static void prio_endio(struct bio *bio, int error)
+static void prio_endio(struct bio *bio)
{
struct cache *ca = bio->bi_private;
- cache_set_err_on(error, ca->set, "accessing priorities");
+ cache_set_err_on(bio->bi_error, ca->set, "accessing priorities");
bch_bbio_free(bio, ca->set);
closure_put(&ca->prio);
}
@@ -537,7 +514,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
bio->bi_private = ca;
bch_bio_map(bio, ca->disk_buckets);
- closure_bio_submit(bio, &ca->prio, ca);
+ closure_bio_submit(bio, &ca->prio);
closure_sync(cl);
}
@@ -757,7 +734,6 @@ static void bcache_device_free(struct bcache_device *d)
put_disk(d->disk);
}
- bio_split_pool_free(&d->bio_split_hook);
if (d->bio_split)
bioset_free(d->bio_split);
kvfree(d->full_dirty_stripes);
@@ -804,7 +780,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
return minor;
if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
- bio_split_pool_init(&d->bio_split_hook) ||
!(d->disk = alloc_disk(1))) {
ida_simple_remove(&bcache_minor, minor);
return -ENOMEM;
@@ -830,7 +805,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
q->limits.max_sectors = UINT_MAX;
q->limits.max_segment_size = UINT_MAX;
q->limits.max_segments = BIO_MAX_PAGES;
- q->limits.max_discard_sectors = UINT_MAX;
+ blk_queue_max_discard_sectors(q, UINT_MAX);
q->limits.discard_granularity = 512;
q->limits.io_min = block_size;
q->limits.logical_block_size = block_size;
@@ -1793,8 +1768,6 @@ void bch_cache_release(struct kobject *kobj)
ca->set->cache[ca->sb.nr_this_dev] = NULL;
}
- bio_split_pool_free(&ca->bio_split_hook);
-
free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
kfree(ca->prio_buckets);
vfree(ca->buckets);
@@ -1839,8 +1812,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
ca->sb.nbuckets)) ||
!(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
2, GFP_KERNEL)) ||
- !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
- bio_split_pool_init(&ca->bio_split_hook))
+ !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
return -ENOMEM;
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1d04c48..cf2cbc2 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -4,6 +4,7 @@
#include <linux/blkdev.h>
#include <linux/errno.h>
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/llist.h>
#include <linux/ratelimit.h>
@@ -570,10 +571,10 @@ static inline sector_t bdev_sectors(struct block_device *bdev)
return bdev->bd_inode->i_size >> 9;
}
-#define closure_bio_submit(bio, cl, dev) \
+#define closure_bio_submit(bio, cl) \
do { \
closure_get(cl); \
- bch_generic_make_request(bio, &(dev)->bio_split_hook); \
+ generic_make_request(bio); \
} while (0)
uint64_t bch_crc64_update(uint64_t, const void *, size_t);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index f1986bc..b23f88d 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -166,12 +166,12 @@ static void write_dirty_finish(struct closure *cl)
closure_return_with_destructor(cl, dirty_io_destructor);
}
-static void dirty_endio(struct bio *bio, int error)
+static void dirty_endio(struct bio *bio)
{
struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private;
- if (error)
+ if (bio->bi_error)
SET_KEY_DIRTY(&w->key, false);
closure_put(&io->cl);
@@ -188,27 +188,27 @@ static void write_dirty(struct closure *cl)
io->bio.bi_bdev = io->dc->bdev;
io->bio.bi_end_io = dirty_endio;
- closure_bio_submit(&io->bio, cl, &io->dc->disk);
+ closure_bio_submit(&io->bio, cl);
continue_at(cl, write_dirty_finish, system_wq);
}
-static void read_dirty_endio(struct bio *bio, int error)
+static void read_dirty_endio(struct bio *bio)
{
struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private;
bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
- error, "reading dirty data from cache");
+ bio->bi_error, "reading dirty data from cache");
- dirty_endio(bio, error);
+ dirty_endio(bio);
}
static void read_dirty_submit(struct closure *cl)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
- closure_bio_submit(&io->bio, cl, &io->dc->disk);
+ closure_bio_submit(&io->bio, cl);
continue_at(cl, write_dirty, system_wq);
}
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e51de52..4f22e91 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -613,12 +613,10 @@ re_read:
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
- /* XXX: This is a hack to ensure that we don't use clustering
- * in case:
- * - dm-raid is in use and
- * - the nodes written in bitmap_sb is erroneous.
+ /* Setup nodes/clustername only if bitmap version is
+ * cluster-compatible
*/
- if (!bitmap->mddev->sync_super) {
+ if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
nodes = le32_to_cpu(sb->nodes);
strlcpy(bitmap->mddev->bitmap_info.cluster_name,
sb->cluster_name, 64);
@@ -628,7 +626,7 @@ re_read:
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
reason = "bad magic";
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
- le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
+ le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
reason = "unrecognized superblock version";
else if (chunksize < 512)
reason = "bitmap chunksize too small";
@@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
}
EXPORT_SYMBOL(bitmap_close_sync);
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
{
sector_t s = 0;
sector_t blocks;
@@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
bitmap->last_end_sync = jiffies;
return;
}
- if (time_before(jiffies, (bitmap->last_end_sync
+ if (!force && time_before(jiffies, (bitmap->last_end_sync
+ bitmap->mddev->bitmap_info.daemon_sleep)))
return;
wait_event(bitmap->mddev->recovery_wait,
@@ -1997,7 +1995,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
ret = bitmap_storage_alloc(&store, chunks,
!bitmap->mddev->bitmap_info.external,
- bitmap->cluster_slot);
+ mddev_is_clustered(bitmap->mddev)
+ ? bitmap->cluster_slot : 0);
if (ret)
goto err;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index f1f4dd0..7d5c3a6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -9,8 +9,10 @@
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
+ * Version 5 is currently set only for clustered devices
*/
#define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3
/*
@@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
void bitmap_close_sync(struct bitmap *bitmap);
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(struct mddev *mddev);
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index cd6d1d2..03af174 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -236,8 +236,10 @@ void dm_cell_error(struct dm_bio_prison *prison,
bio_list_init(&bios);
dm_cell_release(prison, cell, &bios);
- while ((bio = bio_list_pop(&bios)))
- bio_endio(bio, error);
+ while ((bio = bio_list_pop(&bios))) {
+ bio->bi_error = error;
+ bio_endio(bio);
+ }
}
EXPORT_SYMBOL_GPL(dm_cell_error);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 86dbbc7..2dd3308 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -545,7 +545,8 @@ static void dmio_complete(unsigned long error, void *context)
{
struct dm_buffer *b = context;
- b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
+ b->bio.bi_error = error ? -EIO : 0;
+ b->bio.bi_end_io(&b->bio);
}
static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
@@ -575,13 +576,16 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
b->bio.bi_end_io = end_io;
r = dm_io(&io_req, 1, &region, NULL);
- if (r)
- end_io(&b->bio, r);
+ if (r) {
+ b->bio.bi_error = r;
+ end_io(&b->bio);
+ }
}
-static void inline_endio(struct bio *bio, int error)
+static void inline_endio(struct bio *bio)
{
bio_end_io_t *end_fn = bio->bi_private;
+ int error = bio->bi_error;
/*
* Reset the bio to free any attached resources
@@ -589,7 +593,8 @@ static void inline_endio(struct bio *bio, int error)
*/
bio_reset(bio);
- end_fn(bio, error);
+ bio->bi_error = error;
+ end_fn(bio);
}
static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
@@ -661,13 +666,14 @@ static void submit_io(struct dm_buffer *b, int rw, sector_t block,
* Set the error, clear B_WRITING bit and wake anyone who was waiting on
* it.
*/
-static void write_endio(struct bio *bio, int error)
+static void write_endio(struct bio *bio)
{
struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
- b->write_error = error;
- if (unlikely(error)) {
+ b->write_error = bio->bi_error;
+ if (unlikely(bio->bi_error)) {
struct dm_bufio_client *c = b->c;
+ int error = bio->bi_error;
(void)cmpxchg(&c->async_write_error, 0, error);
}
@@ -1026,11 +1032,11 @@ found_buffer:
* The endio routine for reading: set the error, clear the bit and wake up
* anyone waiting on the buffer.
*/
-static void read_endio(struct bio *bio, int error)
+static void read_endio(struct bio *bio)
{
struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
- b->read_error = error;
+ b->read_error = bio->bi_error;
BUG_ON(!test_bit(B_READING, &b->state));
@@ -1592,11 +1598,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
c->bdev = bdev;
c->block_size = block_size;
- c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
- c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
- ffs(block_size) - 1 - PAGE_SHIFT : 0;
- c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
- PAGE_SHIFT - (ffs(block_size) - 1) : 0);
+ c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
+ c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ?
+ __ffs(block_size) - PAGE_SHIFT : 0;
+ c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ?
+ PAGE_SHIFT - __ffs(block_size) : 0);
c->aux_size = aux_size;
c->alloc_callback = alloc_callback;
@@ -1855,12 +1861,8 @@ static void __exit dm_bufio_exit(void)
cancel_delayed_work_sync(&dm_bufio_work);
destroy_workqueue(dm_bufio_wq);
- for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
- struct kmem_cache *kc = dm_bufio_caches[i];
-
- if (kc)
- kmem_cache_destroy(kc);
- }
+ for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++)
+ kmem_cache_destroy(dm_bufio_caches[i]);
for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
kfree(dm_bufio_cache_names[i]);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 20cc36b..f6543f3 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -260,7 +260,9 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
}
}
- return dm_bm_unlock(b);
+ dm_bm_unlock(b);
+
+ return 0;
}
static void __setup_mapping_info(struct dm_cache_metadata *cmd)
@@ -465,7 +467,9 @@ static int __open_metadata(struct dm_cache_metadata *cmd)
dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
sb_flags = le32_to_cpu(disk_super->flags);
cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
- return dm_bm_unlock(sblock);
+ dm_bm_unlock(sblock);
+
+ return 0;
bad:
dm_bm_unlock(sblock);
@@ -634,10 +638,10 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
disk_super = dm_block_data(sblock);
+ disk_super->flags = cpu_to_le32(cmd->flags);
if (mutator)
update_flags(disk_super, mutator);
- disk_super->flags = cpu_to_le32(cmd->flags);
disk_super->mapping_root = cpu_to_le64(cmd->root);
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index 240c9f0..14aaaf0 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -83,7 +83,7 @@ static struct list_head *list_pop(struct list_head *q)
static int alloc_hash(struct hash *hash, unsigned elts)
{
hash->nr_buckets = next_power(elts >> 4, 16);
- hash->hash_bits = ffs(hash->nr_buckets) - 1;
+ hash->hash_bits = __ffs(hash->nr_buckets);
hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
return hash->table ? 0 : -ENOMEM;
@@ -436,7 +436,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
static struct dm_cache_policy_type wb_policy_type = {
.name = "cleaner",
.version = {1, 0, 0},
- .hint_size = 0,
+ .hint_size = 4,
.owner = THIS_MODULE,
.create = wb_create
};
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 3281437..ddb2698 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1410,7 +1410,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
- mq->hash_bits = ffs(mq->nr_buckets) - 1;
+ mq->hash_bits = __ffs(mq->nr_buckets);
mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
if (!mq->table)
goto bad_alloc_table;
@@ -1471,5 +1471,3 @@ module_exit(mq_exit);
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("mq cache policy");
-
-MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 48a4a82..28d4586 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -566,7 +566,7 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent
ht->es = es;
nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
- ht->hash_bits = ffs(nr_buckets) - 1;
+ ht->hash_bits = __ffs(nr_buckets);
ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
if (!ht->buckets)
@@ -772,7 +772,7 @@ struct smq_policy {
struct dm_cache_policy policy;
/* protects everything */
- struct mutex lock;
+ spinlock_t lock;
dm_cblock_t cache_size;
sector_t cache_block_size;
@@ -807,13 +807,7 @@ struct smq_policy {
/*
* Keeps track of time, incremented by the core. We use this to
* avoid attributing multiple hits within the same tick.
- *
- * Access to tick_protected should be done with the spin lock held.
- * It's copied to tick at the start of the map function (within the
- * mutex).
*/
- spinlock_t tick_lock;
- unsigned tick_protected;
unsigned tick;
/*
@@ -1296,46 +1290,20 @@ static void smq_destroy(struct dm_cache_policy *p)
kfree(mq);
}
-static void copy_tick(struct smq_policy *mq)
-{
- unsigned long flags, tick;
-
- spin_lock_irqsave(&mq->tick_lock, flags);
- tick = mq->tick_protected;
- if (tick != mq->tick) {
- update_sentinels(mq);
- end_hotspot_period(mq);
- end_cache_period(mq);
- mq->tick = tick;
- }
- spin_unlock_irqrestore(&mq->tick_lock, flags);
-}
-
-static bool maybe_lock(struct smq_policy *mq, bool can_block)
-{
- if (can_block) {
- mutex_lock(&mq->lock);
- return true;
- } else
- return mutex_trylock(&mq->lock);
-}
-
static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
bool can_block, bool can_migrate, bool fast_promote,
struct bio *bio, struct policy_locker *locker,
struct policy_result *result)
{
int r;
+ unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
result->op = POLICY_MISS;
- if (!maybe_lock(mq, can_block))
- return -EWOULDBLOCK;
-
- copy_tick(mq);
+ spin_lock_irqsave(&mq->lock, flags);
r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
- mutex_unlock(&mq->lock);
+ spin_unlock_irqrestore(&mq->lock, flags);
return r;
}
@@ -1343,20 +1311,18 @@ static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
{
int r;
+ unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
struct entry *e;
- if (!mutex_trylock(&mq->lock))
- return -EWOULDBLOCK;
-
+ spin_lock_irqsave(&mq->lock, flags);
e = h_lookup(&mq->table, oblock);
if (e) {
*cblock = infer_cblock(mq, e);
r = 0;
} else
r = -ENOENT;
-
- mutex_unlock(&mq->lock);
+ spin_unlock_irqrestore(&mq->lock, flags);
return r;
}
@@ -1375,20 +1341,22 @@ static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, boo
static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
{
+ unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
- mutex_lock(&mq->lock);
+ spin_lock_irqsave(&mq->lock, flags);
__smq_set_clear_dirty(mq, oblock, true);
- mutex_unlock(&mq->lock);
+ spin_unlock_irqrestore(&mq->lock, flags);
}
static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
{
struct smq_policy *mq = to_smq_policy(p);
+ unsigned long flags;
- mutex_lock(&mq->lock);
+ spin_lock_irqsave(&mq->lock, flags);
__smq_set_clear_dirty(mq, oblock, false);
- mutex_unlock(&mq->lock);
+ spin_unlock_irqrestore(&mq->lock, flags);
}
static int smq_load_mapping(struct dm_cache_policy *p,
@@ -1433,14 +1401,14 @@ static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
struct smq_policy *mq = to_smq_policy(p);
int r = 0;
- mutex_lock(&mq->lock);
-
+ /*
+ * We don't need to lock here since this method is only called once
+ * the IO has stopped.
+ */
r = smq_save_hints(mq, &mq->clean, fn, context);
if (!r)
r = smq_save_hints(mq, &mq->dirty, fn, context);
- mutex_unlock(&mq->lock);
-
return r;
}
@@ -1458,10 +1426,11 @@ static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
{
struct smq_policy *mq = to_smq_policy(p);
+ unsigned long flags;
- mutex_lock(&mq->lock);
+ spin_lock_irqsave(&mq->lock, flags);
__remove_mapping(mq, oblock);
- mutex_unlock(&mq->lock);
+ spin_unlock_irqrestore(&mq->lock, flags);
}
static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
@@ -1480,11 +1449,12 @@ static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
{
int r;
+ unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
- mutex_lock(&mq->lock);
+ spin_lock_irqsave(&mq->lock, flags);
r = __remove_cblock(mq, cblock);
- mutex_unlock(&mq->lock);
+ spin_unlock_irqrestore(&mq->lock, flags);
return r;
}
@@ -1537,11 +1507,12 @@ static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
dm_cblock_t *cblock, bool critical_only)
{
int r;
+ unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
- mutex_lock(&mq->lock);
+ spin_lock_irqsave(&mq->lock, flags);
r = __smq_writeback_work(mq, oblock, cblock, critical_only);
- mutex_unlock(&mq->lock);
+ spin_unlock_irqrestore(&mq->lock, flags);
return r;
}
@@ -1562,21 +1533,23 @@ static void __force_mapping(struct smq_policy *mq,
static void smq_force_mapping(struct dm_cache_policy *p,
dm_oblock_t current_oblock, dm_oblock_t new_oblock)
{
+ unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
- mutex_lock(&mq->lock);
+ spin_lock_irqsave(&mq->lock, flags);
__force_mapping(mq, current_oblock, new_oblock);
- mutex_unlock(&mq->lock);
+ spin_unlock_irqrestore(&mq->lock, flags);
}
static dm_cblock_t smq_residency(struct dm_cache_policy *p)
{
dm_cblock_t r;
+ unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
- mutex_lock(&mq->lock);
+ spin_lock_irqsave(&mq->lock, flags);
r = to_cblock(mq->cache_alloc.nr_allocated);
- mutex_unlock(&mq->lock);
+ spin_unlock_irqrestore(&mq->lock, flags);
return r;
}
@@ -1586,15 +1559,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
struct smq_policy *mq = to_smq_policy(p);
unsigned long flags;
- spin_lock_irqsave(&mq->tick_lock, flags);
- mq->tick_protected++;
- spin_unlock_irqrestore(&mq->tick_lock, flags);
-
- if (can_block) {
- mutex_lock(&mq->lock);
- copy_tick(mq);
- mutex_unlock(&mq->lock);
- }
+ spin_lock_irqsave(&mq->lock, flags);
+ mq->tick++;
+ update_sentinels(mq);
+ end_hotspot_period(mq);
+ end_cache_period(mq);
+ spin_unlock_irqrestore(&mq->lock, flags);
}
/* Init the policy plugin interface function pointers. */
@@ -1694,10 +1664,8 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
} else
mq->cache_hit_bits = NULL;
- mq->tick_protected = 0;
mq->tick = 0;
- mutex_init(&mq->lock);
- spin_lock_init(&mq->tick_lock);
+ spin_lock_init(&mq->lock);
q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
mq->hotspot.nr_top_levels = 8;
@@ -1789,3 +1757,5 @@ module_exit(smq_exit);
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("smq cache policy");
+
+MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1fe93cf..2fd4c82 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -424,7 +424,6 @@ static void free_migration(struct dm_cache_migration *mg)
wake_up(&cache->migration_wait);
mempool_free(mg, cache->migration_pool);
- wake_worker(cache);
}
static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
@@ -919,14 +918,14 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
wake_worker(cache);
}
-static void writethrough_endio(struct bio *bio, int err)
+static void writethrough_endio(struct bio *bio)
{
struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
dm_unhook_bio(&pb->hook_info, bio);
- if (err) {
- bio_endio(bio, err);
+ if (bio->bi_error) {
+ bio_endio(bio);
return;
}
@@ -1064,14 +1063,6 @@ static void dec_io_migrations(struct cache *cache)
atomic_dec(&cache->nr_io_migrations);
}
-static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell,
- bool holder, struct bio_list *bios)
-{
- (holder ? dm_cell_release : dm_cell_release_no_holder)
- (cache->prison, cell, bios);
- free_prison_cell(cache, cell);
-}
-
static bool discard_or_flush(struct bio *bio)
{
return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
@@ -1079,14 +1070,13 @@ static bool discard_or_flush(struct bio *bio)
static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
{
- if (discard_or_flush(cell->holder))
+ if (discard_or_flush(cell->holder)) {
/*
- * We have to handle these bios
- * individually.
+ * We have to handle these bios individually.
*/
- __cell_release(cache, cell, true, &cache->deferred_bios);
-
- else
+ dm_cell_release(cache->prison, cell, &cache->deferred_bios);
+ free_prison_cell(cache, cell);
+ } else
list_add_tail(&cell->user_list, &cache->deferred_cells);
}
@@ -1113,7 +1103,7 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, boo
static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
{
dm_cell_error(cache->prison, cell, err);
- dm_bio_prison_free_cell(cache->prison, cell);
+ free_prison_cell(cache, cell);
}
static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
@@ -1123,8 +1113,11 @@ static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
static void free_io_migration(struct dm_cache_migration *mg)
{
- dec_io_migrations(mg->cache);
+ struct cache *cache = mg->cache;
+
+ dec_io_migrations(cache);
free_migration(mg);
+ wake_worker(cache);
}
static void migration_failure(struct dm_cache_migration *mg)
@@ -1231,7 +1224,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
* The block was promoted via an overwrite, so it's dirty.
*/
set_dirty(cache, mg->new_oblock, mg->cblock);
- bio_endio(mg->new_ocell->holder, 0);
+ bio_endio(mg->new_ocell->holder);
cell_defer(cache, mg->new_ocell, false);
}
free_io_migration(mg);
@@ -1284,7 +1277,7 @@ static void issue_copy(struct dm_cache_migration *mg)
}
}
-static void overwrite_endio(struct bio *bio, int err)
+static void overwrite_endio(struct bio *bio)
{
struct dm_cache_migration *mg = bio->bi_private;
struct cache *cache = mg->cache;
@@ -1294,7 +1287,7 @@ static void overwrite_endio(struct bio *bio, int err)
dm_unhook_bio(&pb->hook_info, bio);
- if (err)
+ if (bio->bi_error)
mg->err = true;
mg->requeue_holder = false;
@@ -1351,16 +1344,18 @@ static void issue_discard(struct dm_cache_migration *mg)
{
dm_dblock_t b, e;
struct bio *bio = mg->new_ocell->holder;
+ struct cache *cache = mg->cache;
- calc_discard_block_range(mg->cache, bio, &b, &e);
+ calc_discard_block_range(cache, bio, &b, &e);
while (b != e) {
- set_discard(mg->cache, b);
+ set_discard(cache, b);
b = to_dblock(from_dblock(b) + 1);
}
- bio_endio(bio, 0);
- cell_defer(mg->cache, mg->new_ocell, false);
+ bio_endio(bio);
+ cell_defer(cache, mg->new_ocell, false);
free_migration(mg);
+ wake_worker(cache);
}
static void issue_copy_or_discard(struct dm_cache_migration *mg)
@@ -1631,7 +1626,7 @@ static void process_discard_bio(struct cache *cache, struct prealloc *structs,
calc_discard_block_range(cache, bio, &b, &e);
if (b == e) {
- bio_endio(bio, 0);
+ bio_endio(bio);
return;
}
@@ -1729,6 +1724,8 @@ static void remap_cell_to_origin_clear_discard(struct cache *cache,
remap_to_origin(cache, bio);
issue(cache, bio);
}
+
+ free_prison_cell(cache, cell);
}
static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
@@ -1763,6 +1760,8 @@ static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_
remap_to_cache(cache, bio, cblock);
issue(cache, bio);
}
+
+ free_prison_cell(cache, cell);
}
/*----------------------------------------------------------------*/
@@ -2217,8 +2216,10 @@ static void requeue_deferred_bios(struct cache *cache)
bio_list_merge(&bios, &cache->deferred_bios);
bio_list_init(&cache->deferred_bios);
- while ((bio = bio_list_pop(&bios)))
- bio_endio(bio, DM_ENDIO_REQUEUE);
+ while ((bio = bio_list_pop(&bios))) {
+ bio->bi_error = DM_ENDIO_REQUEUE;
+ bio_endio(bio);
+ }
}
static int more_work(struct cache *cache)
@@ -2308,8 +2309,7 @@ static void destroy(struct cache *cache)
{
unsigned i;
- if (cache->migration_pool)
- mempool_destroy(cache->migration_pool);
+ mempool_destroy(cache->migration_pool);
if (cache->all_io_ds)
dm_deferred_set_destroy(cache->all_io_ds);
@@ -3123,7 +3123,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
* This is a duplicate writethrough io that is no
* longer needed because the block has been demoted.
*/
- bio_endio(bio, 0);
+ bio_endio(bio);
// FIXME: remap everything as a miss
cell_defer(cache, cell, false);
r = DM_MAPIO_SUBMITTED;
@@ -3778,26 +3778,6 @@ static int cache_iterate_devices(struct dm_target *ti,
return r;
}
-/*
- * We assume I/O is going to the origin (which is the volume
- * more likely to have restrictions e.g. by being striped).
- * (Looking up the exact location of the data would be expensive
- * and could always be out of date by the time the bio is submitted.)
- */
-static int cache_bvec_merge(struct dm_target *ti,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct cache *cache = ti->private;
- struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = cache->origin_dev->bdev;
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
{
/*
@@ -3841,7 +3821,6 @@ static struct target_type cache_target = {
.status = cache_status,
.message = cache_message,
.iterate_devices = cache_iterate_devices,
- .merge = cache_bvec_merge,
.io_hints = cache_io_hints,
};
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0f48fed..3729b39 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -968,7 +968,8 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
/*
* Generate a new unfragmented bio with the given size
- * This should never violate the device limitations
+ * This should never violate the device limitations (but only because
+ * max_segment_size is being constrained to PAGE_SIZE).
*
* This function may be called concurrently. If we allocate from the mempool
* concurrently, there is a possibility of deadlock. For example, if we have
@@ -1076,7 +1077,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
if (io->ctx.req)
crypt_free_req(cc, io->ctx.req, base_bio);
- bio_endio(base_bio, error);
+ base_bio->bi_error = error;
+ bio_endio(base_bio);
}
/*
@@ -1096,14 +1098,12 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
* The work is done per CPU global for all dm-crypt instances.
* They should not depend on each other and do not block.
*/
-static void crypt_endio(struct bio *clone, int error)
+static void crypt_endio(struct bio *clone)
{
struct dm_crypt_io *io = clone->bi_private;
struct crypt_config *cc = io->cc;
unsigned rw = bio_data_dir(clone);
-
- if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
- error = -EIO;
+ int error;
/*
* free the processed pages
@@ -1111,6 +1111,7 @@ static void crypt_endio(struct bio *clone, int error)
if (rw == WRITE)
crypt_free_buffer_pages(cc, clone);
+ error = clone->bi_error;
bio_put(clone);
if (rw == READ && !error) {
@@ -1543,10 +1544,8 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->bs)
bioset_free(cc->bs);
- if (cc->page_pool)
- mempool_destroy(cc->page_pool);
- if (cc->req_pool)
- mempool_destroy(cc->req_pool);
+ mempool_destroy(cc->page_pool);
+ mempool_destroy(cc->req_pool);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc);
@@ -1811,11 +1810,13 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
cc->iv_offset = tmpll;
- if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) {
+ ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev);
+ if (ret) {
ti->error = "Device lookup failed";
goto bad;
}
+ ret = -EINVAL;
if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid device sector";
goto bad;
@@ -2035,21 +2036,6 @@ error:
return -EINVAL;
}
-static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct crypt_config *cc = ti->private;
- struct request_queue *q = bdev_get_queue(cc->dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = cc->dev->bdev;
- bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector);
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
static int crypt_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
@@ -2058,9 +2044,20 @@ static int crypt_iterate_devices(struct dm_target *ti,
return fn(ti, cc->dev, cc->start, ti->len, data);
}
+static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ /*
+ * Unfortunate constraint that is required to avoid the potential
+ * for exceeding underlying device's max_segments limits -- due to
+ * crypt_alloc_buffer() possibly allocating pages for the encryption
+ * bio that are not as physically contiguous as the original bio.
+ */
+ limits->max_segment_size = PAGE_SIZE;
+}
+
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 14, 0},
+ .version = {1, 14, 1},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
@@ -2070,8 +2067,8 @@ static struct target_type crypt_target = {
.preresume = crypt_preresume,
.resume = crypt_resume,
.message = crypt_message,
- .merge = crypt_merge,
.iterate_devices = crypt_iterate_devices,
+ .io_hints = crypt_io_hints,
};
static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 57b6a19..b4c356a 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -122,6 +122,7 @@ static void flush_expired_bios(struct work_struct *work)
* <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
*
* With separate write parameters, the first set is only used for reads.
+ * Offsets are specified in sectors.
* Delays are specified in milliseconds.
*/
static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
@@ -129,9 +130,10 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
struct delay_c *dc;
unsigned long long tmpll;
char dummy;
+ int ret;
if (argc != 3 && argc != 6) {
- ti->error = "requires exactly 3 or 6 arguments";
+ ti->error = "Requires exactly 3 or 6 arguments";
return -EINVAL;
}
@@ -143,6 +145,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
dc->reads = dc->writes = 0;
+ ret = -EINVAL;
if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid device sector";
goto bad;
@@ -154,12 +157,14 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
- &dc->dev_read)) {
+ ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+ &dc->dev_read);
+ if (ret) {
ti->error = "Device lookup failed";
goto bad;
}
+ ret = -EINVAL;
dc->dev_write = NULL;
if (argc == 3)
goto out;
@@ -175,13 +180,15 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_dev_read;
}
- if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table),
- &dc->dev_write)) {
+ ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table),
+ &dc->dev_write);
+ if (ret) {
ti->error = "Write device lookup failed";
goto bad_dev_read;
}
out:
+ ret = -EINVAL;
dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
if (!dc->kdelayd_wq) {
DMERR("Couldn't start kdelayd");
@@ -208,7 +215,7 @@ bad_dev_read:
dm_put_device(ti, dc->dev_read);
bad:
kfree(dc);
- return -EINVAL;
+ return ret;
}
static void delay_dtr(struct dm_target *ti)
@@ -231,7 +238,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
unsigned long expires = 0;
if (!delay || !atomic_read(&dc->may_delay))
- return 1;
+ return DM_MAPIO_REMAPPED;
delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
@@ -251,7 +258,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
queue_timeout(dc, expires);
- return 0;
+ return DM_MAPIO_SUBMITTED;
}
static void delay_presuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index ad913cd..665bf32 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -343,7 +343,9 @@ static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
}
}
- return dm_bm_unlock(b);
+ dm_bm_unlock(b);
+
+ return 0;
}
/*----------------------------------------------------------------*/
@@ -582,7 +584,9 @@ static int open_metadata(struct era_metadata *md)
md->metadata_snap = le64_to_cpu(disk->metadata_snap);
md->archived_writesets = true;
- return dm_bm_unlock(sblock);
+ dm_bm_unlock(sblock);
+
+ return 0;
bad:
dm_bm_unlock(sblock);
@@ -1046,12 +1050,7 @@ static int metadata_take_snap(struct era_metadata *md)
md->metadata_snap = dm_block_location(clone);
- r = dm_tm_unlock(md->tm, clone);
- if (r) {
- DMERR("%s: couldn't unlock clone", __func__);
- md->metadata_snap = SUPERBLOCK_LOCATION;
- return r;
- }
+ dm_tm_unlock(md->tm, clone);
return 0;
}
@@ -1673,20 +1672,6 @@ static int era_iterate_devices(struct dm_target *ti,
return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data);
}
-static int era_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct era *era = ti->private;
- struct request_queue *q = bdev_get_queue(era->origin_dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = era->origin_dev->bdev;
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
static void era_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct era *era = ti->private;
@@ -1717,7 +1702,6 @@ static struct target_type era_target = {
.status = era_status,
.message = era_message,
.iterate_devices = era_iterate_devices,
- .merge = era_merge,
.io_hints = era_io_hints
};
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index ebaa4f8..3997f34 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -183,7 +183,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
store->chunk_size = chunk_size;
store->chunk_mask = chunk_size - 1;
- store->chunk_shift = ffs(chunk_size) - 1;
+ store->chunk_shift = __ffs(chunk_size);
return 0;
}
@@ -203,7 +203,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
return -EINVAL;
}
- tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL);
+ tmp_store = kzalloc(sizeof(*tmp_store), GFP_KERNEL);
if (!tmp_store) {
ti->error = "Exception store allocation failed";
return -ENOMEM;
@@ -215,7 +215,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
else if (persistent == 'N')
type = get_type("N");
else {
- ti->error = "Persistent flag is not P or N";
+ ti->error = "Exception store type is not P or N";
r = -EINVAL;
goto bad_type;
}
@@ -233,7 +233,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
if (r)
goto bad;
- r = type->ctr(tmp_store, 0, NULL);
+ r = type->ctr(tmp_store, (strlen(argv[0]) > 1 ? &argv[0][1] : NULL));
if (r) {
ti->error = "Exception store type constructor failed";
goto bad;
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 0b25362..fae34e7 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -42,8 +42,7 @@ struct dm_exception_store_type {
const char *name;
struct module *module;
- int (*ctr) (struct dm_exception_store *store,
- unsigned argc, char **argv);
+ int (*ctr) (struct dm_exception_store *store, char *options);
/*
* Destroys this object when you've finished with it.
@@ -123,6 +122,8 @@ struct dm_exception_store {
unsigned chunk_shift;
void *context;
+
+ bool userspace_supports_overflow;
};
/*
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index b257e46..09e2afc 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -183,6 +183,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
devname = dm_shift_arg(&as);
+ r = -EINVAL;
if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid device sector";
goto bad;
@@ -211,7 +212,8 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) {
+ r = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev);
+ if (r) {
ti->error = "Device lookup failed";
goto bad;
}
@@ -224,7 +226,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bad:
kfree(fc);
- return -EINVAL;
+ return r;
}
static void flakey_dtr(struct dm_target *ti)
@@ -296,7 +298,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
* Drop writes?
*/
if (test_bit(DROP_WRITES, &fc->flags)) {
- bio_endio(bio, 0);
+ bio_endio(bio);
return DM_MAPIO_SUBMITTED;
}
@@ -371,35 +373,20 @@ static void flakey_status(struct dm_target *ti, status_type_t type,
}
}
-static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
+static int flakey_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct flakey_c *fc = ti->private;
- struct dm_dev *dev = fc->dev;
- int r = 0;
+
+ *bdev = fc->dev->bdev;
/*
* Only pass ioctls through if the device sizes match exactly.
*/
if (fc->start ||
- ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
-}
-
-static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct flakey_c *fc = ti->private;
- struct request_queue *q = bdev_get_queue(fc->dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = fc->dev->bdev;
- bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector);
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+ ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+ return 1;
+ return 0;
}
static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
@@ -418,8 +405,7 @@ static struct target_type flakey_target = {
.map = flakey_map,
.end_io = flakey_end_io,
.status = flakey_status,
- .ioctl = flakey_ioctl,
- .merge = flakey_merge,
+ .prepare_ioctl = flakey_prepare_ioctl,
.iterate_devices = flakey_iterate_devices,
};
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 74adcd2..81c5e1a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -65,8 +65,7 @@ struct dm_io_client *dm_io_client_create(void)
return client;
bad:
- if (client->pool)
- mempool_destroy(client->pool);
+ mempool_destroy(client->pool);
kfree(client);
return ERR_PTR(-ENOMEM);
}
@@ -134,12 +133,13 @@ static void dec_count(struct io *io, unsigned int region, int error)
complete_io(io);
}
-static void endio(struct bio *bio, int error)
+static void endio(struct bio *bio)
{
struct io *io;
unsigned region;
+ int error;
- if (error && bio_data_dir(bio) == READ)
+ if (bio->bi_error && bio_data_dir(bio) == READ)
zero_fill_bio(bio);
/*
@@ -147,6 +147,7 @@ static void endio(struct bio *bio, int error)
*/
retrieve_io_and_region_from_bio(bio, &io, &region);
+ error = bio->bi_error;
bio_put(bio);
dec_count(io, region, error);
@@ -314,7 +315,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME))
num_bvecs = 1;
else
- num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
+ num_bvecs = min_t(int, BIO_MAX_PAGES,
dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 720ceeb..80a4395 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1919,9 +1919,7 @@ int __init dm_interface_init(void)
void dm_interface_exit(void)
{
- if (misc_deregister(&_dm_misc) < 0)
- DMERR("misc_deregister failed for control device");
-
+ misc_deregister(&_dm_misc);
dm_hash_exit();
}
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 53e848c..05c35aa 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -30,6 +30,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
struct linear_c *lc;
unsigned long long tmp;
char dummy;
+ int ret;
if (argc != 2) {
ti->error = "Invalid argument count";
@@ -38,18 +39,20 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
lc = kmalloc(sizeof(*lc), GFP_KERNEL);
if (lc == NULL) {
- ti->error = "dm-linear: Cannot allocate linear context";
+ ti->error = "Cannot allocate linear context";
return -ENOMEM;
}
+ ret = -EINVAL;
if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
- ti->error = "dm-linear: Invalid device sector";
+ ti->error = "Invalid device sector";
goto bad;
}
lc->start = tmp;
- if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) {
- ti->error = "dm-linear: Device lookup failed";
+ ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev);
+ if (ret) {
+ ti->error = "Device lookup failed";
goto bad;
}
@@ -61,7 +64,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bad:
kfree(lc);
- return -EINVAL;
+ return ret;
}
static void linear_dtr(struct dm_target *ti)
@@ -113,36 +116,21 @@ static void linear_status(struct dm_target *ti, status_type_t type,
}
}
-static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
- unsigned long arg)
+static int linear_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct linear_c *lc = (struct linear_c *) ti->private;
struct dm_dev *dev = lc->dev;
- int r = 0;
+
+ *bdev = dev->bdev;
/*
* Only pass ioctls through if the device sizes match exactly.
*/
if (lc->start ||
ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
-}
-
-static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct linear_c *lc = ti->private;
- struct request_queue *q = bdev_get_queue(lc->dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = lc->dev->bdev;
- bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector);
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+ return 1;
+ return 0;
}
static int linear_iterate_devices(struct dm_target *ti,
@@ -161,8 +149,7 @@ static struct target_type linear_target = {
.dtr = linear_dtr,
.map = linear_map,
.status = linear_status,
- .ioctl = linear_ioctl,
- .merge = linear_merge,
+ .prepare_ioctl = linear_prepare_ioctl,
.iterate_devices = linear_iterate_devices,
};
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 058256d..53b7b06d 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -313,8 +313,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
out:
kfree(devices_rdata);
if (r) {
- if (lc->flush_entry_pool)
- mempool_destroy(lc->flush_entry_pool);
+ mempool_destroy(lc->flush_entry_pool);
kfree(lc);
kfree(ctr_str);
} else {
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index ad1b049..624589d 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -146,16 +146,16 @@ static void put_io_block(struct log_writes_c *lc)
}
}
-static void log_end_io(struct bio *bio, int err)
+static void log_end_io(struct bio *bio)
{
struct log_writes_c *lc = bio->bi_private;
struct bio_vec *bvec;
int i;
- if (err) {
+ if (bio->bi_error) {
unsigned long flags;
- DMERR("Error writing log block, error=%d", err);
+ DMERR("Error writing log block, error=%d", bio->bi_error);
spin_lock_irqsave(&lc->blocks_lock, flags);
lc->logging_enabled = false;
spin_unlock_irqrestore(&lc->blocks_lock, flags);
@@ -205,7 +205,6 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
bio->bi_bdev = lc->logdev->bdev;
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
- set_bit(BIO_UPTODATE, &bio->bi_flags);
page = alloc_page(GFP_KERNEL);
if (!page) {
@@ -270,7 +269,6 @@ static int log_one_block(struct log_writes_c *lc,
bio->bi_bdev = lc->logdev->bdev;
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
- set_bit(BIO_UPTODATE, &bio->bi_flags);
for (i = 0; i < block->vec_cnt; i++) {
/*
@@ -292,7 +290,6 @@ static int log_one_block(struct log_writes_c *lc,
bio->bi_bdev = lc->logdev->bdev;
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
- set_bit(BIO_UPTODATE, &bio->bi_flags);
ret = bio_add_page(bio, block->vecs[i].bv_page,
block->vecs[i].bv_len, 0);
@@ -420,6 +417,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
struct log_writes_c *lc;
struct dm_arg_set as;
const char *devname, *logdevname;
+ int ret;
as.argc = argc;
as.argv = argv;
@@ -443,18 +441,22 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
atomic_set(&lc->pending_blocks, 0);
devname = dm_shift_arg(&as);
- if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) {
+ ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev);
+ if (ret) {
ti->error = "Device lookup failed";
goto bad;
}
logdevname = dm_shift_arg(&as);
- if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) {
+ ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table),
+ &lc->logdev);
+ if (ret) {
ti->error = "Log device lookup failed";
dm_put_device(ti, lc->dev);
goto bad;
}
+ ret = -EINVAL;
lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
if (!lc->log_kthread) {
ti->error = "Couldn't alloc kthread";
@@ -479,7 +481,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bad:
kfree(lc);
- return -EINVAL;
+ return ret;
}
static int log_mark(struct log_writes_c *lc, char *data)
@@ -606,7 +608,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio)
WARN_ON(flush_bio || fua_bio);
if (lc->device_supports_discard)
goto map_bio;
- bio_endio(bio, 0);
+ bio_endio(bio);
return DM_MAPIO_SUBMITTED;
}
@@ -712,35 +714,19 @@ static void log_writes_status(struct dm_target *ti, status_type_t type,
}
}
-static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
- unsigned long arg)
+static int log_writes_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct log_writes_c *lc = ti->private;
struct dm_dev *dev = lc->dev;
- int r = 0;
+ *bdev = dev->bdev;
/*
* Only pass ioctls through if the device sizes match exactly.
*/
if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
-}
-
-static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct log_writes_c *lc = ti->private;
- struct request_queue *q = bdev_get_queue(lc->dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = lc->dev->bdev;
- bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+ return 1;
+ return 0;
}
static int log_writes_iterate_devices(struct dm_target *ti,
@@ -795,8 +781,7 @@ static struct target_type log_writes_target = {
.map = log_writes_map,
.end_io = normal_end_io,
.status = log_writes_status,
- .ioctl = log_writes_ioctl,
- .merge = log_writes_merge,
+ .prepare_ioctl = log_writes_prepare_ioctl,
.message = log_writes_message,
.iterate_devices = log_writes_iterate_devices,
.io_hints = log_writes_io_hints,
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index eff7bdd..aaa6caa 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -159,12 +159,9 @@ static struct priority_group *alloc_priority_group(void)
static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
{
struct pgpath *pgpath, *tmp;
- struct multipath *m = ti->private;
list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
list_del(&pgpath->list);
- if (m->hw_handler_name)
- scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
dm_put_device(ti, pgpath->path.dev);
free_pgpath(pgpath);
}
@@ -580,6 +577,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
q = bdev_get_queue(p->path.dev->bdev);
if (m->retain_attached_hw_handler) {
+retain:
attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
if (attached_handler_name) {
/*
@@ -599,20 +597,14 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
}
if (m->hw_handler_name) {
- /*
- * Increments scsi_dh reference, even when using an
- * already-attached handler.
- */
r = scsi_dh_attach(q, m->hw_handler_name);
if (r == -EBUSY) {
- /*
- * Already attached to different hw_handler:
- * try to reattach with correct one.
- */
- scsi_dh_detach(q);
- r = scsi_dh_attach(q, m->hw_handler_name);
- }
+ char b[BDEVNAME_SIZE];
+ printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
+ bdevname(p->path.dev->bdev, b));
+ goto retain;
+ }
if (r < 0) {
ti->error = "error attaching hardware handler";
dm_put_device(ti, p->path.dev);
@@ -624,7 +616,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
if (r < 0) {
ti->error = "unable to set hardware "
"handler parameters";
- scsi_dh_detach(q);
dm_put_device(ti, p->path.dev);
goto bad;
}
@@ -734,12 +725,6 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
return 0;
m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
- if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name),
- "scsi_dh_%s", m->hw_handler_name)) {
- ti->error = "unknown hardware handler type";
- ret = -EINVAL;
- goto fail;
- }
if (hw_argc > 1) {
char *p;
@@ -1548,18 +1533,14 @@ out:
return r;
}
-static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
- unsigned long arg)
+static int multipath_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct multipath *m = ti->private;
struct pgpath *pgpath;
- struct block_device *bdev;
- fmode_t mode;
unsigned long flags;
int r;
- bdev = NULL;
- mode = 0;
r = 0;
spin_lock_irqsave(&m->lock, flags);
@@ -1570,26 +1551,17 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
pgpath = m->current_pgpath;
if (pgpath) {
- bdev = pgpath->path.dev->bdev;
- mode = pgpath->path.dev->mode;
+ *bdev = pgpath->path.dev->bdev;
+ *mode = pgpath->path.dev->mode;
}
if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
r = -ENOTCONN;
- else if (!bdev)
+ else if (!*bdev)
r = -EIO;
spin_unlock_irqrestore(&m->lock, flags);
- /*
- * Only pass ioctls through if the device sizes match exactly.
- */
- if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
- int err = scsi_verify_blk_ioctl(NULL, cmd);
- if (err)
- r = err;
- }
-
if (r == -ENOTCONN && !fatal_signal_pending(current)) {
spin_lock_irqsave(&m->lock, flags);
if (!m->current_pg) {
@@ -1602,7 +1574,12 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
dm_table_run_md_queue_async(m->ti->table);
}
- return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+ return 1;
+ return r;
}
static int multipath_iterate_devices(struct dm_target *ti,
@@ -1705,7 +1682,7 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 9, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
@@ -1718,7 +1695,7 @@ static struct target_type multipath_target = {
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,
- .ioctl = multipath_ioctl,
+ .prepare_ioctl = multipath_prepare_ioctl,
.iterate_devices = multipath_iterate_devices,
.busy = multipath_busy,
};
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2daa677..a090121 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -329,8 +329,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
*/
if (min_region_size > (1 << 13)) {
/* If not a power of 2, make it the next power of 2 */
- if (min_region_size & (min_region_size - 1))
- region_size = 1 << fls(region_size);
+ region_size = roundup_pow_of_two(min_region_size);
DMINFO("Choosing default region size of %lu sectors",
region_size);
} else {
@@ -1717,24 +1716,6 @@ static void raid_resume(struct dm_target *ti)
mddev_resume(&rs->md);
}
-static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct raid_set *rs = ti->private;
- struct md_personality *pers = rs->md.pers;
-
- if (pers && pers->mergeable_bvec)
- return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
-
- /*
- * In case we can't request the personality because
- * the raid set is not running yet
- *
- * -> return safe minimum
- */
- return rs->md.chunk_sectors;
-}
-
static struct target_type raid_target = {
.name = "raid",
.version = {1, 7, 0},
@@ -1749,7 +1730,6 @@ static struct target_type raid_target = {
.presuspend = raid_presuspend,
.postsuspend = raid_postsuspend,
.resume = raid_resume,
- .merge = raid_merge,
};
static int __init dm_raid_init(void)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d83696b..f2a363a 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -490,9 +490,11 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio)
* If device is suspended, complete the bio.
*/
if (dm_noflush_suspending(ms->ti))
- bio_endio(bio, DM_ENDIO_REQUEUE);
+ bio->bi_error = DM_ENDIO_REQUEUE;
else
- bio_endio(bio, -EIO);
+ bio->bi_error = -EIO;
+
+ bio_endio(bio);
return;
}
@@ -515,7 +517,7 @@ static void read_callback(unsigned long error, void *context)
bio_set_m(bio, NULL);
if (likely(!error)) {
- bio_endio(bio, 0);
+ bio_endio(bio);
return;
}
@@ -531,7 +533,7 @@ static void read_callback(unsigned long error, void *context)
DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.",
m->dev->name);
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
}
/* Asynchronous read. */
@@ -580,7 +582,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
if (likely(m))
read_async_bio(m, bio);
else
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
}
}
@@ -598,7 +600,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
static void write_callback(unsigned long error, void *context)
{
- unsigned i, ret = 0;
+ unsigned i;
struct bio *bio = (struct bio *) context;
struct mirror_set *ms;
int should_wake = 0;
@@ -614,7 +616,7 @@ static void write_callback(unsigned long error, void *context)
* regions with the same code.
*/
if (likely(!error)) {
- bio_endio(bio, ret);
+ bio_endio(bio);
return;
}
@@ -623,7 +625,8 @@ static void write_callback(unsigned long error, void *context)
* degrade the array.
*/
if (bio->bi_rw & REQ_DISCARD) {
- bio_endio(bio, -EOPNOTSUPP);
+ bio->bi_error = -EOPNOTSUPP;
+ bio_endio(bio);
return;
}
@@ -828,13 +831,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
* be wrong if the failed leg returned after reboot and
* got replicated back to the good legs.)
*/
-
if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure)))
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
else if (errors_handled(ms) && !keep_log(ms))
hold_bio(ms, bio);
else
- bio_endio(bio, 0);
+ bio_endio(bio);
}
}
@@ -943,16 +945,18 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
{
unsigned long long offset;
char dummy;
+ int ret;
if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
ti->error = "Invalid offset";
return -EINVAL;
}
- if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
- &ms->mirror[mirror].dev)) {
+ ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+ &ms->mirror[mirror].dev);
+ if (ret) {
ti->error = "Device lookup failure";
- return -ENXIO;
+ return ret;
}
ms->mirror[mirror].ms = ms;
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index b929fd5..74cb7b99 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -193,7 +193,7 @@ struct dm_region_hash *dm_region_hash_create(
rh->max_recovery = max_recovery;
rh->log = log;
rh->region_size = region_size;
- rh->region_shift = ffs(region_size) - 1;
+ rh->region_shift = __ffs(region_size);
rwlock_init(&rh->hash_lock);
rh->mask = nr_buckets - 1;
rh->nr_buckets = nr_buckets;
@@ -249,9 +249,7 @@ void dm_region_hash_destroy(struct dm_region_hash *rh)
if (rh->log)
dm_dirty_log_destroy(rh->log);
- if (rh->region_pool)
- mempool_destroy(rh->region_pool);
-
+ mempool_destroy(rh->region_pool);
vfree(rh->buckets);
kfree(rh);
}
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 808b841..3164b8b 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -7,6 +7,7 @@
#include "dm-exception-store.h"
+#include <linux/ctype.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
@@ -321,7 +322,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
bdev) >> 9);
ps->store->chunk_mask = ps->store->chunk_size - 1;
- ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
+ ps->store->chunk_shift = __ffs(ps->store->chunk_size);
chunk_size_supplied = 0;
}
@@ -533,7 +534,7 @@ static int read_exceptions(struct pstore *ps,
chunk = area_location(ps, ps->current_area);
area = dm_bufio_read(client, chunk, &bp);
- if (unlikely(IS_ERR(area))) {
+ if (IS_ERR(area)) {
r = PTR_ERR(area);
goto ret_destroy_bufio;
}
@@ -843,10 +844,10 @@ static void persistent_drop_snapshot(struct dm_exception_store *store)
DMWARN("write header failed");
}
-static int persistent_ctr(struct dm_exception_store *store,
- unsigned argc, char **argv)
+static int persistent_ctr(struct dm_exception_store *store, char *options)
{
struct pstore *ps;
+ int r;
/* allocate the pstore */
ps = kzalloc(sizeof(*ps), GFP_KERNEL);
@@ -868,14 +869,32 @@ static int persistent_ctr(struct dm_exception_store *store,
ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
if (!ps->metadata_wq) {
- kfree(ps);
DMERR("couldn't start header metadata update thread");
- return -ENOMEM;
+ r = -ENOMEM;
+ goto err_workqueue;
+ }
+
+ if (options) {
+ char overflow = toupper(options[0]);
+ if (overflow == 'O')
+ store->userspace_supports_overflow = true;
+ else {
+ DMERR("Unsupported persistent store option: %s", options);
+ r = -EINVAL;
+ goto err_options;
+ }
}
store->context = ps;
return 0;
+
+err_options:
+ destroy_workqueue(ps->metadata_wq);
+err_workqueue:
+ kfree(ps);
+
+ return r;
}
static unsigned persistent_status(struct dm_exception_store *store,
@@ -888,7 +907,8 @@ static unsigned persistent_status(struct dm_exception_store *store,
case STATUSTYPE_INFO:
break;
case STATUSTYPE_TABLE:
- DMEMIT(" P %llu", (unsigned long long)store->chunk_size);
+ DMEMIT(" %s %llu", store->userspace_supports_overflow ? "PO" : "P",
+ (unsigned long long)store->chunk_size);
}
return sz;
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 1ce9a25..9b7c8c8 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -70,8 +70,7 @@ static void transient_usage(struct dm_exception_store *store,
*metadata_sectors = 0;
}
-static int transient_ctr(struct dm_exception_store *store,
- unsigned argc, char **argv)
+static int transient_ctr(struct dm_exception_store *store, char *options)
{
struct transient_c *tc;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 7c82d3c..c06b74e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -63,6 +63,13 @@ struct dm_snapshot {
*/
int valid;
+ /*
+ * The snapshot overflowed because of a write to the snapshot device.
+ * We don't have to invalidate the snapshot in this case, but we need
+ * to prevent further writes.
+ */
+ int snapshot_overflowed;
+
/* Origin writes don't trigger exceptions until this is set */
int active;
@@ -1091,7 +1098,7 @@ static void stop_merge(struct dm_snapshot *s)
}
/*
- * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
+ * Construct a snapshot mapping: <origin_dev> <COW-dev> <p|po|n> <chunk-size>
*/
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
@@ -1152,6 +1159,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->ti = ti;
s->valid = 1;
+ s->snapshot_overflowed = 0;
s->active = 0;
atomic_set(&s->pending_exceptions_count, 0);
s->exception_start_sequence = 0;
@@ -1294,6 +1302,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src,
u.store_swap = snap_dest->store;
snap_dest->store = snap_src->store;
+ snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow;
snap_src->store = u.store_swap;
snap_dest->store->snap = snap_dest;
@@ -1301,6 +1310,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src,
snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
snap_dest->valid = snap_src->valid;
+ snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed;
/*
* Set source invalid to ensure it receives no further I/O.
@@ -1490,7 +1500,7 @@ out:
error_bios(snapshot_bios);
} else {
if (full_bio)
- bio_endio(full_bio, 0);
+ bio_endio(full_bio);
flush_bios(snapshot_bios);
}
@@ -1580,11 +1590,11 @@ static void start_copy(struct dm_snap_pending_exception *pe)
dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
}
-static void full_bio_end_io(struct bio *bio, int error)
+static void full_bio_end_io(struct bio *bio)
{
void *callback_data = bio->bi_private;
- dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
+ dm_kcopyd_do_callback(callback_data, 0, bio->bi_error ? 1 : 0);
}
static void start_full_bio(struct dm_snap_pending_exception *pe,
@@ -1691,7 +1701,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
* to copy an exception */
down_write(&s->lock);
- if (!s->valid) {
+ if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_rw(bio) == WRITE)) {
r = -EIO;
goto out_unlock;
}
@@ -1715,7 +1725,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
pe = alloc_pending_exception(s);
down_write(&s->lock);
- if (!s->valid) {
+ if (!s->valid || s->snapshot_overflowed) {
free_pending_exception(pe);
r = -EIO;
goto out_unlock;
@@ -1730,7 +1740,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
pe = __find_pending_exception(s, pe, chunk);
if (!pe) {
- __invalidate_snapshot(s, -ENOMEM);
+ if (s->store->userspace_supports_overflow) {
+ s->snapshot_overflowed = 1;
+ DMERR("Snapshot overflowed: Unable to allocate exception.");
+ } else
+ __invalidate_snapshot(s, -ENOMEM);
r = -EIO;
goto out_unlock;
}
@@ -1990,6 +2004,8 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
DMEMIT("Invalid");
else if (snap->merge_failed)
DMEMIT("Merge failed");
+ else if (snap->snapshot_overflowed)
+ DMEMIT("Overflow");
else {
if (snap->store->type->usage) {
sector_t total_sectors, sectors_allocated,
@@ -2330,20 +2346,6 @@ static void origin_status(struct dm_target *ti, status_type_t type,
}
}
-static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct dm_origin *o = ti->private;
- struct request_queue *q = bdev_get_queue(o->dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = o->dev->bdev;
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
static int origin_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
@@ -2362,13 +2364,12 @@ static struct target_type origin_target = {
.resume = origin_resume,
.postsuspend = origin_postsuspend,
.status = origin_status,
- .merge = origin_merge,
.iterate_devices = origin_iterate_devices,
};
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 13, 0},
+ .version = {1, 15, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2382,7 +2383,7 @@ static struct target_type snapshot_target = {
static struct target_type merge_target = {
.name = dm_snapshot_merge_target_name,
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 8a8b48f..8289804 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -457,12 +457,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program,
list_for_each_entry(s, &stats->list, list_entry) {
if (!program || !strcmp(program, s->program_id)) {
len = s->end - s->start;
- DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id,
+ DMEMIT("%d: %llu+%llu %llu %s %s", s->id,
(unsigned long long)s->start,
(unsigned long long)len,
(unsigned long long)s->step,
s->program_id,
s->aux_data);
+ if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
+ DMEMIT(" precise_timestamps");
+ if (s->n_histogram_entries) {
+ unsigned i;
+ DMEMIT(" histogram:");
+ for (i = 0; i < s->n_histogram_entries; i++) {
+ if (i)
+ DMEMIT(",");
+ DMEMIT("%llu", s->histogram_boundaries[i]);
+ }
+ }
+ DMEMIT("\n");
}
}
mutex_unlock(&stats->mutex);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index a672a15..797ddb9 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -75,13 +75,15 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
{
unsigned long long start;
char dummy;
+ int ret;
if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1)
return -EINVAL;
- if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
- &sc->stripe[stripe].dev))
- return -ENXIO;
+ ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+ &sc->stripe[stripe].dev);
+ if (ret)
+ return ret;
sc->stripe[stripe].physical_start = start;
@@ -273,7 +275,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
return DM_MAPIO_REMAPPED;
} else {
/* The range doesn't map to the target stripe */
- bio_endio(bio, 0);
+ bio_endio(bio);
return DM_MAPIO_SUBMITTED;
}
}
@@ -412,26 +414,6 @@ static void stripe_io_hints(struct dm_target *ti,
blk_limits_io_opt(limits, chunk_size * sc->stripes);
}
-static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct stripe_c *sc = ti->private;
- sector_t bvm_sector = bvm->bi_sector;
- uint32_t stripe;
- struct request_queue *q;
-
- stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector);
-
- q = bdev_get_queue(sc->stripe[stripe].dev->bdev);
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = sc->stripe[stripe].dev->bdev;
- bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector;
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
static struct target_type stripe_target = {
.name = "striped",
.version = {1, 5, 1},
@@ -443,7 +425,6 @@ static struct target_type stripe_target = {
.status = stripe_status,
.iterate_devices = stripe_iterate_devices,
.io_hints = stripe_io_hints,
- .merge = stripe_merge,
};
int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 50fca46..871c18f 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -99,11 +99,11 @@ static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
if (sector_div(nr_regions, sctx->region_size))
nr_regions++;
- sctx->nr_regions = nr_regions;
- if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
+ if (nr_regions >= ULONG_MAX) {
ti->error = "Region table too large";
return -EINVAL;
}
+ sctx->nr_regions = nr_regions;
nr_slots = nr_regions;
if (sector_div(nr_slots, sctx->region_entries_per_slot))
@@ -511,27 +511,24 @@ static void switch_status(struct dm_target *ti, status_type_t type,
*
* Passthrough all ioctls to the path for sector 0
*/
-static int switch_ioctl(struct dm_target *ti, unsigned cmd,
- unsigned long arg)
+static int switch_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct switch_ctx *sctx = ti->private;
- struct block_device *bdev;
- fmode_t mode;
unsigned path_nr;
- int r = 0;
path_nr = switch_get_path_nr(sctx, 0);
- bdev = sctx->path_list[path_nr].dmdev->bdev;
- mode = sctx->path_list[path_nr].dmdev->mode;
+ *bdev = sctx->path_list[path_nr].dmdev->bdev;
+ *mode = sctx->path_list[path_nr].dmdev->mode;
/*
* Only pass ioctls through if the device sizes match exactly.
*/
- if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+ if (ti->len + sctx->path_list[path_nr].start !=
+ i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+ return 1;
+ return 0;
}
static int switch_iterate_devices(struct dm_target *ti,
@@ -560,7 +557,7 @@ static struct target_type switch_target = {
.map = switch_map,
.message = switch_message,
.status = switch_status,
- .ioctl = switch_ioctl,
+ .prepare_ioctl = switch_prepare_ioctl,
.iterate_devices = switch_iterate_devices,
};
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 16ba55a..061152a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -440,14 +440,6 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
q->limits.alignment_offset,
(unsigned long long) start << SECTOR_SHIFT);
- /*
- * Check if merge fn is supported.
- * If not we'll force DM to use PAGE_SIZE or
- * smaller I/O, just to be safe.
- */
- if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
- blk_limits_max_hw_sectors(limits,
- (unsigned int) (PAGE_SIZE >> 9));
return 0;
}
@@ -1022,15 +1014,16 @@ static int dm_table_build_index(struct dm_table *t)
return r;
}
+static bool integrity_profile_exists(struct gendisk *disk)
+{
+ return !!blk_get_integrity(disk);
+}
+
/*
* Get a disk whose integrity profile reflects the table's profile.
- * If %match_all is true, all devices' profiles must match.
- * If %match_all is false, all devices must at least have an
- * allocated integrity profile; but uninitialized is ok.
* Returns NULL if integrity support was inconsistent or unavailable.
*/
-static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
- bool match_all)
+static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
{
struct list_head *devices = dm_table_get_devices(t);
struct dm_dev_internal *dd = NULL;
@@ -1038,10 +1031,8 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
list_for_each_entry(dd, devices, list) {
template_disk = dd->dm_dev->bdev->bd_disk;
- if (!blk_get_integrity(template_disk))
+ if (!integrity_profile_exists(template_disk))
goto no_integrity;
- if (!match_all && !blk_integrity_is_initialized(template_disk))
- continue; /* skip uninitialized profiles */
else if (prev_disk &&
blk_integrity_compare(prev_disk, template_disk) < 0)
goto no_integrity;
@@ -1060,34 +1051,40 @@ no_integrity:
}
/*
- * Register the mapped device for blk_integrity support if
- * the underlying devices have an integrity profile. But all devices
- * may not have matching profiles (checking all devices isn't reliable
+ * Register the mapped device for blk_integrity support if the
+ * underlying devices have an integrity profile. But all devices may
+ * not have matching profiles (checking all devices isn't reliable
* during table load because this table may use other DM device(s) which
- * must be resumed before they will have an initialized integity profile).
- * Stacked DM devices force a 2 stage integrity profile validation:
- * 1 - during load, validate all initialized integrity profiles match
- * 2 - during resume, validate all integrity profiles match
+ * must be resumed before they will have an initialized integity
+ * profile). Consequently, stacked DM devices force a 2 stage integrity
+ * profile validation: First pass during table load, final pass during
+ * resume.
*/
-static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
+static int dm_table_register_integrity(struct dm_table *t)
{
+ struct mapped_device *md = t->md;
struct gendisk *template_disk = NULL;
- template_disk = dm_table_get_integrity_disk(t, false);
+ template_disk = dm_table_get_integrity_disk(t);
if (!template_disk)
return 0;
- if (!blk_integrity_is_initialized(dm_disk(md))) {
+ if (!integrity_profile_exists(dm_disk(md))) {
t->integrity_supported = 1;
- return blk_integrity_register(dm_disk(md), NULL);
+ /*
+ * Register integrity profile during table load; we can do
+ * this because the final profile must match during resume.
+ */
+ blk_integrity_register(dm_disk(md),
+ blk_get_integrity(template_disk));
+ return 0;
}
/*
- * If DM device already has an initalized integrity
+ * If DM device already has an initialized integrity
* profile the new profile should not conflict.
*/
- if (blk_integrity_is_initialized(template_disk) &&
- blk_integrity_compare(dm_disk(md), template_disk) < 0) {
+ if (blk_integrity_compare(dm_disk(md), template_disk) < 0) {
DMWARN("%s: conflict with existing integrity profile: "
"%s profile mismatch",
dm_device_name(t->md),
@@ -1095,7 +1092,7 @@ static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device
return 1;
}
- /* Preserve existing initialized integrity profile */
+ /* Preserve existing integrity profile */
t->integrity_supported = 1;
return 0;
}
@@ -1120,7 +1117,7 @@ int dm_table_complete(struct dm_table *t)
return r;
}
- r = dm_table_prealloc_integrity(t, t->md);
+ r = dm_table_register_integrity(t);
if (r) {
DMERR("could not register integrity profile.");
return r;
@@ -1286,29 +1283,30 @@ combine_limits:
}
/*
- * Set the integrity profile for this device if all devices used have
- * matching profiles. We're quite deep in the resume path but still
- * don't know if all devices (particularly DM devices this device
- * may be stacked on) have matching profiles. Even if the profiles
- * don't match we have no way to fail (to resume) at this point.
+ * Verify that all devices have an integrity profile that matches the
+ * DM device's registered integrity profile. If the profiles don't
+ * match then unregister the DM device's integrity profile.
*/
-static void dm_table_set_integrity(struct dm_table *t)
+static void dm_table_verify_integrity(struct dm_table *t)
{
struct gendisk *template_disk = NULL;
- if (!blk_get_integrity(dm_disk(t->md)))
- return;
+ if (t->integrity_supported) {
+ /*
+ * Verify that the original integrity profile
+ * matches all the devices in this table.
+ */
+ template_disk = dm_table_get_integrity_disk(t);
+ if (template_disk &&
+ blk_integrity_compare(dm_disk(t->md), template_disk) >= 0)
+ return;
+ }
- template_disk = dm_table_get_integrity_disk(t, true);
- if (template_disk)
- blk_integrity_register(dm_disk(t->md),
- blk_get_integrity(template_disk));
- else if (blk_integrity_is_initialized(dm_disk(t->md)))
- DMWARN("%s: device no longer has a valid integrity profile",
- dm_device_name(t->md));
- else
+ if (integrity_profile_exists(dm_disk(t->md))) {
DMWARN("%s: unable to establish an integrity profile",
dm_device_name(t->md));
+ blk_integrity_unregister(dm_disk(t->md));
+ }
}
static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
@@ -1388,14 +1386,6 @@ static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
}
-static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags);
-}
-
static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
@@ -1516,12 +1506,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
- if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps))
- queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q);
- else
- queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q);
-
- dm_table_set_integrity(t);
+ dm_table_verify_integrity(t);
/*
* Determine whether or not this queue's I/O timings contribute
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 48dfe3c..1fa4569 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -396,7 +396,9 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
}
}
- return dm_bm_unlock(b);
+ dm_bm_unlock(b);
+
+ return 0;
}
static void __setup_btree_details(struct dm_pool_metadata *pmd)
@@ -650,7 +652,9 @@ static int __open_metadata(struct dm_pool_metadata *pmd)
}
__setup_btree_details(pmd);
- return dm_bm_unlock(sblock);
+ dm_bm_unlock(sblock);
+
+ return 0;
bad_cleanup_data_sm:
dm_sm_destroy(pmd->data_sm);
@@ -1293,11 +1297,13 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
return r;
disk_super = dm_block_data(copy);
- dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
- dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
+ dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
+ dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
dm_sm_dec_block(pmd->metadata_sm, held_root);
- return dm_tm_unlock(pmd->tm, copy);
+ dm_tm_unlock(pmd->tm, copy);
+
+ return 0;
}
int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
@@ -1327,7 +1333,9 @@ static int __get_metadata_snap(struct dm_pool_metadata *pmd,
disk_super = dm_block_data(sblock);
*result = le64_to_cpu(disk_super->held_root);
- return dm_bm_unlock(sblock);
+ dm_bm_unlock(sblock);
+
+ return 0;
}
int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index d2bbe8c..3897b90 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -332,9 +332,6 @@ struct thin_c {
*
* Description:
* Asynchronously issue a discard request for the sectors in question.
- * NOTE: this variant of blk-core's blkdev_issue_discard() is a stop-gap
- * that is being kept local to DM thinp until the block changes to allow
- * late bio splitting land upstream.
*/
static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
@@ -342,91 +339,36 @@ static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sect
{
struct request_queue *q = bdev_get_queue(bdev);
int type = REQ_WRITE | REQ_DISCARD;
- unsigned int max_discard_sectors, granularity;
- int alignment;
struct bio *bio;
- int ret = 0;
- struct blk_plug plug;
- if (!q)
+ if (!q || !nr_sects)
return -ENXIO;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
- /* Zero-sector (unknown) and one-sector granularities are the same. */
- granularity = max(q->limits.discard_granularity >> 9, 1U);
- alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
-
- /*
- * Ensure that max_discard_sectors is of the proper
- * granularity, so that requests stay aligned after a split.
- */
- max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
- max_discard_sectors -= max_discard_sectors % granularity;
- if (unlikely(!max_discard_sectors)) {
- /* Avoid infinite loop below. Being cautious never hurts. */
- return -EOPNOTSUPP;
- }
-
if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secdiscard(q))
return -EOPNOTSUPP;
type |= REQ_SECURE;
}
- blk_start_plug(&plug);
- while (nr_sects) {
- unsigned int req_sects;
- sector_t end_sect, tmp;
-
- /*
- * Required bio_put occurs in bio_endio thanks to bio_chain below
- */
- bio = bio_alloc(gfp_mask, 1);
- if (!bio) {
- ret = -ENOMEM;
- break;
- }
-
- req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
-
- /*
- * If splitting a request, and the next starting sector would be
- * misaligned, stop the discard at the previous aligned sector.
- */
- end_sect = sector + req_sects;
- tmp = end_sect;
- if (req_sects < nr_sects &&
- sector_div(tmp, granularity) != alignment) {
- end_sect = end_sect - alignment;
- sector_div(end_sect, granularity);
- end_sect = end_sect * granularity + alignment;
- req_sects = end_sect - sector;
- }
-
- bio_chain(bio, parent_bio);
+ /*
+ * Required bio_put occurs in bio_endio thanks to bio_chain below
+ */
+ bio = bio_alloc(gfp_mask, 1);
+ if (!bio)
+ return -ENOMEM;
- bio->bi_iter.bi_sector = sector;
- bio->bi_bdev = bdev;
+ bio_chain(bio, parent_bio);
- bio->bi_iter.bi_size = req_sects << 9;
- nr_sects -= req_sects;
- sector = end_sect;
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_size = nr_sects << 9;
- submit_bio(type, bio);
+ submit_bio(type, bio);
- /*
- * We can loop for a long time in here, if someone does
- * full device discards (like mkfs). Be nice and allow
- * us to schedule out to avoid softlocking if preempt
- * is disabled.
- */
- cond_resched();
- }
- blk_finish_plug(&plug);
-
- return ret;
+ return 0;
}
static bool block_size_is_power_of_two(struct pool *pool)
@@ -615,8 +557,10 @@ static void error_bio_list(struct bio_list *bios, int error)
{
struct bio *bio;
- while ((bio = bio_list_pop(bios)))
- bio_endio(bio, error);
+ while ((bio = bio_list_pop(bios))) {
+ bio->bi_error = error;
+ bio_endio(bio);
+ }
}
static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
@@ -870,14 +814,14 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
complete_mapping_preparation(m);
}
-static void overwrite_endio(struct bio *bio, int err)
+static void overwrite_endio(struct bio *bio)
{
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct dm_thin_new_mapping *m = h->overwrite_mapping;
bio->bi_end_io = m->saved_bi_end_io;
- m->err = err;
+ m->err = bio->bi_error;
complete_mapping_preparation(m);
}
@@ -1002,7 +946,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
*/
if (bio) {
inc_remap_and_issue_cell(tc, m->cell, m->data_block);
- bio_endio(bio, 0);
+ bio_endio(bio);
} else {
inc_all_io_entry(tc->pool, m->cell->holder);
remap_and_issue(tc, m->cell->holder, m->data_block);
@@ -1032,7 +976,7 @@ static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
{
- bio_endio(m->bio, 0);
+ bio_endio(m->bio);
free_discard_mapping(m);
}
@@ -1046,7 +990,7 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
bio_io_error(m->bio);
} else
- bio_endio(m->bio, 0);
+ bio_endio(m->bio);
cell_defer_no_holder(tc, m->cell);
mempool_free(m, tc->pool->mapping_pool);
@@ -1117,7 +1061,8 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
* Even if r is set, there could be sub discards in flight that we
* need to wait for.
*/
- bio_endio(m->bio, r);
+ m->bio->bi_error = r;
+ bio_endio(m->bio);
cell_defer_no_holder(tc, m->cell);
mempool_free(m, pool->mapping_pool);
}
@@ -1493,9 +1438,10 @@ static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
{
int error = should_error_unserviceable_bio(pool);
- if (error)
- bio_endio(bio, error);
- else
+ if (error) {
+ bio->bi_error = error;
+ bio_endio(bio);
+ } else
retry_on_resume(bio);
}
@@ -1539,9 +1485,8 @@ static void process_discard_cell_no_passdown(struct thin_c *tc,
}
/*
- * FIXME: DM local hack to defer parent bios's end_io until we
- * _know_ all chained sub range discard bios have completed.
- * Will go away once late bio splitting lands upstream!
+ * __bio_inc_remaining() is used to defer parent bios's end_io until
+ * we _know_ all chained sub range discard bios have completed.
*/
static inline void __bio_inc_remaining(struct bio *bio)
{
@@ -1631,7 +1576,7 @@ static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_priso
* will prevent completion until the sub range discards have
* completed.
*/
- bio_endio(bio, 0);
+ bio_endio(bio);
}
static void process_discard_bio(struct thin_c *tc, struct bio *bio)
@@ -1645,7 +1590,7 @@ static void process_discard_bio(struct thin_c *tc, struct bio *bio)
/*
* The discard covers less than a block.
*/
- bio_endio(bio, 0);
+ bio_endio(bio);
return;
}
@@ -1790,7 +1735,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
if (bio_data_dir(bio) == READ) {
zero_fill_bio(bio);
cell_defer_no_holder(tc, cell);
- bio_endio(bio, 0);
+ bio_endio(bio);
return;
}
@@ -1855,7 +1800,7 @@ static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
} else {
zero_fill_bio(bio);
- bio_endio(bio, 0);
+ bio_endio(bio);
}
} else
provision_block(tc, bio, block, cell);
@@ -1926,7 +1871,7 @@ static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
}
zero_fill_bio(bio);
- bio_endio(bio, 0);
+ bio_endio(bio);
break;
default:
@@ -1951,7 +1896,7 @@ static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell
static void process_bio_success(struct thin_c *tc, struct bio *bio)
{
- bio_endio(bio, 0);
+ bio_endio(bio);
}
static void process_bio_fail(struct thin_c *tc, struct bio *bio)
@@ -2600,7 +2545,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
thin_hook_bio(tc, bio);
if (tc->requeue_mode) {
- bio_endio(bio, DM_ENDIO_REQUEUE);
+ bio->bi_error = DM_ENDIO_REQUEUE;
+ bio_endio(bio);
return DM_MAPIO_SUBMITTED;
}
@@ -3255,7 +3201,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
metadata_low_callback,
pool);
if (r)
- goto out_free_pt;
+ goto out_flags_changed;
pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks);
@@ -3875,20 +3821,6 @@ static int pool_iterate_devices(struct dm_target *ti,
return fn(ti, pt->data_dev, 0, ti->len, data);
}
-static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct pool_c *pt = ti->private;
- struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = pt->data_dev->bdev;
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
@@ -3965,7 +3897,6 @@ static struct target_type pool_target = {
.resume = pool_resume,
.message = pool_message,
.status = pool_status,
- .merge = pool_merge,
.iterate_devices = pool_iterate_devices,
.io_hints = pool_io_hints,
};
@@ -4292,21 +4223,6 @@ err:
DMEMIT("Error");
}
-static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct thin_c *tc = ti->private;
- struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = tc->pool_dev->bdev;
- bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
static int thin_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
@@ -4333,6 +4249,10 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct thin_c *tc = ti->private;
struct pool *pool = tc->pool;
+ struct queue_limits *pool_limits = dm_get_queue_limits(pool->pool_md);
+
+ if (!pool_limits->discard_granularity)
+ return; /* pool's discard support is disabled */
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
@@ -4350,7 +4270,6 @@ static struct target_type thin_target = {
.presuspend = thin_presuspend,
.postsuspend = thin_postsuspend,
.status = thin_status,
- .merge = thin_merge,
.iterate_devices = thin_iterate_devices,
.io_hints = thin_io_hints,
};
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index bb9c6a0..ccf4188 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -26,8 +26,6 @@
#define DM_VERITY_ENV_LENGTH 42
#define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR"
-#define DM_VERITY_IO_VEC_INLINE 16
-#define DM_VERITY_MEMPOOL_SIZE 4
#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
#define DM_VERITY_MAX_LEVELS 63
@@ -76,8 +74,6 @@ struct dm_verity {
enum verity_mode mode; /* mode for handling verification errors */
unsigned corrupted_errs;/* Number of errors for corrupted blocks */
- mempool_t *vec_mempool; /* mempool of bio vector */
-
struct workqueue_struct *verify_wq;
/* starting blocks for each tree level. 0 is the lowest level. */
@@ -271,7 +267,7 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
verity_hash_at_level(v, block, level, &hash_block, &offset);
data = dm_bufio_read(v->bufio, hash_block, &buf);
- if (unlikely(IS_ERR(data)))
+ if (IS_ERR(data))
return PTR_ERR(data);
aux = dm_bufio_get_aux_data(buf);
@@ -458,8 +454,9 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
bio->bi_end_io = io->orig_bi_end_io;
bio->bi_private = io->orig_bi_private;
+ bio->bi_error = error;
- bio_endio(bio, error);
+ bio_endio(bio);
}
static void verity_work(struct work_struct *w)
@@ -469,12 +466,12 @@ static void verity_work(struct work_struct *w)
verity_finish_io(io, verity_verify_io(io));
}
-static void verity_end_io(struct bio *bio, int error)
+static void verity_end_io(struct bio *bio)
{
struct dm_verity_io *io = bio->bi_private;
- if (error) {
- verity_finish_io(io, error);
+ if (bio->bi_error) {
+ verity_finish_io(io, bio->bi_error);
return;
}
@@ -634,33 +631,17 @@ static void verity_status(struct dm_target *ti, status_type_t type,
}
}
-static int verity_ioctl(struct dm_target *ti, unsigned cmd,
- unsigned long arg)
+static int verity_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct dm_verity *v = ti->private;
- int r = 0;
+
+ *bdev = v->data_dev->bdev;
if (v->data_start ||
ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
- cmd, arg);
-}
-
-static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct dm_verity *v = ti->private;
- struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = v->data_dev->bdev;
- bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+ return 1;
+ return 0;
}
static int verity_iterate_devices(struct dm_target *ti,
@@ -691,9 +672,6 @@ static void verity_dtr(struct dm_target *ti)
if (v->verify_wq)
destroy_workqueue(v->verify_wq);
- if (v->vec_mempool)
- mempool_destroy(v->vec_mempool);
-
if (v->bufio)
dm_bufio_client_destroy(v->bufio);
@@ -962,14 +940,6 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
- v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
- BIO_MAX_PAGES * sizeof(struct bio_vec));
- if (!v->vec_mempool) {
- ti->error = "Cannot allocate vector mempool";
- r = -ENOMEM;
- goto bad;
- }
-
/* WQ_UNBOUND greatly improves performance when running on ramdisk */
v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
if (!v->verify_wq) {
@@ -994,8 +964,7 @@ static struct target_type verity_target = {
.dtr = verity_dtr,
.map = verity_map,
.status = verity_status,
- .ioctl = verity_ioctl,
- .merge = verity_merge,
+ .prepare_ioctl = verity_prepare_ioctl,
.iterate_devices = verity_iterate_devices,
.io_hints = verity_io_hints,
};
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index b9a64bb..766bc93 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -47,7 +47,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
break;
}
- bio_endio(bio, 0);
+ bio_endio(bio);
/* accepted bio, don't make new request */
return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ab37ae1..32440ad 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,7 @@
#include <linux/ktime.h>
#include <linux/elevator.h> /* for rq_end_sector() */
#include <linux/blk-mq.h>
+#include <linux/pr.h>
#include <trace/events/block.h>
@@ -124,9 +125,8 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_MERGE_IS_OPTIONAL 6
-#define DMF_DEFERRED_REMOVE 7
-#define DMF_SUSPENDED_INTERNALLY 8
+#define DMF_DEFERRED_REMOVE 6
+#define DMF_SUSPENDED_INTERNALLY 7
/*
* A dummy definition to make RCU happy.
@@ -556,18 +556,16 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
-static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+static int dm_get_live_table_for_ioctl(struct mapped_device *md,
+ struct dm_target **tgt, struct block_device **bdev,
+ fmode_t *mode, int *srcu_idx)
{
- struct mapped_device *md = bdev->bd_disk->private_data;
- int srcu_idx;
struct dm_table *map;
- struct dm_target *tgt;
- int r = -ENOTTY;
+ int r;
retry:
- map = dm_get_live_table(md, &srcu_idx);
-
+ r = -ENOTTY;
+ map = dm_get_live_table(md, srcu_idx);
if (!map || !dm_table_get_size(map))
goto out;
@@ -575,8 +573,9 @@ retry:
if (dm_table_get_num_targets(map) != 1)
goto out;
- tgt = dm_table_get_target(map, 0);
- if (!tgt->type->ioctl)
+ *tgt = dm_table_get_target(map, 0);
+
+ if (!(*tgt)->type->prepare_ioctl)
goto out;
if (dm_suspended_md(md)) {
@@ -584,16 +583,46 @@ retry:
goto out;
}
- r = tgt->type->ioctl(tgt, cmd, arg);
+ r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode);
+ if (r < 0)
+ goto out;
+
+ return r;
out:
- dm_put_live_table(md, srcu_idx);
-
+ dm_put_live_table(md, *srcu_idx);
if (r == -ENOTCONN) {
msleep(10);
goto retry;
}
+ return r;
+}
+static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ struct dm_target *tgt;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ if (r > 0) {
+ /*
+ * Target determined this ioctl is being issued against
+ * a logical partition of the parent bdev; so extra
+ * validation is needed.
+ */
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+ if (r)
+ goto out;
+ }
+
+ r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+out:
+ dm_put_live_table(md, srcu_idx);
return r;
}
@@ -944,7 +973,8 @@ static void dec_pending(struct dm_io *io, int error)
} else {
/* done with normal IO or empty flush */
trace_block_bio_complete(md->queue, bio, io_error);
- bio_endio(bio, io_error);
+ bio->bi_error = io_error;
+ bio_endio(bio);
}
}
}
@@ -957,17 +987,15 @@ static void disable_write_same(struct mapped_device *md)
limits->max_write_same_sectors = 0;
}
-static void clone_endio(struct bio *bio, int error)
+static void clone_endio(struct bio *bio)
{
+ int error = bio->bi_error;
int r = error;
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
- if (!bio_flagged(bio, BIO_UPTODATE) && !error)
- error = -EIO;
-
if (endio) {
r = endio(tio->ti, bio, error);
if (r < 0 || r == DM_ENDIO_REQUEUE)
@@ -996,13 +1024,14 @@ static void clone_endio(struct bio *bio, int error)
/*
* Partial completion handling for request-based dm
*/
-static void end_clone_bio(struct bio *clone, int error)
+static void end_clone_bio(struct bio *clone)
{
struct dm_rq_clone_bio_info *info =
container_of(clone, struct dm_rq_clone_bio_info, clone);
struct dm_rq_target_io *tio = info->tio;
struct bio *bio = info->orig;
unsigned int nr_bytes = info->orig->bi_iter.bi_size;
+ int error = clone->bi_error;
bio_put(clone);
@@ -1466,7 +1495,7 @@ static void __map_bio(struct dm_target_io *tio)
md = tio->io->md;
dec_pending(tio->io, r);
free_tio(md, tio);
- } else if (r) {
+ } else if (r != DM_MAPIO_SUBMITTED) {
DMWARN("unimplemented target map return value: %d", r);
BUG();
}
@@ -1722,67 +1751,6 @@ static void __split_and_process_bio(struct mapped_device *md,
* CRUD END
*---------------------------------------------------------------*/
-static int dm_merge_bvec(struct request_queue *q,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_live_table_fast(md);
- struct dm_target *ti;
- sector_t max_sectors, max_size = 0;
-
- if (unlikely(!map))
- goto out;
-
- ti = dm_table_find_target(map, bvm->bi_sector);
- if (!dm_target_is_valid(ti))
- goto out;
-
- /*
- * Find maximum amount of I/O that won't need splitting
- */
- max_sectors = min(max_io_len(bvm->bi_sector, ti),
- (sector_t) queue_max_sectors(q));
- max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
-
- /*
- * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t
- * to the targets' merge function since it holds sectors not bytes).
- * Just doing this as an interim fix for stable@ because the more
- * comprehensive cleanup of switching to sector_t will impact every
- * DM target that implements a ->merge hook.
- */
- if (max_size > INT_MAX)
- max_size = INT_MAX;
-
- /*
- * merge_bvec_fn() returns number of bytes
- * it can accept at this offset
- * max is precomputed maximal io size
- */
- if (max_size && ti->type->merge)
- max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);
- /*
- * If the target doesn't support merge method and some of the devices
- * provided their merge_bvec method (we know this by looking for the
- * max_hw_sectors that dm_set_device_limits may set), then we can't
- * allow bios with multiple vector entries. So always set max_size
- * to 0, and the code below allows just one page.
- */
- else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
- max_size = 0;
-
-out:
- dm_put_live_table_fast(md);
- /*
- * Always allow an entire first page
- */
- if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
- max_size = biovec->bv_len;
-
- return max_size;
-}
-
/*
* The request function that just remaps the bio built up by
* dm_merge_bvec.
@@ -2258,6 +2226,13 @@ static void dm_init_md_queue(struct mapped_device *md)
* This queue is new, so no concurrency on the queue_flags.
*/
queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+
+ /*
+ * Initialize data that will only be used by a non-blk-mq DM queue
+ * - must do so here (in alloc_dev callchain) before queue is used
+ */
+ md->queue->queuedata = md;
+ md->queue->backing_dev_info.congested_data = md;
}
static void dm_init_old_md_queue(struct mapped_device *md)
@@ -2268,10 +2243,7 @@ static void dm_init_old_md_queue(struct mapped_device *md)
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
- md->queue->queuedata = md;
md->queue->backing_dev_info.congested_fn = dm_any_congested;
- md->queue->backing_dev_info.congested_data = md;
-
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
}
@@ -2281,10 +2253,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
destroy_workqueue(md->wq);
if (md->kworker_task)
kthread_stop(md->kworker_task);
- if (md->io_pool)
- mempool_destroy(md->io_pool);
- if (md->rq_pool)
- mempool_destroy(md->rq_pool);
+ mempool_destroy(md->io_pool);
+ mempool_destroy(md->rq_pool);
if (md->bs)
bioset_free(md->bs);
@@ -2294,8 +2264,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
spin_unlock(&_minor_lock);
- if (blk_get_integrity(md->disk))
- blk_integrity_unregister(md->disk);
del_gendisk(md->disk);
put_disk(md->disk);
}
@@ -2503,59 +2471,6 @@ static void __set_size(struct mapped_device *md, sector_t size)
}
/*
- * Return 1 if the queue has a compulsory merge_bvec_fn function.
- *
- * If this function returns 0, then the device is either a non-dm
- * device without a merge_bvec_fn, or it is a dm device that is
- * able to split any bios it receives that are too big.
- */
-int dm_queue_merge_is_compulsory(struct request_queue *q)
-{
- struct mapped_device *dev_md;
-
- if (!q->merge_bvec_fn)
- return 0;
-
- if (q->make_request_fn == dm_make_request) {
- dev_md = q->queuedata;
- if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
- return 0;
- }
-
- return 1;
-}
-
-static int dm_device_merge_is_compulsory(struct dm_target *ti,
- struct dm_dev *dev, sector_t start,
- sector_t len, void *data)
-{
- struct block_device *bdev = dev->bdev;
- struct request_queue *q = bdev_get_queue(bdev);
-
- return dm_queue_merge_is_compulsory(q);
-}
-
-/*
- * Return 1 if it is acceptable to ignore merge_bvec_fn based
- * on the properties of the underlying devices.
- */
-static int dm_table_merge_is_optional(struct dm_table *table)
-{
- unsigned i = 0;
- struct dm_target *ti;
-
- while (i < dm_table_get_num_targets(table)) {
- ti = dm_table_get_target(table, i++);
-
- if (ti->type->iterate_devices &&
- ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
- return 0;
- }
-
- return 1;
-}
-
-/*
* Returns old map, which caller must destroy.
*/
static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
@@ -2564,7 +2479,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
struct dm_table *old_map;
struct request_queue *q = md->queue;
sector_t size;
- int merge_is_optional;
size = dm_table_get_size(t);
@@ -2590,17 +2504,11 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
__bind_mempools(md, t);
- merge_is_optional = dm_table_merge_is_optional(t);
-
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
dm_table_set_restrictions(t, q, limits);
- if (merge_is_optional)
- set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
- else
- clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
if (old_map)
dm_sync_table(md);
@@ -2881,7 +2789,12 @@ int dm_setup_md_queue(struct mapped_device *md)
case DM_TYPE_BIO_BASED:
dm_init_old_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
- blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+ /*
+ * DM handles splitting bios as needed. Free the bio_split bioset
+ * since it won't be used (saves 1 process per bio-based DM device).
+ */
+ bioset_free(md->queue->bio_split);
+ md->queue->bio_split = NULL;
break;
}
@@ -2959,8 +2872,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
might_sleep();
- map = dm_get_live_table(md, &srcu_idx);
-
spin_lock(&_minor_lock);
idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
set_bit(DMF_FREEING, &md->flags);
@@ -2974,14 +2885,14 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
* do not race with internal suspend.
*/
mutex_lock(&md->suspend_lock);
+ map = dm_get_live_table(md, &srcu_idx);
if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
}
- mutex_unlock(&md->suspend_lock);
-
/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
dm_put_live_table(md, srcu_idx);
+ mutex_unlock(&md->suspend_lock);
/*
* Rare, but there may be I/O requests still going to complete,
@@ -3630,11 +3541,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
if (!pools)
return;
- if (pools->io_pool)
- mempool_destroy(pools->io_pool);
-
- if (pools->rq_pool)
- mempool_destroy(pools->rq_pool);
+ mempool_destroy(pools->io_pool);
+ mempool_destroy(pools->rq_pool);
if (pools->bs)
bioset_free(pools->bs);
@@ -3642,11 +3550,133 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
kfree(pools);
}
+static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
+ u32 flags)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_register)
+ r = ops->pr_register(bdev, old_key, new_key, flags);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
+ u32 flags)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_reserve)
+ r = ops->pr_reserve(bdev, key, type, flags);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_release)
+ r = ops->pr_release(bdev, key, type);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
+ enum pr_type type, bool abort)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_preempt)
+ r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static int dm_pr_clear(struct block_device *bdev, u64 key)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_clear)
+ r = ops->pr_clear(bdev, key);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static const struct pr_ops dm_pr_ops = {
+ .pr_register = dm_pr_register,
+ .pr_reserve = dm_pr_reserve,
+ .pr_release = dm_pr_release,
+ .pr_preempt = dm_pr_preempt,
+ .pr_clear = dm_pr_clear,
+};
+
static const struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
.getgeo = dm_blk_getgeo,
+ .pr_ops = &dm_pr_ops,
.owner = THIS_MODULE
};
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 4e98499..7edcf97 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -78,8 +78,6 @@ bool dm_table_mq_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
-int dm_queue_merge_is_compulsory(struct request_queue *q);
-
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
void dm_set_md_type(struct mapped_device *md, unsigned type);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 1277eb2..4a8e150 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -70,7 +70,7 @@
#include <linux/seq_file.h>
-static void faulty_fail(struct bio *bio, int error)
+static void faulty_fail(struct bio *bio)
{
struct bio *b = bio->bi_private;
@@ -181,7 +181,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
/* special case - don't decrement, don't generic_make_request,
* just fail immediately
*/
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
return;
}
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index fa7d577..b7fe7e9 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -52,48 +52,6 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
return conf->disks + lo;
}
-/**
- * linear_mergeable_bvec -- tell bio layer if two requests can be merged
- * @q: request queue
- * @bvm: properties of new bio
- * @biovec: the request that could be merged to it.
- *
- * Return amount of bytes we can take at this offset
- */
-static int linear_mergeable_bvec(struct mddev *mddev,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct dev_info *dev0;
- unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int maxbytes = biovec->bv_len;
- struct request_queue *subq;
-
- dev0 = which_dev(mddev, sector);
- maxsectors = dev0->end_sector - sector;
- subq = bdev_get_queue(dev0->rdev->bdev);
- if (subq->merge_bvec_fn) {
- bvm->bi_bdev = dev0->rdev->bdev;
- bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
- maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
- biovec));
- }
-
- if (maxsectors < bio_sectors)
- maxsectors = 0;
- else
- maxsectors -= bio_sectors;
-
- if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
- return maxbytes;
-
- if (maxsectors > (maxbytes >> 9))
- return maxbytes;
- else
- return maxsectors << 9;
-}
-
static int linear_congested(struct mddev *mddev, int bits)
{
struct linear_conf *conf;
@@ -297,7 +255,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely((split->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
/* Just ignore it */
- bio_endio(split, 0);
+ bio_endio(split);
} else
generic_make_request(split);
} while (split != bio);
@@ -338,7 +296,6 @@ static struct md_personality linear_personality =
.size = linear_size,
.quiesce = linear_quiesce,
.congested = linear_congested,
- .mergeable_bvec = linear_mergeable_bvec,
};
static int __init linear_init (void)
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0072190..d6a1126 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -28,6 +28,7 @@ struct dlm_lock_resource {
struct completion completion; /* completion for synchronized locking */
void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
struct mddev *mddev; /* pointing back to mddev. */
+ int mode;
};
struct suspend_info {
@@ -45,6 +46,7 @@ struct resync_info {
/* md_cluster_info flags */
#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
#define MD_CLUSTER_SUSPEND_READ_BALANCING 2
+#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3
struct md_cluster_info {
@@ -52,9 +54,8 @@ struct md_cluster_info {
dlm_lockspace_t *lockspace;
int slot_number;
struct completion completion;
- struct dlm_lock_resource *sb_lock;
- struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres;
+ struct dlm_lock_resource *resync_lockres;
struct list_head suspend_list;
spinlock_t suspend_lock;
struct md_thread *recovery_thread;
@@ -75,23 +76,24 @@ enum msg_type {
NEWDISK,
REMOVE,
RE_ADD,
+ BITMAP_NEEDS_SYNC,
};
struct cluster_msg {
- int type;
- int slot;
+ __le32 type;
+ __le32 slot;
/* TODO: Unionize this for smaller footprint */
- sector_t low;
- sector_t high;
+ __le64 low;
+ __le64 high;
char uuid[16];
- int raid_slot;
+ __le32 raid_slot;
};
static void sync_ast(void *arg)
{
struct dlm_lock_resource *res;
- res = (struct dlm_lock_resource *) arg;
+ res = arg;
complete(&res->completion);
}
@@ -99,13 +101,14 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
{
int ret = 0;
- init_completion(&res->completion);
ret = dlm_lock(res->ls, mode, &res->lksb,
res->flags, res->name, strlen(res->name),
0, sync_ast, res, res->bast);
if (ret)
return ret;
wait_for_completion(&res->completion);
+ if (res->lksb.sb_status == 0)
+ res->mode = mode;
return res->lksb.sb_status;
}
@@ -124,8 +127,10 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
if (!res)
return NULL;
+ init_completion(&res->completion);
res->ls = cinfo->lockspace;
res->mddev = mddev;
+ res->mode = DLM_LOCK_IV;
namelen = strlen(name);
res->name = kzalloc(namelen + 1, GFP_KERNEL);
if (!res->name) {
@@ -165,11 +170,24 @@ out_err:
static void lockres_free(struct dlm_lock_resource *res)
{
+ int ret;
+
if (!res)
return;
- init_completion(&res->completion);
- dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+ /* cancel a lock request or a conversion request that is blocked */
+ res->flags |= DLM_LKF_CANCEL;
+retry:
+ ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+ if (unlikely(ret != 0)) {
+ pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
+
+ /* if a lock conversion is cancelled, then the lock is put
+ * back to grant queue, need to ensure it is unlocked */
+ if (ret == -DLM_ECANCEL)
+ goto retry;
+ }
+ res->flags &= ~DLM_LKF_CANCEL;
wait_for_completion(&res->completion);
kfree(res->name);
@@ -177,20 +195,8 @@ static void lockres_free(struct dlm_lock_resource *res)
kfree(res);
}
-static char *pretty_uuid(char *dest, char *src)
-{
- int i, len = 0;
-
- for (i = 0; i < 16; i++) {
- if (i == 4 || i == 6 || i == 8 || i == 10)
- len += sprintf(dest + len, "-");
- len += sprintf(dest + len, "%02x", (__u8)src[i]);
- }
- return dest;
-}
-
-static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
- sector_t lo, sector_t hi)
+static void add_resync_info(struct dlm_lock_resource *lockres,
+ sector_t lo, sector_t hi)
{
struct resync_info *ri;
@@ -208,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc
dlm_lock_sync(lockres, DLM_LOCK_CR);
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
hi = le64_to_cpu(ri.hi);
- if (ri.hi > 0) {
+ if (hi > 0) {
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
if (!s)
goto out;
@@ -281,16 +287,11 @@ static void recover_prep(void *arg)
set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
-static void recover_slot(void *arg, struct dlm_slot *slot)
+static void __recover_slot(struct mddev *mddev, int slot)
{
- struct mddev *mddev = arg;
struct md_cluster_info *cinfo = mddev->cluster_info;
- pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
- mddev->bitmap_info.cluster_name,
- slot->nodeid, slot->slot,
- cinfo->slot_number);
- set_bit(slot->slot - 1, &cinfo->recovery_map);
+ set_bit(slot, &cinfo->recovery_map);
if (!cinfo->recovery_thread) {
cinfo->recovery_thread = md_register_thread(recover_bitmaps,
mddev, "recover");
@@ -302,6 +303,20 @@ static void recover_slot(void *arg, struct dlm_slot *slot)
md_wakeup_thread(cinfo->recovery_thread);
}
+static void recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct mddev *mddev = arg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
+ mddev->bitmap_info.cluster_name,
+ slot->nodeid, slot->slot,
+ cinfo->slot_number);
+ /* deduct one since dlm slot starts from one while the num of
+ * cluster-md begins with 0 */
+ __recover_slot(mddev, slot->slot - 1);
+}
+
static void recover_done(void *arg, struct dlm_slot *slots,
int num_slots, int our_slot,
uint32_t generation)
@@ -310,10 +325,17 @@ static void recover_done(void *arg, struct dlm_slot *slots,
struct md_cluster_info *cinfo = mddev->cluster_info;
cinfo->slot_number = our_slot;
- complete(&cinfo->completion);
+ /* completion is only need to be complete when node join cluster,
+ * it doesn't need to run during another node's failure */
+ if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
+ complete(&cinfo->completion);
+ clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
+ }
clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
+/* the ops is called when node join the cluster, and do lock recovery
+ * if node failure occurs */
static const struct dlm_lockspace_ops md_ls_ops = {
.recover_prep = recover_prep,
.recover_slot = recover_slot,
@@ -327,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = {
*/
static void ack_bast(void *arg, int mode)
{
- struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
+ struct dlm_lock_resource *res = arg;
struct md_cluster_info *cinfo = res->mddev->cluster_info;
if (mode == DLM_LOCK_EX)
@@ -340,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
if (slot == s->slot) {
- pr_info("%s:%d Deleting suspend_info: %d\n",
- __func__, __LINE__, slot);
list_del(&s->list);
kfree(s);
break;
}
}
-static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+static void remove_suspend_info(struct mddev *mddev, int slot)
{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
spin_lock_irq(&cinfo->suspend_lock);
__remove_suspend_info(cinfo, slot);
spin_unlock_irq(&cinfo->suspend_lock);
+ mddev->pers->quiesce(mddev, 2);
}
-static void process_suspend_info(struct md_cluster_info *cinfo,
+static void process_suspend_info(struct mddev *mddev,
int slot, sector_t lo, sector_t hi)
{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
struct suspend_info *s;
if (!hi) {
- remove_suspend_info(cinfo, slot);
+ remove_suspend_info(mddev, slot);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
return;
}
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
@@ -371,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
s->slot = slot;
s->lo = lo;
s->hi = hi;
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
spin_lock_irq(&cinfo->suspend_lock);
/* Remove existing entry (if exists) before adding */
__remove_suspend_info(cinfo, slot);
list_add(&s->list, &cinfo->suspend_list);
spin_unlock_irq(&cinfo->suspend_lock);
+ mddev->pers->quiesce(mddev, 2);
}
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
@@ -388,8 +416,8 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
int len;
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
- pretty_uuid(disk_uuid + len, cmsg->uuid);
- snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
+ sprintf(disk_uuid + len, "%pU", cmsg->uuid);
+ snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion);
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
@@ -403,60 +431,60 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
-
- md_reload_sb(mddev);
+ md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
}
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
{
- struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+ struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
+ le32_to_cpu(msg->raid_slot));
if (rdev)
md_kick_rdev_from_array(rdev);
else
- pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
+ pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
+ __func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
{
- struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+ struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
+ le32_to_cpu(msg->raid_slot));
if (rdev && test_bit(Faulty, &rdev->flags))
clear_bit(Faulty, &rdev->flags);
else
- pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
+ pr_warn("%s: %d Could not find disk(%d) which is faulty",
+ __func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
{
- switch (msg->type) {
+ if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
+ "node %d received it's own msg\n", le32_to_cpu(msg->slot)))
+ return;
+ switch (le32_to_cpu(msg->type)) {
case METADATA_UPDATED:
- pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
- __func__, __LINE__, msg->slot);
process_metadata_update(mddev, msg);
break;
case RESYNCING:
- pr_info("%s: %d Received message: RESYNCING from %d\n",
- __func__, __LINE__, msg->slot);
- process_suspend_info(mddev->cluster_info, msg->slot,
- msg->low, msg->high);
+ process_suspend_info(mddev, le32_to_cpu(msg->slot),
+ le64_to_cpu(msg->low),
+ le64_to_cpu(msg->high));
break;
case NEWDISK:
- pr_info("%s: %d Received message: NEWDISK from %d\n",
- __func__, __LINE__, msg->slot);
process_add_new_disk(mddev, msg);
break;
case REMOVE:
- pr_info("%s: %d Received REMOVE from %d\n",
- __func__, __LINE__, msg->slot);
process_remove_disk(mddev, msg);
break;
case RE_ADD:
- pr_info("%s: %d Received RE_ADD from %d\n",
- __func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg);
break;
+ case BITMAP_NEEDS_SYNC:
+ __recover_slot(mddev, le32_to_cpu(msg->slot));
+ break;
default:
pr_warn("%s:%d Received unknown message from %d\n",
__func__, __LINE__, msg->slot);
@@ -472,6 +500,7 @@ static void recv_daemon(struct md_thread *thread)
struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
struct cluster_msg msg;
+ int ret;
/*get CR on Message*/
if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
@@ -484,23 +513,37 @@ static void recv_daemon(struct md_thread *thread)
process_recvd_msg(thread->mddev, &msg);
/*release CR on ack_lockres*/
- dlm_unlock_sync(ack_lockres);
- /*up-convert to EX on message_lockres*/
- dlm_lock_sync(message_lockres, DLM_LOCK_EX);
+ ret = dlm_unlock_sync(ack_lockres);
+ if (unlikely(ret != 0))
+ pr_info("unlock ack failed return %d\n", ret);
+ /*up-convert to PR on message_lockres*/
+ ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
+ if (unlikely(ret != 0))
+ pr_info("lock PR on msg failed return %d\n", ret);
/*get CR on ack_lockres again*/
- dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+ ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+ if (unlikely(ret != 0))
+ pr_info("lock CR on ack failed return %d\n", ret);
/*release CR on message_lockres*/
- dlm_unlock_sync(message_lockres);
+ ret = dlm_unlock_sync(message_lockres);
+ if (unlikely(ret != 0))
+ pr_info("unlock msg failed return %d\n", ret);
}
/* lock_comm()
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway.
+ * If called again, and the TOKEN lock is alread in EX mode
+ * return success. However, care must be taken that unlock_comm()
+ * is called only once.
*/
static int lock_comm(struct md_cluster_info *cinfo)
{
int error;
+ if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+ return 0;
+
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
if (error)
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
@@ -510,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo)
static void unlock_comm(struct md_cluster_info *cinfo)
{
+ WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
dlm_unlock_sync(cinfo->token_lockres);
}
@@ -519,7 +563,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
* The function:
* 1. Grabs the message lockresource in EX mode
* 2. Copies the message to the message LVB
- * 3. Downconverts message lockresource to CR
+ * 3. Downconverts message lockresource to CW
* 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
* and the other nodes read the message. The thread will wait here until all other
* nodes have released ack lock resource.
@@ -540,12 +584,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
sizeof(struct cluster_msg));
- /*down-convert EX to CR on Message*/
- error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
+ /*down-convert EX to CW on Message*/
+ error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
if (error) {
- pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
+ pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
error);
- goto failed_message;
+ goto failed_ack;
}
/*up-convert CR to EX on Ack*/
@@ -565,7 +609,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
}
failed_ack:
- dlm_unlock_sync(cinfo->message_lockres);
+ error = dlm_unlock_sync(cinfo->message_lockres);
+ if (unlikely(error != 0)) {
+ pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
+ error);
+ /* in case the message can't be released due to some reason */
+ goto failed_ack;
+ }
failed_message:
return error;
}
@@ -587,6 +637,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
struct dlm_lock_resource *bm_lockres;
struct suspend_info *s;
char str[64];
+ sector_t lo, hi;
for (i = 0; i < total_slots; i++) {
@@ -617,9 +668,24 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
lockres_free(bm_lockres);
continue;
}
- if (ret)
+ if (ret) {
+ lockres_free(bm_lockres);
goto out;
- /* TODO: Read the disk bitmap sb and check if it needs recovery */
+ }
+
+ /* Read the disk bitmap sb and check if it needs recovery */
+ ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
+ if (ret) {
+ pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
+ lockres_free(bm_lockres);
+ continue;
+ }
+ if ((hi > 0) && (lo < mddev->recovery_cp)) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ mddev->recovery_cp = lo;
+ md_check_recovery(mddev);
+ }
+
dlm_unlock_sync(bm_lockres);
lockres_free(bm_lockres);
}
@@ -633,20 +699,19 @@ static int join(struct mddev *mddev, int nodes)
int ret, ops_rv;
char str[64];
- if (!try_module_get(THIS_MODULE))
- return -ENOENT;
-
cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
if (!cinfo)
return -ENOMEM;
+ INIT_LIST_HEAD(&cinfo->suspend_list);
+ spin_lock_init(&cinfo->suspend_lock);
init_completion(&cinfo->completion);
+ set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
- mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo;
memset(str, 0, 64);
- pretty_uuid(str, mddev->uuid);
+ sprintf(str, "%pU", mddev->uuid);
ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
DLM_LSFL_FS, LVB_SIZE,
&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
@@ -659,12 +724,6 @@ static int join(struct mddev *mddev, int nodes)
ret = -ERANGE;
goto err;
}
- cinfo->sb_lock = lockres_init(mddev, "cmd-super",
- NULL, 0);
- if (!cinfo->sb_lock) {
- ret = -ENOMEM;
- goto err;
- }
/* Initiate the communication resources */
ret = -ENOMEM;
cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
@@ -705,8 +764,9 @@ static int join(struct mddev *mddev, int nodes)
goto err;
}
- INIT_LIST_HEAD(&cinfo->suspend_list);
- spin_lock_init(&cinfo->suspend_lock);
+ cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
+ if (!cinfo->resync_lockres)
+ goto err;
ret = gather_all_resync_info(mddev, nodes);
if (ret)
@@ -718,29 +778,47 @@ err:
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
+ lockres_free(cinfo->resync_lockres);
lockres_free(cinfo->bitmap_lockres);
- lockres_free(cinfo->sb_lock);
if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2);
mddev->cluster_info = NULL;
kfree(cinfo);
- module_put(THIS_MODULE);
return ret;
}
+static void resync_bitmap(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg = {0};
+ int err;
+
+ cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
+ err = sendmsg(cinfo, &cmsg);
+ if (err)
+ pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
+ __func__, __LINE__, err);
+}
+
static int leave(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
if (!cinfo)
return 0;
+
+ /* BITMAP_NEEDS_SYNC message should be sent when node
+ * is leaving the cluster with dirty bitmap, also we
+ * can only deliver it when dlm connection is available */
+ if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
+ resync_bitmap(mddev);
+
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
- lockres_free(cinfo->sb_lock);
lockres_free(cinfo->bitmap_lockres);
dlm_release_lockspace(cinfo->lockspace, 2);
return 0;
@@ -757,15 +835,6 @@ static int slot_number(struct mddev *mddev)
return cinfo->slot_number - 1;
}
-static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
-{
- struct md_cluster_info *cinfo = mddev->cluster_info;
-
- add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
- /* Re-acquire the lock to refresh LVB */
- dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
-}
-
static int metadata_update_start(struct mddev *mddev)
{
return lock_comm(mddev->cluster_info);
@@ -775,50 +844,62 @@ static int metadata_update_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
- int ret;
+ struct md_rdev *rdev;
+ int ret = 0;
+ int raid_slot = -1;
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(METADATA_UPDATED);
- ret = __sendmsg(cinfo, &cmsg);
+ /* Pick up a good active device number to send.
+ */
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
+ raid_slot = rdev->desc_nr;
+ break;
+ }
+ if (raid_slot >= 0) {
+ cmsg.raid_slot = cpu_to_le32(raid_slot);
+ ret = __sendmsg(cinfo, &cmsg);
+ } else
+ pr_warn("md-cluster: No good device id found to send\n");
unlock_comm(cinfo);
return ret;
}
-static int metadata_update_cancel(struct mddev *mddev)
+static void metadata_update_cancel(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
+ unlock_comm(cinfo);
+}
- return dlm_unlock_sync(cinfo->token_lockres);
+static int resync_start(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
+ return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
}
-static int resync_send(struct mddev *mddev, enum msg_type type,
- sector_t lo, sector_t hi)
+static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
- struct cluster_msg cmsg;
- int slot = cinfo->slot_number - 1;
+ struct cluster_msg cmsg = {0};
- pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
- (unsigned long long)lo,
- (unsigned long long)hi);
- resync_info_update(mddev, lo, hi);
- cmsg.type = cpu_to_le32(type);
- cmsg.slot = cpu_to_le32(slot);
+ add_resync_info(cinfo->bitmap_lockres, lo, hi);
+ /* Re-acquire the lock to refresh LVB */
+ dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
+ cmsg.type = cpu_to_le32(RESYNCING);
cmsg.low = cpu_to_le64(lo);
cmsg.high = cpu_to_le64(hi);
- return sendmsg(cinfo, &cmsg);
-}
-static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
-{
- pr_info("%s:%d\n", __func__, __LINE__);
- return resync_send(mddev, RESYNCING, lo, hi);
+ return sendmsg(cinfo, &cmsg);
}
-static void resync_finish(struct mddev *mddev)
+static int resync_finish(struct mddev *mddev)
{
- pr_info("%s:%d\n", __func__, __LINE__);
- resync_send(mddev, RESYNCING, 0, 0);
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
+ dlm_unlock_sync(cinfo->resync_lockres);
+ return resync_info_update(mddev, 0, 0);
}
static int area_resyncing(struct mddev *mddev, int direction,
@@ -845,7 +926,11 @@ out:
return ret;
}
-static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
+/* add_new_disk() - initiates a disk add
+ * However, if this fails before writing md_update_sb(),
+ * add_new_disk_cancel() must be called to release token lock
+ */
+static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
@@ -856,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16);
- cmsg.raid_slot = rdev->desc_nr;
+ cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
lock_comm(cinfo);
ret = __sendmsg(cinfo, &cmsg);
if (ret)
@@ -867,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
/* Some node does not "see" the device */
if (ret == -EAGAIN)
ret = -ENOENT;
+ if (ret)
+ unlock_comm(cinfo);
else
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
return ret;
}
-static int add_new_disk_finish(struct mddev *mddev)
+static void add_new_disk_cancel(struct mddev *mddev)
{
- struct cluster_msg cmsg;
struct md_cluster_info *cinfo = mddev->cluster_info;
- int ret;
- /* Write sb and inform others */
- md_update_sb(mddev, 1);
- cmsg.type = METADATA_UPDATED;
- ret = __sendmsg(cinfo, &cmsg);
unlock_comm(cinfo);
- return ret;
}
static int new_disk_ack(struct mddev *mddev, bool ack)
@@ -902,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
- struct cluster_msg cmsg;
+ struct cluster_msg cmsg = {0};
struct md_cluster_info *cinfo = mddev->cluster_info;
- cmsg.type = REMOVE;
- cmsg.raid_slot = rdev->desc_nr;
+ cmsg.type = cpu_to_le32(REMOVE);
+ cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
return __sendmsg(cinfo, &cmsg);
}
@@ -913,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev)
{
int sn, err;
sector_t lo, hi;
- struct cluster_msg cmsg;
+ struct cluster_msg cmsg = {0};
struct mddev *mddev = rdev->mddev;
struct md_cluster_info *cinfo = mddev->cluster_info;
- cmsg.type = RE_ADD;
- cmsg.raid_slot = rdev->desc_nr;
+ cmsg.type = cpu_to_le32(RE_ADD);
+ cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
err = sendmsg(cinfo, &cmsg);
if (err)
goto out;
@@ -942,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = {
.join = join,
.leave = leave,
.slot_number = slot_number,
- .resync_info_update = resync_info_update,
.resync_start = resync_start,
.resync_finish = resync_finish,
+ .resync_info_update = resync_info_update,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
.metadata_update_cancel = metadata_update_cancel,
.area_resyncing = area_resyncing,
- .add_new_disk_start = add_new_disk_start,
- .add_new_disk_finish = add_new_disk_finish,
+ .add_new_disk = add_new_disk,
+ .add_new_disk_cancel = add_new_disk_cancel,
.new_disk_ack = new_disk_ack,
.remove_disk = remove_disk,
.gather_bitmaps = gather_bitmaps,
@@ -971,5 +1051,6 @@ static void cluster_exit(void)
module_init(cluster_init);
module_exit(cluster_exit);
+MODULE_AUTHOR("SUSE");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Clustering support for MD");
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 00defe2..e75ea26 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -12,15 +12,15 @@ struct md_cluster_operations {
int (*join)(struct mddev *mddev, int nodes);
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
- void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
- int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
- void (*resync_finish)(struct mddev *mddev);
+ int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
- int (*metadata_update_cancel)(struct mddev *mddev);
+ void (*metadata_update_cancel)(struct mddev *mddev);
+ int (*resync_start)(struct mddev *mddev);
+ int (*resync_finish)(struct mddev *mddev);
int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
- int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
- int (*add_new_disk_finish)(struct mddev *mddev);
+ int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
+ void (*add_new_disk_cancel)(struct mddev *mddev);
int (*new_disk_ack)(struct mddev *mddev, bool ack);
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
int (*gather_bitmaps)(struct md_rdev *rdev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0c2a4e8..3f9a514 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -257,13 +257,17 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
unsigned int sectors;
int cpu;
+ blk_queue_split(q, &bio, q->bio_split);
+
if (mddev == NULL || mddev->pers == NULL
|| !mddev->ready) {
bio_io_error(bio);
return;
}
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
- bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
+ if (bio_sectors(bio) != 0)
+ bio->bi_error = -EROFS;
+ bio_endio(bio);
return;
}
smp_rmb(); /* Ensure implications of 'active' are visible */
@@ -350,34 +354,11 @@ static int md_congested(void *data, int bits)
return mddev_congested(mddev, bits);
}
-static int md_mergeable_bvec(struct request_queue *q,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct mddev *mddev = q->queuedata;
- int ret;
- rcu_read_lock();
- if (mddev->suspended) {
- /* Must always allow one vec */
- if (bvm->bi_size == 0)
- ret = biovec->bv_len;
- else
- ret = 0;
- } else {
- struct md_personality *pers = mddev->pers;
- if (pers && pers->mergeable_bvec)
- ret = pers->mergeable_bvec(mddev, bvm, biovec);
- else
- ret = biovec->bv_len;
- }
- rcu_read_unlock();
- return ret;
-}
/*
* Generic flush handling for md
*/
-static void md_end_flush(struct bio *bio, int err)
+static void md_end_flush(struct bio *bio)
{
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
@@ -433,7 +414,7 @@ static void md_submit_flush_data(struct work_struct *ws)
if (bio->bi_iter.bi_size == 0)
/* an empty barrier - all done */
- bio_endio(bio, 0);
+ bio_endio(bio);
else {
bio->bi_rw &= ~REQ_FLUSH;
mddev->pers->make_request(mddev, bio);
@@ -502,6 +483,8 @@ static void mddev_put(struct mddev *mddev)
bioset_free(bs);
}
+static void md_safemode_timeout(unsigned long data);
+
void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
@@ -509,7 +492,8 @@ void mddev_init(struct mddev *mddev)
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
- init_timer(&mddev->safemode_timer);
+ setup_timer(&mddev->safemode_timer, md_safemode_timeout,
+ (unsigned long) mddev);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
@@ -728,15 +712,13 @@ void md_rdev_clear(struct md_rdev *rdev)
}
EXPORT_SYMBOL_GPL(md_rdev_clear);
-static void super_written(struct bio *bio, int error)
+static void super_written(struct bio *bio)
{
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
- if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
- printk("md: super_written gets error=%d, uptodate=%d\n",
- error, test_bit(BIO_UPTODATE, &bio->bi_flags));
- WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
+ if (bio->bi_error) {
+ printk("md: super_written gets error=%d\n", bio->bi_error);
md_error(mddev, rdev);
}
@@ -791,7 +773,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
bio_add_page(bio, page, size, 0);
submit_bio_wait(rw, bio);
- ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ ret = !bio->bi_error;
bio_put(bio);
return ret;
}
@@ -1626,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
++ev1;
if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
- le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
+ (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
+ le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
if (ev1 < mddev->events)
return -EINVAL;
} else if (mddev->bitmap) {
@@ -1646,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
int role;
if (rdev->desc_nr < 0 ||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
- role = 0xffff;
+ role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
} else
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
- case 0xffff: /* spare */
+ case MD_DISK_ROLE_SPARE: /* spare */
break;
- case 0xfffe: /* faulty */
+ case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags);
break;
+ case MD_DISK_ROLE_JOURNAL: /* journal device */
+ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
+ /* journal device without journal feature */
+ printk(KERN_WARNING
+ "md: journal device provided without journal feature, ignoring the device\n");
+ return -EINVAL;
+ }
+ set_bit(Journal, &rdev->flags);
+ rdev->journal_tail = le64_to_cpu(sb->journal_tail);
+ if (mddev->recovery_cp == MaxSector)
+ set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+ rdev->raid_disk = mddev->raid_disks;
+ break;
default:
rdev->saved_raid_disk = role;
if ((le32_to_cpu(sb->feature_map) &
@@ -1673,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
+ set_bit(MD_HAS_JOURNAL, &mddev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);
@@ -1697,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync)
sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
+ sb->resync_offset = cpu_to_le64(MaxSector);
else
sb->resync_offset = cpu_to_le64(0);
@@ -1720,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
}
- if (rdev->raid_disk >= 0 &&
+ if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags)) {
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
@@ -1730,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
}
+ /* Note: recovery_offset and journal_tail share space */
+ if (test_bit(Journal, &rdev->flags))
+ sb->journal_tail = cpu_to_le64(rdev->journal_tail);
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);
@@ -1753,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
}
}
+ if (mddev_is_clustered(mddev))
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
+
if (rdev->badblocks.count == 0)
/* Nothing to do for bad blocks*/ ;
else if (sb->bblog_offset == 0)
@@ -1803,18 +1809,23 @@ retry:
max_dev = le32_to_cpu(sb->max_dev);
for (i=0; i<max_dev;i++)
- sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
- sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
else if (test_bit(In_sync, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else if (test_bit(Journal, &rdev2->flags))
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
else if (rdev2->raid_disk >= 0)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else
- sb->dev_roles[i] = cpu_to_le16(0xffff);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
}
sb->sb_csum = calc_sb_1_csum(sb);
@@ -1930,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
struct md_rdev *rdev, *rdev2;
rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev1)
- rdev_for_each_rcu(rdev2, mddev2)
+ rdev_for_each_rcu(rdev, mddev1) {
+ if (test_bit(Faulty, &rdev->flags) ||
+ test_bit(Journal, &rdev->flags) ||
+ rdev->raid_disk == -1)
+ continue;
+ rdev_for_each_rcu(rdev2, mddev2) {
+ if (test_bit(Faulty, &rdev2->flags) ||
+ test_bit(Journal, &rdev2->flags) ||
+ rdev2->raid_disk == -1)
+ continue;
if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) {
rcu_read_unlock();
return 1;
}
+ }
+ }
rcu_read_unlock();
return 0;
}
@@ -1980,12 +2001,9 @@ int md_integrity_register(struct mddev *mddev)
* All component devices are integrity capable and have matching
* profiles, register the common profile for the md device.
*/
- if (blk_integrity_register(mddev->gendisk,
- bdev_get_integrity(reference->bdev)) != 0) {
- printk(KERN_ERR "md: failed to register integrity for %s\n",
- mdname(mddev));
- return -EINVAL;
- }
+ blk_integrity_register(mddev->gendisk,
+ bdev_get_integrity(reference->bdev));
+
printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
printk(KERN_ERR "md: failed to create integrity pool for %s\n",
@@ -2015,6 +2033,7 @@ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
if (bi_rdev && blk_integrity_compare(mddev->gendisk,
rdev->bdev->bd_disk) >= 0)
return;
+ WARN_ON_ONCE(!mddev->suspended);
printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
blk_integrity_unregister(mddev->gendisk);
}
@@ -2214,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares)
}
}
+static bool does_sb_need_changing(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ struct mdp_superblock_1 *sb;
+ int role;
+
+ /* Find a good rdev */
+ rdev_for_each(rdev, mddev)
+ if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
+ break;
+
+ /* No good device found. */
+ if (!rdev)
+ return false;
+
+ sb = page_address(rdev->sb_page);
+ /* Check if a device has become faulty or a spare become active */
+ rdev_for_each(rdev, mddev) {
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ /* Device activated? */
+ if (role == 0xffff && rdev->raid_disk >=0 &&
+ !test_bit(Faulty, &rdev->flags))
+ return true;
+ /* Device turned faulty? */
+ if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
+ return true;
+ }
+
+ /* Check if any mddev parameters have changed */
+ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
+ (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
+ (mddev->layout != le64_to_cpu(sb->layout)) ||
+ (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
+ (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
+ return true;
+
+ return false;
+}
+
void md_update_sb(struct mddev *mddev, int force_change)
{
struct md_rdev *rdev;
int sync_req;
int nospares = 0;
int any_badblocks_changed = 0;
+ int ret = -1;
if (mddev->ro) {
if (force_change)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return;
}
+
+ if (mddev_is_clustered(mddev)) {
+ if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+ force_change = 1;
+ ret = md_cluster_ops->metadata_update_start(mddev);
+ /* Has someone else has updated the sb */
+ if (!does_sb_need_changing(mddev)) {
+ if (ret == 0)
+ md_cluster_ops->metadata_update_cancel(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ return;
+ }
+ }
repeat:
/* First make sure individual recovery_offsets are correct */
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset)
rdev->recovery_offset = mddev->curr_resync_completed;
@@ -2374,6 +2447,9 @@ repeat:
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
+
+ if (mddev_is_clustered(mddev) && ret == 0)
+ md_cluster_ops->metadata_update_finish(mddev);
}
EXPORT_SYMBOL(md_update_sb);
@@ -2449,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%sin_sync",sep);
sep = ",";
}
+ if (test_bit(Journal, &flags)) {
+ len += sprintf(page+len, "%sjournal",sep);
+ sep = ",";
+ }
if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
@@ -2460,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page)
sep = ",";
}
if (!test_bit(Faulty, &flags) &&
+ !test_bit(Journal, &flags) &&
!test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sspare", sep);
sep = ",";
@@ -2508,17 +2589,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = -EBUSY;
else {
struct mddev *mddev = rdev->mddev;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->remove_disk(mddev, rdev);
- md_kick_rdev_from_array(rdev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
- if (mddev->pers)
- md_update_sb(mddev, 1);
- md_new_event(mddev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
err = 0;
+ if (mddev_is_clustered(mddev))
+ err = md_cluster_ops->remove_disk(mddev, rdev);
+
+ if (err == 0) {
+ md_kick_rdev_from_array(rdev);
+ if (mddev->pers)
+ md_update_sb(mddev, 1);
+ md_new_event(mddev);
+ }
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
@@ -2547,7 +2627,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags);
err = 0;
- } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
+ } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags)) {
if (rdev->mddev->pers == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->saved_raid_disk = rdev->raid_disk;
@@ -2566,6 +2647,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* check if recovery is needed.
*/
if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
@@ -2643,7 +2725,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
static ssize_t
slot_show(struct md_rdev *rdev, char *page)
{
- if (rdev->raid_disk < 0)
+ if (test_bit(Journal, &rdev->flags))
+ return sprintf(page, "journal\n");
+ else if (rdev->raid_disk < 0)
return sprintf(page, "none\n");
else
return sprintf(page, "%d\n", rdev->raid_disk);
@@ -2655,6 +2739,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
int slot;
int err;
+ if (test_bit(Journal, &rdev->flags))
+ return -EBUSY;
if (strncmp(buf, "none", 4)==0)
slot = -1;
else {
@@ -2706,15 +2792,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
clear_bit(Bitmap_sync, &rdev->flags);
- err = rdev->mddev->pers->
- hot_add_disk(rdev->mddev, rdev);
- if (err) {
- rdev->raid_disk = -1;
- return err;
- } else
- sysfs_notify_dirent_safe(rdev->sysfs_state);
- if (sysfs_link_rdev(rdev->mddev, rdev))
- /* failure here is OK */;
+ remove_and_add_spares(rdev->mddev, rdev);
+ if (rdev->raid_disk == -1)
+ return -EBUSY;
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks &&
@@ -2859,6 +2939,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
sector_t oldsectors = rdev->sectors;
sector_t sectors;
+ if (test_bit(Journal, &rdev->flags))
+ return -EBUSY;
if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL;
if (rdev->data_offset != rdev->new_data_offset)
@@ -3216,20 +3298,14 @@ static void analyze_sbs(struct mddev *mddev)
md_kick_rdev_from_array(rdev);
continue;
}
- /* No device should have a Candidate flag
- * when reading devices
- */
- if (test_bit(Candidate, &rdev->flags)) {
- pr_info("md: kicking Cluster Candidate %s from array!\n",
- bdevname(rdev->bdev, b));
- md_kick_rdev_from_array(rdev);
- }
}
if (mddev->level == LEVEL_MULTIPATH) {
rdev->desc_nr = i++;
rdev->raid_disk = rdev->desc_nr;
set_bit(In_sync, &rdev->flags);
- } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
+ } else if (rdev->raid_disk >=
+ (mddev->raid_disks - min(0, mddev->delta_disks)) &&
+ !test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
}
@@ -3276,8 +3352,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
return 0;
}
-static void md_safemode_timeout(unsigned long data);
-
static ssize_t
safe_delay_show(struct mddev *mddev, char *page)
{
@@ -3289,6 +3363,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
{
unsigned long msec;
+ if (mddev_is_clustered(mddev)) {
+ pr_info("md: Safemode is disabled for clustered mode\n");
+ return -EINVAL;
+ }
+
if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
return -EINVAL;
if (msec == 0)
@@ -3889,7 +3968,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case clean:
if (mddev->pers) {
- restart_array(mddev);
+ err = restart_array(mddev);
+ if (err)
+ break;
spin_lock(&mddev->lock);
if (atomic_read(&mddev->writes_pending) == 0) {
if (mddev->in_sync == 0) {
@@ -3907,7 +3988,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case active:
if (mddev->pers) {
- restart_array(mddev);
+ err = restart_array(mddev);
+ if (err)
+ break;
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
wake_up(&mddev->sb_wait);
err = 0;
@@ -4086,12 +4169,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
return err;
if (mddev->pers) {
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
} else {
if (mddev->dev_sectors == 0 ||
mddev->dev_sectors > sectors)
@@ -4210,6 +4289,8 @@ action_show(struct mddev *mddev, char *page)
type = "repair";
} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
type = "recover";
+ else if (mddev->reshape_position != MaxSector)
+ type = "reshape";
}
return sprintf(page, "%s\n", type);
}
@@ -5186,7 +5267,6 @@ int md_run(struct mddev *mddev)
if (mddev->queue) {
mddev->queue->backing_dev_info.congested_data = mddev;
mddev->queue->backing_dev_info.congested_fn = md_congested;
- blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
}
if (pers->sync_request) {
if (mddev->kobj.sd &&
@@ -5202,9 +5282,10 @@ int md_run(struct mddev *mddev)
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
- mddev->safemode_timer.function = md_safemode_timeout;
- mddev->safemode_timer.data = (unsigned long) mddev;
- mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
+ if (mddev_is_clustered(mddev))
+ mddev->safemode_delay = 0;
+ else
+ mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
spin_lock(&mddev->lock);
@@ -5216,6 +5297,11 @@ int md_run(struct mddev *mddev)
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
+ if (mddev->degraded && !mddev->ro)
+ /* This ensures that recovering status is reported immediately
+ * via sysfs - until a lack of spares is confirmed.
+ */
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
if (mddev->flags & MD_UPDATE_SB_FLAGS)
@@ -5242,6 +5328,9 @@ static int do_md_run(struct mddev *mddev)
goto out;
}
+ if (mddev_is_clustered(mddev))
+ md_allow_write(mddev);
+
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
@@ -5264,6 +5353,25 @@ static int restart_array(struct mddev *mddev)
return -EINVAL;
if (!mddev->ro)
return -EBUSY;
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ struct md_rdev *rdev;
+ bool has_journal = false;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags)) {
+ has_journal = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ /* Don't restart rw with journal missing/faulty */
+ if (!has_journal)
+ return -EINVAL;
+ }
+
mddev->safemode = 0;
mddev->ro = 0;
set_disk_ro(disk, 0);
@@ -5315,7 +5423,6 @@ static void md_clean(struct mddev *mddev)
mddev->degraded = 0;
mddev->safemode = 0;
mddev->private = NULL;
- mddev->merge_check_needed = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
mddev->bitmap_info.default_space = 0;
@@ -5326,8 +5433,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
@@ -5341,13 +5446,13 @@ static void __md_stop_writes(struct mddev *mddev)
md_super_wait(mddev);
if (mddev->ro == 0 &&
- (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
+ ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
+ (mddev->flags & MD_UPDATE_SB_FLAGS))) {
/* mark array as shutdown cleanly */
- mddev->in_sync = 1;
+ if (!mddev_is_clustered(mddev))
+ mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
}
void md_stop_writes(struct mddev *mddev)
@@ -5426,9 +5531,13 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
* which will now never happen */
wake_up_process(mddev->sync_thread->tsk);
+ if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags))
+ return -EBUSY;
mddev_unlock(mddev);
wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
&mddev->recovery));
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags));
mddev_lock_nointr(mddev);
mutex_lock(&mddev->open_mutex);
@@ -5514,7 +5623,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
__md_stop_writes(mddev);
__md_stop(mddev);
- mddev->queue->merge_bvec_fn = NULL;
mddev->queue->backing_dev_info.congested_fn = NULL;
/* tell userspace to handle 'inactive' */
@@ -5556,7 +5664,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
if (mddev->hold_active == UNTIL_STOP)
mddev->hold_active = 0;
}
- blk_integrity_unregister(disk);
md_new_event(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
return 0;
@@ -5759,22 +5866,22 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
char *ptr;
int err;
- file = kmalloc(sizeof(*file), GFP_NOIO);
+ file = kzalloc(sizeof(*file), GFP_NOIO);
if (!file)
return -ENOMEM;
err = 0;
spin_lock(&mddev->lock);
- /* bitmap disabled, zero the first byte and copy out */
- if (!mddev->bitmap_info.file)
- file->pathname[0] = '\0';
- else if ((ptr = file_path(mddev->bitmap_info.file,
- file->pathname, sizeof(file->pathname))),
- IS_ERR(ptr))
- err = PTR_ERR(ptr);
- else
- memmove(file->pathname, ptr,
- sizeof(file->pathname)-(ptr-file->pathname));
+ /* bitmap enabled */
+ if (mddev->bitmap_info.file) {
+ ptr = file_path(mddev->bitmap_info.file, file->pathname,
+ sizeof(file->pathname));
+ if (IS_ERR(ptr))
+ err = PTR_ERR(ptr);
+ else
+ memmove(file->pathname, ptr,
+ sizeof(file->pathname)-(ptr-file->pathname));
+ }
spin_unlock(&mddev->lock);
if (err == 0 &&
@@ -5806,6 +5913,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
info.state |= (1<<MD_DISK_ACTIVE);
info.state |= (1<<MD_DISK_SYNC);
}
+ if (test_bit(Journal, &rdev->flags))
+ info.state |= (1<<MD_DISK_JOURNAL);
if (test_bit(WriteMostly, &rdev->flags))
info.state |= (1<<MD_DISK_WRITEMOSTLY);
} else {
@@ -5920,23 +6029,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
else
clear_bit(WriteMostly, &rdev->flags);
+ if (info->state & (1<<MD_DISK_JOURNAL))
+ set_bit(Journal, &rdev->flags);
/*
* check whether the device shows up in other nodes
*/
if (mddev_is_clustered(mddev)) {
- if (info->state & (1 << MD_DISK_CANDIDATE)) {
- /* Through --cluster-confirm */
+ if (info->state & (1 << MD_DISK_CANDIDATE))
set_bit(Candidate, &rdev->flags);
- err = md_cluster_ops->new_disk_ack(mddev, true);
- if (err) {
- export_rdev(rdev);
- return err;
- }
- } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
+ else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */
- err = md_cluster_ops->add_new_disk_start(mddev, rdev);
+ err = md_cluster_ops->add_new_disk(mddev, rdev);
if (err) {
- md_cluster_ops->add_new_disk_finish(mddev);
export_rdev(rdev);
return err;
}
@@ -5945,13 +6049,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
+
if (err)
export_rdev(rdev);
- else
+
+ if (mddev_is_clustered(mddev)) {
+ if (info->state & (1 << MD_DISK_CANDIDATE))
+ md_cluster_ops->new_disk_ack(mddev, (err == 0));
+ else {
+ if (err)
+ md_cluster_ops->add_new_disk_cancel(mddev);
+ else
+ err = add_bound_rdev(rdev);
+ }
+
+ } else if (!err)
err = add_bound_rdev(rdev);
- if (mddev_is_clustered(mddev) &&
- (info->state & (1 << MD_DISK_CLUSTER_ADD)))
- md_cluster_ops->add_new_disk_finish(mddev);
+
return err;
}
@@ -6007,13 +6121,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
{
char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
+ int ret = -1;
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
+ ret = md_cluster_ops->metadata_update_start(mddev);
+
+ if (rdev->raid_disk < 0)
+ goto kick_rdev;
clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(mddev, rdev);
@@ -6021,20 +6139,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
if (rdev->raid_disk >= 0)
goto busy;
- if (mddev_is_clustered(mddev))
+kick_rdev:
+ if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->remove_disk(mddev, rdev);
md_kick_rdev_from_array(rdev);
md_update_sb(mddev, 1);
md_new_event(mddev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
-
return 0;
busy:
- if (mddev_is_clustered(mddev))
+ if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_cancel(mddev);
+
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
bdevname(rdev->bdev,b), mdname(mddev));
return -EBUSY;
@@ -6085,14 +6202,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
goto abort_export;
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
clear_bit(In_sync, &rdev->flags);
rdev->desc_nr = -1;
rdev->saved_raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
- goto abort_clustered;
+ goto abort_export;
/*
* The rest should better be atomic, we can have disk failures
@@ -6102,9 +6217,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
rdev->raid_disk = -1;
md_update_sb(mddev, 1);
-
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
/*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
@@ -6114,9 +6226,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
md_new_event(mddev);
return 0;
-abort_clustered:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_cancel(mddev);
abort_export:
export_rdev(rdev);
return err;
@@ -6434,8 +6543,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
return rv;
}
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2);
@@ -6493,12 +6600,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
}
}
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
return rv;
err:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_cancel(mddev);
return rv;
}
@@ -7093,7 +7196,7 @@ static void status_unused(struct seq_file *seq)
seq_printf(seq, "\n");
}
-static void status_resync(struct seq_file *seq, struct mddev *mddev)
+static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
unsigned long dt, db;
@@ -7101,18 +7204,32 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
int scale;
unsigned int per_milli;
- if (mddev->curr_resync <= 3)
- resync = 0;
- else
- resync = mddev->curr_resync
- - atomic_read(&mddev->recovery_active);
-
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors;
else
max_sectors = mddev->dev_sectors;
+ resync = mddev->curr_resync;
+ if (resync <= 3) {
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ /* Still cleaning up */
+ resync = max_sectors;
+ } else
+ resync -= atomic_read(&mddev->recovery_active);
+
+ if (resync == 0) {
+ if (mddev->recovery_cp < MaxSector) {
+ seq_printf(seq, "\tresync=PENDING");
+ return 1;
+ }
+ return 0;
+ }
+ if (resync < 3) {
+ seq_printf(seq, "\tresync=DELAYED");
+ return 1;
+ }
+
WARN_ON(max_sectors == 0);
/* Pick 'scale' such that (resync>>scale)*1000 will fit
* in a sector_t, and (max_sectors>>scale) will fit in a
@@ -7177,6 +7294,7 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
((unsigned long)rt % 60)/6);
seq_printf(seq, " speed=%ldK/sec", db/2/dt);
+ return 1;
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@ -7284,6 +7402,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
bdevname(rdev->bdev,b), rdev->desc_nr);
if (test_bit(WriteMostly, &rdev->flags))
seq_printf(seq, "(W)");
+ if (test_bit(Journal, &rdev->flags))
+ seq_printf(seq, "(J)");
if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)");
continue;
@@ -7322,13 +7442,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
mddev->pers->status(seq, mddev);
seq_printf(seq, "\n ");
if (mddev->pers->sync_request) {
- if (mddev->curr_resync > 2) {
- status_resync(seq, mddev);
+ if (status_resync(seq, mddev))
seq_printf(seq, "\n ");
- } else if (mddev->curr_resync >= 1)
- seq_printf(seq, "\tresync=DELAYED\n ");
- else if (mddev->recovery_cp < MaxSector)
- seq_printf(seq, "\tresync=PENDING\n ");
}
} else
seq_printf(seq, "\n ");
@@ -7411,15 +7526,19 @@ int unregister_md_personality(struct md_personality *p)
}
EXPORT_SYMBOL(unregister_md_personality);
-int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
+int register_md_cluster_operations(struct md_cluster_operations *ops,
+ struct module *module)
{
- if (md_cluster_ops != NULL)
- return -EALREADY;
+ int ret = 0;
spin_lock(&pers_lock);
- md_cluster_ops = ops;
- md_cluster_mod = module;
+ if (md_cluster_ops != NULL)
+ ret = -EALREADY;
+ else {
+ md_cluster_ops = ops;
+ md_cluster_mod = module;
+ }
spin_unlock(&pers_lock);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(register_md_cluster_operations);
@@ -7597,11 +7716,7 @@ int md_allow_write(struct mddev *mddev)
mddev->safemode == 0)
mddev->safemode = 1;
spin_unlock(&mddev->lock);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
spin_unlock(&mddev->lock);
@@ -7633,6 +7748,7 @@ void md_do_sync(struct md_thread *thread)
struct md_rdev *rdev;
char *desc, *action = NULL;
struct blk_plug plug;
+ bool cluster_resync_finished = false;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7742,6 +7858,7 @@ void md_do_sync(struct md_thread *thread)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < j)
@@ -7802,9 +7919,6 @@ void md_do_sync(struct md_thread *thread)
md_new_event(mddev);
update_time = jiffies;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_start(mddev, j, max_sectors);
-
blk_start_plug(&plug);
while (j < max_sectors) {
sector_t sectors;
@@ -7817,7 +7931,8 @@ void md_do_sync(struct md_thread *thread)
> (max_sectors >> 4)) ||
time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
(j - mddev->curr_resync_completed)*2
- >= mddev->resync_max - mddev->curr_resync_completed
+ >= mddev->resync_max - mddev->curr_resync_completed ||
+ mddev->curr_resync_completed > mddev->resync_max
)) {
/* time to update curr_resync_completed */
wait_event(mddev->recovery_wait,
@@ -7862,10 +7977,11 @@ void md_do_sync(struct md_thread *thread)
break;
j += sectors;
+ if (j > max_sectors)
+ /* when skipping, extra large numbers can be returned. */
+ j = max_sectors;
if (j > 2)
mddev->curr_resync = j;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_info_update(mddev, j, max_sectors);
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliest that rebuild will be
@@ -7930,11 +8046,18 @@ void md_do_sync(struct md_thread *thread)
blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
- /* tell personality that we are finished */
- mddev->pers->sync_request(mddev, max_sectors, &skipped);
-
- if (mddev_is_clustered(mddev))
+ if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ mddev->curr_resync > 2) {
+ mddev->curr_resync_completed = mddev->curr_resync;
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ }
+ /* tell personality and other nodes that we are finished */
+ if (mddev_is_clustered(mddev)) {
md_cluster_ops->resync_finish(mddev);
+ cluster_resync_finished = true;
+ }
+ mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2) {
@@ -7961,6 +8084,7 @@ void md_do_sync(struct md_thread *thread)
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
@@ -7971,6 +8095,11 @@ void md_do_sync(struct md_thread *thread)
skip:
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (mddev_is_clustered(mddev) &&
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !cluster_resync_finished)
+ md_cluster_ops->resync_finish(mddev);
+
spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
@@ -7979,11 +8108,11 @@ void md_do_sync(struct md_thread *thread)
mddev->resync_max = MaxSector;
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
+ set_bit(MD_RECOVERY_DONE, &mddev->recovery);
mddev->curr_resync = 0;
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
- set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return;
}
@@ -8001,7 +8130,8 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) ||
- ! test_bit(In_sync, &rdev->flags)) &&
+ (!test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags))) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
@@ -8013,25 +8143,31 @@ static int remove_and_add_spares(struct mddev *mddev,
if (removed && mddev->kobj.sd)
sysfs_notify(&mddev->kobj, NULL, "degraded");
- if (this)
+ if (this && removed)
goto no_add;
rdev_for_each(rdev, mddev) {
+ if (this && this != rdev)
+ continue;
+ if (test_bit(Candidate, &rdev->flags))
+ continue;
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk >= 0)
continue;
if (test_bit(Faulty, &rdev->flags))
continue;
+ if (test_bit(Journal, &rdev->flags))
+ continue;
if (mddev->ro &&
! (rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags)))
continue;
- if (rdev->saved_raid_disk < 0)
- rdev->recovery_offset = 0;
+ rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
@@ -8050,14 +8186,25 @@ no_add:
static void md_start_sync(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
+ int ret = 0;
+
+ if (mddev_is_clustered(mddev)) {
+ ret = md_cluster_ops->resync_start(mddev);
+ if (ret) {
+ mddev->sync_thread = NULL;
+ goto out;
+ }
+ }
mddev->sync_thread = md_register_thread(md_do_sync,
mddev,
"resync");
+out:
if (!mddev->sync_thread) {
- printk(KERN_ERR "%s: could not start resync"
- " thread...\n",
- mdname(mddev));
+ if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
+ printk(KERN_ERR "%s: could not start resync"
+ " thread...\n",
+ mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -8152,7 +8299,9 @@ void md_check_recovery(struct mddev *mddev)
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
goto unlock;
}
@@ -8174,13 +8323,8 @@ void md_check_recovery(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
- if (mddev->flags & MD_UPDATE_SB_FLAGS) {
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
+ if (mddev->flags & MD_UPDATE_SB_FLAGS)
md_update_sb(mddev, 0);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
- }
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -8278,8 +8422,6 @@ void md_reap_sync_thread(struct mddev *mddev)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape)
mddev->pers->finish_reshape(mddev);
@@ -8292,8 +8434,6 @@ void md_reap_sync_thread(struct mddev *mddev)
rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -8598,6 +8738,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
/* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread);
}
return rv;
@@ -8915,25 +9056,128 @@ err_wq:
return ret;
}
-void md_reload_sb(struct mddev *mddev)
+static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
{
- struct md_rdev *rdev, *tmp;
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+ struct md_rdev *rdev2;
+ int role, ret;
+ char b[BDEVNAME_SIZE];
- rdev_for_each_safe(rdev, tmp, mddev) {
- rdev->sb_loaded = 0;
- ClearPageUptodate(rdev->sb_page);
+ /* Check for change of roles in the active devices */
+ rdev_for_each(rdev2, mddev) {
+ if (test_bit(Faulty, &rdev2->flags))
+ continue;
+
+ /* Check if the roles changed */
+ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
+
+ if (test_bit(Candidate, &rdev2->flags)) {
+ if (role == 0xfffe) {
+ pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
+ md_kick_rdev_from_array(rdev2);
+ continue;
+ }
+ else
+ clear_bit(Candidate, &rdev2->flags);
+ }
+
+ if (role != rdev2->raid_disk) {
+ /* got activated */
+ if (rdev2->raid_disk == -1 && role != 0xffff) {
+ rdev2->saved_raid_disk = role;
+ ret = remove_and_add_spares(mddev, rdev2);
+ pr_info("Activated spare: %s\n",
+ bdevname(rdev2->bdev,b));
+ continue;
+ }
+ /* device faulty
+ * We just want to do the minimum to mark the disk
+ * as faulty. The recovery is performed by the
+ * one who initiated the error.
+ */
+ if ((role == 0xfffe) || (role == 0xfffd)) {
+ md_error(mddev, rdev2);
+ clear_bit(Blocked, &rdev2->flags);
+ }
+ }
}
- mddev->raid_disks = 0;
- analyze_sbs(mddev);
- rdev_for_each_safe(rdev, tmp, mddev) {
- struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
- /* since we don't write to faulty devices, we figure out if the
- * disk is faulty by comparing events
- */
- if (mddev->events > sb->events)
- set_bit(Faulty, &rdev->flags);
+
+ if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
+ update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
+
+ /* Finally set the event to be up to date */
+ mddev->events = le64_to_cpu(sb->events);
+}
+
+static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
+{
+ int err;
+ struct page *swapout = rdev->sb_page;
+ struct mdp_superblock_1 *sb;
+
+ /* Store the sb page of the rdev in the swapout temporary
+ * variable in case we err in the future
+ */
+ rdev->sb_page = NULL;
+ alloc_disk_sb(rdev);
+ ClearPageUptodate(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
+
+ if (err < 0) {
+ pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
+ __func__, __LINE__, rdev->desc_nr, err);
+ put_page(rdev->sb_page);
+ rdev->sb_page = swapout;
+ rdev->sb_loaded = 1;
+ return err;
+ }
+
+ sb = page_address(rdev->sb_page);
+ /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
+ * is not set
+ */
+
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
+ rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+
+ /* The other node finished recovery, call spare_active to set
+ * device In_sync and mddev->degraded
+ */
+ if (rdev->recovery_offset == MaxSector &&
+ !test_bit(In_sync, &rdev->flags) &&
+ mddev->pers->spare_active(mddev))
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
+
+ put_page(swapout);
+ return 0;
+}
+
+void md_reload_sb(struct mddev *mddev, int nr)
+{
+ struct md_rdev *rdev;
+ int err;
+
+ /* Find the rdev */
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev->desc_nr == nr)
+ break;
}
+ if (!rdev || rdev->desc_nr != nr) {
+ pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
+ return;
+ }
+
+ err = read_rdev(mddev, rdev);
+ if (err < 0)
+ return;
+
+ check_sb_changes(mddev, rdev);
+
+ /* Read all rdev's to update recovery_offset */
+ rdev_for_each_rcu(rdev, mddev)
+ read_rdev(mddev, rdev);
}
EXPORT_SYMBOL(md_reload_sb);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7da6e9c..2bea51e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -87,10 +87,16 @@ struct md_rdev {
* array and could again if we did a partial
* resync from the bitmap
*/
- sector_t recovery_offset;/* If this device has been partially
+ union {
+ sector_t recovery_offset;/* If this device has been partially
* recovered, this is where we were
* up to.
*/
+ sector_t journal_tail; /* If this device is a journal device,
+ * this is the journal tail (journal
+ * recovery start point)
+ */
+ };
atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that
@@ -134,10 +140,6 @@ enum flag_bits {
Bitmap_sync, /* ..actually, not quite In_sync. Need a
* bitmap-based recovery to get fully in sync
*/
- Unmerged, /* device is being added to array and should
- * be considerred for bvec_merge_fn but not
- * yet for actual IO
- */
WriteMostly, /* Avoid reading if at all possible */
AutoDetected, /* added by auto-detect */
Blocked, /* An error occurred but has not yet
@@ -176,6 +178,11 @@ enum flag_bits {
* This device is seen locally but not
* by the whole cluster
*/
+ Journal, /* This device is used as journal for
+ * raid-5/6.
+ * Usually, this device should be faster
+ * than other devices in the array
+ */
};
#define BB_LEN_MASK (0x00000000000001FFULL)
@@ -225,6 +232,8 @@ struct mddev {
#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
* md_ioctl checked on it.
*/
+#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */
+#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */
int suspended;
atomic_t active_io;
@@ -374,10 +383,6 @@ struct mddev {
int degraded; /* whether md should consider
* adding a spare
*/
- int merge_check_needed; /* at least one
- * member device
- * has a
- * merge_bvec_fn */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
@@ -532,10 +537,6 @@ struct md_personality
/* congested implements bdi.congested_fn().
* Will not be called while array is 'suspended' */
int (*congested)(struct mddev *mddev, int bits);
- /* mergeable_bvec is use to implement ->merge_bvec_fn */
- int (*mergeable_bvec)(struct mddev *mddev,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec);
};
struct md_sysfs_entry {
@@ -670,7 +671,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev);
extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
-extern void md_reload_sb(struct mddev *mddev);
+extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index ac3ede2..7331a80 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -77,18 +77,18 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
struct bio *bio = mp_bh->master_bio;
struct mpconf *conf = mp_bh->mddev->private;
- bio_endio(bio, err);
+ bio->bi_error = err;
+ bio_endio(bio);
mempool_free(mp_bh, conf->pool);
}
-static void multipath_end_request(struct bio *bio, int error)
+static void multipath_end_request(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct multipath_bh *mp_bh = bio->bi_private;
struct mpconf *conf = mp_bh->mddev->private;
struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
- if (uptodate)
+ if (!bio->bi_error)
multipath_end_bh_io(mp_bh, 0);
else if (!(bio->bi_rw & REQ_RAHEAD)) {
/*
@@ -101,7 +101,7 @@ static void multipath_end_request(struct bio *bio, int error)
(unsigned long long)bio->bi_iter.bi_sector);
multipath_reschedule_retry(mp_bh);
} else
- multipath_end_bh_io(mp_bh, error);
+ multipath_end_bh_io(mp_bh, bio->bi_error);
rdev_dec_pending(rdev, conf->mddev);
}
@@ -123,7 +123,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
mp_bh->path = multipath_map(conf);
if (mp_bh->path < 0) {
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
mempool_free(mp_bh, conf->pool);
return;
}
@@ -257,18 +257,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit ->max_segments to one, lying
- * within a single page.
- * (Note: it is very unlikely that a device with
- * merge_bvec_fn will be involved in multipath.)
- */
- if (q->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
-
spin_lock_irq(&conf->device_lock);
mddev->degraded--;
rdev->raid_disk = path;
@@ -276,7 +264,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
spin_unlock_irq(&conf->device_lock);
rcu_assign_pointer(p->rdev, rdev);
err = 0;
+ mddev_suspend(mddev);
md_integrity_add_rdev(rdev, mddev);
+ mddev_resume(mddev);
break;
}
@@ -432,15 +422,6 @@ static int multipath_run (struct mddev *mddev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, not that we ever expect a device with
- * a merge_bvec_fn to be involved in multipath */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
-
if (!test_bit(Faulty, &rdev->flags))
working_disks++;
}
@@ -491,8 +472,7 @@ static int multipath_run (struct mddev *mddev)
return 0;
out_free_conf:
- if (conf->pool)
- mempool_destroy(conf->pool);
+ mempool_destroy(conf->pool);
kfree(conf->multipaths);
kfree(conf);
mddev->private = NULL;
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index e64b61ad..431a030 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -233,9 +233,9 @@ static int get_ablock(struct dm_array_info *info, dm_block_t b,
/*
* Unlocks an array block.
*/
-static int unlock_ablock(struct dm_array_info *info, struct dm_block *block)
+static void unlock_ablock(struct dm_array_info *info, struct dm_block *block)
{
- return dm_tm_unlock(info->btree_info.tm, block);
+ dm_tm_unlock(info->btree_info.tm, block);
}
/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 4d6c9b6..f2393ba 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -454,7 +454,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
int r;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -490,7 +490,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
return -EPERM;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -523,7 +523,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
int r;
p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
if (unlikely(!p))
return -EWOULDBLOCK;
@@ -559,7 +559,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
return -EPERM;
p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
memset(p, 0, dm_bm_block_size(bm));
@@ -578,7 +578,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
}
EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
-int dm_bm_unlock(struct dm_block *b)
+void dm_bm_unlock(struct dm_block *b)
{
struct buffer_aux *aux;
aux = dm_bufio_get_aux_data(to_buffer(b));
@@ -590,8 +590,6 @@ int dm_bm_unlock(struct dm_block *b)
bl_up_read(&aux->lock);
dm_bufio_release(to_buffer(b));
-
- return 0;
}
EXPORT_SYMBOL_GPL(dm_bm_unlock);
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 84330f5..3627d1b 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -94,7 +94,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
struct dm_block_validator *v,
struct dm_block **result);
-int dm_bm_unlock(struct dm_block *b);
+void dm_bm_unlock(struct dm_block *b);
/*
* It's a common idiom to have a superblock that should be committed last.
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index bf2b80d..a240990 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -52,7 +52,7 @@ void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt);
int new_block(struct dm_btree_info *info, struct dm_block **result);
-int unlock_block(struct dm_btree_info *info, struct dm_block *b);
+void unlock_block(struct dm_btree_info *info, struct dm_block *b);
/*
* Spines keep track of the rolling locks. There are 2 variants, read-only
@@ -138,4 +138,10 @@ int lower_bound(struct btree_node *n, uint64_t key);
extern struct dm_block_validator btree_node_validator;
+/*
+ * Value type for upper levels of multi-level btrees.
+ */
+extern void init_le64_type(struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt);
+
#endif /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 9836c0a..21ea537 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -165,9 +165,9 @@ static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt
return 0;
}
-static int exit_child(struct dm_btree_info *info, struct child *c)
+static void exit_child(struct dm_btree_info *info, struct child *c)
{
- return dm_tm_unlock(info->tm, c->block);
+ dm_tm_unlock(info->tm, c->block);
}
static void shift(struct btree_node *left, struct btree_node *right, int count)
@@ -249,13 +249,10 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
__rebalance2(info, parent, &left, &right);
- r = exit_child(info, &left);
- if (r) {
- exit_child(info, &right);
- return r;
- }
+ exit_child(info, &left);
+ exit_child(info, &right);
- return exit_child(info, &right);
+ return 0;
}
/*
@@ -301,11 +298,16 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
{
int s;
uint32_t max_entries = le32_to_cpu(left->header.max_entries);
- unsigned target = (nr_left + nr_center + nr_right) / 3;
- BUG_ON(target > max_entries);
+ unsigned total = nr_left + nr_center + nr_right;
+ unsigned target_right = total / 3;
+ unsigned remainder = (target_right * 3) != total;
+ unsigned target_left = target_right + remainder;
+
+ BUG_ON(target_left > max_entries);
+ BUG_ON(target_right > max_entries);
if (nr_left < nr_right) {
- s = nr_left - target;
+ s = nr_left - target_left;
if (s < 0 && nr_center < -s) {
/* not enough in central node */
@@ -316,10 +318,10 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
} else
shift(left, center, s);
- shift(center, right, target - nr_right);
+ shift(center, right, target_right - nr_right);
} else {
- s = target - nr_right;
+ s = target_right - nr_right;
if (s > 0 && nr_center < s) {
/* not enough in central node */
shift(center, right, nr_center);
@@ -329,7 +331,7 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
} else
shift(center, right, s);
- shift(left, center, nr_left - target);
+ shift(left, center, nr_left - target_left);
}
*key_ptr(parent, c->index) = center->keys[0];
@@ -389,49 +391,18 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
__rebalance3(info, parent, &left, &center, &right);
- r = exit_child(info, &left);
- if (r) {
- exit_child(info, &center);
- exit_child(info, &right);
- return r;
- }
-
- r = exit_child(info, &center);
- if (r) {
- exit_child(info, &right);
- return r;
- }
-
- r = exit_child(info, &right);
- if (r)
- return r;
+ exit_child(info, &left);
+ exit_child(info, &center);
+ exit_child(info, &right);
return 0;
}
-static int get_nr_entries(struct dm_transaction_manager *tm,
- dm_block_t b, uint32_t *result)
-{
- int r;
- struct dm_block *block;
- struct btree_node *n;
-
- r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
- if (r)
- return r;
-
- n = dm_block_data(block);
- *result = le32_to_cpu(n->header.nr_entries);
-
- return dm_tm_unlock(tm, block);
-}
-
static int rebalance_children(struct shadow_spine *s,
struct dm_btree_info *info,
struct dm_btree_value_type *vt, uint64_t key)
{
int i, r, has_left_sibling, has_right_sibling;
- uint32_t child_entries;
struct btree_node *n;
n = dm_block_data(shadow_current(s));
@@ -446,9 +417,7 @@ static int rebalance_children(struct shadow_spine *s,
memcpy(n, dm_block_data(child),
dm_bm_block_size(dm_tm_get_bm(info->tm)));
- r = dm_tm_unlock(info->tm, child);
- if (r)
- return r;
+ dm_tm_unlock(info->tm, child);
dm_tm_dec(info->tm, dm_block_location(child));
return 0;
@@ -458,10 +427,6 @@ static int rebalance_children(struct shadow_spine *s,
if (i < 0)
return -ENODATA;
- r = get_nr_entries(info->tm, value64(n, i), &child_entries);
- if (r)
- return r;
-
has_left_sibling = i > 0;
has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
@@ -544,14 +509,6 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
return r;
}
-static struct dm_btree_value_type le64_type = {
- .context = NULL,
- .size = sizeof(__le64),
- .inc = NULL,
- .dec = NULL,
- .equal = NULL
-};
-
int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root)
{
@@ -559,12 +516,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
int index = 0, r = 0;
struct shadow_spine spine;
struct btree_node *n;
+ struct dm_btree_value_type le64_vt;
+ init_le64_type(info->tm, &le64_vt);
init_shadow_spine(&spine, info);
for (level = 0; level < info->levels; level++) {
r = remove_raw(&spine, info,
(level == last_level ?
- &info->value_type : &le64_type),
+ &info->value_type : &le64_vt),
root, keys[level], (unsigned *)&index);
if (r < 0)
break;
@@ -654,11 +613,13 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root,
int index = 0, r = 0;
struct shadow_spine spine;
struct btree_node *n;
+ struct dm_btree_value_type le64_vt;
uint64_t k;
+ init_le64_type(info->tm, &le64_vt);
init_shadow_spine(&spine, info);
for (level = 0; level < last_level; level++) {
- r = remove_raw(&spine, info, &le64_type,
+ r = remove_raw(&spine, info, &le64_vt,
root, keys[level], (unsigned *) &index);
if (r < 0)
goto out;
@@ -689,6 +650,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root,
value_ptr(n, index));
delete_at(n, index);
+ keys[last_level] = k + 1ull;
} else
r = -ENODATA;
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index 1b5e13e..b27b8091 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -117,9 +117,9 @@ int new_block(struct dm_btree_info *info, struct dm_block **result)
return dm_tm_new_block(info->tm, &btree_node_validator, result);
}
-int unlock_block(struct dm_btree_info *info, struct dm_block *b)
+void unlock_block(struct dm_btree_info *info, struct dm_block *b)
{
- return dm_tm_unlock(info->tm, b);
+ dm_tm_unlock(info->tm, b);
}
/*----------------------------------------------------------------*/
@@ -137,9 +137,7 @@ int exit_ro_spine(struct ro_spine *s)
int r = 0, i;
for (i = 0; i < s->count; i++) {
- int r2 = unlock_block(s->info, s->nodes[i]);
- if (r2 < 0)
- r = r2;
+ unlock_block(s->info, s->nodes[i]);
}
return r;
@@ -150,9 +148,7 @@ int ro_step(struct ro_spine *s, dm_block_t new_child)
int r;
if (s->count == 2) {
- r = unlock_block(s->info, s->nodes[0]);
- if (r < 0)
- return r;
+ unlock_block(s->info, s->nodes[0]);
s->nodes[0] = s->nodes[1];
s->count--;
}
@@ -194,9 +190,7 @@ int exit_shadow_spine(struct shadow_spine *s)
int r = 0, i;
for (i = 0; i < s->count; i++) {
- int r2 = unlock_block(s->info, s->nodes[i]);
- if (r2 < 0)
- r = r2;
+ unlock_block(s->info, s->nodes[i]);
}
return r;
@@ -208,9 +202,7 @@ int shadow_step(struct shadow_spine *s, dm_block_t b,
int r;
if (s->count == 2) {
- r = unlock_block(s->info, s->nodes[0]);
- if (r < 0)
- return r;
+ unlock_block(s->info, s->nodes[0]);
s->nodes[0] = s->nodes[1];
s->count--;
}
@@ -249,3 +241,40 @@ int shadow_root(struct shadow_spine *s)
{
return s->root;
}
+
+static void le64_inc(void *context, const void *value_le)
+{
+ struct dm_transaction_manager *tm = context;
+ __le64 v_le;
+
+ memcpy(&v_le, value_le, sizeof(v_le));
+ dm_tm_inc(tm, le64_to_cpu(v_le));
+}
+
+static void le64_dec(void *context, const void *value_le)
+{
+ struct dm_transaction_manager *tm = context;
+ __le64 v_le;
+
+ memcpy(&v_le, value_le, sizeof(v_le));
+ dm_tm_dec(tm, le64_to_cpu(v_le));
+}
+
+static int le64_equal(void *context, const void *value1_le, const void *value2_le)
+{
+ __le64 v1_le, v2_le;
+
+ memcpy(&v1_le, value1_le, sizeof(v1_le));
+ memcpy(&v2_le, value2_le, sizeof(v2_le));
+ return v1_le == v2_le;
+}
+
+void init_le64_type(struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt)
+{
+ vt->context = tm;
+ vt->size = sizeof(__le64);
+ vt->inc = le64_inc;
+ vt->dec = le64_dec;
+ vt->equal = le64_equal;
+}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index fdd3793..c573402 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -141,7 +141,9 @@ int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
n->header.value_size = cpu_to_le32(info->value_type.size);
*root = dm_block_location(b);
- return unlock_block(info, b);
+ unlock_block(info, b);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(dm_btree_empty);
@@ -420,8 +422,8 @@ EXPORT_SYMBOL_GPL(dm_btree_lookup);
*
* Where A* is a shadow of A.
*/
-static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
- unsigned parent_index, uint64_t key)
+static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
+ uint64_t key)
{
int r;
size_t size;
@@ -523,7 +525,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
r = new_block(s->info, &right);
if (r < 0) {
- /* FIXME: put left */
+ unlock_block(s->info, left);
return r;
}
@@ -625,7 +627,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
if (top)
r = btree_split_beneath(s, key);
else
- r = btree_split_sibling(s, root, i, key);
+ r = btree_split_sibling(s, i, key);
if (r < 0)
return r;
@@ -667,12 +669,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
struct btree_node *n;
struct dm_btree_value_type le64_type;
- le64_type.context = NULL;
- le64_type.size = sizeof(__le64);
- le64_type.inc = NULL;
- le64_type.dec = NULL;
- le64_type.equal = NULL;
-
+ init_le64_type(info->tm, &le64_type);
init_shadow_spine(&spine, info);
for (level = 0; level < (info->levels - 1); level++) {
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index aacbe70..306d2e4 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -259,9 +259,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
idx.blocknr = cpu_to_le64(dm_block_location(b));
- r = dm_tm_unlock(ll->tm, b);
- if (r < 0)
- return r;
+ dm_tm_unlock(ll->tm, b);
idx.nr_free = cpu_to_le32(ll->entries_per_block);
idx.none_free_before = 0;
@@ -293,7 +291,9 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
*result = sm_lookup_bitmap(dm_bitmap_data(blk), b);
- return dm_tm_unlock(ll->tm, blk);
+ dm_tm_unlock(ll->tm, blk);
+
+ return 0;
}
static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b,
@@ -373,9 +373,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
return r;
}
- r = dm_tm_unlock(ll->tm, blk);
- if (r < 0)
- return r;
+ dm_tm_unlock(ll->tm, blk);
*result = i * ll->entries_per_block + (dm_block_t) position;
return 0;
@@ -429,9 +427,7 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
if (ref_count <= 2) {
sm_set_bitmap(bm_le, bit, ref_count);
- r = dm_tm_unlock(ll->tm, nb);
- if (r < 0)
- return r;
+ dm_tm_unlock(ll->tm, nb);
if (old > 2) {
r = dm_btree_remove(&ll->ref_count_info,
@@ -445,9 +441,7 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
__le32 le_rc = cpu_to_le32(ref_count);
sm_set_bitmap(bm_le, bit, 3);
- r = dm_tm_unlock(ll->tm, nb);
- if (r < 0)
- return r;
+ dm_tm_unlock(ll->tm, nb);
__dm_bless_for_disk(&le_rc);
r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
@@ -556,7 +550,9 @@ static int metadata_ll_init_index(struct ll_disk *ll)
memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
ll->bitmap_root = dm_block_location(b);
- return dm_tm_unlock(ll->tm, b);
+ dm_tm_unlock(ll->tm, b);
+
+ return 0;
}
static int metadata_ll_open(struct ll_disk *ll)
@@ -570,7 +566,9 @@ static int metadata_ll_open(struct ll_disk *ll)
return r;
memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le));
- return dm_tm_unlock(ll->tm, block);
+ dm_tm_unlock(ll->tm, block);
+
+ return 0;
}
static dm_block_t metadata_ll_max_entries(struct ll_disk *ll)
@@ -590,7 +588,9 @@ static int metadata_ll_commit(struct ll_disk *ll)
memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
ll->bitmap_root = dm_block_location(b);
- return dm_tm_unlock(ll->tm, b);
+ dm_tm_unlock(ll->tm, b);
+
+ return 0;
}
int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm)
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 9cb797d..abe2c5d 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -342,9 +342,9 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
}
EXPORT_SYMBOL_GPL(dm_tm_read_lock);
-int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
+void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
{
- return dm_bm_unlock(b);
+ dm_bm_unlock(b);
}
EXPORT_SYMBOL_GPL(dm_tm_unlock);
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 2e0d4d6..f3a18be 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -94,7 +94,7 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
struct dm_block_validator *v,
struct dm_block **result);
-int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
+void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
/*
* Functions for altering the reference count of a block directly.
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index efb654e..f8e5db0 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -83,7 +83,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE];
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
- bool discard_supported = false;
+ unsigned short blksize = 512;
if (!conf)
return -ENOMEM;
@@ -98,6 +98,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors;
+ blksize = max(blksize, queue_logical_block_size(
+ rdev1->bdev->bd_disk->queue));
+
rdev_for_each(rdev2, mddev) {
pr_debug("md/raid0:%s: comparing %s(%llu)"
" with %s(%llu)\n",
@@ -134,6 +137,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
+ /*
+ * now since we have the hard sector sizes, we can make sure
+ * chunk size is a multiple of that sector size
+ */
+ if ((mddev->chunk_sectors << 9) % blksize) {
+ printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
+ mdname(mddev),
+ mddev->chunk_sectors << 9, blksize);
+ err = -EINVAL;
+ goto abort;
+ }
+
err = -ENOMEM;
conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
conf->nr_strip_zones, GFP_KERNEL);
@@ -188,19 +203,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
dev[j] = rdev1;
- if (mddev->queue)
- disk_stack_limits(mddev->gendisk, rdev1->bdev,
- rdev1->data_offset << 9);
-
- if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
- conf->has_merge_bvec = 1;
-
if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1;
cnt++;
-
- if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
- discard_supported = true;
}
if (cnt != mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
@@ -261,28 +266,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
(unsigned long long)smallest->sectors);
}
- /*
- * now since we have the hard sector sizes, we can make sure
- * chunk size is a multiple of that sector size
- */
- if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
- printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
- mdname(mddev),
- mddev->chunk_sectors << 9);
- goto abort;
- }
-
- if (mddev->queue) {
- blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
- blk_queue_io_opt(mddev->queue,
- (mddev->chunk_sectors << 9) * mddev->raid_disks);
-
- if (!discard_supported)
- queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
- else
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
- }
-
pr_debug("md/raid0:%s: done.\n", mdname(mddev));
*private_conf = conf;
@@ -351,58 +334,6 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
+ sector_div(sector, zone->nb_dev)];
}
-/**
- * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
- * @mddev: the md device
- * @bvm: properties of new bio
- * @biovec: the request that could be merged to it.
- *
- * Return amount of bytes we can accept at this offset
- */
-static int raid0_mergeable_bvec(struct mddev *mddev,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct r0conf *conf = mddev->private;
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- sector_t sector_offset = sector;
- int max;
- unsigned int chunk_sectors = mddev->chunk_sectors;
- unsigned int bio_sectors = bvm->bi_size >> 9;
- struct strip_zone *zone;
- struct md_rdev *rdev;
- struct request_queue *subq;
-
- if (is_power_of_2(chunk_sectors))
- max = (chunk_sectors - ((sector & (chunk_sectors-1))
- + bio_sectors)) << 9;
- else
- max = (chunk_sectors - (sector_div(sector, chunk_sectors)
- + bio_sectors)) << 9;
- if (max < 0)
- max = 0; /* bio_add cannot handle a negative return */
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- if (max < biovec->bv_len)
- /* too small already, no need to check further */
- return max;
- if (!conf->has_merge_bvec)
- return max;
-
- /* May need to check subordinate device */
- sector = sector_offset;
- zone = find_zone(mddev->private, &sector_offset);
- rdev = map_sector(mddev, zone, sector, &sector_offset);
- subq = bdev_get_queue(rdev->bdev);
- if (subq->merge_bvec_fn) {
- bvm->bi_bdev = rdev->bdev;
- bvm->bi_sector = sector_offset + zone->dev_start +
- rdev->data_offset;
- return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
- } else
- return max;
-}
-
static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks)
{
sector_t array_sectors = 0;
@@ -433,12 +364,6 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
- if (mddev->queue) {
- blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
- blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
- blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
- }
-
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf);
@@ -447,6 +372,29 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf;
}
conf = mddev->private;
+ if (mddev->queue) {
+ struct md_rdev *rdev;
+ bool discard_supported = false;
+
+ blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+
+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+ blk_queue_io_opt(mddev->queue,
+ (mddev->chunk_sectors << 9) * mddev->raid_disks);
+
+ rdev_for_each(rdev, mddev) {
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ discard_supported = true;
+ }
+ if (!discard_supported)
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ }
/* calculate array device size */
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
@@ -543,7 +491,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely((split->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
/* Just ignore it */
- bio_endio(split, 0);
+ bio_endio(split);
} else
generic_make_request(split);
} while (split != bio);
@@ -727,7 +675,6 @@ static struct md_personality raid0_personality=
.takeover = raid0_takeover,
.quiesce = raid0_quiesce,
.congested = raid0_congested,
- .mergeable_bvec = raid0_mergeable_bvec,
};
static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 05539d9..7127a62 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -12,8 +12,6 @@ struct r0conf {
struct md_rdev **devlist; /* lists of rdevs, pointed to
* by strip_zone->dev */
int nr_strip_zones;
- int has_merge_bvec; /* at least one member has
- * a merge_bvec_fn */
};
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 94f5b55..e2169ff 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
+#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
@@ -255,9 +257,10 @@ static void call_bio_endio(struct r1bio *r1_bio)
done = 1;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio->bi_error = -EIO;
+
if (done) {
- bio_endio(bio, 0);
+ bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
* to go idle.
@@ -312,9 +315,9 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
return mirror;
}
-static void raid1_end_read_request(struct bio *bio, int error)
+static void raid1_end_read_request(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ int uptodate = !bio->bi_error;
struct r1bio *r1_bio = bio->bi_private;
int mirror;
struct r1conf *conf = r1_bio->mddev->private;
@@ -397,9 +400,8 @@ static void r1_bio_write_done(struct r1bio *r1_bio)
}
}
-static void raid1_end_write_request(struct bio *bio, int error)
+static void raid1_end_write_request(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r1bio *r1_bio = bio->bi_private;
int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
struct r1conf *conf = r1_bio->mddev->private;
@@ -410,7 +412,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
/*
* 'one mirror IO has finished' event handler:
*/
- if (!uptodate) {
+ if (bio->bi_error) {
set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags);
if (!test_and_set_bit(WantReplacement,
@@ -557,7 +559,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
- || test_bit(Unmerged, &rdev->flags)
|| test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
@@ -708,38 +709,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
-static int raid1_mergeable_bvec(struct mddev *mddev,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct r1conf *conf = mddev->private;
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int max = biovec->bv_len;
-
- if (mddev->merge_check_needed) {
- int disk;
- rcu_read_lock();
- for (disk = 0; disk < conf->raid_disks * 2; disk++) {
- struct md_rdev *rdev = rcu_dereference(
- conf->mirrors[disk].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q =
- bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn) {
- bvm->bi_sector = sector +
- rdev->data_offset;
- bvm->bi_bdev = rdev->bdev;
- max = min(max, q->merge_bvec_fn(
- q, bvm, biovec));
- }
- }
- }
- rcu_read_unlock();
- }
- return max;
-
-}
-
static int raid1_congested(struct mddev *mddev, int bits)
{
struct r1conf *conf = mddev->private;
@@ -793,7 +762,7 @@ static void flush_pending_writes(struct r1conf *conf)
if (unlikely((bio->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
/* Just ignore it */
- bio_endio(bio, 0);
+ bio_endio(bio);
else
generic_make_request(bio);
bio = next;
@@ -914,8 +883,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
}
if (bio && bio_data_dir(bio) == WRITE) {
- if (bio->bi_iter.bi_sector >=
- conf->mddev->curr_resync_completed) {
+ if (bio->bi_iter.bi_sector >= conf->next_resync) {
if (conf->start_next_window == MaxSector)
conf->start_next_window =
conf->next_resync +
@@ -1068,7 +1036,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
if (unlikely((bio->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
/* Just ignore it */
- bio_endio(bio, 0);
+ bio_endio(bio);
else
generic_make_request(bio);
bio = next;
@@ -1158,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
* non-zero, then it is the number of not-completed requests.
*/
bio->bi_phys_segments = 0;
- clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+ bio_clear_flag(bio, BIO_SEG_VALID);
if (rw == READ) {
/*
@@ -1269,8 +1237,7 @@ read_again:
break;
}
r1_bio->bios[i] = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags)
- || test_bit(Unmerged, &rdev->flags)) {
+ if (!rdev || test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
set_bit(R1BIO_Degraded, &r1_bio->state);
continue;
@@ -1476,6 +1443,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
{
char b[BDEVNAME_SIZE];
struct r1conf *conf = mddev->private;
+ unsigned long flags;
/*
* If it is not operational, then we have already marked it as dead
@@ -1495,19 +1463,19 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
return;
}
set_bit(Blocked, &rdev->flags);
+ spin_lock_irqsave(&conf->device_lock, flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
- spin_unlock_irqrestore(&conf->device_lock, flags);
} else
set_bit(Faulty, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n"
"md/raid1:%s: Operation continuing on %d devices.\n",
@@ -1549,7 +1517,7 @@ static void close_sync(struct r1conf *conf)
conf->r1buf_pool = NULL;
spin_lock_irq(&conf->resync_lock);
- conf->next_resync = 0;
+ conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE;
conf->start_next_window = MaxSector;
conf->current_window_requests +=
conf->next_window_requests;
@@ -1568,7 +1536,10 @@ static int raid1_spare_active(struct mddev *mddev)
* Find all failed disks within the RAID1 configuration
* and mark them readable.
* Called under mddev lock, so rcu protection not needed.
+ * device_lock used to avoid races with raid1_end_read_request
+ * which expects 'In_sync' flags and ->degraded to be consistent.
*/
+ spin_lock_irqsave(&conf->device_lock, flags);
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev;
struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
@@ -1599,7 +1570,6 @@ static int raid1_spare_active(struct mddev *mddev)
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
- spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded -= count;
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1615,7 +1585,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct raid1_info *p;
int first = 0;
int last = conf->raid_disks - 1;
- struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
@@ -1623,10 +1592,14 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
- if (q->merge_bvec_fn) {
- set_bit(Unmerged, &rdev->flags);
- mddev->merge_check_needed = 1;
- }
+ /*
+ * find the disk ... but prefer rdev->saved_raid_disk
+ * if possible.
+ */
+ if (rdev->saved_raid_disk >= 0 &&
+ rdev->saved_raid_disk >= first &&
+ conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+ first = last = rdev->saved_raid_disk;
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror;
@@ -1659,20 +1632,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break;
}
}
- if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
- /* Some requests might not have seen this new
- * merge_bvec_fn. We must wait for them to complete
- * before merging the device fully.
- * First we make sure any code which has tested
- * our function has submitted the request, then
- * we wait for all outstanding requests to complete.
- */
- synchronize_sched();
- freeze_array(conf, 0);
- unfreeze_array(conf);
- clear_bit(Unmerged, &rdev->flags);
- }
+ mddev_suspend(mddev);
md_integrity_add_rdev(rdev, mddev);
+ mddev_resume(mddev);
if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
print_conf(conf);
@@ -1735,7 +1697,7 @@ abort:
return err;
}
-static void end_sync_read(struct bio *bio, int error)
+static void end_sync_read(struct bio *bio)
{
struct r1bio *r1_bio = bio->bi_private;
@@ -1746,16 +1708,16 @@ static void end_sync_read(struct bio *bio, int error)
* or re-read if the read failed.
* We don't do much here, just schedule handling by raid1d
*/
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (!bio->bi_error)
set_bit(R1BIO_Uptodate, &r1_bio->state);
if (atomic_dec_and_test(&r1_bio->remaining))
reschedule_retry(r1_bio);
}
-static void end_sync_write(struct bio *bio, int error)
+static void end_sync_write(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ int uptodate = !bio->bi_error;
struct r1bio *r1_bio = bio->bi_private;
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
@@ -1942,7 +1904,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
idx ++;
}
set_bit(R1BIO_Uptodate, &r1_bio->state);
- set_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio->bi_error = 0;
return 1;
}
@@ -1966,15 +1928,14 @@ static void process_checks(struct r1bio *r1_bio)
for (i = 0; i < conf->raid_disks * 2; i++) {
int j;
int size;
- int uptodate;
+ int error;
struct bio *b = r1_bio->bios[i];
if (b->bi_end_io != end_sync_read)
continue;
- /* fixup the bio for reuse, but preserve BIO_UPTODATE */
- uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
+ /* fixup the bio for reuse, but preserve errno */
+ error = b->bi_error;
bio_reset(b);
- if (!uptodate)
- clear_bit(BIO_UPTODATE, &b->bi_flags);
+ b->bi_error = error;
b->bi_vcnt = vcnt;
b->bi_iter.bi_size = r1_bio->sectors << 9;
b->bi_iter.bi_sector = r1_bio->sector +
@@ -1997,7 +1958,7 @@ static void process_checks(struct r1bio *r1_bio)
}
for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
- test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+ !r1_bio->bios[primary]->bi_error) {
r1_bio->bios[primary]->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
break;
@@ -2007,14 +1968,14 @@ static void process_checks(struct r1bio *r1_bio)
int j;
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
- int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
+ int error = sbio->bi_error;
if (sbio->bi_end_io != end_sync_read)
continue;
- /* Now we can 'fixup' the BIO_UPTODATE flag */
- set_bit(BIO_UPTODATE, &sbio->bi_flags);
+ /* Now we can 'fixup' the error value */
+ sbio->bi_error = 0;
- if (uptodate) {
+ if (!error) {
for (j = vcnt; j-- ; ) {
struct page *p, *s;
p = pbio->bi_io_vec[j].bv_page;
@@ -2029,7 +1990,7 @@ static void process_checks(struct r1bio *r1_bio)
if (j >= 0)
atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
- && uptodate)) {
+ && !error)) {
/* No need to write to this device. */
sbio->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
@@ -2247,7 +2208,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
bio_trim(wbio, sector - r1_bio->sector, sectors);
wbio->bi_iter.bi_sector += rdev->data_offset;
wbio->bi_bdev = rdev->bdev;
- if (submit_bio_wait(WRITE, wbio) == 0)
+ if (submit_bio_wait(WRITE, wbio) < 0)
/* failure! */
ok = rdev_set_badblocks(rdev, sector,
sectors, 0)
@@ -2270,11 +2231,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
struct bio *bio = r1_bio->bios[m];
if (bio->bi_end_io == NULL)
continue;
- if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ if (!bio->bi_error &&
test_bit(R1BIO_MadeGood, &r1_bio->state)) {
rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
}
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ if (bio->bi_error &&
test_bit(R1BIO_WriteError, &r1_bio->state)) {
if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
md_error(conf->mddev, rdev);
@@ -2287,6 +2248,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
int m;
+ bool fail = false;
for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2299,6 +2261,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
* narrow down and record precise write
* errors.
*/
+ fail = true;
if (!narrow_write_error(r1_bio, m)) {
md_error(conf->mddev,
conf->mirrors[m].rdev);
@@ -2308,9 +2271,16 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
rdev_dec_pending(conf->mirrors[m].rdev,
conf->mddev);
}
- if (test_bit(R1BIO_WriteError, &r1_bio->state))
- close_write(r1_bio);
- raid_end_bio_io(r1_bio);
+ if (fail) {
+ spin_lock_irq(&conf->device_lock);
+ list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else {
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ close_write(r1_bio);
+ raid_end_bio_io(r1_bio);
+ }
}
static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
@@ -2416,6 +2386,27 @@ static void raid1d(struct md_thread *thread)
md_check_recovery(mddev);
+ if (!list_empty_careful(&conf->bio_end_io_list) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ LIST_HEAD(tmp);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ list_add(&tmp, &conf->bio_end_io_list);
+ list_del_init(&conf->bio_end_io_list);
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ while (!list_empty(&tmp)) {
+ r1_bio = list_first_entry(&tmp, struct r1bio,
+ retry_list);
+ list_del(&r1_bio->retry_list);
+ if (mddev->degraded)
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ close_write(r1_bio);
+ raid_end_bio_io(r1_bio);
+ }
+ }
+
blk_start_plug(&plug);
for (;;) {
@@ -2515,6 +2506,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bitmap_close_sync(mddev->bitmap);
close_sync(conf);
+
+ if (mddev_is_clustered(mddev)) {
+ conf->cluster_sync_low = 0;
+ conf->cluster_sync_high = 0;
+ }
return 0;
}
@@ -2535,7 +2531,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
return sync_blocks;
}
- bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ /* we are incrementing sector_nr below. To be safe, we check against
+ * sector_nr + two times RESYNC_SECTORS
+ */
+
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr,
+ mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
raise_barrier(conf, sector_nr);
@@ -2713,7 +2714,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
/* remove last page from this bio */
bio->bi_vcnt--;
bio->bi_iter.bi_size -= len;
- __clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+ bio_clear_flag(bio, BIO_SEG_VALID);
}
goto bio_full;
}
@@ -2726,6 +2727,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bio_full:
r1_bio->sectors = nr_sectors;
+ if (mddev_is_clustered(mddev) &&
+ conf->cluster_sync_high < sector_nr + nr_sectors) {
+ conf->cluster_sync_low = mddev->curr_resync_completed;
+ conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
+ /* Send resync message */
+ md_cluster_ops->resync_info_update(mddev,
+ conf->cluster_sync_low,
+ conf->cluster_sync_high);
+ }
+
/* For a user-requested sync, we read all readable devices and do a
* compare
*/
@@ -2808,8 +2819,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
goto abort;
disk->rdev = rdev;
q = bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn)
- mddev->merge_check_needed = 1;
disk->head_position = 0;
disk->seq_start = MaxSector;
@@ -2817,6 +2826,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
+ INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
@@ -2870,8 +2880,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
abort:
if (conf) {
- if (conf->r1bio_pool)
- mempool_destroy(conf->r1bio_pool);
+ mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
@@ -2973,8 +2982,7 @@ static void raid1_free(struct mddev *mddev, void *priv)
{
struct r1conf *conf = priv;
- if (conf->r1bio_pool)
- mempool_destroy(conf->r1bio_pool);
+ mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
@@ -3043,9 +3051,11 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL;
}
- err = md_allow_write(mddev);
- if (err)
- return err;
+ if (!mddev_is_clustered(mddev)) {
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
+ }
raid_disks = mddev->raid_disks + mddev->delta_disks;
@@ -3111,6 +3121,7 @@ static int raid1_reshape(struct mddev *mddev)
unfreeze_array(conf);
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -3174,7 +3185,6 @@ static struct md_personality raid1_personality =
.quiesce = raid1_quiesce,
.takeover = raid1_takeover,
.congested = raid1_congested,
- .mergeable_bvec = raid1_mergeable_bvec,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 14ebb28..61c39b3 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -61,6 +61,11 @@ struct r1conf {
* block, or anything else.
*/
struct list_head retry_list;
+ /* A separate list of r1bio which just need raid_end_bio_io called.
+ * This mustn't happen for writes which had any errors if the superblock
+ * needs to be written.
+ */
+ struct list_head bio_end_io_list;
/* queue pending writes to be submitted on unplug */
struct bio_list pending_bio_list;
@@ -106,6 +111,13 @@ struct r1conf {
* the new thread here until we fully activate the array.
*/
struct md_thread *thread;
+
+ /* Keep track of cluster resync window to send to other
+ * nodes.
+ */
+ sector_t cluster_sync_low;
+ sector_t cluster_sync_high;
+
};
/*
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 38c58e1..41d70bc 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -39,6 +39,7 @@
* far_copies (stored in second byte of layout)
* far_offset (stored in bit 16 of layout )
* use_far_sets (stored in bit 17 of layout )
+ * use_far_sets_bugfixed (stored in bit 18 of layout )
*
* The data to be stored is divided into chunks using chunksize. Each device
* is divided into far_copies sections. In each section, chunks are laid out
@@ -101,7 +102,7 @@ static int _enough(struct r10conf *conf, int previous, int ignore);
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
int *skipped);
static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
-static void end_reshape_write(struct bio *bio, int error);
+static void end_reshape_write(struct bio *bio);
static void end_reshape(struct r10conf *conf);
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -307,9 +308,9 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
} else
done = 1;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio->bi_error = -EIO;
if (done) {
- bio_endio(bio, 0);
+ bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
* to go idle.
@@ -358,9 +359,9 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
return r10_bio->devs[slot].devnum;
}
-static void raid10_end_read_request(struct bio *bio, int error)
+static void raid10_end_read_request(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ int uptodate = !bio->bi_error;
struct r10bio *r10_bio = bio->bi_private;
int slot, dev;
struct md_rdev *rdev;
@@ -438,9 +439,8 @@ static void one_write_done(struct r10bio *r10_bio)
}
}
-static void raid10_end_write_request(struct bio *bio, int error)
+static void raid10_end_write_request(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
int dev;
int dec_rdev = 1;
@@ -460,7 +460,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
- if (!uptodate) {
+ if (bio->bi_error) {
if (repl)
/* Never record new bad blocks to replacement,
* just fail it.
@@ -672,93 +672,6 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
return (vchunk << geo->chunk_shift) + offset;
}
-/**
- * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
- * @mddev: the md device
- * @bvm: properties of new bio
- * @biovec: the request that could be merged to it.
- *
- * Return amount of bytes we can accept at this offset
- * This requires checking for end-of-chunk if near_copies != raid_disks,
- * and for subordinate merge_bvec_fns if merge_check_needed.
- */
-static int raid10_mergeable_bvec(struct mddev *mddev,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct r10conf *conf = mddev->private;
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int max;
- unsigned int chunk_sectors;
- unsigned int bio_sectors = bvm->bi_size >> 9;
- struct geom *geo = &conf->geo;
-
- chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
- if (conf->reshape_progress != MaxSector &&
- ((sector >= conf->reshape_progress) !=
- conf->mddev->reshape_backwards))
- geo = &conf->prev;
-
- if (geo->near_copies < geo->raid_disks) {
- max = (chunk_sectors - ((sector & (chunk_sectors - 1))
- + bio_sectors)) << 9;
- if (max < 0)
- /* bio_add cannot handle a negative return */
- max = 0;
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- } else
- max = biovec->bv_len;
-
- if (mddev->merge_check_needed) {
- struct {
- struct r10bio r10_bio;
- struct r10dev devs[conf->copies];
- } on_stack;
- struct r10bio *r10_bio = &on_stack.r10_bio;
- int s;
- if (conf->reshape_progress != MaxSector) {
- /* Cannot give any guidance during reshape */
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- return 0;
- }
- r10_bio->sector = sector;
- raid10_find_phys(conf, r10_bio);
- rcu_read_lock();
- for (s = 0; s < conf->copies; s++) {
- int disk = r10_bio->devs[s].devnum;
- struct md_rdev *rdev = rcu_dereference(
- conf->mirrors[disk].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q =
- bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn) {
- bvm->bi_sector = r10_bio->devs[s].addr
- + rdev->data_offset;
- bvm->bi_bdev = rdev->bdev;
- max = min(max, q->merge_bvec_fn(
- q, bvm, biovec));
- }
- }
- rdev = rcu_dereference(conf->mirrors[disk].replacement);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q =
- bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn) {
- bvm->bi_sector = r10_bio->devs[s].addr
- + rdev->data_offset;
- bvm->bi_bdev = rdev->bdev;
- max = min(max, q->merge_bvec_fn(
- q, bvm, biovec));
- }
- }
- }
- rcu_read_unlock();
- }
- return max;
-}
-
/*
* This routine returns the disk from which the requested read should
* be done. There is a per-array 'next expected sequential IO' sector
@@ -821,12 +734,10 @@ retry:
disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].replacement);
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
- test_bit(Unmerged, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (rdev == NULL ||
- test_bit(Faulty, &rdev->flags) ||
- test_bit(Unmerged, &rdev->flags))
+ test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -957,7 +868,7 @@ static void flush_pending_writes(struct r10conf *conf)
if (unlikely((bio->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
/* Just ignore it */
- bio_endio(bio, 0);
+ bio_endio(bio);
else
generic_make_request(bio);
bio = next;
@@ -1133,7 +1044,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
if (unlikely((bio->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
/* Just ignore it */
- bio_endio(bio, 0);
+ bio_endio(bio);
else
generic_make_request(bio);
bio = next;
@@ -1217,7 +1128,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
* non-zero, then it is the number of not-completed requests.
*/
bio->bi_phys_segments = 0;
- clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+ bio_clear_flag(bio, BIO_SEG_VALID);
if (rw == READ) {
/*
@@ -1326,11 +1237,9 @@ retry_write:
blocked_rdev = rrdev;
break;
}
- if (rdev && (test_bit(Faulty, &rdev->flags)
- || test_bit(Unmerged, &rdev->flags)))
+ if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL;
- if (rrdev && (test_bit(Faulty, &rrdev->flags)
- || test_bit(Unmerged, &rrdev->flags)))
+ if (rrdev && (test_bit(Faulty, &rrdev->flags)))
rrdev = NULL;
r10_bio->devs[i].bio = NULL;
@@ -1589,6 +1498,8 @@ static void status(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
else
seq_printf(seq, " %d far-copies", conf->geo.far_copies);
+ if (conf->geo.far_set_size != conf->geo.raid_disks)
+ seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
}
seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
conf->geo.raid_disks - mddev->degraded);
@@ -1681,6 +1592,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
printk(KERN_ALERT
"md/raid10:%s: Disk failure on %s, disabling device.\n"
@@ -1777,7 +1689,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int mirror;
int first = 0;
int last = conf->geo.raid_disks - 1;
- struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_cp < MaxSector)
/* only hot-add to in-sync arrays, as recovery is
@@ -1790,11 +1701,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
- if (q->merge_bvec_fn) {
- set_bit(Unmerged, &rdev->flags);
- mddev->merge_check_needed = 1;
- }
-
if (rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk;
@@ -1833,20 +1739,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
- if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
- /* Some requests might not have seen this new
- * merge_bvec_fn. We must wait for them to complete
- * before merging the device fully.
- * First we make sure any code which has tested
- * our function has submitted the request, then
- * we wait for all outstanding requests to complete.
- */
- synchronize_sched();
- freeze_array(conf, 0);
- unfreeze_array(conf);
- clear_bit(Unmerged, &rdev->flags);
- }
+ mddev_suspend(mddev);
md_integrity_add_rdev(rdev, mddev);
+ mddev_resume(mddev);
if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
@@ -1916,7 +1811,7 @@ abort:
return err;
}
-static void end_sync_read(struct bio *bio, int error)
+static void end_sync_read(struct bio *bio)
{
struct r10bio *r10_bio = bio->bi_private;
struct r10conf *conf = r10_bio->mddev->private;
@@ -1928,7 +1823,7 @@ static void end_sync_read(struct bio *bio, int error)
} else
d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (!bio->bi_error)
set_bit(R10BIO_Uptodate, &r10_bio->state);
else
/* The write handler will notice the lack of
@@ -1977,9 +1872,8 @@ static void end_sync_request(struct r10bio *r10_bio)
}
}
-static void end_sync_write(struct bio *bio, int error)
+static void end_sync_write(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
@@ -1996,7 +1890,7 @@ static void end_sync_write(struct bio *bio, int error)
else
rdev = conf->mirrors[d].rdev;
- if (!uptodate) {
+ if (bio->bi_error) {
if (repl)
md_error(mddev, rdev);
else {
@@ -2044,7 +1938,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
/* find the first device with a block */
for (i=0; i<conf->copies; i++)
- if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
+ if (!r10_bio->devs[i].bio->bi_error)
break;
if (i == conf->copies)
@@ -2064,7 +1958,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
if (i == first)
continue;
- if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
+ if (!r10_bio->devs[i].bio->bi_error) {
/* We know that the bi_io_vec layout is the same for
* both 'first' and 'i', so we just compare them.
* All vec entries are PAGE_SIZE;
@@ -2394,7 +2288,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev &&
- !test_bit(Unmerged, &rdev->flags) &&
test_bit(In_sync, &rdev->flags) &&
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
&first_bad, &bad_sectors) == 0) {
@@ -2448,7 +2341,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (!rdev ||
- test_bit(Unmerged, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
continue;
@@ -2580,7 +2472,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
choose_data_offset(r10_bio, rdev) +
(sector - r10_bio->sector));
wbio->bi_bdev = rdev->bdev;
- if (submit_bio_wait(WRITE, wbio) == 0)
+ if (submit_bio_wait(WRITE, wbio) < 0)
/* Failure! */
ok = rdev_set_badblocks(rdev, sector,
sectors, 0)
@@ -2706,8 +2598,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev = conf->mirrors[dev].rdev;
if (r10_bio->devs[m].bio == NULL)
continue;
- if (test_bit(BIO_UPTODATE,
- &r10_bio->devs[m].bio->bi_flags)) {
+ if (!r10_bio->devs[m].bio->bi_error) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
@@ -2722,8 +2613,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev = conf->mirrors[dev].replacement;
if (r10_bio->devs[m].repl_bio == NULL)
continue;
- if (test_bit(BIO_UPTODATE,
- &r10_bio->devs[m].repl_bio->bi_flags)) {
+
+ if (!r10_bio->devs[m].repl_bio->bi_error) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
@@ -2738,6 +2629,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
}
put_buf(r10_bio);
} else {
+ bool fail = false;
for (m = 0; m < conf->copies; m++) {
int dev = r10_bio->devs[m].devnum;
struct bio *bio = r10_bio->devs[m].bio;
@@ -2748,8 +2640,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
r10_bio->devs[m].addr,
r10_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev);
- } else if (bio != NULL &&
- !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ } else if (bio != NULL && bio->bi_error) {
+ fail = true;
if (!narrow_write_error(r10_bio, m)) {
md_error(conf->mddev, rdev);
set_bit(R10BIO_Degraded,
@@ -2767,10 +2659,17 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_dec_pending(rdev, conf->mddev);
}
}
- if (test_bit(R10BIO_WriteError,
- &r10_bio->state))
- close_write(r10_bio);
- raid_end_bio_io(r10_bio);
+ if (fail) {
+ spin_lock_irq(&conf->device_lock);
+ list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else {
+ if (test_bit(R10BIO_WriteError,
+ &r10_bio->state))
+ close_write(r10_bio);
+ raid_end_bio_io(r10_bio);
+ }
}
}
@@ -2785,6 +2684,29 @@ static void raid10d(struct md_thread *thread)
md_check_recovery(mddev);
+ if (!list_empty_careful(&conf->bio_end_io_list) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ LIST_HEAD(tmp);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ list_add(&tmp, &conf->bio_end_io_list);
+ list_del_init(&conf->bio_end_io_list);
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ while (!list_empty(&tmp)) {
+ r10_bio = list_first_entry(&tmp, struct r10bio,
+ retry_list);
+ list_del(&r10_bio->retry_list);
+ if (mddev->degraded)
+ set_bit(R10BIO_Degraded, &r10_bio->state);
+
+ if (test_bit(R10BIO_WriteError,
+ &r10_bio->state))
+ close_write(r10_bio);
+ raid_end_bio_io(r10_bio);
+ }
+ }
+
blk_start_plug(&plug);
for (;;) {
@@ -3227,7 +3149,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* resync. Schedule a read for every block at this virt offset */
int count = 0;
- bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
@@ -3263,7 +3185,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r10_bio->devs[i].bio;
bio_reset(bio);
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio->bi_error = -EIO;
if (conf->mirrors[d].rdev == NULL ||
test_bit(Faulty, &conf->mirrors[d].rdev->flags))
continue;
@@ -3300,7 +3222,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* Need to set up for writing to the replacement */
bio = r10_bio->devs[i].repl_bio;
bio_reset(bio);
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio->bi_error = -EIO;
sector = r10_bio->devs[i].addr;
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
@@ -3357,7 +3279,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* remove last page from this bio */
bio2->bi_vcnt--;
bio2->bi_iter.bi_size -= len;
- __clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
+ bio_clear_flag(bio2, BIO_SEG_VALID);
}
goto bio_full;
}
@@ -3377,7 +3299,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (bio->bi_end_io == end_sync_read) {
md_sync_acct(bio->bi_bdev, nr_sectors);
- set_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio->bi_error = 0;
generic_make_request(bio);
}
}
@@ -3477,7 +3399,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
disks = mddev->raid_disks + mddev->delta_disks;
break;
}
- if (layout >> 18)
+ if (layout >> 19)
return -1;
if (chunk < (PAGE_SIZE >> 9) ||
!is_power_of_2(chunk))
@@ -3489,7 +3411,22 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
geo->near_copies = nc;
geo->far_copies = fc;
geo->far_offset = fo;
- geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
+ switch (layout >> 17) {
+ case 0: /* original layout. simple but not always optimal */
+ geo->far_set_size = disks;
+ break;
+ case 1: /* "improved" layout which was buggy. Hopefully no-one is
+ * actually using this, but leave code here just in case.*/
+ geo->far_set_size = disks/fc;
+ WARN(geo->far_set_size < fc,
+ "This RAID10 layout does not provide data safety - please backup and create new array\n");
+ break;
+ case 2: /* "improved" layout fixed to match documentation */
+ geo->far_set_size = fc * nc;
+ break;
+ default: /* Not a valid layout */
+ return -1;
+ }
geo->chunk_mask = chunk - 1;
geo->chunk_shift = ffz(~chunk);
return nc*fc;
@@ -3559,6 +3496,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
conf->reshape_safe = conf->reshape_progress;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
+ INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
@@ -3575,8 +3513,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
mdname(mddev));
if (conf) {
- if (conf->r10bio_pool)
- mempool_destroy(conf->r10bio_pool);
+ mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf);
@@ -3643,8 +3580,6 @@ static int run(struct mddev *mddev)
disk->rdev = rdev;
}
q = bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn)
- mddev->merge_check_needed = 1;
diff = (rdev->new_data_offset - rdev->data_offset);
if (!mddev->reshape_backwards)
diff = -diff;
@@ -3773,8 +3708,7 @@ static int run(struct mddev *mddev)
out_free_conf:
md_unregister_thread(&mddev->thread);
- if (conf->r10bio_pool)
- mempool_destroy(conf->r10bio_pool);
+ mempool_destroy(conf->r10bio_pool);
safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf);
@@ -3787,8 +3721,7 @@ static void raid10_free(struct mddev *mddev, void *priv)
{
struct r10conf *conf = priv;
- if (conf->r10bio_pool)
- mempool_destroy(conf->r10bio_pool);
+ mempool_destroy(conf->r10bio_pool);
safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf->mirrors_old);
@@ -4215,7 +4148,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
* at a time, possibly less if that exceeds RESYNC_PAGES,
* or we hit a bad block or something.
* This might mean we pause for normal IO in the middle of
- * a chunk, but that is not a problem was mddev->reshape_position
+ * a chunk, but that is not a problem as mddev->reshape_position
* can record any location.
*
* If we will want to write to a location that isn't
@@ -4239,7 +4172,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
*
* In all this the minimum difference in data offsets
* (conf->offset_diff - always positive) allows a bit of slack,
- * so next can be after 'safe', but not by more than offset_disk
+ * so next can be after 'safe', but not by more than offset_diff
*
* We need to prepare all the bios here before we start any IO
* to ensure the size we choose is acceptable to all devices.
@@ -4382,7 +4315,7 @@ read_more:
read_bio->bi_end_io = end_sync_read;
read_bio->bi_rw = READ;
read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
- __set_bit(BIO_UPTODATE, &read_bio->bi_flags);
+ read_bio->bi_error = 0;
read_bio->bi_vcnt = 0;
read_bio->bi_iter.bi_size = 0;
r10_bio->master_bio = read_bio;
@@ -4439,7 +4372,7 @@ read_more:
/* Remove last page from this bio */
bio2->bi_vcnt--;
bio2->bi_iter.bi_size -= len;
- __clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
+ bio_clear_flag(bio2, BIO_SEG_VALID);
}
goto bio_full;
}
@@ -4604,9 +4537,8 @@ static int handle_reshape_read_error(struct mddev *mddev,
return 0;
}
-static void end_reshape_write(struct bio *bio, int error)
+static void end_reshape_write(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
@@ -4623,7 +4555,7 @@ static void end_reshape_write(struct bio *bio, int error)
rdev = conf->mirrors[d].rdev;
}
- if (!uptodate) {
+ if (bio->bi_error) {
/* FIXME should record badblock */
md_error(mddev, rdev);
}
@@ -4700,7 +4632,6 @@ static struct md_personality raid10_personality =
.start_reshape = raid10_start_reshape,
.finish_reshape = raid10_finish_reshape,
.congested = raid10_congested,
- .mergeable_bvec = raid10_mergeable_bvec,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 5ee6473..6fc2c75 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -53,6 +53,12 @@ struct r10conf {
sector_t offset_diff;
struct list_head retry_list;
+ /* A separate list of r1bio which just need raid_end_bio_io called.
+ * This mustn't happen for writes which had any errors if the superblock
+ * needs to be written.
+ */
+ struct list_head bio_end_io_list;
+
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
int pending_count;
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
new file mode 100644
index 0000000..b887e04
--- /dev/null
+++ b/drivers/md/raid5-cache.c
@@ -0,0 +1,1191 @@
+/*
+ * Copyright (C) 2015 Shaohua Li <shli@fb.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/raid/md_p.h>
+#include <linux/crc32c.h>
+#include <linux/random.h>
+#include "md.h"
+#include "raid5.h"
+
+/*
+ * metadata/data stored in disk with 4k size unit (a block) regardless
+ * underneath hardware sector size. only works with PAGE_SIZE == 4096
+ */
+#define BLOCK_SECTORS (8)
+
+/*
+ * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
+ * recovery scans a very long log
+ */
+#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
+#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
+
+struct r5l_log {
+ struct md_rdev *rdev;
+
+ u32 uuid_checksum;
+
+ sector_t device_size; /* log device size, round to
+ * BLOCK_SECTORS */
+ sector_t max_free_space; /* reclaim run if free space is at
+ * this size */
+
+ sector_t last_checkpoint; /* log tail. where recovery scan
+ * starts from */
+ u64 last_cp_seq; /* log tail sequence */
+
+ sector_t log_start; /* log head. where new data appends */
+ u64 seq; /* log head sequence */
+
+ sector_t next_checkpoint;
+ u64 next_cp_seq;
+
+ struct mutex io_mutex;
+ struct r5l_io_unit *current_io; /* current io_unit accepting new data */
+
+ spinlock_t io_list_lock;
+ struct list_head running_ios; /* io_units which are still running,
+ * and have not yet been completely
+ * written to the log */
+ struct list_head io_end_ios; /* io_units which have been completely
+ * written to the log but not yet written
+ * to the RAID */
+ struct list_head flushing_ios; /* io_units which are waiting for log
+ * cache flush */
+ struct list_head finished_ios; /* io_units which settle down in log disk */
+ struct bio flush_bio;
+
+ struct kmem_cache *io_kc;
+
+ struct md_thread *reclaim_thread;
+ unsigned long reclaim_target; /* number of space that need to be
+ * reclaimed. if it's 0, reclaim spaces
+ * used by io_units which are in
+ * IO_UNIT_STRIPE_END state (eg, reclaim
+ * dones't wait for specific io_unit
+ * switching to IO_UNIT_STRIPE_END
+ * state) */
+ wait_queue_head_t iounit_wait;
+
+ struct list_head no_space_stripes; /* pending stripes, log has no space */
+ spinlock_t no_space_stripes_lock;
+
+ bool need_cache_flush;
+ bool in_teardown;
+};
+
+/*
+ * an IO range starts from a meta data block and end at the next meta data
+ * block. The io unit's the meta data block tracks data/parity followed it. io
+ * unit is written to log disk with normal write, as we always flush log disk
+ * first and then start move data to raid disks, there is no requirement to
+ * write io unit with FLUSH/FUA
+ */
+struct r5l_io_unit {
+ struct r5l_log *log;
+
+ struct page *meta_page; /* store meta block */
+ int meta_offset; /* current offset in meta_page */
+
+ struct bio *current_bio;/* current_bio accepting new data */
+
+ atomic_t pending_stripe;/* how many stripes not flushed to raid */
+ u64 seq; /* seq number of the metablock */
+ sector_t log_start; /* where the io_unit starts */
+ sector_t log_end; /* where the io_unit ends */
+ struct list_head log_sibling; /* log->running_ios */
+ struct list_head stripe_list; /* stripes added to the io_unit */
+
+ int state;
+ bool need_split_bio;
+};
+
+/* r5l_io_unit state */
+enum r5l_io_unit_state {
+ IO_UNIT_RUNNING = 0, /* accepting new IO */
+ IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
+ * don't accepting new bio */
+ IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
+ IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
+};
+
+static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
+{
+ start += inc;
+ if (start >= log->device_size)
+ start = start - log->device_size;
+ return start;
+}
+
+static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
+ sector_t end)
+{
+ if (end >= start)
+ return end - start;
+ else
+ return end + log->device_size - start;
+}
+
+static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
+{
+ sector_t used_size;
+
+ used_size = r5l_ring_distance(log, log->last_checkpoint,
+ log->log_start);
+
+ return log->device_size > used_size + size;
+}
+
+static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
+{
+ __free_page(io->meta_page);
+ kmem_cache_free(log->io_kc, io);
+}
+
+static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
+ enum r5l_io_unit_state state)
+{
+ struct r5l_io_unit *io;
+
+ while (!list_empty(from)) {
+ io = list_first_entry(from, struct r5l_io_unit, log_sibling);
+ /* don't change list order */
+ if (io->state >= state)
+ list_move_tail(&io->log_sibling, to);
+ else
+ break;
+ }
+}
+
+static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
+ enum r5l_io_unit_state state)
+{
+ if (WARN_ON(io->state >= state))
+ return;
+ io->state = state;
+}
+
+static void r5l_io_run_stripes(struct r5l_io_unit *io)
+{
+ struct stripe_head *sh, *next;
+
+ list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
+ list_del_init(&sh->log_list);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+}
+
+static void r5l_log_run_stripes(struct r5l_log *log)
+{
+ struct r5l_io_unit *io, *next;
+
+ assert_spin_locked(&log->io_list_lock);
+
+ list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
+ /* don't change list order */
+ if (io->state < IO_UNIT_IO_END)
+ break;
+
+ list_move_tail(&io->log_sibling, &log->finished_ios);
+ r5l_io_run_stripes(io);
+ }
+}
+
+static void r5l_log_endio(struct bio *bio)
+{
+ struct r5l_io_unit *io = bio->bi_private;
+ struct r5l_log *log = io->log;
+ unsigned long flags;
+
+ if (bio->bi_error)
+ md_error(log->rdev->mddev, log->rdev);
+
+ bio_put(bio);
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
+ if (log->need_cache_flush)
+ r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
+ IO_UNIT_IO_END);
+ else
+ r5l_log_run_stripes(log);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+ if (log->need_cache_flush)
+ md_wakeup_thread(log->rdev->mddev->thread);
+}
+
+static void r5l_submit_current_io(struct r5l_log *log)
+{
+ struct r5l_io_unit *io = log->current_io;
+ struct r5l_meta_block *block;
+ unsigned long flags;
+ u32 crc;
+
+ if (!io)
+ return;
+
+ block = page_address(io->meta_page);
+ block->meta_size = cpu_to_le32(io->meta_offset);
+ crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
+ block->checksum = cpu_to_le32(crc);
+
+ log->current_io = NULL;
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+ submit_bio(WRITE, io->current_bio);
+}
+
+static struct bio *r5l_bio_alloc(struct r5l_log *log)
+{
+ struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
+
+ bio->bi_rw = WRITE;
+ bio->bi_bdev = log->rdev->bdev;
+ bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
+
+ return bio;
+}
+
+static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
+{
+ log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
+
+ /*
+ * If we filled up the log device start from the beginning again,
+ * which will require a new bio.
+ *
+ * Note: for this to work properly the log size needs to me a multiple
+ * of BLOCK_SECTORS.
+ */
+ if (log->log_start == 0)
+ io->need_split_bio = true;
+
+ io->log_end = log->log_start;
+}
+
+static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
+{
+ struct r5l_io_unit *io;
+ struct r5l_meta_block *block;
+
+ /* We can't handle memory allocate failure so far */
+ io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
+ io->log = log;
+ INIT_LIST_HEAD(&io->log_sibling);
+ INIT_LIST_HEAD(&io->stripe_list);
+ io->state = IO_UNIT_RUNNING;
+
+ io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
+ block = page_address(io->meta_page);
+ block->magic = cpu_to_le32(R5LOG_MAGIC);
+ block->version = R5LOG_VERSION;
+ block->seq = cpu_to_le64(log->seq);
+ block->position = cpu_to_le64(log->log_start);
+
+ io->log_start = log->log_start;
+ io->meta_offset = sizeof(struct r5l_meta_block);
+ io->seq = log->seq++;
+
+ io->current_bio = r5l_bio_alloc(log);
+ io->current_bio->bi_end_io = r5l_log_endio;
+ io->current_bio->bi_private = io;
+ bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
+
+ r5_reserve_log_entry(log, io);
+
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&io->log_sibling, &log->running_ios);
+ spin_unlock_irq(&log->io_list_lock);
+
+ return io;
+}
+
+static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
+{
+ if (log->current_io &&
+ log->current_io->meta_offset + payload_size > PAGE_SIZE)
+ r5l_submit_current_io(log);
+
+ if (!log->current_io)
+ log->current_io = r5l_new_meta(log);
+ return 0;
+}
+
+static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
+ sector_t location,
+ u32 checksum1, u32 checksum2,
+ bool checksum2_valid)
+{
+ struct r5l_io_unit *io = log->current_io;
+ struct r5l_payload_data_parity *payload;
+
+ payload = page_address(io->meta_page) + io->meta_offset;
+ payload->header.type = cpu_to_le16(type);
+ payload->header.flags = cpu_to_le16(0);
+ payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
+ (PAGE_SHIFT - 9));
+ payload->location = cpu_to_le64(location);
+ payload->checksum[0] = cpu_to_le32(checksum1);
+ if (checksum2_valid)
+ payload->checksum[1] = cpu_to_le32(checksum2);
+
+ io->meta_offset += sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) * (1 + !!checksum2_valid);
+}
+
+static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
+{
+ struct r5l_io_unit *io = log->current_io;
+
+ if (io->need_split_bio) {
+ struct bio *prev = io->current_bio;
+
+ io->current_bio = r5l_bio_alloc(log);
+ bio_chain(io->current_bio, prev);
+
+ submit_bio(WRITE, prev);
+ }
+
+ if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
+ BUG();
+
+ r5_reserve_log_entry(log, io);
+}
+
+static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
+ int data_pages, int parity_pages)
+{
+ int i;
+ int meta_size;
+ struct r5l_io_unit *io;
+
+ meta_size =
+ ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
+ * data_pages) +
+ sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) * parity_pages;
+
+ r5l_get_meta(log, meta_size);
+ io = log->current_io;
+
+ for (i = 0; i < sh->disks; i++) {
+ if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+ continue;
+ if (i == sh->pd_idx || i == sh->qd_idx)
+ continue;
+ r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
+ raid5_compute_blocknr(sh, i, 0),
+ sh->dev[i].log_checksum, 0, false);
+ r5l_append_payload_page(log, sh->dev[i].page);
+ }
+
+ if (sh->qd_idx >= 0) {
+ r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
+ sh->sector, sh->dev[sh->pd_idx].log_checksum,
+ sh->dev[sh->qd_idx].log_checksum, true);
+ r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
+ r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
+ } else {
+ r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
+ sh->sector, sh->dev[sh->pd_idx].log_checksum,
+ 0, false);
+ r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
+ }
+
+ list_add_tail(&sh->log_list, &io->stripe_list);
+ atomic_inc(&io->pending_stripe);
+ sh->log_io = io;
+}
+
+static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+/*
+ * running in raid5d, where reclaim could wait for raid5d too (when it flushes
+ * data from log to raid disks), so we shouldn't wait for reclaim here
+ */
+int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+ int write_disks = 0;
+ int data_pages, parity_pages;
+ int meta_size;
+ int reserve;
+ int i;
+
+ if (!log)
+ return -EAGAIN;
+ /* Don't support stripe batch */
+ if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+ test_bit(STRIPE_SYNCING, &sh->state)) {
+ /* the stripe is written to log, we start writing it to raid */
+ clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ return -EAGAIN;
+ }
+
+ for (i = 0; i < sh->disks; i++) {
+ void *addr;
+
+ if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+ continue;
+ write_disks++;
+ /* checksum is already calculated in last run */
+ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
+ continue;
+ addr = kmap_atomic(sh->dev[i].page);
+ sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
+ addr, PAGE_SIZE);
+ kunmap_atomic(addr);
+ }
+ parity_pages = 1 + !!(sh->qd_idx >= 0);
+ data_pages = write_disks - parity_pages;
+
+ meta_size =
+ ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
+ * data_pages) +
+ sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) * parity_pages;
+ /* Doesn't work with very big raid array */
+ if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
+ return -EINVAL;
+
+ set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ /*
+ * The stripe must enter state machine again to finish the write, so
+ * don't delay.
+ */
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ atomic_inc(&sh->count);
+
+ mutex_lock(&log->io_mutex);
+ /* meta + data */
+ reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
+ if (r5l_has_free_space(log, reserve))
+ r5l_log_stripe(log, sh, data_pages, parity_pages);
+ else {
+ spin_lock(&log->no_space_stripes_lock);
+ list_add_tail(&sh->log_list, &log->no_space_stripes);
+ spin_unlock(&log->no_space_stripes_lock);
+
+ r5l_wake_reclaim(log, reserve);
+ }
+ mutex_unlock(&log->io_mutex);
+
+ return 0;
+}
+
+void r5l_write_stripe_run(struct r5l_log *log)
+{
+ if (!log)
+ return;
+ mutex_lock(&log->io_mutex);
+ r5l_submit_current_io(log);
+ mutex_unlock(&log->io_mutex);
+}
+
+int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
+{
+ if (!log)
+ return -ENODEV;
+ /*
+ * we flush log disk cache first, then write stripe data to raid disks.
+ * So if bio is finished, the log disk cache is flushed already. The
+ * recovery guarantees we can recovery the bio from log disk, so we
+ * don't need to flush again
+ */
+ if (bio->bi_iter.bi_size == 0) {
+ bio_endio(bio);
+ return 0;
+ }
+ bio->bi_rw &= ~REQ_FLUSH;
+ return -EAGAIN;
+}
+
+/* This will run after log space is reclaimed */
+static void r5l_run_no_space_stripes(struct r5l_log *log)
+{
+ struct stripe_head *sh;
+
+ spin_lock(&log->no_space_stripes_lock);
+ while (!list_empty(&log->no_space_stripes)) {
+ sh = list_first_entry(&log->no_space_stripes,
+ struct stripe_head, log_list);
+ list_del_init(&sh->log_list);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+ spin_unlock(&log->no_space_stripes_lock);
+}
+
+static sector_t r5l_reclaimable_space(struct r5l_log *log)
+{
+ return r5l_ring_distance(log, log->last_checkpoint,
+ log->next_checkpoint);
+}
+
+static bool r5l_complete_finished_ios(struct r5l_log *log)
+{
+ struct r5l_io_unit *io, *next;
+ bool found = false;
+
+ assert_spin_locked(&log->io_list_lock);
+
+ list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
+ /* don't change list order */
+ if (io->state < IO_UNIT_STRIPE_END)
+ break;
+
+ log->next_checkpoint = io->log_start;
+ log->next_cp_seq = io->seq;
+
+ list_del(&io->log_sibling);
+ r5l_free_io_unit(log, io);
+
+ found = true;
+ }
+
+ return found;
+}
+
+static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
+{
+ struct r5l_log *log = io->log;
+ unsigned long flags;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
+
+ if (!r5l_complete_finished_ios(log)) {
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ return;
+ }
+
+ if (r5l_reclaimable_space(log) > log->max_free_space)
+ r5l_wake_reclaim(log, 0);
+
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ wake_up(&log->iounit_wait);
+}
+
+void r5l_stripe_write_finished(struct stripe_head *sh)
+{
+ struct r5l_io_unit *io;
+
+ io = sh->log_io;
+ sh->log_io = NULL;
+
+ if (io && atomic_dec_and_test(&io->pending_stripe))
+ __r5l_stripe_write_finished(io);
+}
+
+static void r5l_log_flush_endio(struct bio *bio)
+{
+ struct r5l_log *log = container_of(bio, struct r5l_log,
+ flush_bio);
+ unsigned long flags;
+ struct r5l_io_unit *io;
+
+ if (bio->bi_error)
+ md_error(log->rdev->mddev, log->rdev);
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ list_for_each_entry(io, &log->flushing_ios, log_sibling)
+ r5l_io_run_stripes(io);
+ list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+/*
+ * Starting dispatch IO to raid.
+ * io_unit(meta) consists of a log. There is one situation we want to avoid. A
+ * broken meta in the middle of a log causes recovery can't find meta at the
+ * head of log. If operations require meta at the head persistent in log, we
+ * must make sure meta before it persistent in log too. A case is:
+ *
+ * stripe data/parity is in log, we start write stripe to raid disks. stripe
+ * data/parity must be persistent in log before we do the write to raid disks.
+ *
+ * The solution is we restrictly maintain io_unit list order. In this case, we
+ * only write stripes of an io_unit to raid disks till the io_unit is the first
+ * one whose data/parity is in log.
+ */
+void r5l_flush_stripe_to_raid(struct r5l_log *log)
+{
+ bool do_flush;
+
+ if (!log || !log->need_cache_flush)
+ return;
+
+ spin_lock_irq(&log->io_list_lock);
+ /* flush bio is running */
+ if (!list_empty(&log->flushing_ios)) {
+ spin_unlock_irq(&log->io_list_lock);
+ return;
+ }
+ list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
+ do_flush = !list_empty(&log->flushing_ios);
+ spin_unlock_irq(&log->io_list_lock);
+
+ if (!do_flush)
+ return;
+ bio_reset(&log->flush_bio);
+ log->flush_bio.bi_bdev = log->rdev->bdev;
+ log->flush_bio.bi_end_io = r5l_log_flush_endio;
+ submit_bio(WRITE_FLUSH, &log->flush_bio);
+}
+
+static void r5l_write_super(struct r5l_log *log, sector_t cp);
+static void r5l_write_super_and_discard_space(struct r5l_log *log,
+ sector_t end)
+{
+ struct block_device *bdev = log->rdev->bdev;
+ struct mddev *mddev;
+
+ r5l_write_super(log, end);
+
+ if (!blk_queue_discard(bdev_get_queue(bdev)))
+ return;
+
+ mddev = log->rdev->mddev;
+ /*
+ * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
+ * wait for this thread to finish. This thread waits for
+ * MD_CHANGE_PENDING clear, which is supposed to be done in
+ * md_check_recovery(). md_check_recovery() tries to get
+ * reconfig_mutex. Since r5l_quiesce already holds the mutex,
+ * md_check_recovery() fails, so the PENDING never get cleared. The
+ * in_teardown check workaround this issue.
+ */
+ if (!log->in_teardown) {
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
+ log->in_teardown);
+ /*
+ * r5l_quiesce could run after in_teardown check and hold
+ * mutex first. Superblock might get updated twice.
+ */
+ if (log->in_teardown)
+ md_update_sb(mddev, 1);
+ } else {
+ WARN_ON(!mddev_is_locked(mddev));
+ md_update_sb(mddev, 1);
+ }
+
+ /* discard IO error really doesn't matter, ignore it */
+ if (log->last_checkpoint < end) {
+ blkdev_issue_discard(bdev,
+ log->last_checkpoint + log->rdev->data_offset,
+ end - log->last_checkpoint, GFP_NOIO, 0);
+ } else {
+ blkdev_issue_discard(bdev,
+ log->last_checkpoint + log->rdev->data_offset,
+ log->device_size - log->last_checkpoint,
+ GFP_NOIO, 0);
+ blkdev_issue_discard(bdev, log->rdev->data_offset, end,
+ GFP_NOIO, 0);
+ }
+}
+
+
+static void r5l_do_reclaim(struct r5l_log *log)
+{
+ sector_t reclaim_target = xchg(&log->reclaim_target, 0);
+ sector_t reclaimable;
+ sector_t next_checkpoint;
+ u64 next_cp_seq;
+
+ spin_lock_irq(&log->io_list_lock);
+ /*
+ * move proper io_unit to reclaim list. We should not change the order.
+ * reclaimable/unreclaimable io_unit can be mixed in the list, we
+ * shouldn't reuse space of an unreclaimable io_unit
+ */
+ while (1) {
+ reclaimable = r5l_reclaimable_space(log);
+ if (reclaimable >= reclaim_target ||
+ (list_empty(&log->running_ios) &&
+ list_empty(&log->io_end_ios) &&
+ list_empty(&log->flushing_ios) &&
+ list_empty(&log->finished_ios)))
+ break;
+
+ md_wakeup_thread(log->rdev->mddev->thread);
+ wait_event_lock_irq(log->iounit_wait,
+ r5l_reclaimable_space(log) > reclaimable,
+ log->io_list_lock);
+ }
+
+ next_checkpoint = log->next_checkpoint;
+ next_cp_seq = log->next_cp_seq;
+ spin_unlock_irq(&log->io_list_lock);
+
+ BUG_ON(reclaimable < 0);
+ if (reclaimable == 0)
+ return;
+
+ /*
+ * write_super will flush cache of each raid disk. We must write super
+ * here, because the log area might be reused soon and we don't want to
+ * confuse recovery
+ */
+ r5l_write_super_and_discard_space(log, next_checkpoint);
+
+ mutex_lock(&log->io_mutex);
+ log->last_checkpoint = next_checkpoint;
+ log->last_cp_seq = next_cp_seq;
+ mutex_unlock(&log->io_mutex);
+
+ r5l_run_no_space_stripes(log);
+}
+
+static void r5l_reclaim_thread(struct md_thread *thread)
+{
+ struct mddev *mddev = thread->mddev;
+ struct r5conf *conf = mddev->private;
+ struct r5l_log *log = conf->log;
+
+ if (!log)
+ return;
+ r5l_do_reclaim(log);
+}
+
+static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
+{
+ unsigned long target;
+ unsigned long new = (unsigned long)space; /* overflow in theory */
+
+ do {
+ target = log->reclaim_target;
+ if (new < target)
+ return;
+ } while (cmpxchg(&log->reclaim_target, target, new) != target);
+ md_wakeup_thread(log->reclaim_thread);
+}
+
+void r5l_quiesce(struct r5l_log *log, int state)
+{
+ struct mddev *mddev;
+ if (!log || state == 2)
+ return;
+ if (state == 0) {
+ log->in_teardown = 0;
+ log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
+ log->rdev->mddev, "reclaim");
+ } else if (state == 1) {
+ /*
+ * at this point all stripes are finished, so io_unit is at
+ * least in STRIPE_END state
+ */
+ log->in_teardown = 1;
+ /* make sure r5l_write_super_and_discard_space exits */
+ mddev = log->rdev->mddev;
+ wake_up(&mddev->sb_wait);
+ r5l_wake_reclaim(log, -1L);
+ md_unregister_thread(&log->reclaim_thread);
+ r5l_do_reclaim(log);
+ }
+}
+
+bool r5l_log_disk_error(struct r5conf *conf)
+{
+ /* don't allow write if journal disk is missing */
+ if (!conf->log)
+ return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
+ return test_bit(Faulty, &conf->log->rdev->flags);
+}
+
+struct r5l_recovery_ctx {
+ struct page *meta_page; /* current meta */
+ sector_t meta_total_blocks; /* total size of current meta and data */
+ sector_t pos; /* recovery position */
+ u64 seq; /* recovery position seq */
+};
+
+static int r5l_read_meta_block(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ struct page *page = ctx->meta_page;
+ struct r5l_meta_block *mb;
+ u32 crc, stored_crc;
+
+ if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
+ return -EIO;
+
+ mb = page_address(page);
+ stored_crc = le32_to_cpu(mb->checksum);
+ mb->checksum = 0;
+
+ if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
+ le64_to_cpu(mb->seq) != ctx->seq ||
+ mb->version != R5LOG_VERSION ||
+ le64_to_cpu(mb->position) != ctx->pos)
+ return -EINVAL;
+
+ crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ if (stored_crc != crc)
+ return -EINVAL;
+
+ if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
+ return -EINVAL;
+
+ ctx->meta_total_blocks = BLOCK_SECTORS;
+
+ return 0;
+}
+
+static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx,
+ sector_t stripe_sect,
+ int *offset, sector_t *log_offset)
+{
+ struct r5conf *conf = log->rdev->mddev->private;
+ struct stripe_head *sh;
+ struct r5l_payload_data_parity *payload;
+ int disk_index;
+
+ sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
+ while (1) {
+ payload = page_address(ctx->meta_page) + *offset;
+
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
+ raid5_compute_sector(conf,
+ le64_to_cpu(payload->location), 0,
+ &disk_index, sh);
+
+ sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
+ sh->dev[disk_index].page, READ, false);
+ sh->dev[disk_index].log_checksum =
+ le32_to_cpu(payload->checksum[0]);
+ set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
+ ctx->meta_total_blocks += BLOCK_SECTORS;
+ } else {
+ disk_index = sh->pd_idx;
+ sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
+ sh->dev[disk_index].page, READ, false);
+ sh->dev[disk_index].log_checksum =
+ le32_to_cpu(payload->checksum[0]);
+ set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
+
+ if (sh->qd_idx >= 0) {
+ disk_index = sh->qd_idx;
+ sync_page_io(log->rdev,
+ r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
+ PAGE_SIZE, sh->dev[disk_index].page,
+ READ, false);
+ sh->dev[disk_index].log_checksum =
+ le32_to_cpu(payload->checksum[1]);
+ set_bit(R5_Wantwrite,
+ &sh->dev[disk_index].flags);
+ }
+ ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
+ }
+
+ *log_offset = r5l_ring_add(log, *log_offset,
+ le32_to_cpu(payload->size));
+ *offset += sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) *
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
+ break;
+ }
+
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
+ void *addr;
+ u32 checksum;
+
+ if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
+ continue;
+ addr = kmap_atomic(sh->dev[disk_index].page);
+ checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
+ kunmap_atomic(addr);
+ if (checksum != sh->dev[disk_index].log_checksum)
+ goto error;
+ }
+
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
+ struct md_rdev *rdev, *rrdev;
+
+ if (!test_and_clear_bit(R5_Wantwrite,
+ &sh->dev[disk_index].flags))
+ continue;
+
+ /* in case device is broken */
+ rdev = rcu_dereference(conf->disks[disk_index].rdev);
+ if (rdev)
+ sync_page_io(rdev, stripe_sect, PAGE_SIZE,
+ sh->dev[disk_index].page, WRITE, false);
+ rrdev = rcu_dereference(conf->disks[disk_index].replacement);
+ if (rrdev)
+ sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
+ sh->dev[disk_index].page, WRITE, false);
+ }
+ raid5_release_stripe(sh);
+ return 0;
+
+error:
+ for (disk_index = 0; disk_index < sh->disks; disk_index++)
+ sh->dev[disk_index].flags = 0;
+ raid5_release_stripe(sh);
+ return -EINVAL;
+}
+
+static int r5l_recovery_flush_one_meta(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ struct r5conf *conf = log->rdev->mddev->private;
+ struct r5l_payload_data_parity *payload;
+ struct r5l_meta_block *mb;
+ int offset;
+ sector_t log_offset;
+ sector_t stripe_sector;
+
+ mb = page_address(ctx->meta_page);
+ offset = sizeof(struct r5l_meta_block);
+ log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
+
+ while (offset < le32_to_cpu(mb->meta_size)) {
+ int dd;
+
+ payload = (void *)mb + offset;
+ stripe_sector = raid5_compute_sector(conf,
+ le64_to_cpu(payload->location), 0, &dd, NULL);
+ if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
+ &offset, &log_offset))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* copy data/parity from log to raid disks */
+static void r5l_recovery_flush_log(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ while (1) {
+ if (r5l_read_meta_block(log, ctx))
+ return;
+ if (r5l_recovery_flush_one_meta(log, ctx))
+ return;
+ ctx->seq++;
+ ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
+ }
+}
+
+static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
+ u64 seq)
+{
+ struct page *page;
+ struct r5l_meta_block *mb;
+ u32 crc;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+ mb = page_address(page);
+ mb->magic = cpu_to_le32(R5LOG_MAGIC);
+ mb->version = R5LOG_VERSION;
+ mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
+ mb->seq = cpu_to_le64(seq);
+ mb->position = cpu_to_le64(pos);
+ crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ mb->checksum = cpu_to_le32(crc);
+
+ if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
+ __free_page(page);
+ return -EIO;
+ }
+ __free_page(page);
+ return 0;
+}
+
+static int r5l_recovery_log(struct r5l_log *log)
+{
+ struct r5l_recovery_ctx ctx;
+
+ ctx.pos = log->last_checkpoint;
+ ctx.seq = log->last_cp_seq;
+ ctx.meta_page = alloc_page(GFP_KERNEL);
+ if (!ctx.meta_page)
+ return -ENOMEM;
+
+ r5l_recovery_flush_log(log, &ctx);
+ __free_page(ctx.meta_page);
+
+ /*
+ * we did a recovery. Now ctx.pos points to an invalid meta block. New
+ * log will start here. but we can't let superblock point to last valid
+ * meta block. The log might looks like:
+ * | meta 1| meta 2| meta 3|
+ * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
+ * superblock points to meta 1, we write a new valid meta 2n. if crash
+ * happens again, new recovery will start from meta 1. Since meta 2n is
+ * valid now, recovery will think meta 3 is valid, which is wrong.
+ * The solution is we create a new meta in meta2 with its seq == meta
+ * 1's seq + 10 and let superblock points to meta2. The same recovery will
+ * not think meta 3 is a valid meta, because its seq doesn't match
+ */
+ if (ctx.seq > log->last_cp_seq + 1) {
+ int ret;
+
+ ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
+ if (ret)
+ return ret;
+ log->seq = ctx.seq + 11;
+ log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
+ r5l_write_super(log, ctx.pos);
+ } else {
+ log->log_start = ctx.pos;
+ log->seq = ctx.seq;
+ }
+ return 0;
+}
+
+static void r5l_write_super(struct r5l_log *log, sector_t cp)
+{
+ struct mddev *mddev = log->rdev->mddev;
+
+ log->rdev->journal_tail = cp;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+}
+
+static int r5l_load_log(struct r5l_log *log)
+{
+ struct md_rdev *rdev = log->rdev;
+ struct page *page;
+ struct r5l_meta_block *mb;
+ sector_t cp = log->rdev->journal_tail;
+ u32 stored_crc, expected_crc;
+ bool create_super = false;
+ int ret;
+
+ /* Make sure it's valid */
+ if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
+ cp = 0;
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
+ ret = -EIO;
+ goto ioerr;
+ }
+ mb = page_address(page);
+
+ if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
+ mb->version != R5LOG_VERSION) {
+ create_super = true;
+ goto create;
+ }
+ stored_crc = le32_to_cpu(mb->checksum);
+ mb->checksum = 0;
+ expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ if (stored_crc != expected_crc) {
+ create_super = true;
+ goto create;
+ }
+ if (le64_to_cpu(mb->position) != cp) {
+ create_super = true;
+ goto create;
+ }
+create:
+ if (create_super) {
+ log->last_cp_seq = prandom_u32();
+ cp = 0;
+ /*
+ * Make sure super points to correct address. Log might have
+ * data very soon. If super hasn't correct log tail address,
+ * recovery can't find the log
+ */
+ r5l_write_super(log, cp);
+ } else
+ log->last_cp_seq = le64_to_cpu(mb->seq);
+
+ log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
+ log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
+ if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
+ log->max_free_space = RECLAIM_MAX_FREE_SPACE;
+ log->last_checkpoint = cp;
+
+ __free_page(page);
+
+ return r5l_recovery_log(log);
+ioerr:
+ __free_page(page);
+ return ret;
+}
+
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+{
+ struct r5l_log *log;
+
+ if (PAGE_SIZE != 4096)
+ return -EINVAL;
+ log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ return -ENOMEM;
+ log->rdev = rdev;
+
+ log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
+
+ log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
+ sizeof(rdev->mddev->uuid));
+
+ mutex_init(&log->io_mutex);
+
+ spin_lock_init(&log->io_list_lock);
+ INIT_LIST_HEAD(&log->running_ios);
+ INIT_LIST_HEAD(&log->io_end_ios);
+ INIT_LIST_HEAD(&log->flushing_ios);
+ INIT_LIST_HEAD(&log->finished_ios);
+ bio_init(&log->flush_bio);
+
+ log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
+ if (!log->io_kc)
+ goto io_kc;
+
+ log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
+ log->rdev->mddev, "reclaim");
+ if (!log->reclaim_thread)
+ goto reclaim_thread;
+ init_waitqueue_head(&log->iounit_wait);
+
+ INIT_LIST_HEAD(&log->no_space_stripes);
+ spin_lock_init(&log->no_space_stripes_lock);
+
+ if (r5l_load_log(log))
+ goto error;
+
+ conf->log = log;
+ return 0;
+error:
+ md_unregister_thread(&log->reclaim_thread);
+reclaim_thread:
+ kmem_cache_destroy(log->io_kc);
+io_kc:
+ kfree(log);
+ return -EINVAL;
+}
+
+void r5l_exit_log(struct r5l_log *log)
+{
+ md_unregister_thread(&log->reclaim_thread);
+ kmem_cache_destroy(log->io_kc);
+ kfree(log);
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 643d217..704ef7f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
-static void return_io(struct bio *return_bi)
+static void return_io(struct bio_list *return_bi)
{
- struct bio *bi = return_bi;
- while (bi) {
-
- return_bi = bi->bi_next;
- bi->bi_next = NULL;
+ struct bio *bi;
+ while ((bi = bio_list_pop(return_bi)) != NULL) {
bi->bi_iter.bi_size = 0;
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
- bio_endio(bi, 0);
- bi = return_bi;
+ bio_endio(bi);
}
}
@@ -357,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf,
struct list_head *list = &temp_inactive_list[size - 1];
/*
- * We don't hold any lock here yet, get_active_stripe() might
+ * We don't hold any lock here yet, raid5_get_active_stripe() might
* remove stripes from the list
*/
if (!list_empty_careful(list)) {
@@ -417,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf,
return count;
}
-static void release_stripe(struct stripe_head *sh)
+void raid5_release_stripe(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
unsigned long flags;
@@ -662,9 +658,9 @@ static int has_failed(struct r5conf *conf)
return 0;
}
-static struct stripe_head *
-get_active_stripe(struct r5conf *conf, sector_t sector,
- int previous, int noblock, int noquiesce)
+struct stripe_head *
+raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
+ int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
int hash = stripe_hash_locks_hash(sector);
@@ -759,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
/* Only freshly new full stripe normal write stripe can be added to a batch list */
static bool stripe_can_batch(struct stripe_head *sh)
{
+ struct r5conf *conf = sh->raid_conf;
+
+ if (conf->log)
+ return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
is_full_stripe_write(sh);
@@ -862,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
unlock_out:
unlock_two_stripes(head, sh);
out:
- release_stripe(head);
+ raid5_release_stripe(head);
}
/* Determine if 'data_offset' or 'new_data_offset' should be used
@@ -887,9 +887,9 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
}
static void
-raid5_end_read_request(struct bio *bi, int error);
+raid5_end_read_request(struct bio *bi);
static void
-raid5_end_write_request(struct bio *bi, int error);
+raid5_end_write_request(struct bio *bi);
static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
{
@@ -899,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
+ if (r5l_write_stripe(conf->log, sh) == 0)
+ return;
for (i = disks; i--; ) {
int rw;
int replace_only = 0;
@@ -1177,7 +1179,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
- struct bio *return_bi = NULL;
+ struct bio_list return_bi = BIO_EMPTY_LIST;
int i;
pr_debug("%s: stripe %llu\n", __func__,
@@ -1201,20 +1203,18 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(rbi)) {
- rbi->bi_next = return_bi;
- return_bi = rbi;
- }
+ if (!raid5_dec_bi_active_stripes(rbi))
+ bio_list_add(&return_bi, rbi);
rbi = rbi2;
}
}
}
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
- return_io(return_bi);
+ return_io(&return_bi);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void ops_run_biofill(struct stripe_head *sh)
@@ -1277,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref)
if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
/* return a pointer to the address conversion region of the scribble buffer */
@@ -1703,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
}
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void
@@ -1861,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref)
sh->check_state = check_state_check_result;
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
@@ -2023,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
/* we just created an active stripe so... */
atomic_inc(&conf->active_stripes);
- release_stripe(sh);
+ raid5_release_stripe(sh);
conf->max_nr_stripes++;
return 1;
}
@@ -2242,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (!p)
err = -ENOMEM;
}
- release_stripe(nsh);
+ raid5_release_stripe(nsh);
}
/* critical section pass, GFP_NOIO no longer needed */
@@ -2256,7 +2256,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
static int drop_one_stripe(struct r5conf *conf)
{
struct stripe_head *sh;
- int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
+ int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
spin_lock_irq(conf->hash_locks + hash);
sh = get_free_stripe(conf, hash);
@@ -2277,17 +2277,15 @@ static void shrink_stripes(struct r5conf *conf)
drop_one_stripe(conf))
;
- if (conf->slab_cache)
- kmem_cache_destroy(conf->slab_cache);
+ kmem_cache_destroy(conf->slab_cache);
conf->slab_cache = NULL;
}
-static void raid5_end_read_request(struct bio * bi, int error)
+static void raid5_end_read_request(struct bio * bi)
{
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE];
struct md_rdev *rdev = NULL;
sector_t s;
@@ -2296,9 +2294,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
if (bi == &sh->dev[i].req)
break;
- pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
+ pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- uptodate);
+ bi->bi_error);
if (i == disks) {
BUG();
return;
@@ -2317,7 +2315,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
s = sh->sector + rdev->new_data_offset;
else
s = sh->sector + rdev->data_offset;
- if (uptodate) {
+ if (!bi->bi_error) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
/* Note that this cannot happen on a
@@ -2402,16 +2400,15 @@ static void raid5_end_read_request(struct bio * bi, int error)
rdev_dec_pending(rdev, conf->mddev);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
-static void raid5_end_write_request(struct bio *bi, int error)
+static void raid5_end_write_request(struct bio *bi)
{
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
struct md_rdev *uninitialized_var(rdev);
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
sector_t first_bad;
int bad_sectors;
int replacement = 0;
@@ -2434,23 +2431,23 @@ static void raid5_end_write_request(struct bio *bi, int error)
break;
}
}
- pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
+ pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- uptodate);
+ bi->bi_error);
if (i == disks) {
BUG();
return;
}
if (replacement) {
- if (!uptodate)
+ if (bi->bi_error)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
- if (!uptodate) {
+ if (bi->bi_error) {
set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
@@ -2471,20 +2468,18 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
rdev_dec_pending(rdev, conf->mddev);
- if (sh->batch_head && !uptodate && !replacement)
+ if (sh->batch_head && bi->bi_error && !replacement)
set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
if (sh->batch_head && sh != sh->batch_head)
- release_stripe(sh->batch_head);
+ raid5_release_stripe(sh->batch_head);
}
-static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
@@ -2500,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
dev->rreq.bi_private = sh;
dev->flags = 0;
- dev->sector = compute_blocknr(sh, i, previous);
+ dev->sector = raid5_compute_blocknr(sh, i, previous);
}
static void error(struct mddev *mddev, struct md_rdev *rdev)
@@ -2519,6 +2514,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT
"md/raid:%s: Disk failure on %s, disabling device.\n"
"md/raid:%s: Operation continuing on %d devices.\n",
@@ -2532,9 +2528,9 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
* Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them.
*/
-static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
- int previous, int *dd_idx,
- struct stripe_head *sh)
+sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+ int previous, int *dd_idx,
+ struct stripe_head *sh)
{
sector_t stripe, stripe2;
sector_t chunk_number;
@@ -2734,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
return new_sector;
}
-static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
+sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
{
struct r5conf *conf = sh->raid_conf;
int raid_disks = sh->disks;
@@ -3071,7 +3067,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks,
- struct bio **return_bi)
+ struct bio_list *return_bi)
{
int i;
BUG_ON(sh->batch_head);
@@ -3106,17 +3102,19 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
+ r5l_stripe_write_finished(sh);
+
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+ bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
- bi->bi_next = *return_bi;
- *return_bi = bi;
+ bio_list_add(return_bi, bi);
}
bi = nextbi;
}
@@ -3136,11 +3134,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+ bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
- bi->bi_next = *return_bi;
- *return_bi = bi;
+ bio_list_add(return_bi, bi);
}
bi = bi2;
}
@@ -3149,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
* the data has not reached the cache yet.
*/
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
+ s->failed > conf->max_degraded &&
(!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags))) {
spin_lock_irq(&sh->stripe_lock);
@@ -3157,15 +3156,16 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
+ if (bi)
+ s->to_read--;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
- if (!raid5_dec_bi_active_stripes(bi)) {
- bi->bi_next = *return_bi;
- *return_bi = bi;
- }
+
+ bi->bi_error = -EIO;
+ if (!raid5_dec_bi_active_stripes(bi))
+ bio_list_add(return_bi, bi);
bi = nextbi;
}
}
@@ -3177,6 +3177,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
*/
clear_bit(R5_LOCKED, &sh->dev[i].flags);
}
+ s->to_write = 0;
+ s->written = 0;
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -3308,7 +3310,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
*/
return 0;
- for (i = 0; i < s->failed; i++) {
+ for (i = 0; i < s->failed && i < 2; i++) {
if (fdev[i]->towrite &&
!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
!test_bit(R5_OVERWRITE, &fdev[i]->flags))
@@ -3332,7 +3334,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
sh->sector < sh->raid_conf->mddev->recovery_cp)
/* reconstruct-write isn't being forced */
return 0;
- for (i = 0; i < s->failed; i++) {
+ for (i = 0; i < s->failed && i < 2; i++) {
if (s->failed_num[i] != sh->pd_idx &&
s->failed_num[i] != sh->qd_idx &&
!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
@@ -3444,7 +3446,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_stripe_clean_event(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio **return_bi)
+ struct stripe_head *sh, int disks, struct bio_list *return_bi)
{
int i;
struct r5dev *dev;
@@ -3478,8 +3480,7 @@ returnbi:
wbi2 = r5_next_bio(wbi, dev->sector);
if (!raid5_dec_bi_active_stripes(wbi)) {
md_write_end(conf->mddev);
- wbi->bi_next = *return_bi;
- *return_bi = wbi;
+ bio_list_add(return_bi, wbi);
}
wbi = wbi2;
}
@@ -3503,8 +3504,12 @@ returnbi:
WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
WARN_ON(dev->page != dev->orig_page);
}
+
+ r5l_stripe_write_finished(sh);
+
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
+ int hash;
clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
if (sh->qd_idx >= 0) {
@@ -3518,16 +3523,17 @@ returnbi:
* no updated data, so remove it from hash list and the stripe
* will be reinitialized
*/
- spin_lock_irq(&conf->device_lock);
unhash:
+ hash = sh->hash_lock_index;
+ spin_lock_irq(conf->hash_locks + hash);
remove_hash(sh);
+ spin_unlock_irq(conf->hash_locks + hash);
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head, batch_list);
if (sh != head_sh)
goto unhash;
}
- spin_unlock_irq(&conf->device_lock);
sh = head_sh;
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
@@ -3943,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
struct stripe_head *sh2;
struct async_submit_ctl submit;
- sector_t bn = compute_blocknr(sh, i, 1);
+ sector_t bn = raid5_compute_blocknr(sh, i, 1);
sector_t s = raid5_compute_sector(conf, bn, 0,
&dd_idx, NULL);
- sh2 = get_active_stripe(conf, s, 0, 1, 1);
+ sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
if (sh2 == NULL)
/* so far only the early blocks of this stripe
* have been requested. When later blocks
@@ -3956,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
/* must have already done this block */
- release_stripe(sh2);
+ raid5_release_stripe(sh2);
continue;
}
@@ -3977,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
set_bit(STRIPE_EXPAND_READY, &sh2->state);
set_bit(STRIPE_HANDLE, &sh2->state);
}
- release_stripe(sh2);
+ raid5_release_stripe(sh2);
}
/* done submitting copies, wait for them to complete */
@@ -4012,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
s->failed_num[0] = -1;
s->failed_num[1] = -1;
+ s->log_failed = r5l_log_disk_error(conf);
/* Now to look around and see what can be done */
rcu_read_lock();
@@ -4263,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
if (handle_flags == 0 ||
sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
spin_lock_irq(&head_sh->stripe_lock);
head_sh->batch_head = NULL;
@@ -4324,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh)
analyse_stripe(sh, &s);
+ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
+ goto finish;
+
if (s.handle_bad_blocks) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
@@ -4352,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh)
/* check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
*/
- if (s.failed > conf->max_degraded) {
+ if (s.failed > conf->max_degraded || s.log_failed) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
@@ -4510,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh_src
- = get_active_stripe(conf, sh->sector, 1, 1, 1);
+ = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
/* sh cannot be written until sh_src has been read.
* so arrange for sh to be delayed a little
@@ -4520,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
&sh_src->state))
atomic_inc(&conf->preread_active_stripes);
- release_stripe(sh_src);
+ raid5_release_stripe(sh_src);
goto finish;
}
if (sh_src)
- release_stripe(sh_src);
+ raid5_release_stripe(sh_src);
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
@@ -4612,7 +4622,15 @@ finish:
md_wakeup_thread(conf->mddev->thread);
}
- return_io(s.return_bi);
+ if (!bio_list_empty(&s.return_bi)) {
+ if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
+ spin_lock_irq(&conf->device_lock);
+ bio_list_merge(&conf->return_bi, &s.return_bi);
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else
+ return_io(&s.return_bi);
+ }
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}
@@ -4669,43 +4687,14 @@ static int raid5_congested(struct mddev *mddev, int bits)
return 0;
}
-/* We want read requests to align with chunks where possible,
- * but write requests don't need to.
- */
-static int raid5_mergeable_bvec(struct mddev *mddev,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int max;
- unsigned int chunk_sectors = mddev->chunk_sectors;
- unsigned int bio_sectors = bvm->bi_size >> 9;
-
- /*
- * always allow writes to be mergeable, read as well if array
- * is degraded as we'll go through stripe cache anyway.
- */
- if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
- return biovec->bv_len;
-
- if (mddev->new_chunk_sectors < mddev->chunk_sectors)
- chunk_sectors = mddev->new_chunk_sectors;
- max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
- if (max < 0) max = 0;
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- else
- return max;
-}
-
static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
+ struct r5conf *conf = mddev->private;
sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
- unsigned int chunk_sectors = mddev->chunk_sectors;
+ unsigned int chunk_sectors;
unsigned int bio_sectors = bio_sectors(bio);
- if (mddev->new_chunk_sectors < mddev->chunk_sectors)
- chunk_sectors = mddev->new_chunk_sectors;
+ chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
return chunk_sectors >=
((sector & (chunk_sectors - 1)) + bio_sectors);
}
@@ -4756,13 +4745,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
* first).
* If the read failed..
*/
-static void raid5_align_endio(struct bio *bi, int error)
+static void raid5_align_endio(struct bio *bi)
{
struct bio* raid_bi = bi->bi_private;
struct mddev *mddev;
struct r5conf *conf;
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
struct md_rdev *rdev;
+ int error = bi->bi_error;
bio_put(bi);
@@ -4773,10 +4762,10 @@ static void raid5_align_endio(struct bio *bi, int error)
rdev_dec_pending(rdev, conf->mddev);
- if (!error && uptodate) {
+ if (!error) {
trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
raid_bi, 0);
- bio_endio(raid_bi, 0);
+ bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
return;
@@ -4787,26 +4776,7 @@ static void raid5_align_endio(struct bio *bi, int error)
add_bio_to_retry(raid_bi, conf);
}
-static int bio_fits_rdev(struct bio *bi)
-{
- struct request_queue *q = bdev_get_queue(bi->bi_bdev);
-
- if (bio_sectors(bi) > queue_max_sectors(q))
- return 0;
- blk_recount_segments(q, bi);
- if (bi->bi_phys_segments > queue_max_segments(q))
- return 0;
-
- if (q->merge_bvec_fn)
- /* it's too hard to apply the merge_bvec_fn at this stage,
- * just just give up
- */
- return 0;
-
- return 1;
-}
-
-static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
+static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
{
struct r5conf *conf = mddev->private;
int dd_idx;
@@ -4815,7 +4785,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
sector_t end_sector;
if (!in_chunk_boundary(mddev, raid_bio)) {
- pr_debug("chunk_aligned_read : non aligned\n");
+ pr_debug("%s: non aligned\n", __func__);
return 0;
}
/*
@@ -4857,13 +4827,11 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
rcu_read_unlock();
raid_bio->bi_next = (void*)rdev;
align_bi->bi_bdev = rdev->bdev;
- __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags);
+ bio_clear_flag(align_bi, BIO_SEG_VALID);
- if (!bio_fits_rdev(align_bi) ||
- is_badblock(rdev, align_bi->bi_iter.bi_sector,
+ if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
bio_sectors(align_bi),
&first_bad, &bad_sectors)) {
- /* too big in some way, or has a known bad block */
bio_put(align_bi);
rdev_dec_pending(rdev, mddev);
return 0;
@@ -4892,6 +4860,31 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
}
}
+static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
+{
+ struct bio *split;
+
+ do {
+ sector_t sector = raid_bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+ unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
+
+ if (sectors < bio_sectors(raid_bio)) {
+ split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
+ bio_chain(split, raid_bio);
+ } else
+ split = raid_bio;
+
+ if (!raid5_read_one_chunk(mddev, split)) {
+ if (split != raid_bio)
+ generic_make_request(raid_bio);
+ return split;
+ }
+ } while (split != raid_bio);
+
+ return NULL;
+}
+
/* __get_priority_stripe - get the next stripe to process
*
* Full stripe writes are allowed to pass preread active stripes up until
@@ -5033,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev,
struct raid5_plug_cb *cb;
if (!blk_cb) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
return;
}
@@ -5049,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev,
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
list_add_tail(&sh->lru, &cb->list);
else
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void make_discard_request(struct mddev *mddev, struct bio *bi)
@@ -5084,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
DEFINE_WAIT(w);
int d;
again:
- sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+ sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
goto again;
}
@@ -5101,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
if (sh->dev[d].towrite || sh->dev[d].toread) {
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
goto again;
}
@@ -5140,7 +5133,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
remaining = raid5_dec_bi_active_stripes(bi);
if (remaining == 0) {
md_write_end(mddev);
- bio_endio(bi, 0);
+ bio_endio(bi);
}
}
@@ -5157,8 +5150,15 @@ static void make_request(struct mddev *mddev, struct bio * bi)
bool do_prepare;
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
- md_flush_request(mddev, bi);
- return;
+ int ret = r5l_handle_flush_request(conf->log, bi);
+
+ if (ret == 0)
+ return;
+ if (ret == -ENODEV) {
+ md_flush_request(mddev, bi);
+ return;
+ }
+ /* ret == -EAGAIN, fallback */
}
md_write_start(mddev, bi);
@@ -5169,9 +5169,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* data on failed drives.
*/
if (rw == READ && mddev->degraded == 0 &&
- mddev->reshape_position == MaxSector &&
- chunk_aligned_read(mddev,bi))
- return;
+ mddev->reshape_position == MaxSector) {
+ bi = chunk_aligned_read(mddev, bi);
+ if (!bi)
+ return;
+ }
if (unlikely(bi->bi_rw & REQ_DISCARD)) {
make_discard_request(mddev, bi);
@@ -5229,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
(unsigned long long)new_sector,
(unsigned long long)logical_sector);
- sh = get_active_stripe(conf, new_sector, previous,
+ sh = raid5_get_active_stripe(conf, new_sector, previous,
(bi->bi_rw&RWA_MASK), 0);
if (sh) {
if (unlikely(previous)) {
@@ -5250,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
must_retry = 1;
spin_unlock_irq(&conf->device_lock);
if (must_retry) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
@@ -5260,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi)
/* Might have got the wrong stripe_head
* by accident
*/
- release_stripe(sh);
+ raid5_release_stripe(sh);
goto retry;
}
if (rw == WRITE &&
logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
@@ -5290,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* and wait a while
*/
md_wakeup_thread(mddev->thread);
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
@@ -5304,7 +5306,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
release_stripe_plug(mddev, sh);
} else {
/* cannot get stripe for read-ahead, just give-up */
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ bi->bi_error = -EIO;
break;
}
}
@@ -5318,7 +5320,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
- bio_endio(bi, 0);
+ bio_endio(bi);
}
}
@@ -5347,6 +5349,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sector_t stripe_addr;
int reshape_sectors;
struct list_head stripes;
+ sector_t retn;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
@@ -5354,6 +5357,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
+ } else if (mddev->reshape_backwards &&
+ conf->reshape_progress == MaxSector) {
+ /* shouldn't happen, but just in case, finish up.*/
+ sector_nr = MaxSector;
} else if (!mddev->reshape_backwards &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
@@ -5362,7 +5369,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
mddev->curr_resync_completed = sector_nr;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
*skipped = 1;
- return sector_nr;
+ retn = sector_nr;
+ goto finish;
}
}
@@ -5370,10 +5378,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* If old and new chunk sizes differ, we need to process the
* largest of these
*/
- if (mddev->new_chunk_sectors > mddev->chunk_sectors)
- reshape_sectors = mddev->new_chunk_sectors;
- else
- reshape_sectors = mddev->chunk_sectors;
+
+ reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
/* We update the metadata at least every 10 seconds, or when
* the data about to be copied would over-write the source of
@@ -5388,11 +5394,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->reshape_backwards) {
- writepos -= min_t(sector_t, reshape_sectors, writepos);
+ BUG_ON(writepos < reshape_sectors);
+ writepos -= reshape_sectors;
readpos += reshape_sectors;
safepos += reshape_sectors;
} else {
writepos += reshape_sectors;
+ /* readpos and safepos are worst-case calculations.
+ * A negative number is overly pessimistic, and causes
+ * obvious problems for unsigned storage. So clip to 0.
+ */
readpos -= min_t(sector_t, reshape_sectors, readpos);
safepos -= min_t(sector_t, reshape_sectors, safepos);
}
@@ -5468,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j;
int skipped_disk = 0;
- sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
+ sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old
@@ -5481,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (conf->level == 6 &&
j == sh->qd_idx)
continue;
- s = compute_blocknr(sh, j, 0);
+ s = raid5_compute_blocknr(sh, j, 0);
if (s < raid5_size(mddev, 0, 0)) {
skipped_disk = 1;
continue;
@@ -5517,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) {
- sh = get_active_stripe(conf, first_sector, 1, 0, 1);
+ sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
first_sector += STRIPE_SECTORS;
}
/* Now that the sources are clearly marked, we can release
@@ -5529,13 +5540,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
while (!list_empty(&stripes)) {
sh = list_entry(stripes.next, struct stripe_head, lru);
list_del_init(&sh->lru);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
/* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock.
*/
sector_nr += reshape_sectors;
- if ((sector_nr - mddev->curr_resync_completed) * 2
+ retn = reshape_sectors;
+finish:
+ if (mddev->curr_resync_completed > mddev->resync_max ||
+ (sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
@@ -5560,7 +5574,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
ret:
- return reshape_sectors;
+ return retn;
}
static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
@@ -5622,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}
- bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
- sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
+ sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
if (sh == NULL) {
- sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
+ sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
/* make sure we don't swamp the stripe cache if someone else
* is trying to get access
*/
@@ -5650,7 +5664,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
return STRIPE_SECTORS;
}
@@ -5689,7 +5703,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
/* already done this stripe */
continue;
- sh = get_active_stripe(conf, sector, 0, 1, 1);
+ sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
if (!sh) {
/* failed to get a stripe - must wait */
@@ -5699,7 +5713,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
return handled;
@@ -5707,14 +5721,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
handle_stripe(sh);
- release_stripe(sh);
+ raid5_release_stripe(sh);
handled++;
}
remaining = raid5_dec_bi_active_stripes(raid_bio);
if (remaining == 0) {
trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
raid_bio, 0);
- bio_endio(raid_bio, 0);
+ bio_endio(raid_bio);
}
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
@@ -5737,8 +5751,12 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
if (!list_empty(temp_inactive_list + i))
break;
- if (i == NR_STRIPE_HASH_LOCKS)
+ if (i == NR_STRIPE_HASH_LOCKS) {
+ spin_unlock_irq(&conf->device_lock);
+ r5l_flush_stripe_to_raid(conf->log);
+ spin_lock_irq(&conf->device_lock);
return batch_size;
+ }
release_inactive = true;
}
spin_unlock_irq(&conf->device_lock);
@@ -5746,6 +5764,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
release_inactive_stripe_list(conf, temp_inactive_list,
NR_STRIPE_HASH_LOCKS);
+ r5l_flush_stripe_to_raid(conf->log);
if (release_inactive) {
spin_lock_irq(&conf->device_lock);
return 0;
@@ -5753,6 +5772,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
+ r5l_write_stripe_run(conf->log);
cond_resched();
@@ -5816,6 +5836,18 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev);
+ if (!bio_list_empty(&conf->return_bi) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ struct bio_list tmp = BIO_EMPTY_LIST;
+ spin_lock_irq(&conf->device_lock);
+ if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ bio_list_merge(&tmp, &conf->return_bi);
+ bio_list_init(&conf->return_bi);
+ }
+ spin_unlock_irq(&conf->device_lock);
+ return_io(&tmp);
+ }
+
blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
@@ -5874,6 +5906,8 @@ static void raid5d(struct md_thread *thread)
mutex_unlock(&conf->cache_size_mutex);
}
+ r5l_flush_stripe_to_raid(conf->log);
+
async_tx_issue_pending_all();
blk_finish_plug(&plug);
@@ -6256,8 +6290,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
/* size is defined by the smallest of previous and new size */
raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
- sectors &= ~((sector_t)mddev->chunk_sectors - 1);
- sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
+ sectors &= ~((sector_t)conf->chunk_sectors - 1);
+ sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
return sectors * (raid_disks - conf->max_degraded);
}
@@ -6311,8 +6345,11 @@ static void raid5_free_percpu(struct r5conf *conf)
static void free_conf(struct r5conf *conf)
{
+ if (conf->log)
+ r5l_exit_log(conf->log);
if (conf->shrinker.seeks)
unregister_shrinker(&conf->shrinker);
+
free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
@@ -6388,7 +6425,8 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink,
if (mutex_trylock(&conf->cache_size_mutex)) {
ret= 0;
- while (ret < sc->nr_to_scan) {
+ while (ret < sc->nr_to_scan &&
+ conf->max_nr_stripes > conf->min_nr_stripes) {
if (drop_one_stripe(conf) == 0) {
ret = SHRINK_STOP;
break;
@@ -6474,6 +6512,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
+ bio_list_init(&conf->return_bi);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
@@ -6523,7 +6562,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
- || raid_disk < 0)
+ || raid_disk < 0 || test_bit(Journal, &rdev->flags))
continue;
disk = conf->disks + raid_disk;
@@ -6563,6 +6602,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (conf->reshape_progress != MaxSector) {
conf->prev_chunk_sectors = mddev->chunk_sectors;
conf->prev_algo = mddev->layout;
+ } else {
+ conf->prev_chunk_sectors = conf->chunk_sectors;
+ conf->prev_algo = conf->algorithm;
}
conf->min_nr_stripes = NR_STRIPES;
@@ -6640,6 +6682,7 @@ static int run(struct mddev *mddev)
int working_disks = 0;
int dirty_parity_disks = 0;
struct md_rdev *rdev;
+ struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0;
int i;
long long min_offset_diff = 0;
@@ -6652,6 +6695,11 @@ static int run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
+
+ if (test_bit(Journal, &rdev->flags)) {
+ journal_dev = rdev;
+ continue;
+ }
if (rdev->raid_disk < 0)
continue;
diff = (rdev->new_data_offset - rdev->data_offset);
@@ -6682,6 +6730,14 @@ static int run(struct mddev *mddev)
sector_t here_new, here_old;
int old_disks;
int max_degraded = (mddev->level == 6 ? 2 : 1);
+ int chunk_sectors;
+ int new_data_disks;
+
+ if (journal_dev) {
+ printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
if (mddev->new_level != mddev->level) {
printk(KERN_ERR "md/raid:%s: unsupported reshape "
@@ -6693,28 +6749,25 @@ static int run(struct mddev *mddev)
/* reshape_position must be on a new-stripe boundary, and one
* further up in new geometry must map after here in old
* geometry.
+ * If the chunk sizes are different, then as we perform reshape
+ * in units of the largest of the two, reshape_position needs
+ * be a multiple of the largest chunk size times new data disks.
*/
here_new = mddev->reshape_position;
- if (sector_div(here_new, mddev->new_chunk_sectors *
- (mddev->raid_disks - max_degraded))) {
+ chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
+ new_data_disks = mddev->raid_disks - max_degraded;
+ if (sector_div(here_new, chunk_sectors * new_data_disks)) {
printk(KERN_ERR "md/raid:%s: reshape_position not "
"on a stripe boundary\n", mdname(mddev));
return -EINVAL;
}
- reshape_offset = here_new * mddev->new_chunk_sectors;
+ reshape_offset = here_new * chunk_sectors;
/* here_new is the stripe we will write to */
here_old = mddev->reshape_position;
- sector_div(here_old, mddev->chunk_sectors *
- (old_disks-max_degraded));
+ sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
if (mddev->delta_disks == 0) {
- if ((here_new * mddev->new_chunk_sectors !=
- here_old * mddev->chunk_sectors)) {
- printk(KERN_ERR "md/raid:%s: reshape position is"
- " confused - aborting\n", mdname(mddev));
- return -EINVAL;
- }
/* We cannot be sure it is safe to start an in-place
* reshape. It is only safe if user-space is monitoring
* and taking constant backups.
@@ -6733,10 +6786,10 @@ static int run(struct mddev *mddev)
return -EINVAL;
}
} else if (mddev->reshape_backwards
- ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
- here_old * mddev->chunk_sectors)
- : (here_new * mddev->new_chunk_sectors >=
- here_old * mddev->chunk_sectors + (-min_offset_diff))) {
+ ? (here_new * chunk_sectors + min_offset_diff <=
+ here_old * chunk_sectors)
+ : (here_new * chunk_sectors >=
+ here_old * chunk_sectors + (-min_offset_diff))) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "md/raid:%s: reshape_position too early for "
"auto-recovery - aborting.\n",
@@ -6761,6 +6814,13 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) {
+ printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n",
+ mdname(mddev));
+ mddev->ro = 1;
+ set_disk_ro(mddev->gendisk, 1);
+ }
+
conf->min_offset_diff = min_offset_diff;
mddev->thread = conf->thread;
conf->thread = NULL;
@@ -6964,6 +7024,14 @@ static int run(struct mddev *mddev)
mddev->queue);
}
+ if (journal_dev) {
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
+ mdname(mddev), bdevname(journal_dev->bdev, b));
+ r5l_init_log(conf, journal_dev);
+ }
+
return 0;
abort:
md_unregister_thread(&mddev->thread);
@@ -6988,7 +7056,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
int i;
seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
- mddev->chunk_sectors / 2, mddev->layout);
+ conf->chunk_sectors / 2, mddev->layout);
seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
@@ -7073,6 +7141,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct disk_info *p = conf->disks + number;
print_raid5_conf(conf);
+ if (test_bit(Journal, &rdev->flags)) {
+ /*
+ * journal disk is not removable, but we need give a chance to
+ * update superblock of other disks. Otherwise journal disk
+ * will be considered as 'fresh'
+ */
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ return -EINVAL;
+ }
if (rdev == p->rdev)
rdevp = &p->rdev;
else if (rdev == p->replacement)
@@ -7135,6 +7212,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int first = 0;
int last = conf->raid_disks - 1;
+ if (test_bit(Journal, &rdev->flags))
+ return -EINVAL;
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
@@ -7194,7 +7273,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize;
- sectors &= ~((sector_t)mddev->chunk_sectors - 1);
+ struct r5conf *conf = mddev->private;
+
+ if (conf->log)
+ return -EINVAL;
+ sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
if (mddev->external_size &&
mddev->array_sectors > newsize)
@@ -7245,6 +7328,8 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
+ if (conf->log)
+ return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
mddev->new_chunk_sectors == mddev->chunk_sectors)
@@ -7433,6 +7518,7 @@ static void end_reshape(struct r5conf *conf)
rdev->data_offset = rdev->new_data_offset;
smp_wmb();
conf->reshape_progress = MaxSector;
+ conf->mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
@@ -7520,6 +7606,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
unlock_all_device_hash_locks_irq(conf);
break;
}
+ r5l_quiesce(conf->log, state);
}
static void *raid45_takeover_raid0(struct mddev *mddev, int level)
@@ -7778,7 +7865,6 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.congested = raid5_congested,
- .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid5_personality =
{
@@ -7802,7 +7888,6 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.congested = raid5_congested,
- .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid4_personality =
@@ -7827,7 +7912,6 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.congested = raid5_congested,
- .mergeable_bvec = raid5_mergeable_bvec,
};
static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d051442..a415e1c 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -223,6 +223,9 @@ struct stripe_head {
struct stripe_head *batch_head; /* protected by stripe lock */
spinlock_t batch_lock; /* only header's lock is useful */
struct list_head batch_list; /* protected by head's batch lock*/
+
+ struct r5l_io_unit *log_io;
+ struct list_head log_list;
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
@@ -244,6 +247,7 @@ struct stripe_head {
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
+ u32 log_checksum;
} dev[1]; /* allocated with extra space depending of RAID geometry */
};
@@ -265,9 +269,10 @@ struct stripe_head_state {
int dec_preread_active;
unsigned long ops_request;
- struct bio *return_bi;
+ struct bio_list return_bi;
struct md_rdev *blocked_rdev;
int handle_bad_blocks;
+ int log_failed;
};
/* Flags for struct r5dev.flags */
@@ -340,6 +345,7 @@ enum {
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
* to batch yet.
*/
+ STRIPE_LOG_TRAPPED, /* trapped into log */
};
#define STRIPE_EXPAND_SYNC_FLAGS \
@@ -476,6 +482,9 @@ struct r5conf {
int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */
+ /* bios to have bi_end_io called after metadata is synced */
+ struct bio_list return_bi;
+
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have
* two caches.
@@ -540,6 +549,7 @@ struct r5conf {
struct r5worker_group *worker_groups;
int group_cnt;
int worker_cnt_per_group;
+ struct r5l_log *log;
};
@@ -606,4 +616,21 @@ static inline int algorithm_is_DDF(int layout)
extern void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size);
+extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
+extern void raid5_release_stripe(struct stripe_head *sh);
+extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+ int previous, int *dd_idx,
+ struct stripe_head *sh);
+extern struct stripe_head *
+raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
+ int previous, int noblock, int noquiesce);
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5l_log *log);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
#endif
OpenPOWER on IntegriCloud