diff options
Diffstat (limited to 'drivers/md')
26 files changed, 308 insertions, 142 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 2b46bf1..4c9852d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -421,9 +421,11 @@ out: if (watermark <= WATERMARK_METADATA) { SET_GC_MARK(b, GC_MARK_METADATA); + SET_GC_MOVE(b, 0); b->prio = BTREE_PRIO; } else { SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MOVE(b, 0); b->prio = INITIAL_PRIO; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 964353c..dbdbca5 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -197,7 +197,7 @@ struct bucket { uint8_t disk_gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint8_t gc_gen; - uint16_t gc_mark; + uint16_t gc_mark; /* Bitfield used by GC. See below for field */ }; /* @@ -209,7 +209,8 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_RECLAIMABLE 0 #define GC_MARK_DIRTY 1 #define GC_MARK_METADATA 2 -BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); +BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); #include "journal.h" #include "stats.h" @@ -371,14 +372,14 @@ struct cached_dev { unsigned char writeback_percent; unsigned writeback_delay; - int writeback_rate_change; - int64_t writeback_rate_derivative; uint64_t writeback_rate_target; + int64_t writeback_rate_proportional; + int64_t writeback_rate_derivative; + int64_t writeback_rate_change; unsigned writeback_rate_update_seconds; unsigned writeback_rate_d_term; unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_d_smooth; }; enum alloc_watermarks { @@ -444,7 +445,6 @@ struct cache { * call prio_write() to keep gens from wrapping. */ uint8_t need_save_prio; - unsigned gc_move_threshold; /* * If nonzero, we know we aren't going to find any buckets to invalidate diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index b62f379..946ecd3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1561,6 +1561,28 @@ size_t bch_btree_gc_finish(struct cache_set *c) SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), GC_MARK_METADATA); + /* don't reclaim buckets to which writeback keys point */ + rcu_read_lock(); + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + struct cached_dev *dc; + struct keybuf_key *w, *n; + unsigned j; + + if (!d || UUID_FLASH_ONLY(&c->uuids[i])) + continue; + dc = container_of(d, struct cached_dev, disk); + + spin_lock(&dc->writeback_keys.lock); + rbtree_postorder_for_each_entry_safe(w, n, + &dc->writeback_keys.keys, node) + for (j = 0; j < KEY_PTRS(&w->key); j++) + SET_GC_MARK(PTR_BUCKET(c, &w->key, j), + GC_MARK_DIRTY); + spin_unlock(&dc->writeback_keys.lock); + } + rcu_read_unlock(); + for_each_cache(ca, c, i) { uint64_t *i; @@ -1817,7 +1839,8 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, if (KEY_START(k) > KEY_START(insert) + sectors_found) goto check_failed; - if (KEY_PTRS(replace_key) != KEY_PTRS(k)) + if (KEY_PTRS(k) != KEY_PTRS(replace_key) || + KEY_DIRTY(k) != KEY_DIRTY(replace_key)) goto check_failed; /* skip past gen */ @@ -2217,7 +2240,7 @@ struct btree_insert_op { struct bkey *replace_key; }; -int btree_insert_fn(struct btree_op *b_op, struct btree *b) +static int btree_insert_fn(struct btree_op *b_op, struct btree *b) { struct btree_insert_op *op = container_of(b_op, struct btree_insert_op, op); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 581f95d..052bd24 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -25,10 +25,9 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) unsigned i; for (i = 0; i < KEY_PTRS(k); i++) { - struct cache *ca = PTR_CACHE(c, k, i); struct bucket *g = PTR_BUCKET(c, k, i); - if (GC_SECTORS_USED(g) < ca->gc_move_threshold) + if (GC_MOVE(g)) return true; } @@ -65,11 +64,16 @@ static void write_moving_finish(struct closure *cl) static void read_moving_endio(struct bio *bio, int error) { + struct bbio *b = container_of(bio, struct bbio, bio); struct moving_io *io = container_of(bio->bi_private, struct moving_io, cl); if (error) io->op.error = error; + else if (!KEY_DIRTY(&b->key) && + ptr_stale(io->op.c, &b->key, 0)) { + io->op.error = -EINTR; + } bch_bbio_endio(io->op.c, bio, error, "reading data to move"); } @@ -141,6 +145,11 @@ static void read_moving(struct cache_set *c) if (!w) break; + if (ptr_stale(c, &w->key, 0)) { + bch_keybuf_del(&c->moving_gc_keys, w); + continue; + } + io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), GFP_KERNEL); @@ -184,7 +193,8 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r) static unsigned bucket_heap_top(struct cache *ca) { - return GC_SECTORS_USED(heap_peek(&ca->heap)); + struct bucket *b; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } void bch_moving_gc(struct cache_set *c) @@ -226,9 +236,8 @@ void bch_moving_gc(struct cache_set *c) sectors_to_move -= GC_SECTORS_USED(b); } - ca->gc_move_threshold = bucket_heap_top(ca); - - pr_debug("threshold %u", ca->gc_move_threshold); + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); } mutex_unlock(&c->bucket_lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 60fb604..93d593f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1672,7 +1672,7 @@ err: static bool can_attach_cache(struct cache *ca, struct cache_set *c) { return ca->sb.block_size == c->sb.block_size && - ca->sb.bucket_size == c->sb.block_size && + ca->sb.bucket_size == c->sb.bucket_size && ca->sb.nr_in_set == c->sb.nr_in_set; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 80d4c2b..a1f8561 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -83,7 +83,6 @@ rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_d_term); rw_attribute(writeback_rate_p_term_inverse); -rw_attribute(writeback_rate_d_smooth); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -129,31 +128,41 @@ SHOW(__bch_cached_dev) var_printf(writeback_running, "%i"); var_print(writeback_delay); var_print(writeback_percent); - sysfs_print(writeback_rate, dc->writeback_rate.rate); + sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); var_print(writeback_rate_d_term); var_print(writeback_rate_p_term_inverse); - var_print(writeback_rate_d_smooth); if (attr == &sysfs_writeback_rate_debug) { + char rate[20]; char dirty[20]; - char derivative[20]; char target[20]; - bch_hprint(dirty, - bcache_dev_sectors_dirty(&dc->disk) << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + char proportional[20]; + char derivative[20]; + char change[20]; + s64 next_io; + + bch_hprint(rate, dc->writeback_rate.rate << 9); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional,dc->writeback_rate_proportional << 9); + bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(change, dc->writeback_rate_change << 9); + + next_io = div64_s64(dc->writeback_rate.next - local_clock(), + NSEC_PER_MSEC); return sprintf(buf, - "rate:\t\t%u\n" - "change:\t\t%i\n" + "rate:\t\t%s/sec\n" "dirty:\t\t%s\n" + "target:\t\t%s\n" + "proportional:\t%s\n" "derivative:\t%s\n" - "target:\t\t%s\n", - dc->writeback_rate.rate, - dc->writeback_rate_change, - dirty, derivative, target); + "change:\t\t%s/sec\n" + "next io:\t%llims\n", + rate, dirty, target, proportional, + derivative, change, next_io); } sysfs_hprint(dirty_data, @@ -189,6 +198,7 @@ STORE(__cached_dev) struct kobj_uevent_env *env; #define d_strtoul(var) sysfs_strtoul(var, dc->var) +#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) sysfs_strtoul(data_csum, dc->disk.data_csum); @@ -197,16 +207,15 @@ STORE(__cached_dev) d_strtoul(writeback_metadata); d_strtoul(writeback_running); d_strtoul(writeback_delay); - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, 1000000); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); - d_strtoul(writeback_rate_update_seconds); + sysfs_strtoul_clamp(writeback_rate, + dc->writeback_rate.rate, 1, INT_MAX); + + d_strtoul_nonzero(writeback_rate_update_seconds); d_strtoul(writeback_rate_d_term); - d_strtoul(writeback_rate_p_term_inverse); - sysfs_strtoul_clamp(writeback_rate_p_term_inverse, - dc->writeback_rate_p_term_inverse, 1, INT_MAX); - d_strtoul(writeback_rate_d_smooth); + d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); @@ -313,7 +322,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_update_seconds, &sysfs_writeback_rate_d_term, &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_d_smooth, &sysfs_writeback_rate_debug, &sysfs_dirty_data, &sysfs_stripe_size, diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index c57621e..db3ae4c 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -209,7 +209,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); - d->next += div_u64(done, d->rate); + d->next += div_u64(done * NSEC_PER_SEC, d->rate); + + if (time_before64(now + NSEC_PER_SEC, d->next)) + d->next = now + NSEC_PER_SEC; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; return time_after64(d->next, now) ? div_u64(d->next - now, NSEC_PER_SEC / HZ) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 362c4b3..1030c60 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -110,7 +110,7 @@ do { \ _r; \ }) -#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) #define heap_full(h) ((h)->used == (h)->size) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 04657e9..f4300e4 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -30,38 +30,40 @@ static void __update_writeback_rate(struct cached_dev *dc) /* PD controller */ - int change = 0; - int64_t error; int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t derivative = dirty - dc->disk.sectors_dirty_last; + int64_t proportional = dirty - target; + int64_t change; dc->disk.sectors_dirty_last = dirty; - derivative *= dc->writeback_rate_d_term; - derivative = clamp(derivative, -dirty, dirty); + /* Scale to sectors per second */ - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - dc->writeback_rate_d_smooth, 0); + proportional *= dc->writeback_rate_update_seconds; + proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - /* Avoid divide by zero */ - if (!target) - goto out; + derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - error = div64_s64((dirty + derivative - target) << 8, target); + derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, + (dc->writeback_rate_d_term / + dc->writeback_rate_update_seconds) ?: 1, 0); + + derivative *= dc->writeback_rate_d_term; + derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); - change = div_s64((dc->writeback_rate.rate * error) >> 8, - dc->writeback_rate_p_term_inverse); + change = proportional + derivative; /* Don't increase writeback rate if the device isn't keeping up */ if (change > 0 && time_after64(local_clock(), - dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) + dc->writeback_rate.next + NSEC_PER_MSEC)) change = 0; dc->writeback_rate.rate = - clamp_t(int64_t, dc->writeback_rate.rate + change, + clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, 1, NSEC_PER_MSEC); -out: + + dc->writeback_rate_proportional = proportional; dc->writeback_rate_derivative = derivative; dc->writeback_rate_change = change; dc->writeback_rate_target = target; @@ -87,15 +89,11 @@ static void update_writeback_rate(struct work_struct *work) static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { - uint64_t ret; - if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) return 0; - ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); - - return min_t(uint64_t, ret, HZ); + return bch_next_delay(&dc->writeback_rate, sectors); } struct dirty_io { @@ -241,7 +239,7 @@ static void read_dirty(struct cached_dev *dc) if (KEY_START(&w->key) != dc->last_read || jiffies_to_msecs(delay) > 50) while (!kthread_should_stop() && delay) - delay = schedule_timeout_interruptible(delay); + delay = schedule_timeout_uninterruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -438,7 +436,7 @@ static int bch_writeback_thread(void *arg) while (delay && !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_interruptible(delay); + delay = schedule_timeout_uninterruptible(delay); } } @@ -476,6 +474,8 @@ void bch_sectors_dirty_init(struct cached_dev *dc) bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); + + dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); } int bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -490,18 +490,15 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; - dc->writeback_rate_update_seconds = 30; - dc->writeback_rate_d_term = 16; - dc->writeback_rate_p_term_inverse = 64; - dc->writeback_rate_d_smooth = 8; + dc->writeback_rate_update_seconds = 5; + dc->writeback_rate_d_term = 30; + dc->writeback_rate_p_term_inverse = 6000; dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) return PTR_ERR(dc->writeback_thread); - set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 4113b60..a1b58a6 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void) { __u64 mem; + dm_bufio_allocated_kmem_cache = 0; + dm_bufio_allocated_get_free_pages = 0; + dm_bufio_allocated_vmalloc = 0; + dm_bufio_current_allocated = 0; + memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index bfba97d..d13a168 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, int r = 0; bool updated = updated_this_tick(mq, e); - requeue_and_update_tick(mq, e); - if ((!discarded_oblock && updated) || - !should_promote(mq, e, discarded_oblock, data_dir)) + !should_promote(mq, e, discarded_oblock, data_dir)) { + requeue_and_update_tick(mq, e); result->op = POLICY_MISS; - else if (!can_migrate) + + } else if (!can_migrate) r = -EWOULDBLOCK; - else + + else { + requeue_and_update_tick(mq, e); r = pre_cache_to_cache(mq, e, result); + } return r; } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 7c8dd1f..99f9162 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2763,7 +2763,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) { int r; - r = dm_cache_resize(cache->cmd, cache->cache_size); + r = dm_cache_resize(cache->cmd, new_size); if (r) { DMERR("could not resize cache metadata"); return r; diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 84c8601..fc8482a 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -20,6 +20,7 @@ struct delay_c { struct timer_list delay_timer; struct mutex timer_lock; + struct workqueue_struct *kdelayd_wq; struct work_struct flush_expired_bios; struct list_head delayed_bios; atomic_t may_delay; @@ -45,14 +46,13 @@ struct dm_delay_info { static DEFINE_MUTEX(delayed_bios_lock); -static struct workqueue_struct *kdelayd_wq; static struct kmem_cache *delayed_cache; static void handle_delayed_timer(unsigned long data) { struct delay_c *dc = (struct delay_c *)data; - queue_work(kdelayd_wq, &dc->flush_expired_bios); + queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); } static void queue_timeout(struct delay_c *dc, unsigned long expires) @@ -191,6 +191,12 @@ out: goto bad_dev_write; } + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!dc->kdelayd_wq) { + DMERR("Couldn't start kdelayd"); + goto bad_queue; + } + setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); @@ -203,6 +209,8 @@ out: ti->private = dc; return 0; +bad_queue: + mempool_destroy(dc->delayed_pool); bad_dev_write: if (dc->dev_write) dm_put_device(ti, dc->dev_write); @@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - flush_workqueue(kdelayd_wq); + destroy_workqueue(dc->kdelayd_wq); dm_put_device(ti, dc->dev_read); @@ -351,12 +359,6 @@ static int __init dm_delay_init(void) { int r = -ENOMEM; - kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); - if (!kdelayd_wq) { - DMERR("Couldn't start kdelayd"); - goto bad_queue; - } - delayed_cache = KMEM_CACHE(dm_delay_info, 0); if (!delayed_cache) { DMERR("Couldn't create delayed bio cache."); @@ -374,8 +376,6 @@ static int __init dm_delay_init(void) bad_register: kmem_cache_destroy(delayed_cache); bad_memcache: - destroy_workqueue(kdelayd_wq); -bad_queue: return r; } @@ -383,7 +383,6 @@ static void __exit dm_delay_exit(void) { dm_unregister_target(&delay_target); kmem_cache_destroy(delayed_cache); - destroy_workqueue(kdelayd_wq); } /* Module hooks */ diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 80b5cab..01b6a11 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -66,6 +66,18 @@ struct dm_snapshot { atomic_t pending_exceptions_count; + /* Protected by "lock" */ + sector_t exception_start_sequence; + + /* Protected by kcopyd single-threaded callback */ + sector_t exception_complete_sequence; + + /* + * A list of pending exceptions that completed out of order. + * Protected by kcopyd single-threaded callback. + */ + struct list_head out_of_order_list; + mempool_t *pending_pool; struct dm_exception_table pending; @@ -173,6 +185,14 @@ struct dm_snap_pending_exception { */ int started; + /* There was copying error. */ + int copy_error; + + /* A sequence number, it is used for in-order completion. */ + sector_t exception_sequence; + + struct list_head out_of_order_entry; + /* * For writing a complete chunk, bypassing the copy. */ @@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->valid = 1; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); + s->exception_start_sequence = 0; + s->exception_complete_sequence = 0; + INIT_LIST_HEAD(&s->out_of_order_list); init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); @@ -1444,6 +1467,19 @@ static void commit_callback(void *context, int success) pending_complete(pe, success); } +static void complete_exception(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + + if (unlikely(pe->copy_error)) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, + commit_callback, pe); +} + /* * Called when the copy I/O has finished. kcopyd actually runs * this code so don't block. @@ -1453,13 +1489,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) struct dm_snap_pending_exception *pe = context; struct dm_snapshot *s = pe->snap; - if (read_err || write_err) - pending_complete(pe, 0); + pe->copy_error = read_err || write_err; - else - /* Update the metadata if we are persistent */ - s->store->type->commit_exception(s->store, &pe->e, - commit_callback, pe); + if (pe->exception_sequence == s->exception_complete_sequence) { + s->exception_complete_sequence++; + complete_exception(pe); + + while (!list_empty(&s->out_of_order_list)) { + pe = list_entry(s->out_of_order_list.next, + struct dm_snap_pending_exception, out_of_order_entry); + if (pe->exception_sequence != s->exception_complete_sequence) + break; + s->exception_complete_sequence++; + list_del(&pe->out_of_order_entry); + complete_exception(pe); + } + } else { + struct list_head *lh; + struct dm_snap_pending_exception *pe2; + + list_for_each_prev(lh, &s->out_of_order_list) { + pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry); + if (pe2->exception_sequence < pe->exception_sequence) + break; + } + list_add(&pe->out_of_order_entry, lh); + } } /* @@ -1554,6 +1609,8 @@ __find_pending_exception(struct dm_snapshot *s, return NULL; } + pe->exception_sequence = s->exception_start_sequence++; + dm_insert_exception(&s->pending, &pe->e); return pe; @@ -2193,7 +2250,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 11, 1}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 3d404c1..28a9012 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, int __init dm_statistics_init(void) { + shared_memory_amount = 0; dm_stat_need_rcu_barrier = 0; return 0; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 465f08c..3ba6a38 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode, num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + if (!num_targets) { + kfree(t); + return -ENOMEM; + } + if (alloc_targets(t, num_targets)) { kfree(t); return -ENOMEM; diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 60bce43..8a30ad5 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) up_write(&pmd->root_lock); } +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) +{ + down_write(&pmd->root_lock); + pmd->read_only = false; + dm_bm_set_read_write(pmd->bm); + up_write(&pmd->root_lock); +} + int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_block_t threshold, dm_sm_threshold_fn fn, diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 845ebbe..7bcc0e1 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz * that nothing is changing. */ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd); int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_block_t threshold, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 1abb4a2..357eb27 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -645,7 +645,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) */ r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); if (r) { - DMERR_LIMIT("dm_thin_insert_block() failed"); + DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", + dm_device_name(pool->pool_md), r); + set_pool_mode(pool, PM_READ_ONLY); cell_error(pool, m->cell); goto out; } @@ -887,32 +889,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, } } -static int commit(struct pool *pool) -{ - int r; - - r = dm_pool_commit_metadata(pool->pmd); - if (r) - DMERR_LIMIT("%s: commit failed: error = %d", - dm_device_name(pool->pool_md), r); - - return r; -} - /* * A non-zero return indicates read_only or fail_io mode. * Many callers don't care about the return value. */ -static int commit_or_fallback(struct pool *pool) +static int commit(struct pool *pool) { int r; if (get_pool_mode(pool) != PM_WRITE) return -EINVAL; - r = commit(pool); - if (r) + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", + dm_device_name(pool->pool_md), r); set_pool_mode(pool, PM_READ_ONLY); + } return r; } @@ -949,7 +942,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) * Try to commit to see if that will free up some * more space. */ - (void) commit_or_fallback(pool); + r = commit(pool); + if (r) + return r; r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); if (r) @@ -963,7 +958,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) * table reload). */ if (!free_blocks) { - DMWARN("%s: no free space available.", + DMWARN("%s: no free data space available.", dm_device_name(pool->pool_md)); spin_lock_irqsave(&pool->lock, flags); pool->no_free_space = 1; @@ -973,8 +968,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) } r = dm_pool_alloc_data_block(pool->pmd, result); - if (r) + if (r) { + if (r == -ENOSPC && + !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && + !free_blocks) { + DMWARN("%s: no free metadata space available.", + dm_device_name(pool->pool_md)); + set_pool_mode(pool, PM_READ_ONLY); + } return r; + } return 0; } @@ -1355,7 +1358,7 @@ static void process_deferred_bios(struct pool *pool) if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) return; - if (commit_or_fallback(pool)) { + if (commit(pool)) { while ((bio = bio_list_pop(&bios))) bio_io_error(bio); return; @@ -1403,6 +1406,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) case PM_FAIL: DMERR("%s: switching pool to failure mode", dm_device_name(pool->pool_md)); + dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; pool->process_prepared_mapping = process_prepared_mapping_fail; @@ -1427,6 +1431,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) break; case PM_WRITE: + dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; pool->process_discard = process_discard; pool->process_prepared_mapping = process_prepared_mapping; @@ -1643,12 +1648,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) struct pool_c *pt = ti->private; /* - * We want to make sure that degraded pools are never upgraded. + * We want to make sure that a pool in PM_FAIL mode is never upgraded. */ enum pool_mode old_mode = pool->pf.mode; enum pool_mode new_mode = pt->adjusted_pf.mode; - if (old_mode > new_mode) + /* + * If we were in PM_FAIL mode, rollback of metadata failed. We're + * not going to recover without a thin_repair. So we never let the + * pool move out of the old mode. On the other hand a PM_READ_ONLY + * may have been due to a lack of metadata or data space, and may + * now work (ie. if the underlying devices have been resized). + */ + if (old_mode == PM_FAIL) new_mode = old_mode; pool->ti = ti; @@ -2272,7 +2284,7 @@ static int pool_preresume(struct dm_target *ti) return r; if (need_commit1 || need_commit2) - (void) commit_or_fallback(pool); + (void) commit(pool); return 0; } @@ -2299,7 +2311,7 @@ static void pool_postsuspend(struct dm_target *ti) cancel_delayed_work(&pool->waker); flush_workqueue(pool->wq); - (void) commit_or_fallback(pool); + (void) commit(pool); } static int check_arg_count(unsigned argc, unsigned args_required) @@ -2433,7 +2445,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct if (r) return r; - (void) commit_or_fallback(pool); + (void) commit(pool); r = dm_pool_reserve_metadata_snap(pool->pmd); if (r) @@ -2495,7 +2507,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) DMWARN("Unrecognised thin pool target message received: %s", argv[0]); if (!r) - (void) commit_or_fallback(pool); + (void) commit(pool); return r; } @@ -2550,7 +2562,7 @@ static void pool_status(struct dm_target *ti, status_type_t type, /* Commit to ensure statistics aren't out-of-date */ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) - (void) commit_or_fallback(pool); + (void) commit(pool); r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); if (r) { diff --git a/drivers/md/md.c b/drivers/md/md.c index b07fed3..16d84e0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7765,7 +7765,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || + (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index af96e24..1d75b1d 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, * The shadow op will often be a noop. Only insert if it really * copied data. */ - if (dm_block_location(*block) != b) + if (dm_block_location(*block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); r = insert_ablock(info, index, *block, root); + } return r; } diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index a7e8bf2..064a3c2 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm) } EXPORT_SYMBOL_GPL(dm_bm_set_read_only); +void dm_bm_set_read_write(struct dm_block_manager *bm) +{ + bm->read_only = false; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_write); + u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) { return crc32c(~(u32) 0, data, len) ^ init_xor; diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 9a82083..13cd58e1 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b); int dm_bm_flush_and_unlock(struct dm_block_manager *bm, struct dm_block *superblock); - /* - * Request data be prefetched into the cache. - */ +/* + * Request data is prefetched into the cache. + */ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); /* @@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); * be returned if you do. */ void dm_bm_set_read_only(struct dm_block_manager *bm); +void dm_bm_set_read_write(struct dm_block_manager *bm); u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 6058569..466a60b 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, } static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, - uint32_t (*mutator)(void *context, uint32_t old), + int (*mutator)(void *context, uint32_t old, uint32_t *new), void *context, enum allocation_event *ev) { int r; @@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, if (old > 2) { r = sm_ll_lookup_big_ref_count(ll, b, &old); - if (r < 0) + if (r < 0) { + dm_tm_unlock(ll->tm, nb); return r; + } } - ref_count = mutator(context, old); + r = mutator(context, old, &ref_count); + if (r) { + dm_tm_unlock(ll->tm, nb); + return r; + } if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); @@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, return ll->save_ie(ll, index, &ie_disk); } -static uint32_t set_ref_count(void *context, uint32_t old) +static int set_ref_count(void *context, uint32_t old, uint32_t *new) { - return *((uint32_t *) context); + *new = *((uint32_t *) context); + return 0; } int sm_ll_insert(struct ll_disk *ll, dm_block_t b, @@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); } -static uint32_t inc_ref_count(void *context, uint32_t old) +static int inc_ref_count(void *context, uint32_t old, uint32_t *new) { - return old + 1; + *new = old + 1; + return 0; } int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) @@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); } -static uint32_t dec_ref_count(void *context, uint32_t old) +static int dec_ref_count(void *context, uint32_t old, uint32_t *new) { - return old - 1; + if (!old) { + DMERR_LIMIT("unable to decrement a reference count below 0"); + return -EINVAL; + } + + *new = old - 1; + return 0; } int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 1c95968..58fc1ee 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); int r = sm_metadata_new_block_(sm, b); - if (r) + if (r) { DMERR("unable to allocate new metadata block"); + return r; + } r = sm_metadata_get_nr_free(sm, &count); - if (r) + if (r) { DMERR("couldn't get free block count"); + return r; + } check_threshold(&smm->threshold, count); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bef353c..eea6337 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -678,26 +678,23 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } else init_stripe(sh, sector, previous); } else { + spin_lock(&conf->device_lock); if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru) && !test_bit(STRIPE_EXPANDING, &sh->state) && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) - && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); + ); } else { - spin_lock(&conf->device_lock); if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru) && - !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) && - !test_bit(STRIPE_EXPANDING, &sh->state)) - BUG(); + BUG_ON(list_empty(&sh->lru)); list_del_init(&sh->lru); if (sh->group) { sh->group->stripes_cnt--; sh->group = NULL; } - spin_unlock(&conf->device_lock); } + spin_unlock(&conf->device_lock); } } while (sh == NULL); @@ -5473,7 +5470,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, for (i = 0; i < *group_cnt; i++) { struct r5worker_group *group; - group = worker_groups[i]; + group = &(*worker_groups)[i]; INIT_LIST_HEAD(&group->handle_list); group->conf = conf; group->workers = workers + i * cnt; |