diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-30 11:40:10 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-30 11:40:10 -0800 |
commit | 53d8ab29f8f6d67e37857b68189b38fa3d87dd8e (patch) | |
tree | 3c770b58f0404c67b1b084f626dcafa8464c7512 /drivers/md | |
parent | f568849edac8611d603e00bd6cbbcfea09395ae6 (diff) | |
parent | 14424be4dbfa127001ad623869f7ee4c7635e991 (diff) | |
download | op-kernel-dev-53d8ab29f8f6d67e37857b68189b38fa3d87dd8e.zip op-kernel-dev-53d8ab29f8f6d67e37857b68189b38fa3d87dd8e.tar.gz |
Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver changes from Jens Axboe:
- bcache update from Kent Overstreet.
- two bcache fixes from Nicholas Swenson.
- cciss pci init error fix from Andrew.
- underflow fix in the parallel IDE pg_write code from Dan Carpenter.
I'm sure the 1 (or 0) users of that are now happy.
- two PCI related fixes for sx8 from Jingoo Han.
- floppy init fix for first block read from Jiri Kosina.
- pktcdvd error return miss fix from Julia Lawall.
- removal of IRQF_SHARED from the SEGA Dreamcast CD-ROM code from
Michael Opdenacker.
- comment typo fix for the loop driver from Olaf Hering.
- potential oops fix for null_blk from Raghavendra K T.
- two fixes from Sam Bradshaw (Micron) for the mtip32xx driver, fixing
an OOM problem and a problem with handling security locked conditions
* 'for-3.14/drivers' of git://git.kernel.dk/linux-block: (47 commits)
mg_disk: Spelling s/finised/finished/
null_blk: Null pointer deference problem in alloc_page_buffers
mtip32xx: Correctly handle security locked condition
mtip32xx: Make SGL container per-command to eliminate high order dma allocation
drivers/block/loop.c: fix comment typo in loop_config_discard
drivers/block/cciss.c:cciss_init_one(): use proper errnos
drivers/block/paride/pg.c: underflow bug in pg_write()
drivers/block/sx8.c: remove unnecessary pci_set_drvdata()
drivers/block/sx8.c: use module_pci_driver()
floppy: bail out in open() if drive is not responding to block0 read
bcache: Fix auxiliary search trees for key size > cacheline size
bcache: Don't return -EINTR when insert finished
bcache: Improve bucket_prio() calculation
bcache: Add bch_bkey_equal_header()
bcache: update bch_bkey_try_merge
bcache: Move insert_fixup() to btree_keys_ops
bcache: Convert sorting to btree_keys
bcache: Convert debug code to btree_keys
bcache: Convert btree_iter to struct btree_keys
bcache: Refactor bset_tree sysfs stats
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/Makefile | 5 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 89 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 82 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 904 | ||||
-rw-r--r-- | drivers/md/bcache/bset.h | 440 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 676 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 62 | ||||
-rw-r--r-- | drivers/md/bcache/closure.c | 90 | ||||
-rw-r--r-- | drivers/md/bcache/closure.h | 355 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 247 | ||||
-rw-r--r-- | drivers/md/bcache/debug.h | 27 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 616 | ||||
-rw-r--r-- | drivers/md/bcache/extents.h | 13 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 75 | ||||
-rw-r--r-- | drivers/md/bcache/journal.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 72 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 21 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 103 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 79 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 8 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1 |
22 files changed, 2213 insertions, 1755 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 0e9c825..c488b84 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_BCACHE) += bcache.o -bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ - movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o +bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4c9852d..c0d37d0 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) { BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); - if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && - CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) - return false; + if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { + unsigned i; + + for (i = 0; i < RESERVE_NONE; i++) + if (!fifo_full(&ca->free[i])) + goto add; + return false; + } +add: b->prio = 0; if (can_inc_bucket_gen(b) && @@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b) fifo_push(&ca->free_inc, b - ca->buckets); } -#define bucket_prio(b) \ - (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) +/* + * Determines what order we're going to reuse buckets, smallest bucket_prio() + * first: we also take into account the number of sectors of live data in that + * bucket, and in order for that multiply to make sense we have to scale bucket + * + * Thus, we scale the bucket priorities so that the bucket with the smallest + * prio is worth 1/8th of what INITIAL_PRIO is worth. + */ + +#define bucket_prio(b) \ +({ \ + unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) @@ -304,6 +323,21 @@ do { \ __set_current_state(TASK_RUNNING); \ } while (0) +static int bch_allocator_push(struct cache *ca, long bucket) +{ + unsigned i; + + /* Prios/gens are actually the most important reserve */ + if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) + return true; + + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) + return true; + + return false; +} + static int bch_allocator_thread(void *arg) { struct cache *ca = arg; @@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg) mutex_lock(&ca->set->bucket_lock); } - allocator_wait(ca, !fifo_full(&ca->free)); - - fifo_push(&ca->free, bucket); + allocator_wait(ca, bch_allocator_push(ca, bucket)); wake_up(&ca->set->bucket_wait); } @@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg) } } -long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) +long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) { DEFINE_WAIT(w); struct bucket *b; long r; /* fastpath */ - if (fifo_used(&ca->free) > ca->watermark[watermark]) { - fifo_pop(&ca->free, r); + if (fifo_pop(&ca->free[RESERVE_NONE], r) || + fifo_pop(&ca->free[reserve], r)) goto out; - } if (!wait) return -1; - while (1) { - if (fifo_used(&ca->free) > ca->watermark[watermark]) { - fifo_pop(&ca->free, r); - break; - } - + do { prepare_to_wait(&ca->set->bucket_wait, &w, TASK_UNINTERRUPTIBLE); mutex_unlock(&ca->set->bucket_lock); schedule(); mutex_lock(&ca->set->bucket_lock); - } + } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && + !fifo_pop(&ca->free[reserve], r)); finish_wait(&ca->set->bucket_wait, &w); out: @@ -401,12 +428,14 @@ out: if (expensive_debug_checks(ca->set)) { size_t iter; long i; + unsigned j; for (iter = 0; iter < prio_buckets(ca) * 2; iter++) BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); - fifo_for_each(i, &ca->free, iter) - BUG_ON(i == r); + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each(i, &ca->free[j], iter) + BUG_ON(i == r); fifo_for_each(i, &ca->free_inc, iter) BUG_ON(i == r); fifo_for_each(i, &ca->unused, iter) @@ -419,7 +448,7 @@ out: SET_GC_SECTORS_USED(b, ca->sb.bucket_size); - if (watermark <= WATERMARK_METADATA) { + if (reserve <= RESERVE_PRIO) { SET_GC_MARK(b, GC_MARK_METADATA); SET_GC_MOVE(b, 0); b->prio = BTREE_PRIO; @@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) } } -int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, +int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, struct bkey *k, int n, bool wait) { int i; @@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, for (i = 0; i < n; i++) { struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, watermark, wait); + long b = bch_bucket_alloc(ca, reserve, wait); if (b == -1) goto err; @@ -478,12 +507,12 @@ err: return -1; } -int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, +int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, struct bkey *k, int n, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); + ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); mutex_unlock(&c->bucket_lock); return ret; } @@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { unsigned watermark = write_prio - ? WATERMARK_MOVINGGC - : WATERMARK_NONE; + ? RESERVE_MOVINGGC + : RESERVE_NONE; spin_unlock(&c->data_bucket_lock); @@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca) * Then 8 for btree allocations * Then half for the moving garbage collector */ - +#if 0 ca->watermark[WATERMARK_PRIO] = 0; ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); @@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca) ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + ca->watermark[WATERMARK_MOVINGGC]; - +#endif return 0; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dbdbca5..0c707e4 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -187,6 +187,7 @@ #include <linux/types.h> #include <linux/workqueue.h> +#include "bset.h" #include "util.h" #include "closure.h" @@ -309,7 +310,8 @@ struct cached_dev { struct cache_sb sb; struct bio sb_bio; struct bio_vec sb_bv[1]; - struct closure_with_waitlist sb_write; + struct closure sb_write; + struct semaphore sb_write_mutex; /* Refcount on the cache set. Always nonzero when we're caching. */ atomic_t count; @@ -382,12 +384,12 @@ struct cached_dev { unsigned writeback_rate_p_term_inverse; }; -enum alloc_watermarks { - WATERMARK_PRIO, - WATERMARK_METADATA, - WATERMARK_MOVINGGC, - WATERMARK_NONE, - WATERMARK_MAX +enum alloc_reserve { + RESERVE_BTREE, + RESERVE_PRIO, + RESERVE_MOVINGGC, + RESERVE_NONE, + RESERVE_NR, }; struct cache { @@ -399,8 +401,6 @@ struct cache { struct kobject kobj; struct block_device *bdev; - unsigned watermark[WATERMARK_MAX]; - struct task_struct *alloc_thread; struct closure prio; @@ -429,7 +429,7 @@ struct cache { * because all the data they contained was overwritten), so we only * need to discard them before they can be moved to the free list. */ - DECLARE_FIFO(long, free); + DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); DECLARE_FIFO(long, unused); @@ -514,7 +514,8 @@ struct cache_set { uint64_t cached_dev_sectors; struct closure caching; - struct closure_with_waitlist sb_write; + struct closure sb_write; + struct semaphore sb_write_mutex; mempool_t *search; mempool_t *bio_meta; @@ -629,13 +630,15 @@ struct cache_set { #ifdef CONFIG_BCACHE_DEBUG struct btree *verify_data; + struct bset *verify_ondisk; struct mutex verify_lock; #endif unsigned nr_uuids; struct uuid_entry *uuids; BKEY_PADDED(uuid_bucket); - struct closure_with_waitlist uuid_write; + struct closure uuid_write; + struct semaphore uuid_write_mutex; /* * A btree node on disk could have too many bsets for an iterator to fit @@ -643,13 +646,7 @@ struct cache_set { */ mempool_t *fill_iter; - /* - * btree_sort() is a merge sort and requires temporary space - single - * element mempool - */ - struct mutex sort_lock; - struct bset *sort; - unsigned sort_crit_factor; + struct bset_sort_state sort; /* List of buckets we're currently writing data to */ struct list_head data_buckets; @@ -665,7 +662,6 @@ struct cache_set { unsigned congested_read_threshold_us; unsigned congested_write_threshold_us; - struct time_stats sort_time; struct time_stats btree_gc_time; struct time_stats btree_split_time; struct time_stats btree_read_time; @@ -683,9 +679,9 @@ struct cache_set { unsigned error_decay; unsigned short journal_delay_ms; + bool expensive_debug_checks; unsigned verify:1; unsigned key_merging_disabled:1; - unsigned expensive_debug_checks:1; unsigned gc_always_rewrite:1; unsigned shrinker_disabled:1; unsigned copy_gc_enabled:1; @@ -707,13 +703,8 @@ struct bbio { struct bio bio; }; -static inline unsigned local_clock_us(void) -{ - return local_clock() >> 10; -} - #define BTREE_PRIO USHRT_MAX -#define INITIAL_PRIO 32768 +#define INITIAL_PRIO 32768U #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) #define btree_blocks(b) \ @@ -726,21 +717,6 @@ static inline unsigned local_clock_us(void) #define bucket_bytes(c) ((c)->sb.bucket_size << 9) #define block_bytes(c) ((c)->sb.block_size << 9) -#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) -#define set_bytes(i) __set_bytes(i, i->keys) - -#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) -#define set_blocks(i, c) __set_blocks(i, (i)->keys, c) - -#define node(i, j) ((struct bkey *) ((i)->d + (j))) -#define end(i) node(i, (i)->keys) - -#define index(i, b) \ - ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ - block_bytes(b->c))) - -#define btree_data_space(b) (PAGE_SIZE << (b)->page_order) - #define prios_per_bucket(c) \ ((bucket_bytes(c) - sizeof(struct prio_set)) / \ sizeof(struct bucket_disk)) @@ -783,20 +759,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); } -/* Btree key macros */ +static inline uint8_t gen_after(uint8_t a, uint8_t b) +{ + uint8_t r = a - b; + return r > 128U ? 0 : r; +} -static inline void bkey_init(struct bkey *k) +static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, + unsigned i) { - *k = ZERO_KEY; + return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); } +static inline bool ptr_available(struct cache_set *c, const struct bkey *k, + unsigned i) +{ + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); +} + +/* Btree key macros */ + /* * This is used for various on disk data structures - cache_sb, prio_set, bset, * jset: The checksum is _always_ the first 8 bytes of these structs */ #define csum_set(i) \ bch_crc64(((void *) (i)) + sizeof(uint64_t), \ - ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) + ((void *) bset_bkey_last(i)) - \ + (((void *) (i)) + sizeof(uint64_t))) /* Error handling macros */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 7d388b8..4f6b594 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -5,30 +5,134 @@ * Copyright 2012 Google, Inc. */ -#include "bcache.h" -#include "btree.h" -#include "debug.h" +#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#include "util.h" +#include "bset.h" + +#include <linux/console.h> #include <linux/random.h> #include <linux/prefetch.h> +#ifdef CONFIG_BCACHE_DEBUG + +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) +{ + struct bkey *k, *next; + + for (k = i->start; k < bset_bkey_last(i); k = next) { + next = bkey_next(k); + + printk(KERN_ERR "block %u key %zi/%u: ", set, + (uint64_t *) k - i->d, i->keys); + + if (b->ops->key_dump) + b->ops->key_dump(b, k); + else + printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); + + if (next < bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? + &START_KEY(next) : next) > 0) + printk(KERN_ERR "Key skipped backwards\n"); + } +} + +void bch_dump_bucket(struct btree_keys *b) +{ + unsigned i; + + console_lock(); + for (i = 0; i <= b->nsets; i++) + bch_dump_bset(b, b->set[i].data, + bset_sector_offset(b, b->set[i].data)); + console_unlock(); +} + +int __bch_count_data(struct btree_keys *b) +{ + unsigned ret = 0; + struct btree_iter iter; + struct bkey *k; + + if (b->ops->is_extents) + for_each_key(b, k, &iter) + ret += KEY_SIZE(k); + return ret; +} + +void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) +{ + va_list args; + struct bkey *k, *p = NULL; + struct btree_iter iter; + const char *err; + + for_each_key(b, k, &iter) { + if (b->ops->is_extents) { + err = "Keys out of order"; + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) + goto bug; + + if (bch_ptr_invalid(b, k)) + continue; + + err = "Overlapping keys"; + if (p && bkey_cmp(p, &START_KEY(k)) > 0) + goto bug; + } else { + if (bch_ptr_bad(b, k)) + continue; + + err = "Duplicate keys"; + if (p && !bkey_cmp(p, k)) + goto bug; + } + p = k; + } +#if 0 + err = "Key larger than btree node key"; + if (p && bkey_cmp(p, &b->key) > 0) + goto bug; +#endif + return; +bug: + bch_dump_bucket(b); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + panic("bch_check_keys error: %s:\n", err); +} + +static void bch_btree_iter_next_check(struct btree_iter *iter) +{ + struct bkey *k = iter->data->k, *next = bkey_next(k); + + if (next < iter->data->end && + bkey_cmp(k, iter->b->ops->is_extents ? + &START_KEY(next) : next) > 0) { + bch_dump_bucket(iter->b); + panic("Key skipped backwards\n"); + } +} + +#else + +static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} + +#endif + /* Keylists */ -int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) +int __bch_keylist_realloc(struct keylist *l, unsigned u64s) { size_t oldsize = bch_keylist_nkeys(l); - size_t newsize = oldsize + 2 + nptrs; + size_t newsize = oldsize + u64s; uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; uint64_t *new_keys; - /* The journalling code doesn't handle the case where the keys to insert - * is bigger than an empty write: If we just return -ENOMEM here, - * bio_insert() and bio_invalidate() will insert the keys created so far - * and finish the rest when the keylist is empty. - */ - if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) - return -ENOMEM; - newsize = roundup_pow_of_two(newsize); if (newsize <= KEYLIST_INLINE || @@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l) bch_keylist_bytes(l)); } -/* Pointer validation */ - -static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); - size_t bucket = PTR_BUCKET_NR(c, k, i); - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - - if (KEY_SIZE(k) + r > c->sb.bucket_size || - bucket < ca->sb.first_bucket || - bucket >= ca->sb.nbuckets) - return true; - } - - return false; -} - -bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) -{ - char buf[80]; - - if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) - goto bad; - - if (__ptr_invalid(c, k)) - goto bad; - - return false; -bad: - bch_bkey_to_text(buf, sizeof(buf), k); - cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); - return true; -} - -bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) -{ - char buf[80]; - - if (!KEY_SIZE(k)) - return true; - - if (KEY_SIZE(k) > KEY_OFFSET(k)) - goto bad; - - if (__ptr_invalid(c, k)) - goto bad; - - return false; -bad: - bch_bkey_to_text(buf, sizeof(buf), k); - cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); - return true; -} - -static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, - unsigned ptr) -{ - struct bucket *g = PTR_BUCKET(b->c, k, ptr); - char buf[80]; - - if (mutex_trylock(&b->c->bucket_lock)) { - if (b->level) { - if (KEY_DIRTY(k) || - g->prio != BTREE_PRIO || - (b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_METADATA)) - goto err; - - } else { - if (g->prio == BTREE_PRIO) - goto err; - - if (KEY_DIRTY(k) && - b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_DIRTY) - goto err; - } - mutex_unlock(&b->c->bucket_lock); - } - - return false; -err: - mutex_unlock(&b->c->bucket_lock); - bch_bkey_to_text(buf, sizeof(buf), k); - btree_bug(b, -"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); - return true; -} - -bool bch_ptr_bad(struct btree *b, const struct bkey *k) -{ - struct bucket *g; - unsigned i, stale; - - if (!bkey_cmp(k, &ZERO_KEY) || - !KEY_PTRS(k) || - bch_ptr_invalid(b, k)) - return true; - - for (i = 0; i < KEY_PTRS(k); i++) { - if (!ptr_available(b->c, k, i)) - return true; - - g = PTR_BUCKET(b->c, k, i); - stale = ptr_stale(b->c, k, i); - - btree_bug_on(stale > 96, b, - "key too stale: %i, need_gc %u", - stale, b->c->need_gc); - - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), - b, "stale dirty pointer"); - - if (stale) - return true; - - if (expensive_debug_checks(b->c) && - ptr_bad_expensive_checks(b, k, i)) - return true; - } - - return false; -} - /* Key/pointer manipulation */ void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, @@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k) return true; } -static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +/* Auxiliary search trees */ + +/* 32 bits total: */ +#define BKEY_MID_BITS 3 +#define BKEY_EXPONENT_BITS 7 +#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS) +#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) + +struct bkey_float { + unsigned exponent:BKEY_EXPONENT_BITS; + unsigned m:BKEY_MID_BITS; + unsigned mantissa:BKEY_MANTISSA_BITS; +} __packed; + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 + +/* Space required for the btree node keys */ +static inline size_t btree_keys_bytes(struct btree_keys *b) { - return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & - ~((uint64_t)1 << 63); + return PAGE_SIZE << b->page_order; } -/* Tries to merge l and r: l should be lower than r - * Returns true if we were able to merge. If we did merge, l will be the merged - * key, r will be untouched. - */ -bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) +static inline size_t btree_keys_cachelines(struct btree_keys *b) { - unsigned i; + return btree_keys_bytes(b) / BSET_CACHELINE; +} - if (key_merging_disabled(b->c)) - return false; +/* Space required for the auxiliary search trees */ +static inline size_t bset_tree_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(struct bkey_float); +} - if (KEY_PTRS(l) != KEY_PTRS(r) || - KEY_DIRTY(l) != KEY_DIRTY(r) || - bkey_cmp(l, &START_KEY(r))) - return false; +/* Space required for the prev pointers */ +static inline size_t bset_prev_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(uint8_t); +} - for (i = 0; i < KEY_PTRS(l); i++) - if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || - PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) - return false; +/* Memory allocation */ - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { - SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); - SET_KEY_SIZE(l, USHRT_MAX); +void bch_btree_keys_free(struct btree_keys *b) +{ + struct bset_tree *t = b->set; - bch_cut_front(l, r); - return false; - } + if (bset_prev_bytes(b) < PAGE_SIZE) + kfree(t->prev); + else + free_pages((unsigned long) t->prev, + get_order(bset_prev_bytes(b))); - if (KEY_CSUM(l)) { - if (KEY_CSUM(r)) - l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); - else - SET_KEY_CSUM(l, 0); - } + if (bset_tree_bytes(b) < PAGE_SIZE) + kfree(t->tree); + else + free_pages((unsigned long) t->tree, + get_order(bset_tree_bytes(b))); - SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); - SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + free_pages((unsigned long) t->data, b->page_order); - return true; + t->prev = NULL; + t->tree = NULL; + t->data = NULL; +} +EXPORT_SYMBOL(bch_btree_keys_free); + +int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) +{ + struct bset_tree *t = b->set; + + BUG_ON(t->data); + + b->page_order = page_order; + + t->data = (void *) __get_free_pages(gfp, b->page_order); + if (!t->data) + goto err; + + t->tree = bset_tree_bytes(b) < PAGE_SIZE + ? kmalloc(bset_tree_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); + if (!t->tree) + goto err; + + t->prev = bset_prev_bytes(b) < PAGE_SIZE + ? kmalloc(bset_prev_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); + if (!t->prev) + goto err; + + return 0; +err: + bch_btree_keys_free(b); + return -ENOMEM; } +EXPORT_SYMBOL(bch_btree_keys_alloc); + +void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks) +{ + unsigned i; + + b->ops = ops; + b->expensive_debug_checks = expensive_debug_checks; + b->nsets = 0; + b->last_set_unwritten = 0; + + /* XXX: shouldn't be needed */ + for (i = 0; i < MAX_BSETS; i++) + b->set[i].size = 0; + /* + * Second loop starts at 1 because b->keys[0]->data is the memory we + * allocated + */ + for (i = 1; i < MAX_BSETS; i++) + b->set[i].data = NULL; +} +EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ @@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) return ((void *) k - (void *) t->data) / BSET_CACHELINE; } -static unsigned bkey_to_cacheline_offset(struct bkey *k) +static unsigned bkey_to_cacheline_offset(struct bset_tree *t, + unsigned cacheline, + struct bkey *k) { - return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); + return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); } static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) @@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j) : tree_to_prev_bkey(t, j >> ffs(j)); struct bkey *r = is_power_of_2(j + 1) - ? node(t->data, t->data->keys - bkey_u64s(&t->end)) + ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end)) : tree_to_bkey(t, j >> (ffz(j) + 1)); BUG_ON(m < l || m > r); @@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j) f->exponent = 127; } -static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) { - if (t != b->sets) { + if (t != b->set) { unsigned j = roundup(t[-1].size, 64 / sizeof(struct bkey_float)); @@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t) t->prev = t[-1].prev + j; } - while (t < b->sets + MAX_BSETS) + while (t < b->set + MAX_BSETS) t++->size = 0; } -static void bset_build_unwritten_tree(struct btree *b) +static void bch_bset_build_unwritten_tree(struct btree_keys *b) { - struct bset_tree *t = b->sets + b->nsets; + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(b->last_set_unwritten); + b->last_set_unwritten = 1; bset_alloc_tree(b, t); - if (t->tree != b->sets->tree + bset_tree_space(b)) { - t->prev[0] = bkey_to_cacheline_offset(t->data->start); + if (t->tree != b->set->tree + btree_keys_cachelines(b)) { + t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start); t->size = 1; } } -static void bset_build_written_tree(struct btree *b) +void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) +{ + if (i != b->set->data) { + b->set[++b->nsets].data = i; + i->seq = b->set->data->seq; + } else + get_random_bytes(&i->seq, sizeof(uint64_t)); + + i->magic = magic; + i->version = 0; + i->keys = 0; + + bch_bset_build_unwritten_tree(b); +} +EXPORT_SYMBOL(bch_bset_init_next); + +void bch_bset_build_written_tree(struct btree_keys *b) { - struct bset_tree *t = b->sets + b->nsets; - struct bkey *k = t->data->start; + struct bset_tree *t = bset_tree_last(b); + struct bkey *prev = NULL, *k = t->data->start; unsigned j, cacheline = 1; + b->last_set_unwritten = 0; + bset_alloc_tree(b, t); t->size = min_t(unsigned, - bkey_to_cacheline(t, end(t->data)), - b->sets->tree + bset_tree_space(b) - t->tree); + bkey_to_cacheline(t, bset_bkey_last(t->data)), + b->set->tree + btree_keys_cachelines(b) - t->tree); if (t->size < 2) { t->size = 0; @@ -577,16 +656,14 @@ static void bset_build_written_tree(struct btree *b) for (j = inorder_next(0, t->size); j; j = inorder_next(j, t->size)) { - while (bkey_to_cacheline(t, k) != cacheline) - k = bkey_next(k); + while (bkey_to_cacheline(t, k) < cacheline) + prev = k, k = bkey_next(k); - t->prev[j] = bkey_u64s(k); - k = bkey_next(k); - cacheline++; - t->tree[j].m = bkey_to_cacheline_offset(k); + t->prev[j] = bkey_u64s(prev); + t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); } - while (bkey_next(k) != end(t->data)) + while (bkey_next(k) != bset_bkey_last(t->data)) k = bkey_next(k); t->end = *k; @@ -597,14 +674,17 @@ static void bset_build_written_tree(struct btree *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } +EXPORT_SYMBOL(bch_bset_build_written_tree); -void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) +/* Insert */ + +void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) { struct bset_tree *t; unsigned inorder, j = 1; - for (t = b->sets; t <= &b->sets[b->nsets]; t++) - if (k < end(t->data)) + for (t = b->set; t <= bset_tree_last(b); t++) + if (k < bset_bkey_last(t->data)) goto found_set; BUG(); @@ -617,7 +697,7 @@ found_set: if (k == t->data->start) goto fix_left; - if (bkey_next(k) == end(t->data)) { + if (bkey_next(k) == bset_bkey_last(t->data)) { t->end = *k; goto fix_right; } @@ -642,10 +722,12 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } +EXPORT_SYMBOL(bch_bset_fix_invalidated_key); -void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) +static void bch_bset_fix_lookup_table(struct btree_keys *b, + struct bset_tree *t, + struct bkey *k) { - struct bset_tree *t = &b->sets[b->nsets]; unsigned shift = bkey_u64s(k); unsigned j = bkey_to_cacheline(t, k); @@ -657,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) * lookup table for the first key that is strictly greater than k: * it's either k's cacheline or the next one */ - if (j < t->size && - table_to_bkey(t, j) <= k) + while (j < t->size && + table_to_bkey(t, j) <= k) j++; /* Adjust all the lookup table entries, and find a new key for any that @@ -673,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) while (k < cacheline_to_bkey(t, j, 0)) k = bkey_next(k); - t->prev[j] = bkey_to_cacheline_offset(k); + t->prev[j] = bkey_to_cacheline_offset(t, j, k); } } - if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) + if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) return; /* Possibly add a new entry to the end of the lookup table */ for (k = table_to_bkey(t, t->size - 1); - k != end(t->data); + k != bset_bkey_last(t->data); k = bkey_next(k)) if (t->size == bkey_to_cacheline(t, k)) { - t->prev[t->size] = bkey_to_cacheline_offset(k); + t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); t->size++; } } -void bch_bset_init_next(struct btree *b) +/* + * Tries to merge l and r: l should be lower than r + * Returns true if we were able to merge. If we did merge, l will be the merged + * key, r will be untouched. + */ +bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) { - struct bset *i = write_block(b); + if (!b->ops->key_merge) + return false; - if (i != b->sets[0].data) { - b->sets[++b->nsets].data = i; - i->seq = b->sets[0].data->seq; - } else - get_random_bytes(&i->seq, sizeof(uint64_t)); + /* + * Generic header checks + * Assumes left and right are in order + * Left and right must be exactly aligned + */ + if (!bch_bkey_equal_header(l, r) || + bkey_cmp(l, &START_KEY(r))) + return false; - i->magic = bset_magic(&b->c->sb); - i->version = 0; - i->keys = 0; + return b->ops->key_merge(b, l, r); +} +EXPORT_SYMBOL(bch_bkey_try_merge); - bset_build_unwritten_tree(b); +void bch_bset_insert(struct btree_keys *b, struct bkey *where, + struct bkey *insert) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(!b->last_set_unwritten); + BUG_ON(bset_byte_offset(b, t->data) + + __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > + PAGE_SIZE << b->page_order); + + memmove((uint64_t *) where + bkey_u64s(insert), + where, + (void *) bset_bkey_last(t->data) - (void *) where); + + t->data->keys += bkey_u64s(insert); + bkey_copy(where, insert); + bch_bset_fix_lookup_table(b, t, where); } +EXPORT_SYMBOL(bch_bset_insert); + +unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, + struct bkey *replace_key) +{ + unsigned status = BTREE_INSERT_STATUS_NO_INSERT; + struct bset *i = bset_tree_last(b)->data; + struct bkey *m, *prev = NULL; + struct btree_iter iter; + + BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); + + m = bch_btree_iter_init(b, &iter, b->ops->is_extents + ? PRECEDING_KEY(&START_KEY(k)) + : PRECEDING_KEY(k)); + + if (b->ops->insert_fixup(b, k, &iter, replace_key)) + return status; + + status = BTREE_INSERT_STATUS_INSERT; + + while (m != bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) + prev = m, m = bkey_next(m); + + /* prev is in the tree, if we merge we're done */ + status = BTREE_INSERT_STATUS_BACK_MERGE; + if (prev && + bch_bkey_try_merge(b, prev, k)) + goto merged; +#if 0 + status = BTREE_INSERT_STATUS_OVERWROTE; + if (m != bset_bkey_last(i) && + KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) + goto copy; +#endif + status = BTREE_INSERT_STATUS_FRONT_MERGE; + if (m != bset_bkey_last(i) && + bch_bkey_try_merge(b, k, m)) + goto copy; + + bch_bset_insert(b, m, k); +copy: bkey_copy(m, k); +merged: + return status; +} +EXPORT_SYMBOL(bch_btree_insert_key); + +/* Lookup */ struct bset_search_iter { struct bkey *l, *r; }; -static struct bset_search_iter bset_search_write_set(struct btree *b, - struct bset_tree *t, +static struct bset_search_iter bset_search_write_set(struct bset_tree *t, const struct bkey *search) { unsigned li = 0, ri = t->size; - BUG_ON(!b->nsets && - t->size < bkey_to_cacheline(t, end(t->data))); - while (li + 1 != ri) { unsigned m = (li + ri) >> 1; @@ -732,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b, return (struct bset_search_iter) { table_to_bkey(t, li), - ri < t->size ? table_to_bkey(t, ri) : end(t->data) + ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data) }; } -static struct bset_search_iter bset_search_tree(struct btree *b, - struct bset_tree *t, +static struct bset_search_iter bset_search_tree(struct bset_tree *t, const struct bkey *search) { struct bkey *l, *r; @@ -784,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, f = &t->tree[inorder_next(j, t->size)]; r = cacheline_to_bkey(t, inorder, f->m); } else - r = end(t->data); + r = bset_bkey_last(t->data); } else { r = cacheline_to_bkey(t, inorder, f->m); @@ -798,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, return (struct bset_search_iter) {l, r}; } -struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, +struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search) { struct bset_search_iter i; @@ -820,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, if (unlikely(!t->size)) { i.l = t->data->start; - i.r = end(t->data); + i.r = bset_bkey_last(t->data); } else if (bset_written(b, t)) { /* * Each node in the auxiliary search tree covers a certain range @@ -830,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, */ if (unlikely(bkey_cmp(search, &t->end) >= 0)) - return end(t->data); + return bset_bkey_last(t->data); if (unlikely(bkey_cmp(search, t->data->start) < 0)) return t->data->start; - i = bset_search_tree(b, t, search); - } else - i = bset_search_write_set(b, t, search); + i = bset_search_tree(t, search); + } else { + BUG_ON(!b->nsets && + t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); - if (expensive_debug_checks(b->c)) { + i = bset_search_write_set(t, search); + } + + if (btree_keys_expensive_checks(b)) { BUG_ON(bset_written(b, t) && i.l != t->data->start && bkey_cmp(tree_to_prev_bkey(t, inorder_to_tree(bkey_to_cacheline(t, i.l), t)), search) > 0); - BUG_ON(i.r != end(t->data) && + BUG_ON(i.r != bset_bkey_last(t->data) && bkey_cmp(i.r, search) <= 0); } @@ -856,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, return i.l; } +EXPORT_SYMBOL(__bch_bset_search); /* Btree iterator */ -/* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for btree_sort_fixup() - if there are multiple keys that compare - * equal in different sets, we have to process them newest to oldest. - */ +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); + static inline bool btree_iter_cmp(struct btree_iter_set l, struct btree_iter_set r) { - int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); - - return c ? c > 0 : l.k < r.k; + return bkey_cmp(l.k, r.k) > 0; } static inline bool btree_iter_end(struct btree_iter *iter) @@ -888,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, btree_iter_cmp)); } -struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, - struct bkey *search, struct bset_tree *start) +static struct bkey *__bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; iter->size = ARRAY_SIZE(iter->data); @@ -899,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, iter->b = b; #endif - for (; start <= &b->sets[b->nsets]; start++) { + for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, end(start->data)); + bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_next(struct btree_iter *iter) +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search) +{ + return __bch_btree_iter_init(b, iter, search, b->set); +} +EXPORT_SYMBOL(bch_btree_iter_init); + +static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, + btree_iter_cmp_fn *cmp) { struct btree_iter_set unused; struct bkey *ret = NULL; @@ -924,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) } if (iter->data->k == iter->data->end) - heap_pop(iter, unused, btree_iter_cmp); + heap_pop(iter, unused, cmp); else - heap_sift(iter, 0, btree_iter_cmp); + heap_sift(iter, 0, cmp); } return ret; } +struct bkey *bch_btree_iter_next(struct btree_iter *iter) +{ + return __bch_btree_iter_next(iter, btree_iter_cmp); + +} +EXPORT_SYMBOL(bch_btree_iter_next); + struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, - struct btree *b, ptr_filter_fn fn) + struct btree_keys *b, ptr_filter_fn fn) { struct bkey *ret; @@ -946,70 +1114,58 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, /* Mergesort */ -static void sort_key_next(struct btree_iter *iter, - struct btree_iter_set *i) +void bch_bset_sort_state_free(struct bset_sort_state *state) { - i->k = bkey_next(i->k); - - if (i->k == i->end) - *i = iter->data[--iter->used]; + if (state->pool) + mempool_destroy(state->pool); } -static void btree_sort_fixup(struct btree_iter *iter) +int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) { - while (iter->used > 1) { - struct btree_iter_set *top = iter->data, *i = top + 1; + spin_lock_init(&state->time.lock); - if (iter->used > 2 && - btree_iter_cmp(i[0], i[1])) - i++; + state->page_order = page_order; + state->crit_factor = int_sqrt(1 << page_order); - if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) - break; - - if (!KEY_SIZE(i->k)) { - sort_key_next(iter, i); - heap_sift(iter, i - top, btree_iter_cmp); - continue; - } - - if (top->k > i->k) { - if (bkey_cmp(top->k, i->k) >= 0) - sort_key_next(iter, i); - else - bch_cut_front(top->k, i->k); + state->pool = mempool_create_page_pool(1, page_order); + if (!state->pool) + return -ENOMEM; - heap_sift(iter, i - top, btree_iter_cmp); - } else { - /* can't happen because of comparison func */ - BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); - bch_cut_back(&START_KEY(i->k), top->k); - } - } + return 0; } +EXPORT_SYMBOL(bch_bset_sort_state_init); -static void btree_mergesort(struct btree *b, struct bset *out, +static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { + int i; struct bkey *k, *last = NULL; - bool (*bad)(struct btree *, const struct bkey *) = remove_stale + BKEY_PADDED(k) tmp; + bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; + /* Heapify the iterator, using our comparison function */ + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, b->ops->sort_cmp); + while (!btree_iter_end(iter)) { - if (fixup && !b->level) - btree_sort_fixup(iter); + if (b->ops->sort_fixup && fixup) + k = b->ops->sort_fixup(iter, &tmp.k); + else + k = NULL; + + if (!k) + k = __bch_btree_iter_next(iter, b->ops->sort_cmp); - k = bch_btree_iter_next(iter); if (bad(b, k)) continue; if (!last) { last = out->start; bkey_copy(last, k); - } else if (b->level || - !bch_bkey_try_merge(b, last, k)) { + } else if (!bch_bkey_try_merge(b, last, k)) { last = bkey_next(last); bkey_copy(last, k); } @@ -1020,27 +1176,27 @@ static void btree_mergesort(struct btree *b, struct bset *out, pr_debug("sorted %i keys", out->keys); } -static void __btree_sort(struct btree *b, struct btree_iter *iter, - unsigned start, unsigned order, bool fixup) +static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, + unsigned start, unsigned order, bool fixup, + struct bset_sort_state *state) { uint64_t start_time; - bool remove_stale = !b->written; + bool used_mempool = false; struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, order); if (!out) { - mutex_lock(&b->c->sort_lock); - out = b->c->sort; - order = ilog2(bucket_pages(b->c)); + BUG_ON(order > state->page_order); + + out = page_address(mempool_alloc(state->pool, GFP_NOIO)); + used_mempool = true; + order = state->page_order; } start_time = local_clock(); - btree_mergesort(b, out, iter, fixup, remove_stale); + btree_mergesort(b, out, iter, fixup, false); b->nsets = start; - if (!fixup && !start && b->written) - bch_btree_verify(b, out); - if (!start && order == b->page_order) { /* * Our temporary buffer is the same size as the btree node's @@ -1048,84 +1204,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, * memcpy() */ - out->magic = bset_magic(&b->c->sb); - out->seq = b->sets[0].data->seq; - out->version = b->sets[0].data->version; - swap(out, b->sets[0].data); - - if (b->c->sort == b->sets[0].data) - b->c->sort = out; + out->magic = b->set->data->magic; + out->seq = b->set->data->seq; + out->version = b->set->data->version; + swap(out, b->set->data); } else { - b->sets[start].data->keys = out->keys; - memcpy(b->sets[start].data->start, out->start, - (void *) end(out) - (void *) out->start); + b->set[start].data->keys = out->keys; + memcpy(b->set[start].data->start, out->start, + (void *) bset_bkey_last(out) - (void *) out->start); } - if (out == b->c->sort) - mutex_unlock(&b->c->sort_lock); + if (used_mempool) + mempool_free(virt_to_page(out), state->pool); else free_pages((unsigned long) out, order); - if (b->written) - bset_build_written_tree(b); + bch_bset_build_written_tree(b); if (!start) - bch_time_stats_update(&b->c->sort_time, start_time); + bch_time_stats_update(&state->time, start_time); } -void bch_btree_sort_partial(struct btree *b, unsigned start) +void bch_btree_sort_partial(struct btree_keys *b, unsigned start, + struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; struct btree_iter iter; int oldsize = bch_count_data(b); - __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); - - BUG_ON(b->sets[b->nsets].data == write_block(b) && - (b->sets[b->nsets].size || b->nsets)); - + __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned i; for (i = start; i <= b->nsets; i++) - keys += b->sets[i].data->keys; + keys += b->set[i].data->keys; - order = roundup_pow_of_two(__set_bytes(b->sets->data, - keys)) / PAGE_SIZE; - if (order) - order = ilog2(order); + order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false); + __btree_sort(b, &iter, start, order, false, state); - EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); + EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } +EXPORT_SYMBOL(bch_btree_sort_partial); -void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) +void bch_btree_sort_and_fix_extents(struct btree_keys *b, + struct btree_iter *iter, + struct bset_sort_state *state) { - BUG_ON(!b->written); - __btree_sort(b, iter, 0, b->page_order, true); + __btree_sort(b, iter, 0, b->page_order, true, state); } -void bch_btree_sort_into(struct btree *b, struct btree *new) +void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, + struct bset_sort_state *state) { uint64_t start_time = local_clock(); struct btree_iter iter; bch_btree_iter_init(b, &iter, NULL); - btree_mergesort(b, new->sets->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter, false, true); - bch_time_stats_update(&b->c->sort_time, start_time); + bch_time_stats_update(&state->time, start_time); - bkey_copy_key(&new->key, &b->key); - new->sets->size = 0; + new->set->size = 0; // XXX: why? } #define SORT_CRIT (4096 / sizeof(uint64_t)) -void bch_btree_sort_lazy(struct btree *b) +void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) { unsigned crit = SORT_CRIT; int i; @@ -1134,50 +1282,32 @@ void bch_btree_sort_lazy(struct btree *b) if (!b->nsets) goto out; - /* If not a leaf node, always sort */ - if (b->level) { - bch_btree_sort(b); - return; - } - for (i = b->nsets - 1; i >= 0; --i) { - crit *= b->c->sort_crit_factor; + crit *= state->crit_factor; - if (b->sets[i].data->keys < crit) { - bch_btree_sort_partial(b, i); + if (b->set[i].data->keys < crit) { + bch_btree_sort_partial(b, i, state); return; } } /* Sort if we'd overflow */ if (b->nsets + 1 == MAX_BSETS) { - bch_btree_sort(b); + bch_btree_sort(b, state); return; } out: - bset_build_written_tree(b); + bch_bset_build_written_tree(b); } +EXPORT_SYMBOL(bch_btree_sort_lazy); -/* Sysfs stuff */ - -struct bset_stats { - struct btree_op op; - size_t nodes; - size_t sets_written, sets_unwritten; - size_t bytes_written, bytes_unwritten; - size_t floats, failed; -}; - -static int btree_bset_stats(struct btree_op *op, struct btree *b) +void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { - struct bset_stats *stats = container_of(op, struct bset_stats, op); unsigned i; - stats->nodes++; - for (i = 0; i <= b->nsets; i++) { - struct bset_tree *t = &b->sets[i]; + struct bset_tree *t = &b->set[i]; size_t bytes = t->data->keys * sizeof(uint64_t); size_t j; @@ -1195,32 +1325,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b) stats->bytes_unwritten += bytes; } } - - return MAP_CONTINUE; -} - -int bch_bset_print_stats(struct cache_set *c, char *buf) -{ - struct bset_stats t; - int ret; - - memset(&t, 0, sizeof(struct bset_stats)); - bch_btree_op_init(&t.op, -1); - - ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); - if (ret < 0) - return ret; - - return snprintf(buf, PAGE_SIZE, - "btree nodes: %zu\n" - "written sets: %zu\n" - "unwritten sets: %zu\n" - "written key bytes: %zu\n" - "unwritten key bytes: %zu\n" - "floats: %zu\n" - "failed: %zu\n", - t.nodes, - t.sets_written, t.sets_unwritten, - t.bytes_written, t.bytes_unwritten, - t.floats, t.failed); } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 1d3c24f..003260f 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,7 +1,11 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H -#include <linux/slab.h> +#include <linux/bcache.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#include "util.h" /* for time_stats */ /* * BKEYS: @@ -142,20 +146,13 @@ * first key in that range of bytes again. */ -/* Btree key comparison/iteration */ +struct btree_keys; +struct btree_iter; +struct btree_iter_set; +struct bkey_float; #define MAX_BSETS 4U -struct btree_iter { - size_t size, used; -#ifdef CONFIG_BCACHE_DEBUG - struct btree *b; -#endif - struct btree_iter_set { - struct bkey *k, *end; - } data[MAX_BSETS]; -}; - struct bset_tree { /* * We construct a binary tree in an array as if the array @@ -165,14 +162,14 @@ struct bset_tree { */ /* size of the binary tree and prev array */ - unsigned size; + unsigned size; /* function of size - precalculated for to_inorder() */ - unsigned extra; + unsigned extra; /* copy of the last key in the set */ - struct bkey end; - struct bkey_float *tree; + struct bkey end; + struct bkey_float *tree; /* * The nodes in the bset tree point to specific keys - this @@ -182,12 +179,219 @@ struct bset_tree { * to keep bkey_float to 4 bytes and prev isn't used in the fast * path. */ - uint8_t *prev; + uint8_t *prev; /* The actual btree node, with pointers to each sorted set */ - struct bset *data; + struct bset *data; +}; + +struct btree_keys_ops { + bool (*sort_cmp)(struct btree_iter_set, + struct btree_iter_set); + struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); + bool (*insert_fixup)(struct btree_keys *, struct bkey *, + struct btree_iter *, struct bkey *); + bool (*key_invalid)(struct btree_keys *, + const struct bkey *); + bool (*key_bad)(struct btree_keys *, const struct bkey *); + bool (*key_merge)(struct btree_keys *, + struct bkey *, struct bkey *); + void (*key_to_text)(char *, size_t, const struct bkey *); + void (*key_dump)(struct btree_keys *, const struct bkey *); + + /* + * Only used for deciding whether to use START_KEY(k) or just the key + * itself in a couple places + */ + bool is_extents; +}; + +struct btree_keys { + const struct btree_keys_ops *ops; + uint8_t page_order; + uint8_t nsets; + unsigned last_set_unwritten:1; + bool *expensive_debug_checks; + + /* + * Sets of sorted keys - the real btree node - plus a binary search tree + * + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree set[MAX_BSETS]; +}; + +static inline struct bset_tree *bset_tree_last(struct btree_keys *b) +{ + return b->set + b->nsets; +} + +static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) +{ + return t <= b->set + b->nsets - b->last_set_unwritten; +} + +static inline bool bkey_written(struct btree_keys *b, struct bkey *k) +{ + return !b->last_set_unwritten || k < b->set[b->nsets].data->start; +} + +static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) +{ + return ((size_t) i) - ((size_t) b->set->data); +} + +static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) +{ + return bset_byte_offset(b, i) >> 9; +} + +#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) +#define set_bytes(i) __set_bytes(i, i->keys) + +#define __set_blocks(i, k, block_bytes) \ + DIV_ROUND_UP(__set_bytes(i, k), block_bytes) +#define set_blocks(i, block_bytes) \ + __set_blocks(i, (i)->keys, block_bytes) + +static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON((PAGE_SIZE << b->page_order) < + (bset_byte_offset(b, t->data) + set_bytes(t->data))); + + if (!b->last_set_unwritten) + return 0; + + return ((PAGE_SIZE << b->page_order) - + (bset_byte_offset(b, t->data) + set_bytes(t->data))) / + sizeof(u64); +} + +static inline struct bset *bset_next_set(struct btree_keys *b, + unsigned block_bytes) +{ + struct bset *i = bset_tree_last(b)->data; + + return ((void *) i) + roundup(set_bytes(i), block_bytes); +} + +void bch_btree_keys_free(struct btree_keys *); +int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); +void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, + bool *); + +void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); +void bch_bset_build_written_tree(struct btree_keys *); +void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); +bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); +void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); +unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, + struct bkey *); + +enum { + BTREE_INSERT_STATUS_NO_INSERT = 0, + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, }; +/* Btree key iteration */ + +struct btree_iter { + size_t size, used; +#ifdef CONFIG_BCACHE_DEBUG + struct btree_keys *b; +#endif + struct btree_iter_set { + struct bkey *k, *end; + } data[MAX_BSETS]; +}; + +typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); + +struct bkey *bch_btree_iter_next(struct btree_iter *); +struct bkey *bch_btree_iter_next_filter(struct btree_iter *, + struct btree_keys *, ptr_filter_fn); + +void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); +struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, + struct bkey *); + +struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, + const struct bkey *); + +/* + * Returns the first key that is strictly greater than search + */ +static inline struct bkey *bch_bset_search(struct btree_keys *b, + struct bset_tree *t, + const struct bkey *search) +{ + return search ? __bch_bset_search(b, t, search) : t->data->start; +} + +#define for_each_key_filter(b, k, iter, filter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next_filter((iter), (b), filter));) + +#define for_each_key(b, k, iter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next(iter));) + +/* Sorting */ + +struct bset_sort_state { + mempool_t *pool; + + unsigned page_order; + unsigned crit_factor; + + struct time_stats time; +}; + +void bch_bset_sort_state_free(struct bset_sort_state *); +int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); +void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); +void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, + struct bset_sort_state *); +void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, + struct bset_sort_state *); +void bch_btree_sort_partial(struct btree_keys *, unsigned, + struct bset_sort_state *); + +static inline void bch_btree_sort(struct btree_keys *b, + struct bset_sort_state *state) +{ + bch_btree_sort_partial(b, 0, state); +} + +struct bset_stats { + size_t sets_written, sets_unwritten; + size_t bytes_written, bytes_unwritten; + size_t floats, failed; +}; + +void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); + +/* Bkey utility code */ + +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) + +static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) +{ + return bkey_idx(i->start, idx); +} + +static inline void bkey_init(struct bkey *k) +{ + *k = ZERO_KEY; +} + static __always_inline int64_t bkey_cmp(const struct bkey *l, const struct bkey *r) { @@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); } +void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, + unsigned); +bool __bch_cut_front(const struct bkey *, struct bkey *); +bool __bch_cut_back(const struct bkey *, struct bkey *); + +static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, k) > 0); + return __bch_cut_front(where, k); +} + +static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); + return __bch_cut_back(where, k); +} + +#define PRECEDING_KEY(_k) \ +({ \ + struct bkey *_ret = NULL; \ + \ + if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ + _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ + \ + if (!_ret->low) \ + _ret->high--; \ + _ret->low--; \ + } \ + \ + _ret; \ +}) + +static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_invalid(b, k); +} + +static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_bad(b, k); +} + +static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, + size_t size, const struct bkey *k) +{ + return b->ops->key_to_text(buf, size, k); +} + +static inline bool bch_bkey_equal_header(const struct bkey *l, + const struct bkey *r) +{ + return (KEY_DIRTY(l) == KEY_DIRTY(r) && + KEY_PTRS(l) == KEY_PTRS(r) && + KEY_CSUM(l) == KEY_CSUM(l)); +} + /* Keylists */ struct keylist { @@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l) struct bkey *bch_keylist_pop(struct keylist *); void bch_keylist_pop_front(struct keylist *); -int bch_keylist_realloc(struct keylist *, int, struct cache_set *); - -void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, - unsigned); -bool __bch_cut_front(const struct bkey *, struct bkey *); -bool __bch_cut_back(const struct bkey *, struct bkey *); +int __bch_keylist_realloc(struct keylist *, unsigned); -static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) -{ - BUG_ON(bkey_cmp(where, k) > 0); - return __bch_cut_front(where, k); -} +/* Debug stuff */ -static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) -{ - BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); - return __bch_cut_back(where, k); -} - -const char *bch_ptr_status(struct cache_set *, const struct bkey *); -bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); -bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); - -bool bch_ptr_bad(struct btree *, const struct bkey *); - -static inline uint8_t gen_after(uint8_t a, uint8_t b) -{ - uint8_t r = a - b; - return r > 128U ? 0 : r; -} - -static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, - unsigned i) -{ - return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); -} - -static inline bool ptr_available(struct cache_set *c, const struct bkey *k, - unsigned i) -{ - return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); -} - - -typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); - -struct bkey *bch_btree_iter_next(struct btree_iter *); -struct bkey *bch_btree_iter_next_filter(struct btree_iter *, - struct btree *, ptr_filter_fn); - -void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); -struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, - struct bkey *, struct bset_tree *); - -/* 32 bits total: */ -#define BKEY_MID_BITS 3 -#define BKEY_EXPONENT_BITS 7 -#define BKEY_MANTISSA_BITS 22 -#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) - -struct bkey_float { - unsigned exponent:BKEY_EXPONENT_BITS; - unsigned m:BKEY_MID_BITS; - unsigned mantissa:BKEY_MANTISSA_BITS; -} __packed; - -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ - -#define BSET_CACHELINE 128 -#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) +#ifdef CONFIG_BCACHE_DEBUG -#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) -#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) +int __bch_count_data(struct btree_keys *); +void __bch_check_keys(struct btree_keys *, const char *, ...); +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); +void bch_dump_bucket(struct btree_keys *); -void bch_bset_init_next(struct btree *); +#else -void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); -void bch_bset_fix_lookup_table(struct btree *, struct bkey *); +static inline int __bch_count_data(struct btree_keys *b) { return -1; } +static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} +static inline void bch_dump_bucket(struct btree_keys *b) {} +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); -struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, - const struct bkey *); +#endif -/* - * Returns the first key that is strictly greater than search - */ -static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, - const struct bkey *search) +static inline bool btree_keys_expensive_checks(struct btree_keys *b) { - return search ? __bch_bset_search(b, t, search) : t->data->start; +#ifdef CONFIG_BCACHE_DEBUG + return *b->expensive_debug_checks; +#else + return false; +#endif } -#define PRECEDING_KEY(_k) \ -({ \ - struct bkey *_ret = NULL; \ - \ - if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ - _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ - \ - if (!_ret->low) \ - _ret->high--; \ - _ret->low--; \ - } \ - \ - _ret; \ -}) - -bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); -void bch_btree_sort_lazy(struct btree *); -void bch_btree_sort_into(struct btree *, struct btree *); -void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); -void bch_btree_sort_partial(struct btree *, unsigned); - -static inline void bch_btree_sort(struct btree *b) +static inline int bch_count_data(struct btree_keys *b) { - bch_btree_sort_partial(b, 0); + return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; } -int bch_bset_print_stats(struct cache_set *, char *); +#define bch_check_keys(b, ...) \ +do { \ + if (btree_keys_expensive_checks(b)) \ + __bch_check_keys(b, __VA_ARGS__); \ +} while (0) #endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 946ecd3..98cc0a8 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -23,7 +23,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "writeback.h" +#include "extents.h" #include <linux/slab.h> #include <linux/bitops.h> @@ -89,13 +89,6 @@ * Test module load/unload */ -enum { - BTREE_INSERT_STATUS_INSERT, - BTREE_INSERT_STATUS_BACK_MERGE, - BTREE_INSERT_STATUS_OVERWROTE, - BTREE_INSERT_STATUS_FRONT_MERGE, -}; - #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 @@ -106,14 +99,6 @@ enum { static struct workqueue_struct *btree_io_wq; -static inline bool should_split(struct btree *b) -{ - struct bset *i = write_block(b); - return b->written >= btree_blocks(b) || - (b->written + __set_blocks(i, i->keys + 15, b->c) - > btree_blocks(b)); -} - #define insert_lock(s, b) ((b)->level <= (s)->lock) /* @@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b) _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ } \ rw_unlock(_w, _b); \ + if (_r == -EINTR) \ + schedule(); \ bch_cannibalize_unlock(c); \ if (_r == -ENOSPC) { \ wait_event((c)->try_wait, \ @@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b) } \ } while (_r == -EINTR); \ \ + finish_wait(&(c)->bucket_wait, &(op)->wait); \ _r; \ }) +static inline struct bset *write_block(struct btree *b) +{ + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); +} + /* Btree key manipulation */ void bkey_put(struct cache_set *c, struct bkey *k) @@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k) static uint64_t btree_csum_set(struct btree *b, struct bset *i) { uint64_t crc = b->key.ptr[0]; - void *data = (void *) i + 8, *end = end(i); + void *data = (void *) i + 8, *end = bset_bkey_last(i); crc = bch_crc64_update(crc, data, end - data); return crc ^ 0xffffffffffffffffULL; } -static void bch_btree_node_read_done(struct btree *b) +void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; - struct bset *i = b->sets[0].data; + struct bset *i = btree_bset_first(b); struct btree_iter *iter; iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); @@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b) iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->b = &b->keys; #endif if (!i->seq) goto err; for (; - b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; + b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; i = write_block(b)) { err = "unsupported bset version"; if (i->version > BCACHE_BSET_VERSION) goto err; err = "bad btree header"; - if (b->written + set_blocks(i, b->c) > btree_blocks(b)) + if (b->written + set_blocks(i, block_bytes(b->c)) > + btree_blocks(b)) goto err; err = "bad magic"; @@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b) } err = "empty set"; - if (i != b->sets[0].data && !i->keys) + if (i != b->keys.set[0].data && !i->keys) goto err; - bch_btree_iter_push(iter, i->start, end(i)); + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); - b->written += set_blocks(i, b->c); + b->written += set_blocks(i, block_bytes(b->c)); } err = "corrupted btree"; for (i = write_block(b); - index(i, b) < btree_blocks(b); + bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); i = ((void *) i) + block_bytes(b->c)) - if (i->seq == b->sets[0].data->seq) + if (i->seq == b->keys.set[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(b, iter); + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); - i = b->sets[0].data; + i = b->keys.set[0].data; err = "short btree key"; - if (b->sets[0].size && - bkey_cmp(&b->key, &b->sets[0].end) < 0) + if (b->keys.set[0].size && + bkey_cmp(&b->key, &b->keys.set[0].end) < 0) goto err; if (b->written < btree_blocks(b)) - bch_bset_init_next(b); + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); out: mempool_free(iter, b->c->fill_iter); return; err: set_btree_node_io_error(b); - bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", + bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", err, PTR_BUCKET_NR(b->c, &b->key, 0), - index(i, b), i->keys); + bset_block_offset(b, i), i->keys); goto out; } @@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error) closure_put(cl); } -void bch_btree_node_read(struct btree *b) +static void bch_btree_node_read(struct btree *b) { uint64_t start_time = local_clock(); struct closure cl; @@ -303,7 +298,7 @@ void bch_btree_node_read(struct btree *b) bio->bi_end_io = btree_node_read_endio; bio->bi_private = &cl; - bch_bio_map(bio, b->sets[0].data); + bch_bio_map(bio, b->keys.set[0].data); bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); @@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) w->journal = NULL; } +static void btree_node_write_unlock(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + + up(&b->io_mutex); +} + static void __btree_node_write_done(struct closure *cl) { - struct btree *b = container_of(cl, struct btree, io.cl); + struct btree *b = container_of(cl, struct btree, io); struct btree_write *w = btree_prev_write(b); bch_bbio_free(b->bio, b->c); @@ -353,12 +355,12 @@ static void __btree_node_write_done(struct closure *cl) queue_delayed_work(btree_io_wq, &b->work, msecs_to_jiffies(30000)); - closure_return(cl); + closure_return_with_destructor(cl, btree_node_write_unlock); } static void btree_node_write_done(struct closure *cl) { - struct btree *b = container_of(cl, struct btree, io.cl); + struct btree *b = container_of(cl, struct btree, io); struct bio_vec *bv; int n; @@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl) static void btree_node_write_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct btree *b = container_of(cl, struct btree, io.cl); + struct btree *b = container_of(cl, struct btree, io); if (error) set_btree_node_io_error(b); @@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error) static void do_btree_node_write(struct btree *b) { - struct closure *cl = &b->io.cl; - struct bset *i = b->sets[b->nsets].data; + struct closure *cl = &b->io; + struct bset *i = btree_bset_last(b); BKEY_PADDED(key) k; i->version = BCACHE_BSET_VERSION; @@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; - b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c); + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); bch_bio_map(b->bio, i); /* @@ -414,7 +416,8 @@ static void do_btree_node_write(struct btree *b) */ bkey_copy(&k.key, &b->key); - SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + + bset_sector_offset(&b->keys, i)); if (!bio_alloc_pages(b->bio, GFP_NOIO)) { int j; @@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b) bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); - __btree_node_write_done(cl); + continue_at_nobarrier(cl, __btree_node_write_done, NULL); } } void bch_btree_node_write(struct btree *b, struct closure *parent) { - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); trace_bcache_btree_write(b); BUG_ON(current->bio_list); BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); - BUG_ON(b->sets->data->seq != i->seq); - bch_check_keys(b, "writing"); + BUG_ON(btree_bset_first(b)->seq != i->seq); + bch_check_keys(&b->keys, "writing"); cancel_delayed_work(&b->work); /* If caller isn't waiting for write, parent refcount is cache set */ - closure_lock(&b->io, parent ?: &b->c->cl); + down(&b->io_mutex); + closure_init(&b->io, parent ?: &b->c->cl); clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); do_btree_node_write(b); - b->written += set_blocks(i, b->c); - atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, + atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); - bch_btree_sort_lazy(b); + b->written += set_blocks(i, block_bytes(b->c)); + + /* If not a leaf node, always sort */ + if (b->level && b->keys.nsets) + bch_btree_sort(&b->keys, &b->c->sort); + else + bch_btree_sort_lazy(&b->keys, &b->c->sort); + + /* + * do verify if there was more than one set initially (i.e. we did a + * sort) and we sorted down to a single set: + */ + if (i != b->keys.set->data && !b->keys.nsets) + bch_btree_verify(b); if (b->written < btree_blocks(b)) - bch_bset_init_next(b); + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); } static void bch_btree_node_write_sync(struct btree *b) @@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w) static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) { - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); struct btree_write *w = btree_current_write(b); BUG_ON(!b->written); @@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) * mca -> memory cache */ -static void mca_reinit(struct btree *b) -{ - unsigned i; - - b->flags = 0; - b->written = 0; - b->nsets = 0; - - for (i = 0; i < MAX_BSETS; i++) - b->sets[i].size = 0; - /* - * Second loop starts at 1 because b->sets[0]->data is the memory we - * allocated - */ - for (i = 1; i < MAX_BSETS; i++) - b->sets[i].data = NULL; -} - #define mca_reserve(c) (((c->root && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ @@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b) static void mca_data_free(struct btree *b) { - struct bset_tree *t = b->sets; - BUG_ON(!closure_is_unlocked(&b->io.cl)); + BUG_ON(b->io_mutex.count != 1); - if (bset_prev_bytes(b) < PAGE_SIZE) - kfree(t->prev); - else - free_pages((unsigned long) t->prev, - get_order(bset_prev_bytes(b))); + bch_btree_keys_free(&b->keys); - if (bset_tree_bytes(b) < PAGE_SIZE) - kfree(t->tree); - else - free_pages((unsigned long) t->tree, - get_order(bset_tree_bytes(b))); - - free_pages((unsigned long) t->data, b->page_order); - - t->prev = NULL; - t->tree = NULL; - t->data = NULL; - list_move(&b->list, &b->c->btree_cache_freed); b->c->bucket_cache_used--; + list_move(&b->list, &b->c->btree_cache_freed); } static void mca_bucket_free(struct btree *b) @@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k) static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) { - struct bset_tree *t = b->sets; - BUG_ON(t->data); - - b->page_order = max_t(unsigned, - ilog2(b->c->btree_pages), - btree_order(k)); - - t->data = (void *) __get_free_pages(gfp, b->page_order); - if (!t->data) - goto err; - - t->tree = bset_tree_bytes(b) < PAGE_SIZE - ? kmalloc(bset_tree_bytes(b), gfp) - : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); - if (!t->tree) - goto err; - - t->prev = bset_prev_bytes(b) < PAGE_SIZE - ? kmalloc(bset_prev_bytes(b), gfp) - : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); - if (!t->prev) - goto err; - - list_move(&b->list, &b->c->btree_cache); - b->c->bucket_cache_used++; - return; -err: - mca_data_free(b); + if (!bch_btree_keys_alloc(&b->keys, + max_t(unsigned, + ilog2(b->c->btree_pages), + btree_order(k)), + gfp)) { + b->c->bucket_cache_used++; + list_move(&b->list, &b->c->btree_cache); + } else { + list_move(&b->list, &b->c->btree_cache_freed); + } } static struct btree *mca_bucket_alloc(struct cache_set *c, @@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, INIT_LIST_HEAD(&b->list); INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; - closure_init_unlocked(&b->io); + sema_init(&b->io_mutex, 1); mca_data_alloc(b, k, gfp); return b; @@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) if (!down_write_trylock(&b->lock)) return -ENOMEM; - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); - if (b->page_order < min_order || - (!flush && - (btree_node_dirty(b) || - atomic_read(&b->io.cl.remaining) != -1))) { - rw_unlock(true, b); - return -ENOMEM; + if (b->keys.page_order < min_order) + goto out_unlock; + + if (!flush) { + if (btree_node_dirty(b)) + goto out_unlock; + + if (down_trylock(&b->io_mutex)) + goto out_unlock; + up(&b->io_mutex); } if (btree_node_dirty(b)) bch_btree_node_write_sync(b); /* wait for any in flight btree write */ - closure_wait_event(&b->io.wait, &cl, - atomic_read(&b->io.cl.remaining) == -1); + down(&b->io_mutex); + up(&b->io_mutex); return 0; +out_unlock: + rw_unlock(true, b); + return -ENOMEM; } static unsigned long bch_mca_scan(struct shrinker *shrink, @@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, } } - /* - * Can happen right when we first start up, before we've read in any - * btree nodes - */ - if (list_empty(&c->btree_cache)) - goto out; - for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { + if (list_empty(&c->btree_cache)) + goto out; + b = list_first_entry(&c->btree_cache, struct btree, list); list_rotate_left(&c->btree_cache); @@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c) #ifdef CONFIG_BCACHE_DEBUG if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); + + free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); #endif list_splice(&c->btree_cache_freeable, @@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c) #ifdef CONFIG_BCACHE_DEBUG mutex_init(&c->verify_lock); + c->verify_ondisk = (void *) + __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); if (c->verify_data && - c->verify_data->sets[0].data) + c->verify_data->keys.set->data) list_del_init(&c->verify_data->list); else c->verify_data = NULL; @@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) list_for_each_entry(b, &c->btree_cache_freed, list) if (!mca_reap(b, 0, false)) { mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); - if (!b->sets[0].data) + if (!b->keys.set[0].data) goto err; else goto out; @@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) goto err; BUG_ON(!down_write_trylock(&b->lock)); - if (!b->sets->data) + if (!b->keys.set->data) goto err; out: - BUG_ON(!closure_is_unlocked(&b->io.cl)); + BUG_ON(b->io_mutex.count != 1); bkey_copy(&b->key, k); list_move(&b->list, &c->btree_cache); @@ -930,10 +903,17 @@ out: hlist_add_head_rcu(&b->hash, mca_hash(c, k)); lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); - b->level = level; b->parent = (void *) ~0UL; + b->flags = 0; + b->written = 0; + b->level = level; - mca_reinit(b); + if (!b->level) + bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, + &b->c->expensive_debug_checks); + else + bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, + &b->c->expensive_debug_checks); return b; err: @@ -994,13 +974,13 @@ retry: b->accessed = 1; - for (; i <= b->nsets && b->sets[i].size; i++) { - prefetch(b->sets[i].tree); - prefetch(b->sets[i].data); + for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { + prefetch(b->keys.set[i].tree); + prefetch(b->keys.set[i].data); } - for (; i <= b->nsets; i++) - prefetch(b->sets[i].data); + for (; i <= b->keys.nsets; i++) + prefetch(b->keys.set[i].data); if (btree_node_io_error(b)) { rw_unlock(write, b); @@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) goto err; bkey_put(c, &k.key); @@ -1080,7 +1060,7 @@ retry: } b->accessed = 1; - bch_bset_init_next(b); + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); mutex_unlock(&c->bucket_lock); @@ -1098,8 +1078,10 @@ err: static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) { struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); - if (!IS_ERR_OR_NULL(n)) - bch_btree_sort_into(b, n); + if (!IS_ERR_OR_NULL(n)) { + bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); + bkey_copy_key(&n->key, &b->key); + } return n; } @@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) atomic_inc(&b->c->prio_blocked); } +static int btree_check_reserve(struct btree *b, struct btree_op *op) +{ + struct cache_set *c = b->c; + struct cache *ca; + unsigned i, reserve = c->root->level * 2 + 1; + int ret = 0; + + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->bucket_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + ret = -EINTR; + break; + } + + mutex_unlock(&c->bucket_lock); + return ret; +} + /* Garbage collection */ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) @@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) gc->nodes++; - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { stale = max(stale, btree_mark_key(b, k)); keys++; - if (bch_ptr_bad(b, k)) + if (bch_ptr_bad(&b->keys, k)) continue; gc->key_bytes += bkey_u64s(k); @@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) gc->data += KEY_SIZE(k); } - for (t = b->sets; t <= &b->sets[b->nsets]; t++) + for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) btree_bug_on(t->size && - bset_written(b, t) && + bset_written(&b->keys, t) && bkey_cmp(&b->key, &t->end) < 0, b, "found short btree key in gc"); @@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, blocks = btree_default_blocks(b->c) * 2 / 3; if (nodes < 2 || - __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) + __set_blocks(b->keys.set[0].data, keys, + block_bytes(b->c)) > blocks * (nodes - 1)) return 0; for (i = 0; i < nodes; i++) { @@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, } for (i = nodes - 1; i > 0; --i) { - struct bset *n1 = new_nodes[i]->sets->data; - struct bset *n2 = new_nodes[i - 1]->sets->data; + struct bset *n1 = btree_bset_first(new_nodes[i]); + struct bset *n2 = btree_bset_first(new_nodes[i - 1]); struct bkey *k, *last = NULL; keys = 0; if (i > 1) { for (k = n2->start; - k < end(n2); + k < bset_bkey_last(n2); k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys + - bkey_u64s(k), b->c) > blocks) + bkey_u64s(k), + block_bytes(b->c)) > blocks) break; last = k; @@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, * though) */ if (__set_blocks(n1, n1->keys + n2->keys, - b->c) > btree_blocks(new_nodes[i])) + block_bytes(b->c)) > + btree_blocks(new_nodes[i])) goto out_nocoalesce; keys = n2->keys; @@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, last = &r->b->key; } - BUG_ON(__set_blocks(n1, n1->keys + keys, - b->c) > btree_blocks(new_nodes[i])); + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > + btree_blocks(new_nodes[i])); if (last) bkey_copy_key(&new_nodes[i]->key, last); - memcpy(end(n1), + memcpy(bset_bkey_last(n1), n2->start, - (void *) node(n2, keys) - (void *) n2->start); + (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); n1->keys += keys; r[i].keys = n1->keys; memmove(n2->start, - node(n2, keys), - (void *) end(n2) - (void *) node(n2, keys)); + bset_bkey_idx(n2, keys), + (void *) bset_bkey_last(n2) - + (void *) bset_bkey_idx(n2, keys)); n2->keys -= keys; - if (bch_keylist_realloc(keylist, - KEY_PTRS(&new_nodes[i]->key), b->c)) + if (__bch_keylist_realloc(keylist, + bkey_u64s(&new_nodes[i]->key))) goto out_nocoalesce; bch_btree_node_write(new_nodes[i], &cl); @@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, } for (i = 0; i < nodes; i++) { - if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) + if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) goto out_nocoalesce; make_btree_freeing_key(r[i].b, keylist->top); @@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, } /* We emptied out this node */ - BUG_ON(new_nodes[0]->sets->data->keys); + BUG_ON(btree_bset_first(new_nodes[0])->keys); btree_node_free(new_nodes[0]); rw_unlock(true, new_nodes[0]); @@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b) struct btree_iter iter; unsigned ret = 0; - for_each_key_filter(b, k, &iter, bch_ptr_bad) + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); return ret; @@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct gc_merge_info *last = r + GC_MERGE_NODES - 1; bch_keylist_init(&keys); - bch_btree_iter_init(b, &iter, &b->c->gc_done); + bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); for (i = 0; i < GC_MERGE_NODES; i++) r[i].b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, k, b->level - 1, true); if (IS_ERR(r->b)) { @@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, if (!IS_ERR(last->b)) { should_rewrite = btree_gc_mark_node(last->b, gc); - if (should_rewrite) { + if (should_rewrite && + !btree_check_reserve(b, NULL)) { n = btree_node_alloc_replacement(last->b, false); @@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, struct bucket *g; struct btree_iter iter; - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { for (i = 0; i < KEY_PTRS(k); i++) { if (!ptr_available(b->c, k, i)) continue; @@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, } if (b->level) { - bch_btree_iter_init(b, &iter, NULL); + bch_btree_iter_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad); if (k) btree_node_prefetch(b->c, k, b->level - 1); @@ -1774,235 +1784,36 @@ err: /* Btree insertion */ -static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) -{ - struct bset *i = b->sets[b->nsets].data; - - memmove((uint64_t *) where + bkey_u64s(insert), - where, - (void *) end(i) - (void *) where); - - i->keys += bkey_u64s(insert); - bkey_copy(where, insert); - bch_bset_fix_lookup_table(b, where); -} - -static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, - struct btree_iter *iter, - struct bkey *replace_key) +static bool btree_insert_key(struct btree *b, struct bkey *k, + struct bkey *replace_key) { - void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) - { - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - offset, -sectors); - } - - uint64_t old_offset; - unsigned old_size, sectors_found = 0; - - while (1) { - struct bkey *k = bch_btree_iter_next(iter); - if (!k || - bkey_cmp(&START_KEY(k), insert) >= 0) - break; - - if (bkey_cmp(k, &START_KEY(insert)) <= 0) - continue; - - old_offset = KEY_START(k); - old_size = KEY_SIZE(k); - - /* - * We might overlap with 0 size extents; we can't skip these - * because if they're in the set we're inserting to we have to - * adjust them so they don't overlap with the key we're - * inserting. But we don't want to check them for replace - * operations. - */ - - if (replace_key && KEY_SIZE(k)) { - /* - * k might have been split since we inserted/found the - * key we're replacing - */ - unsigned i; - uint64_t offset = KEY_START(k) - - KEY_START(replace_key); - - /* But it must be a subset of the replace key */ - if (KEY_START(k) < KEY_START(replace_key) || - KEY_OFFSET(k) > KEY_OFFSET(replace_key)) - goto check_failed; - - /* We didn't find a key that we were supposed to */ - if (KEY_START(k) > KEY_START(insert) + sectors_found) - goto check_failed; - - if (KEY_PTRS(k) != KEY_PTRS(replace_key) || - KEY_DIRTY(k) != KEY_DIRTY(replace_key)) - goto check_failed; - - /* skip past gen */ - offset <<= 8; - - BUG_ON(!KEY_PTRS(replace_key)); + unsigned status; - for (i = 0; i < KEY_PTRS(replace_key); i++) - if (k->ptr[i] != replace_key->ptr[i] + offset) - goto check_failed; - - sectors_found = KEY_OFFSET(k) - KEY_START(insert); - } - - if (bkey_cmp(insert, k) < 0 && - bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { - /* - * We overlapped in the middle of an existing key: that - * means we have to split the old key. But we have to do - * slightly different things depending on whether the - * old key has been written out yet. - */ - - struct bkey *top; - - subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); - - if (bkey_written(b, k)) { - /* - * We insert a new key to cover the top of the - * old key, and the old key is modified in place - * to represent the bottom split. - * - * It's completely arbitrary whether the new key - * is the top or the bottom, but it has to match - * up with what btree_sort_fixup() does - it - * doesn't check for this kind of overlap, it - * depends on us inserting a new key for the top - * here. - */ - top = bch_bset_search(b, &b->sets[b->nsets], - insert); - shift_keys(b, top, k); - } else { - BKEY_PADDED(key) temp; - bkey_copy(&temp.key, k); - shift_keys(b, k, &temp.key); - top = bkey_next(k); - } - - bch_cut_front(insert, top); - bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(b, k); - return false; - } - - if (bkey_cmp(insert, k) < 0) { - bch_cut_front(insert, k); - } else { - if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) - old_offset = KEY_START(insert); - - if (bkey_written(b, k) && - bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { - /* - * Completely overwrote, so we don't have to - * invalidate the binary search tree - */ - bch_cut_front(k, k); - } else { - __bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(b, k); - } - } - - subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); - } + BUG_ON(bkey_cmp(k, &b->key) > 0); -check_failed: - if (replace_key) { - if (!sectors_found) { - return true; - } else if (sectors_found < KEY_SIZE(insert)) { - SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - - (KEY_SIZE(insert) - sectors_found)); - SET_KEY_SIZE(insert, sectors_found); - } - } + status = bch_btree_insert_key(&b->keys, k, replace_key); + if (status != BTREE_INSERT_STATUS_NO_INSERT) { + bch_check_keys(&b->keys, "%u for %s", status, + replace_key ? "replace" : "insert"); - return false; + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); + return true; + } else + return false; } -static bool btree_insert_key(struct btree *b, struct btree_op *op, - struct bkey *k, struct bkey *replace_key) +static size_t insert_u64s_remaining(struct btree *b) { - struct bset *i = b->sets[b->nsets].data; - struct bkey *m, *prev; - unsigned status = BTREE_INSERT_STATUS_INSERT; - - BUG_ON(bkey_cmp(k, &b->key) > 0); - BUG_ON(b->level && !KEY_PTRS(k)); - BUG_ON(!b->level && !KEY_OFFSET(k)); - - if (!b->level) { - struct btree_iter iter; - - /* - * bset_search() returns the first key that is strictly greater - * than the search key - but for back merging, we want to find - * the previous key. - */ - prev = NULL; - m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); + ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys); - if (fix_overlapping_extents(b, k, &iter, replace_key)) { - op->insert_collision = true; - return false; - } - - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - KEY_START(k), KEY_SIZE(k)); - - while (m != end(i) && - bkey_cmp(k, &START_KEY(m)) > 0) - prev = m, m = bkey_next(m); - - if (key_merging_disabled(b->c)) - goto insert; - - /* prev is in the tree, if we merge we're done */ - status = BTREE_INSERT_STATUS_BACK_MERGE; - if (prev && - bch_bkey_try_merge(b, prev, k)) - goto merged; - - status = BTREE_INSERT_STATUS_OVERWROTE; - if (m != end(i) && - KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) - goto copy; - - status = BTREE_INSERT_STATUS_FRONT_MERGE; - if (m != end(i) && - bch_bkey_try_merge(b, k, m)) - goto copy; - } else { - BUG_ON(replace_key); - m = bch_bset_search(b, &b->sets[b->nsets], k); - } - -insert: shift_keys(b, m, k); -copy: bkey_copy(m, k); -merged: - bch_check_keys(b, "%u for %s", status, - replace_key ? "replace" : "insert"); - - if (b->level && !KEY_OFFSET(k)) - btree_current_write(b)->prio_blocked++; - - trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); + /* + * Might land in the middle of an existing extent and have to split it + */ + if (b->keys.ops->is_extents) + ret -= KEY_MAX_U64S; - return true; + return max(ret, 0L); } static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, @@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, struct bkey *replace_key) { bool ret = false; - int oldsize = bch_count_data(b); + int oldsize = bch_count_data(&b->keys); while (!bch_keylist_empty(insert_keys)) { - struct bset *i = write_block(b); struct bkey *k = insert_keys->keys; - if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) - > btree_blocks(b)) + if (bkey_u64s(k) > insert_u64s_remaining(b)) break; if (bkey_cmp(k, &b->key) <= 0) { if (!b->level) bkey_put(b->c, k); - ret |= btree_insert_key(b, op, k, replace_key); + ret |= btree_insert_key(b, k, replace_key); bch_keylist_pop_front(insert_keys); } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { BKEY_PADDED(key) temp; @@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, bch_cut_back(&b->key, &temp.key); bch_cut_front(&b->key, insert_keys->keys); - ret |= btree_insert_key(b, op, &temp.key, replace_key); + ret |= btree_insert_key(b, &temp.key, replace_key); break; } else { break; } } + if (!ret) + op->insert_collision = true; + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); - BUG_ON(bch_count_data(b) < oldsize); + BUG_ON(bch_count_data(&b->keys) < oldsize); return ret; } @@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op, closure_init_stack(&cl); bch_keylist_init(&parent_keys); + if (!b->level && + btree_check_reserve(b, op)) + return -EINTR; + n1 = btree_node_alloc_replacement(b, true); if (IS_ERR(n1)) goto err; - split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; + split = set_blocks(btree_bset_first(n1), + block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; if (split) { unsigned keys = 0; - trace_bcache_btree_node_split(b, n1->sets[0].data->keys); + trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); n2 = bch_btree_node_alloc(b->c, b->level, true); if (IS_ERR(n2)) @@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op, * search tree yet */ - while (keys < (n1->sets[0].data->keys * 3) / 5) - keys += bkey_u64s(node(n1->sets[0].data, keys)); + while (keys < (btree_bset_first(n1)->keys * 3) / 5) + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), + keys)); - bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); - keys += bkey_u64s(node(n1->sets[0].data, keys)); + bkey_copy_key(&n1->key, + bset_bkey_idx(btree_bset_first(n1), keys)); + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); - n2->sets[0].data->keys = n1->sets[0].data->keys - keys; - n1->sets[0].data->keys = keys; + btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; + btree_bset_first(n1)->keys = keys; - memcpy(n2->sets[0].data->start, - end(n1->sets[0].data), - n2->sets[0].data->keys * sizeof(uint64_t)); + memcpy(btree_bset_first(n2)->start, + bset_bkey_last(btree_bset_first(n1)), + btree_bset_first(n2)->keys * sizeof(uint64_t)); bkey_copy_key(&n2->key, &b->key); @@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op, bch_btree_node_write(n2, &cl); rw_unlock(true, n2); } else { - trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); + trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); bch_btree_insert_keys(n1, op, insert_keys, replace_key); } @@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op, return 0; err_free2: + bkey_put(b->c, &n2->key); btree_node_free(n2); rw_unlock(true, n2); err_free1: + bkey_put(b->c, &n1->key); btree_node_free(n1); rw_unlock(true, n1); err: + WARN(1, "bcache: btree split failed"); + if (n3 == ERR_PTR(-EAGAIN) || n2 == ERR_PTR(-EAGAIN) || n1 == ERR_PTR(-EAGAIN)) return -EAGAIN; - pr_warn("couldn't split"); return -ENOMEM; } @@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, { BUG_ON(b->level && replace_key); - if (should_split(b)) { + if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { if (current->bio_list) { op->lock = b->c->root->level + 1; return -EAGAIN; @@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, return -EINTR; } else { /* Invalidated all iterators */ - return btree_split(b, op, insert_keys, replace_key) ?: - -EINTR; + int ret = btree_split(b, op, insert_keys, replace_key); + + return bch_keylist_empty(insert_keys) ? + 0 : ret ?: -EINTR; } } else { - BUG_ON(write_block(b) != b->sets[b->nsets].data); + BUG_ON(write_block(b) != btree_bset_last(b)); if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { if (!b->level) @@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, struct bkey *k; struct btree_iter iter; - bch_btree_iter_init(b, &iter, from); + bch_btree_iter_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, b, + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, struct bkey *k; struct btree_iter iter; - bch_btree_iter_init(b, &iter, from); + bch_btree_iter_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : btree(map_keys_recurse, k, b, op, from, fn, flags); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 767e755..af065e9 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -130,20 +130,12 @@ struct btree { unsigned long flags; uint16_t written; /* would be nice to kill */ uint8_t level; - uint8_t nsets; - uint8_t page_order; - - /* - * Set of sorted keys - the real btree node - plus a binary search tree - * - * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point - * to the memory we have allocated for this btree node. Additionally, - * set[0]->data points to the entire btree node as it exists on disk. - */ - struct bset_tree sets[MAX_BSETS]; + + struct btree_keys keys; /* For outstanding btree writes, used as a lock - protects write_idx */ - struct closure_with_waitlist io; + struct closure io; + struct semaphore io_mutex; struct list_head list; struct delayed_work work; @@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b) return b->writes + (btree_node_write_idx(b) ^ 1); } -static inline unsigned bset_offset(struct btree *b, struct bset *i) +static inline struct bset *btree_bset_first(struct btree *b) { - return (((size_t) i) - ((size_t) b->sets->data)) >> 9; + return b->keys.set->data; } -static inline struct bset *write_block(struct btree *b) +static inline struct bset *btree_bset_last(struct btree *b) { - return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); + return bset_tree_last(&b->keys)->data; } -static inline bool bset_written(struct btree *b, struct bset_tree *t) +static inline unsigned bset_block_offset(struct btree *b, struct bset *i) { - return t->data < write_block(b); -} - -static inline bool bkey_written(struct btree *b, struct bkey *k) -{ - return k < write_block(b)->start; + return bset_sector_offset(&b->keys, i) >> b->c->block_bits; } static inline void set_gc_sectors(struct cache_set *c) @@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c) atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); } -static inline struct bkey *bch_btree_iter_init(struct btree *b, - struct btree_iter *iter, - struct bkey *search) -{ - return __bch_btree_iter_init(b, iter, search, b->sets); -} - -static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) -{ - if (b->level) - return bch_btree_ptr_invalid(b->c, k); - else - return bch_extent_ptr_invalid(b->c, k); -} - void bkey_put(struct cache_set *c, struct bkey *k); /* Looping macros */ @@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k); iter++) \ hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), b, filter));) - -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) - /* Recursing down the btree */ struct btree_op { + /* for waiting on btree reserve in btree_split() */ + wait_queue_t wait; + /* Btree level at which we start taking write locks */ short lock; @@ -249,6 +216,7 @@ struct btree_op { static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) { memset(op, 0, sizeof(struct btree_op)); + init_wait(&op->wait); op->lock = write_lock_level; } @@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b) (w ? up_write : up_read)(&b->lock); } -void bch_btree_node_read(struct btree *); +void bch_btree_node_read_done(struct btree *); void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_set_root(struct btree *); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index dfff241..7a228de 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -11,19 +11,6 @@ #include "closure.h" -#define CL_FIELD(type, field) \ - case TYPE_ ## type: \ - return &container_of(cl, struct type, cl)->field - -static struct closure_waitlist *closure_waitlist(struct closure *cl) -{ - switch (cl->type) { - CL_FIELD(closure_with_waitlist, wait); - default: - return NULL; - } -} - static inline void closure_put_after_sub(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; @@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) closure_queue(cl); } else { struct closure *parent = cl->parent; - struct closure_waitlist *wait = closure_waitlist(cl); closure_fn *destructor = cl->fn; closure_debug_destroy(cl); - smp_mb(); - atomic_set(&cl->remaining, -1); - - if (wait) - closure_wake_up(wait); - if (destructor) destructor(cl); @@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v) } EXPORT_SYMBOL(closure_sub); +/** + * closure_put - decrement a closure's refcount + */ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } EXPORT_SYMBOL(closure_put); -static void set_waiting(struct closure *cl, unsigned long f) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->waiting_on = f; -#endif -} - +/** + * closure_wake_up - wake up all closures on a wait list, without memory barrier + */ void __closure_wake_up(struct closure_waitlist *wait_list) { struct llist_node *list; @@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list) cl = container_of(reverse, struct closure, list); reverse = llist_next(reverse); - set_waiting(cl, 0); + closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } } EXPORT_SYMBOL(__closure_wake_up); -bool closure_wait(struct closure_waitlist *list, struct closure *cl) +/** + * closure_wait - add a closure to a waitlist + * + * @waitlist will own a ref on @cl, which will be released when + * closure_wake_up() is called on @waitlist. + * + */ +bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) { if (atomic_read(&cl->remaining) & CLOSURE_WAITING) return false; - set_waiting(cl, _RET_IP_); + closure_set_waiting(cl, _RET_IP_); atomic_add(CLOSURE_WAITING + 1, &cl->remaining); - llist_add(&cl->list, &list->list); + llist_add(&cl->list, &waitlist->list); return true; } EXPORT_SYMBOL(closure_wait); /** - * closure_sync() - sleep until a closure a closure has nothing left to wait on + * closure_sync - sleep until a closure a closure has nothing left to wait on * * Sleeps until the refcount hits 1 - the thread that's running the closure owns * the last refcount. @@ -148,46 +134,6 @@ void closure_sync(struct closure *cl) } EXPORT_SYMBOL(closure_sync); -/** - * closure_trylock() - try to acquire the closure, without waiting - * @cl: closure to lock - * - * Returns true if the closure was succesfully locked. - */ -bool closure_trylock(struct closure *cl, struct closure *parent) -{ - if (atomic_cmpxchg(&cl->remaining, -1, - CLOSURE_REMAINING_INITIALIZER) != -1) - return false; - - smp_mb(); - - cl->parent = parent; - if (parent) - closure_get(parent); - - closure_set_ret_ip(cl); - closure_debug_create(cl); - return true; -} -EXPORT_SYMBOL(closure_trylock); - -void __closure_lock(struct closure *cl, struct closure *parent, - struct closure_waitlist *wait_list) -{ - struct closure wait; - closure_init_stack(&wait); - - while (1) { - if (closure_trylock(cl, parent)) - return; - - closure_wait_event(wait_list, &wait, - atomic_read(&cl->remaining) == -1); - } -} -EXPORT_SYMBOL(__closure_lock); - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG static LIST_HEAD(closure_list); diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 9762f1b..7ef7461 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -72,30 +72,6 @@ * closure - _always_ use continue_at(). Doing so consistently will help * eliminate an entire class of particularly pernicious races. * - * For a closure to wait on an arbitrary event, we need to introduce waitlists: - * - * struct closure_waitlist list; - * closure_wait_event(list, cl, condition); - * closure_wake_up(wait_list); - * - * These work analagously to wait_event() and wake_up() - except that instead of - * operating on the current thread (for wait_event()) and lists of threads, they - * operate on an explicit closure and lists of closures. - * - * Because it's a closure we can now wait either synchronously or - * asynchronously. closure_wait_event() returns the current value of the - * condition, and if it returned false continue_at() or closure_sync() can be - * used to wait for it to become true. - * - * It's useful for waiting on things when you can't sleep in the context in - * which you must check the condition (perhaps a spinlock held, or you might be - * beneath generic_make_request() - in which case you can't sleep on IO). - * - * closure_wait_event() will wait either synchronously or asynchronously, - * depending on whether the closure is in blocking mode or not. You can pick a - * mode explicitly with closure_wait_event_sync() and - * closure_wait_event_async(), which do just what you might expect. - * * Lastly, you might have a wait list dedicated to a specific event, and have no * need for specifying the condition - you just want to wait until someone runs * closure_wake_up() on the appropriate wait list. In that case, just use @@ -121,40 +97,6 @@ * All this implies that a closure should typically be embedded in a particular * struct (which its refcount will normally control the lifetime of), and that * struct can very much be thought of as a stack frame. - * - * Locking: - * - * Closures are based on work items but they can be thought of as more like - * threads - in that like threads and unlike work items they have a well - * defined lifetime; they are created (with closure_init()) and eventually - * complete after a continue_at(cl, NULL, NULL). - * - * Suppose you've got some larger structure with a closure embedded in it that's - * used for periodically doing garbage collection. You only want one garbage - * collection happening at a time, so the natural thing to do is protect it with - * a lock. However, it's difficult to use a lock protecting a closure correctly - * because the unlock should come after the last continue_to() (additionally, if - * you're using the closure asynchronously a mutex won't work since a mutex has - * to be unlocked by the same process that locked it). - * - * So to make it less error prone and more efficient, we also have the ability - * to use closures as locks: - * - * closure_init_unlocked(); - * closure_trylock(); - * - * That's all we need for trylock() - the last closure_put() implicitly unlocks - * it for you. But for closure_lock(), we also need a wait list: - * - * struct closure_with_waitlist frobnicator_cl; - * - * closure_init_unlocked(&frobnicator_cl); - * closure_lock(&frobnicator_cl); - * - * A closure_with_waitlist embeds a closure and a wait list - much like struct - * delayed_work embeds a work item and a timer_list. The important thing is, use - * it exactly like you would a regular closure and closure_put() will magically - * handle everything for you. */ struct closure; @@ -164,12 +106,6 @@ struct closure_waitlist { struct llist_head list; }; -enum closure_type { - TYPE_closure = 0, - TYPE_closure_with_waitlist = 1, - MAX_CLOSURE_TYPE = 1, -}; - enum closure_state { /* * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by @@ -224,8 +160,6 @@ struct closure { atomic_t remaining; - enum closure_type type; - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e @@ -237,34 +171,12 @@ struct closure { #endif }; -struct closure_with_waitlist { - struct closure cl; - struct closure_waitlist wait; -}; - -extern unsigned invalid_closure_type(void); - -#define __CLOSURE_TYPE(cl, _t) \ - __builtin_types_compatible_p(typeof(cl), struct _t) \ - ? TYPE_ ## _t : \ - -#define __closure_type(cl) \ -( \ - __CLOSURE_TYPE(cl, closure) \ - __CLOSURE_TYPE(cl, closure_with_waitlist) \ - invalid_closure_type() \ -) - void closure_sub(struct closure *cl, int v); void closure_put(struct closure *cl); void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); void closure_sync(struct closure *cl); -bool closure_trylock(struct closure *cl, struct closure *parent); -void __closure_lock(struct closure *cl, struct closure *parent, - struct closure_waitlist *wait_list); - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG void closure_debug_init(void); @@ -293,134 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl) #endif } -static inline void closure_get(struct closure *cl) +static inline void closure_set_waiting(struct closure *cl, unsigned long f) { #ifdef CONFIG_BCACHE_CLOSURES_DEBUG - BUG_ON((atomic_inc_return(&cl->remaining) & - CLOSURE_REMAINING_MASK) <= 1); -#else - atomic_inc(&cl->remaining); + cl->waiting_on = f; #endif } -static inline void closure_set_stopped(struct closure *cl) +static inline void __closure_end_sleep(struct closure *cl) { - atomic_sub(CLOSURE_RUNNING, &cl->remaining); + __set_current_state(TASK_RUNNING); + + if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) + atomic_sub(CLOSURE_SLEEPING, &cl->remaining); } -static inline bool closure_is_unlocked(struct closure *cl) +static inline void __closure_start_sleep(struct closure *cl) { - return atomic_read(&cl->remaining) == -1; + closure_set_ip(cl); + cl->task = current; + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + atomic_add(CLOSURE_SLEEPING, &cl->remaining); } -static inline void do_closure_init(struct closure *cl, struct closure *parent, - bool running) +static inline void closure_set_stopped(struct closure *cl) { - cl->parent = parent; - if (parent) - closure_get(parent); - - if (running) { - closure_debug_create(cl); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); - } else - atomic_set(&cl->remaining, -1); + atomic_sub(CLOSURE_RUNNING, &cl->remaining); +} +static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + struct workqueue_struct *wq) +{ + BUG_ON(object_is_on_stack(cl)); closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; + /* between atomic_dec() in closure_put() */ + smp_mb__before_atomic_dec(); } -/* - * Hack to get at the embedded closure if there is one, by doing an unsafe cast: - * the result of __closure_type() is thrown away, it's used merely for type - * checking. - */ -#define __to_internal_closure(cl) \ -({ \ - BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ - (struct closure *) cl; \ -}) - -#define closure_init_type(cl, parent, running) \ -do { \ - struct closure *_cl = __to_internal_closure(cl); \ - _cl->type = __closure_type(*(cl)); \ - do_closure_init(_cl, parent, running); \ -} while (0) +static inline void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); + } else + cl->fn(cl); +} /** - * __closure_init() - Initialize a closure, skipping the memset() - * - * May be used instead of closure_init() when memory has already been zeroed. + * closure_get - increment a closure's refcount */ -#define __closure_init(cl, parent) \ - closure_init_type(cl, parent, true) +static inline void closure_get(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); +#else + atomic_inc(&cl->remaining); +#endif +} /** - * closure_init() - Initialize a closure, setting the refcount to 1 + * closure_init - Initialize a closure, setting the refcount to 1 * @cl: closure to initialize * @parent: parent of the new closure. cl will take a refcount on it for its * lifetime; may be NULL. */ -#define closure_init(cl, parent) \ -do { \ - memset((cl), 0, sizeof(*(cl))); \ - __closure_init(cl, parent); \ -} while (0) - -static inline void closure_init_stack(struct closure *cl) +static inline void closure_init(struct closure *cl, struct closure *parent) { memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); -} - -/** - * closure_init_unlocked() - Initialize a closure but leave it unlocked. - * @cl: closure to initialize - * - * For when the closure will be used as a lock. The closure may not be used - * until after a closure_lock() or closure_trylock(). - */ -#define closure_init_unlocked(cl) \ -do { \ - memset((cl), 0, sizeof(*(cl))); \ - closure_init_type(cl, NULL, false); \ -} while (0) - -/** - * closure_lock() - lock and initialize a closure. - * @cl: the closure to lock - * @parent: the new parent for this closure - * - * The closure must be of one of the types that has a waitlist (otherwise we - * wouldn't be able to sleep on contention). - * - * @parent has exactly the same meaning as in closure_init(); if non null, the - * closure will take a reference on @parent which will be released when it is - * unlocked. - */ -#define closure_lock(cl, parent) \ - __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) + cl->parent = parent; + if (parent) + closure_get(parent); -static inline void __closure_end_sleep(struct closure *cl) -{ - __set_current_state(TASK_RUNNING); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); - if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) - atomic_sub(CLOSURE_SLEEPING, &cl->remaining); + closure_debug_create(cl); + closure_set_ip(cl); } -static inline void __closure_start_sleep(struct closure *cl) +static inline void closure_init_stack(struct closure *cl) { - closure_set_ip(cl); - cl->task = current; - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) - atomic_add(CLOSURE_SLEEPING, &cl->remaining); + memset(cl, 0, sizeof(struct closure)); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); } /** - * closure_wake_up() - wake up all closures on a wait list. + * closure_wake_up - wake up all closures on a wait list. */ static inline void closure_wake_up(struct closure_waitlist *list) { @@ -428,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list) __closure_wake_up(list); } -/* - * Wait on an event, synchronously or asynchronously - analogous to wait_event() - * but for closures. - * - * The loop is oddly structured so as to avoid a race; we must check the - * condition again after we've added ourself to the waitlist. We know if we were - * already on the waitlist because closure_wait() returns false; thus, we only - * schedule or break if closure_wait() returns false. If it returns true, we - * just loop again - rechecking the condition. - * - * The __closure_wake_up() is necessary because we may race with the event - * becoming true; i.e. we see event false -> wait -> recheck condition, but the - * thread that made the event true may have called closure_wake_up() before we - * added ourself to the wait list. - * - * We have to call closure_sync() at the end instead of just - * __closure_end_sleep() because a different thread might've called - * closure_wake_up() before us and gotten preempted before they dropped the - * refcount on our closure. If this was a stack allocated closure, that would be - * bad. +/** + * continue_at - jump to another function with barrier + * + * After @cl is no longer waiting on anything (i.e. all outstanding refs have + * been dropped with closure_put()), it will resume execution at @fn running out + * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). + * + * NOTE: This macro expands to a return in the calling function! + * + * This is because after calling continue_at() you no longer have a ref on @cl, + * and whatever @cl owns may be freed out from under you - a running closure fn + * has a ref on its own closure which continue_at() drops. */ -#define closure_wait_event(list, cl, condition) \ -({ \ - typeof(condition) ret; \ - \ - while (1) { \ - ret = (condition); \ - if (ret) { \ - __closure_wake_up(list); \ - closure_sync(cl); \ - break; \ - } \ - \ - __closure_start_sleep(cl); \ - \ - if (!closure_wait(list, cl)) \ - schedule(); \ - } \ - \ - ret; \ -}) - -static inline void closure_queue(struct closure *cl) -{ - struct workqueue_struct *wq = cl->wq; - if (wq) { - INIT_WORK(&cl->work, cl->work.func); - BUG_ON(!queue_work(wq, &cl->work)); - } else - cl->fn(cl); -} - -static inline void set_closure_fn(struct closure *cl, closure_fn *fn, - struct workqueue_struct *wq) -{ - BUG_ON(object_is_on_stack(cl)); - closure_set_ip(cl); - cl->fn = fn; - cl->wq = wq; - /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic_dec(); -} - #define continue_at(_cl, _fn, _wq) \ do { \ set_closure_fn(_cl, _fn, _wq); \ @@ -498,8 +323,28 @@ do { \ return; \ } while (0) +/** + * closure_return - finish execution of a closure + * + * This is used to indicate that @cl is finished: when all outstanding refs on + * @cl have been dropped @cl's ref on its parent closure (as passed to + * closure_init()) will be dropped, if one was specified - thus this can be + * thought of as returning to the parent closure. + */ #define closure_return(_cl) continue_at((_cl), NULL, NULL) +/** + * continue_at_nobarrier - jump to another function without barrier + * + * Causes @fn to be executed out of @cl, in @wq context (or called directly if + * @wq is NULL). + * + * NOTE: like continue_at(), this macro expands to a return in the caller! + * + * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, + * thus it's not safe to touch anything protected by @cl after a + * continue_at_nobarrier(). + */ #define continue_at_nobarrier(_cl, _fn, _wq) \ do { \ set_closure_fn(_cl, _fn, _wq); \ @@ -507,6 +352,15 @@ do { \ return; \ } while (0) +/** + * closure_return - finish execution of a closure, with destructor + * + * Works like closure_return(), except @destructor will be called when all + * outstanding refs on @cl have been dropped; @destructor may be used to safely + * free the memory occupied by @cl, and it is called with the ref on the parent + * closure still held - so @destructor could safely return an item to a + * freelist protected by @cl's parent. + */ #define closure_return_with_destructor(_cl, _destructor) \ do { \ set_closure_fn(_cl, _destructor, NULL); \ @@ -514,6 +368,13 @@ do { \ return; \ } while (0) +/** + * closure_call - execute @fn out of a new, uninitialized closure + * + * Typically used when running out of one closure, and we want to run @fn + * asynchronously out of a new closure - @parent will then wait for @cl to + * finish. + */ static inline void closure_call(struct closure *cl, closure_fn fn, struct workqueue_struct *wq, struct closure *parent) @@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn, continue_at_nobarrier(cl, fn, wq); } -static inline void closure_trylock_call(struct closure *cl, closure_fn fn, - struct workqueue_struct *wq, - struct closure *parent) -{ - if (closure_trylock(cl, parent)) - continue_at_nobarrier(cl, fn, wq); -} - #endif /* _LINUX_CLOSURE_H */ diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 03cb4d1..8b1f1d5 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -8,6 +8,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include <linux/console.h> #include <linux/debugfs.h> @@ -17,156 +18,88 @@ static struct dentry *debug; -const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); - size_t bucket = PTR_BUCKET_NR(c, k, i); - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - - if (KEY_SIZE(k) + r > c->sb.bucket_size) - return "bad, length too big"; - if (bucket < ca->sb.first_bucket) - return "bad, short offset"; - if (bucket >= ca->sb.nbuckets) - return "bad, offset past end of device"; - if (ptr_stale(c, k, i)) - return "stale"; - } - - if (!bkey_cmp(k, &ZERO_KEY)) - return "bad, null key"; - if (!KEY_PTRS(k)) - return "bad, no pointers"; - if (!KEY_SIZE(k)) - return "zeroed key"; - return ""; -} - -int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) -{ - unsigned i = 0; - char *out = buf, *end = buf + size; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); - - if (KEY_PTRS(k)) - while (1) { - p("%llu:%llu gen %llu", - PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); - - if (++i == KEY_PTRS(k)) - break; - - p(", "); - } - - p("]"); - - if (KEY_DIRTY(k)) - p(" dirty"); - if (KEY_CSUM(k)) - p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); -#undef p - return out - buf; -} - #ifdef CONFIG_BCACHE_DEBUG -static void dump_bset(struct btree *b, struct bset *i) -{ - struct bkey *k, *next; - unsigned j; - char buf[80]; - - for (k = i->start; k < end(i); k = next) { - next = bkey_next(k); - - bch_bkey_to_text(buf, sizeof(buf), k); - printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), - (uint64_t *) k - i->d, i->keys, buf); - - for (j = 0; j < KEY_PTRS(k); j++) { - size_t n = PTR_BUCKET_NR(b->c, k, j); - printk(" bucket %zu", n); - - if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) - printk(" prio %i", - PTR_BUCKET(b->c, k, j)->prio); - } +#define for_each_written_bset(b, start, i) \ + for (i = (start); \ + (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ + i->seq == (start)->seq; \ + i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ + block_bytes(b->c)) - printk(" %s\n", bch_ptr_status(b->c, k)); - - if (next < end(i) && - bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) - printk(KERN_ERR "Key skipped backwards\n"); - } -} - -static void bch_dump_bucket(struct btree *b) -{ - unsigned i; - - console_lock(); - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); - console_unlock(); -} - -void bch_btree_verify(struct btree *b, struct bset *new) +void bch_btree_verify(struct btree *b) { struct btree *v = b->c->verify_data; - struct closure cl; - closure_init_stack(&cl); + struct bset *ondisk, *sorted, *inmemory; + struct bio *bio; - if (!b->c->verify) + if (!b->c->verify || !b->c->verify_ondisk) return; - closure_wait_event(&b->io.wait, &cl, - atomic_read(&b->io.cl.remaining) == -1); - + down(&b->io_mutex); mutex_lock(&b->c->verify_lock); + ondisk = b->c->verify_ondisk; + sorted = b->c->verify_data->keys.set->data; + inmemory = b->keys.set->data; + bkey_copy(&v->key, &b->key); v->written = 0; v->level = b->level; + v->keys.ops = b->keys.ops; + + bio = bch_bbio_alloc(b->c); + bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); + bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; + bch_bio_map(bio, sorted); - bch_btree_node_read(v); - closure_wait_event(&v->io.wait, &cl, - atomic_read(&b->io.cl.remaining) == -1); + submit_bio_wait(REQ_META|READ_SYNC, bio); + bch_bbio_free(bio, b->c); - if (new->keys != v->sets[0].data->keys || - memcmp(new->start, - v->sets[0].data->start, - (void *) end(new) - (void *) new->start)) { - unsigned i, j; + memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); + + bch_btree_node_read_done(v); + sorted = v->keys.set->data; + + if (inmemory->keys != sorted->keys || + memcmp(inmemory->start, + sorted->start, + (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { + struct bset *i; + unsigned j; console_lock(); - printk(KERN_ERR "*** original memory node:\n"); - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); + printk(KERN_ERR "*** in memory:\n"); + bch_dump_bset(&b->keys, inmemory, 0); - printk(KERN_ERR "*** sorted memory node:\n"); - dump_bset(b, new); + printk(KERN_ERR "*** read back in:\n"); + bch_dump_bset(&v->keys, sorted, 0); - printk(KERN_ERR "*** on disk node:\n"); - dump_bset(v, v->sets[0].data); + for_each_written_bset(b, ondisk, i) { + unsigned block = ((void *) i - (void *) ondisk) / + block_bytes(b->c); + + printk(KERN_ERR "*** on disk block %u:\n", block); + bch_dump_bset(&b->keys, i, block); + } - for (j = 0; j < new->keys; j++) - if (new->d[j] != v->sets[0].data->d[j]) + printk(KERN_ERR "*** block %zu not written\n", + ((void *) i - (void *) ondisk) / block_bytes(b->c)); + + for (j = 0; j < inmemory->keys; j++) + if (inmemory->d[j] != sorted->d[j]) break; + printk(KERN_ERR "b->written %u\n", b->written); + console_unlock(); panic("verify failed at %u\n", j); } mutex_unlock(&b->c->verify_lock); + up(&b->io_mutex); } void bch_data_verify(struct cached_dev *dc, struct bio *bio) @@ -207,74 +140,6 @@ out_put: bio_put(check); } -int __bch_count_data(struct btree *b) -{ - unsigned ret = 0; - struct btree_iter iter; - struct bkey *k; - - if (!b->level) - for_each_key(b, k, &iter) - ret += KEY_SIZE(k); - return ret; -} - -void __bch_check_keys(struct btree *b, const char *fmt, ...) -{ - va_list args; - struct bkey *k, *p = NULL; - struct btree_iter iter; - const char *err; - - for_each_key(b, k, &iter) { - if (!b->level) { - err = "Keys out of order"; - if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) - goto bug; - - if (bch_ptr_invalid(b, k)) - continue; - - err = "Overlapping keys"; - if (p && bkey_cmp(p, &START_KEY(k)) > 0) - goto bug; - } else { - if (bch_ptr_bad(b, k)) - continue; - - err = "Duplicate keys"; - if (p && !bkey_cmp(p, k)) - goto bug; - } - p = k; - } - - err = "Key larger than btree node key"; - if (p && bkey_cmp(p, &b->key) > 0) - goto bug; - - return; -bug: - bch_dump_bucket(b); - - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - - panic("bcache error: %s:\n", err); -} - -void bch_btree_iter_next_check(struct btree_iter *iter) -{ - struct bkey *k = iter->data->k, *next = bkey_next(k); - - if (next < iter->data->end && - bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { - bch_dump_bucket(iter->b); - panic("Key skipped backwards\n"); - } -} - #endif #ifdef CONFIG_DEBUG_FS @@ -321,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (!w) break; - bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); + bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); bch_keybuf_del(&i->keys, w); } diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 2ede60e..1f63c19 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -1,47 +1,30 @@ #ifndef _BCACHE_DEBUG_H #define _BCACHE_DEBUG_H -/* Btree/bkey debug printing */ - -int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); +struct bio; +struct cached_dev; +struct cache_set; #ifdef CONFIG_BCACHE_DEBUG -void bch_btree_verify(struct btree *, struct bset *); +void bch_btree_verify(struct btree *); void bch_data_verify(struct cached_dev *, struct bio *); -int __bch_count_data(struct btree *); -void __bch_check_keys(struct btree *, const char *, ...); -void bch_btree_iter_next_check(struct btree_iter *); -#define EBUG_ON(cond) BUG_ON(cond) #define expensive_debug_checks(c) ((c)->expensive_debug_checks) #define key_merging_disabled(c) ((c)->key_merging_disabled) #define bypass_torture_test(d) ((d)->bypass_torture_test) #else /* DEBUG */ -static inline void bch_btree_verify(struct btree *b, struct bset *i) {} +static inline void bch_btree_verify(struct btree *b) {} static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} -static inline int __bch_count_data(struct btree *b) { return -1; } -static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} -static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} -#define EBUG_ON(cond) do { if (cond); } while (0) #define expensive_debug_checks(c) 0 #define key_merging_disabled(c) 0 #define bypass_torture_test(d) 0 #endif -#define bch_count_data(b) \ - (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) - -#define bch_check_keys(b, ...) \ -do { \ - if (expensive_debug_checks((b)->c)) \ - __bch_check_keys(b, __VA_ARGS__); \ -} while (0) - #ifdef CONFIG_DEBUG_FS void bch_debug_init_cache_set(struct cache_set *); #else diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c new file mode 100644 index 0000000..c3ead58 --- /dev/null +++ b/drivers/md/bcache/extents.c @@ -0,0 +1,616 @@ +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/bcache.txt. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" +#include "writeback.h" + +static void sort_key_next(struct btree_iter *iter, + struct btree_iter_set *i) +{ + i->k = bkey_next(i->k); + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(l.k, r.k); + + return c ? c > 0 : l.k < r.k; +} + +static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size || + bucket < ca->sb.first_bucket || + bucket >= ca->sb.nbuckets) + return true; + } + + return false; +} + +/* Common among btree and extent ptrs */ + +static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size) + return "bad, length too big"; + if (bucket < ca->sb.first_bucket) + return "bad, short offset"; + if (bucket >= ca->sb.nbuckets) + return "bad, offset past end of device"; + if (ptr_stale(c, k, i)) + return "stale"; + } + + if (!bkey_cmp(k, &ZERO_KEY)) + return "bad, null key"; + if (!KEY_PTRS(k)) + return "bad, no pointers"; + if (!KEY_SIZE(k)) + return "zeroed key"; + return ""; +} + +void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) +{ + unsigned i = 0; + char *out = buf, *end = buf + size; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + + for (i = 0; i < KEY_PTRS(k); i++) { + if (i) + p(", "); + + if (PTR_DEV(k, i) == PTR_CHECK_DEV) + p("check dev"); + else + p("%llu:%llu gen %llu", PTR_DEV(k, i), + PTR_OFFSET(k, i), PTR_GEN(k, i)); + } + + p("]"); + + if (KEY_DIRTY(k)) + p(" dirty"); + if (KEY_CSUM(k)) + p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); +#undef p +} + +static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) +{ + struct btree *b = container_of(keys, struct btree, keys); + unsigned j; + char buf[80]; + + bch_extent_to_text(buf, sizeof(buf), k); + printk(" %s", buf); + + for (j = 0; j < KEY_PTRS(k); j++) { + size_t n = PTR_BUCKET_NR(b->c, k, j); + printk(" bucket %zu", n); + + if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) + printk(" prio %i", + PTR_BUCKET(b->c, k, j)->prio); + } + + printk(" %s\n", bch_ptr_status(b->c, k)); +} + +/* Btree ptrs */ + +bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + return __bch_btree_ptr_invalid(b->c, k); +} + +static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) +{ + unsigned i; + char buf[80]; + struct bucket *g; + + if (mutex_trylock(&b->c->bucket_lock)) { + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(b->c, k, i)) { + g = PTR_BUCKET(b->c, k, i); + + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto err; + } + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + return true; +} + +static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned i; + + if (!bkey_cmp(k, &ZERO_KEY) || + !KEY_PTRS(k) || + bch_ptr_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i) || + ptr_stale(b->c, k, i)) + return true; + + if (expensive_debug_checks(b->c) && + btree_ptr_bad_expensive(b, k)) + return true; + + return false; +} + +static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct btree *b = container_of(bk, struct btree, keys); + + if (!KEY_OFFSET(insert)) + btree_current_write(b)->prio_blocked++; + + return false; +} + +const struct btree_keys_ops bch_btree_keys_ops = { + .sort_cmp = bch_key_sort_cmp, + .insert_fixup = bch_btree_ptr_insert_fixup, + .key_invalid = bch_btree_ptr_invalid, + .key_bad = bch_btree_ptr_bad, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, +}; + +/* Extents */ + +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); + + return c ? c > 0 : l.k < r.k; +} + +static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, + struct bkey *tmp) +{ + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; + + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) + i++; + + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) + break; + + if (!KEY_SIZE(i->k)) { + sort_key_next(iter, i); + heap_sift(iter, i - top, bch_extent_sort_cmp); + continue; + } + + if (top->k > i->k) { + if (bkey_cmp(top->k, i->k) >= 0) + sort_key_next(iter, i); + else + bch_cut_front(top->k, i->k); + + heap_sift(iter, i - top, bch_extent_sort_cmp); + } else { + /* can't happen because of comparison func */ + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); + + if (bkey_cmp(i->k, top->k) < 0) { + bkey_copy(tmp, top->k); + + bch_cut_back(&START_KEY(i->k), tmp); + bch_cut_front(i->k, top->k); + heap_sift(iter, 0, bch_extent_sort_cmp); + + return tmp; + } else { + bch_cut_back(&START_KEY(i->k), top->k); + } + } + } + + return NULL; +} + +static bool bch_extent_insert_fixup(struct btree_keys *b, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct cache_set *c = container_of(b, struct btree, keys)->c; + + void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) + { + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(k), + offset, -sectors); + } + + uint64_t old_offset; + unsigned old_size, sectors_found = 0; + + BUG_ON(!KEY_OFFSET(insert)); + BUG_ON(!KEY_SIZE(insert)); + + while (1) { + struct bkey *k = bch_btree_iter_next(iter); + if (!k) + break; + + if (bkey_cmp(&START_KEY(k), insert) >= 0) { + if (KEY_SIZE(k)) + break; + else + continue; + } + + if (bkey_cmp(k, &START_KEY(insert)) <= 0) + continue; + + old_offset = KEY_START(k); + old_size = KEY_SIZE(k); + + /* + * We might overlap with 0 size extents; we can't skip these + * because if they're in the set we're inserting to we have to + * adjust them so they don't overlap with the key we're + * inserting. But we don't want to check them for replace + * operations. + */ + + if (replace_key && KEY_SIZE(k)) { + /* + * k might have been split since we inserted/found the + * key we're replacing + */ + unsigned i; + uint64_t offset = KEY_START(k) - + KEY_START(replace_key); + + /* But it must be a subset of the replace key */ + if (KEY_START(k) < KEY_START(replace_key) || + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) + goto check_failed; + + /* We didn't find a key that we were supposed to */ + if (KEY_START(k) > KEY_START(insert) + sectors_found) + goto check_failed; + + if (!bch_bkey_equal_header(k, replace_key)) + goto check_failed; + + /* skip past gen */ + offset <<= 8; + + BUG_ON(!KEY_PTRS(replace_key)); + + for (i = 0; i < KEY_PTRS(replace_key); i++) + if (k->ptr[i] != replace_key->ptr[i] + offset) + goto check_failed; + + sectors_found = KEY_OFFSET(k) - KEY_START(insert); + } + + if (bkey_cmp(insert, k) < 0 && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { + /* + * We overlapped in the middle of an existing key: that + * means we have to split the old key. But we have to do + * slightly different things depending on whether the + * old key has been written out yet. + */ + + struct bkey *top; + + subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); + + if (bkey_written(b, k)) { + /* + * We insert a new key to cover the top of the + * old key, and the old key is modified in place + * to represent the bottom split. + * + * It's completely arbitrary whether the new key + * is the top or the bottom, but it has to match + * up with what btree_sort_fixup() does - it + * doesn't check for this kind of overlap, it + * depends on us inserting a new key for the top + * here. + */ + top = bch_bset_search(b, bset_tree_last(b), + insert); + bch_bset_insert(b, top, k); + } else { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, k); + bch_bset_insert(b, k, &temp.key); + top = bkey_next(k); + } + + bch_cut_front(insert, top); + bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + goto out; + } + + if (bkey_cmp(insert, k) < 0) { + bch_cut_front(insert, k); + } else { + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) + old_offset = KEY_START(insert); + + if (bkey_written(b, k) && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { + /* + * Completely overwrote, so we don't have to + * invalidate the binary search tree + */ + bch_cut_front(k, k); + } else { + __bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + } + } + + subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); + } + +check_failed: + if (replace_key) { + if (!sectors_found) { + return true; + } else if (sectors_found < KEY_SIZE(insert)) { + SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - + (KEY_SIZE(insert) - sectors_found)); + SET_KEY_SIZE(insert, sectors_found); + } + } +out: + if (KEY_DIRTY(insert)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), + KEY_START(insert), + KEY_SIZE(insert)); + + return false; +} + +static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + char buf[80]; + + if (!KEY_SIZE(k)) + return true; + + if (KEY_SIZE(k) > KEY_OFFSET(k)) + goto bad; + + if (__ptr_invalid(b->c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); + return true; +} + +static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, + unsigned ptr) +{ + struct bucket *g = PTR_BUCKET(b->c, k, ptr); + char buf[80]; + + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->c->gc_mark_valid && + ((GC_MARK(g) != GC_MARK_DIRTY && + KEY_DIRTY(k)) || + GC_MARK(g) == GC_MARK_METADATA)) + goto err; + + if (g->prio == BTREE_PRIO) + goto err; + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + return true; +} + +static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + struct bucket *g; + unsigned i, stale; + + if (!KEY_PTRS(k) || + bch_extent_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i)) + return true; + + if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) + return false; + + for (i = 0; i < KEY_PTRS(k); i++) { + g = PTR_BUCKET(b->c, k, i); + stale = ptr_stale(b->c, k, i); + + btree_bug_on(stale > 96, b, + "key too stale: %i, need_gc %u", + stale, b->c->need_gc); + + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), + b, "stale dirty pointer"); + + if (stale) + return true; + + if (expensive_debug_checks(b->c) && + bch_extent_bad_expensive(b, k, i)) + return true; + } + + return false; +} + +static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +{ + return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & + ~((uint64_t)1 << 63); +} + +static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned i; + + if (key_merging_disabled(b->c)) + return false; + + for (i = 0; i < KEY_PTRS(l); i++) + if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || + PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) + return false; + + /* Keys with no pointers aren't restricted to one bucket and could + * overflow KEY_SIZE + */ + if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { + SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); + SET_KEY_SIZE(l, USHRT_MAX); + + bch_cut_front(l, r); + return false; + } + + if (KEY_CSUM(l)) { + if (KEY_CSUM(r)) + l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); + else + SET_KEY_CSUM(l, 0); + } + + SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); + SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + + return true; +} + +const struct btree_keys_ops bch_extent_keys_ops = { + .sort_cmp = bch_extent_sort_cmp, + .sort_fixup = bch_extent_sort_fixup, + .insert_fixup = bch_extent_insert_fixup, + .key_invalid = bch_extent_invalid, + .key_bad = bch_extent_bad, + .key_merge = bch_extent_merge, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, + .is_extents = true, +}; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h new file mode 100644 index 0000000..e4e2340 --- /dev/null +++ b/drivers/md/bcache/extents.h @@ -0,0 +1,13 @@ +#ifndef _BCACHE_EXTENTS_H +#define _BCACHE_EXTENTS_H + +extern const struct btree_keys_ops bch_btree_keys_ops; +extern const struct btree_keys_ops bch_extent_keys_ops; + +struct bkey; +struct cache_set; + +void bch_extent_to_text(char *, size_t, const struct bkey *); +bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); + +#endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7eafdf0..18039af 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -44,11 +44,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, closure_init_stack(&cl); - pr_debug("reading %llu", (uint64_t) bucket); + pr_debug("reading %u", bucket_index); while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; - len = min_t(unsigned, left, PAGE_SECTORS * 8); + len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); bio_reset(bio); bio->bi_iter.bi_sector = bucket + offset; @@ -74,19 +74,28 @@ reread: left = ca->sb.bucket_size - offset; struct list_head *where; size_t blocks, bytes = set_bytes(j); - if (j->magic != jset_magic(&ca->sb)) + if (j->magic != jset_magic(&ca->sb)) { + pr_debug("%u: bad magic", bucket_index); return ret; + } - if (bytes > left << 9) + if (bytes > left << 9 || + bytes > PAGE_SIZE << JSET_BITS) { + pr_info("%u: too big, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } if (bytes > len << 9) goto reread; - if (j->csum != csum_set(j)) + if (j->csum != csum_set(j)) { + pr_info("%u: bad csum, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } - blocks = set_blocks(j, ca->set); + blocks = set_blocks(j, block_bytes(ca->set)); while (!list_empty(list)) { i = list_first_entry(list, @@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } for (k = i->j.start; - k < end(&i->j); + k < bset_bkey_last(&i->j); k = bkey_next(k)) { unsigned j; @@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) n, i->j.seq - 1, start, end); for (k = i->j.start; - k < end(&i->j); + k < bset_bkey_last(&i->j); k = bkey_next(k)) { trace_bcache_journal_replay_key(k); @@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl) continue_at_nobarrier(cl, journal_write, system_wq); } +static void journal_write_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); + + c->journal.io_in_flight = 0; + spin_unlock(&c->journal.lock); +} + static void journal_write_unlocked(struct closure *cl) __releases(c->journal.lock) { @@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl) struct cache *ca; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; + unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * + c->sb.block_size; struct bio *bio; struct bio_list list; bio_list_init(&list); if (!w->need_write) { - /* - * XXX: have to unlock closure before we unlock journal lock, - * else we race with bch_journal(). But this way we race - * against cache set unregister. Doh. - */ - set_closure_fn(cl, NULL, NULL); - closure_sub(cl, CLOSURE_RUNNING + 1); - spin_unlock(&c->journal.lock); - return; + closure_return_with_destructor(cl, journal_write_unlock); } else if (journal_full(&c->journal)) { journal_reclaim(c); spin_unlock(&c->journal.lock); @@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl) continue_at(cl, journal_write, system_wq); } - c->journal.blocks_free -= set_blocks(w->data, c); + c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); w->data->btree_level = c->root->level; @@ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c) w->need_write = true; - if (closure_trylock(cl, &c->cl)) - journal_write_unlocked(cl); - else + if (!c->journal.io_in_flight) { + c->journal.io_in_flight = 1; + closure_call(cl, journal_write_unlocked, NULL, &c->cl); + } else { spin_unlock(&c->journal.lock); + } } static struct journal_write *journal_wait_for_write(struct cache_set *c, @@ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, { size_t sectors; struct closure cl; + bool wait = false; closure_init_stack(&cl); @@ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, struct journal_write *w = c->journal.cur; sectors = __set_blocks(w->data, w->data->keys + nkeys, - c) * c->sb.block_size; + block_bytes(c)) * c->sb.block_size; if (sectors <= min_t(size_t, c->journal.blocks_free * c->sb.block_size, PAGE_SECTORS << JSET_BITS)) return w; - /* XXX: tracepoint */ + if (wait) + closure_wait(&c->journal.wait, &cl); + if (!journal_full(&c->journal)) { - trace_bcache_journal_entry_full(c); + if (wait) + trace_bcache_journal_entry_full(c); /* * XXX: If we were inserting so many keys that they @@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, */ BUG_ON(!w->data->keys); - closure_wait(&w->wait, &cl); journal_try_write(c); /* unlocks */ } else { - trace_bcache_journal_full(c); + if (wait) + trace_bcache_journal_full(c); - closure_wait(&c->journal.wait, &cl); journal_reclaim(c); spin_unlock(&c->journal.lock); @@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, closure_sync(&cl); spin_lock(&c->journal.lock); + wait = true; } } @@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c, w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); - memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); + memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); w->data->keys += bch_keylist_nkeys(keys); ret = &fifo_back(&c->journal.pin); @@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c) { struct journal *j = &c->journal; - closure_init_unlocked(&j->io); spin_lock_init(&j->lock); INIT_DELAYED_WORK(&j->work, journal_write_work); diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index a6472fd..9180c44 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -104,6 +104,7 @@ struct journal { /* used when waiting because the journal was full */ struct closure_waitlist wait; struct closure io; + int io_in_flight; struct delayed_work work; /* Number of blocks free in the bucket(s) we're currently writing to */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 052bd24..9eb60d1 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c) for_each_cache(ca, c, i) { unsigned sectors_to_move = 0; unsigned reserve_sectors = ca->sb.bucket_size * - min(fifo_used(&ca->free), ca->free.size / 2); + fifo_used(&ca->free[RESERVE_MOVINGGC]); ca->heap.used = 0; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index c906571..72cd213 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -254,6 +254,24 @@ static void bch_data_insert_keys(struct closure *cl) closure_return(cl); } +static int bch_keylist_realloc(struct keylist *l, unsigned u64s, + struct cache_set *c) +{ + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + u64s; + + /* + * The journalling code doesn't handle the case where the keys to insert + * is bigger than an empty write: If we just return -ENOMEM here, + * bio_insert() and bio_invalidate() will insert the keys created so far + * and finish the rest when the keylist is empty. + */ + if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) + return -ENOMEM; + + return __bch_keylist_realloc(l, u64s); +} + static void bch_data_invalidate(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); @@ -266,7 +284,7 @@ static void bch_data_invalidate(struct closure *cl) unsigned sectors = min(bio_sectors(bio), 1U << (KEY_SIZE_BITS - 1)); - if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) + if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) goto out; bio->bi_iter.bi_sector += sectors; @@ -356,7 +374,7 @@ static void bch_data_insert_start(struct closure *cl) /* 1 for the device pointer and 1 for the chksum */ if (bch_keylist_realloc(&op->insert_keys, - 1 + (op->csum ? 1 : 0), + 3 + (op->csum ? 1 : 0), op->c)) continue_at(cl, bch_data_insert_keys, bcache_wq); @@ -596,14 +614,12 @@ struct search { /* Stack frame for bio_complete */ struct closure cl; - struct bcache_device *d; - struct bbio bio; struct bio *orig_bio; struct bio *cache_miss; + struct bcache_device *d; unsigned insert_bio_sectors; - unsigned recoverable:1; unsigned write:1; unsigned read_dirty_data:1; @@ -629,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error) if (error) s->iop.error = error; - else if (ptr_stale(s->iop.c, &b->key, 0)) { + else if (!KEY_DIRTY(&b->key) && + ptr_stale(s->iop.c, &b->key, 0)) { atomic_long_inc(&s->iop.c->cache_read_races); s->iop.error = -EINTR; } @@ -710,10 +727,13 @@ static void cache_lookup(struct closure *cl) { struct search *s = container_of(cl, struct search, iop.cl); struct bio *bio = &s->bio.bio; + int ret; + + bch_btree_op_init(&s->op, -1); - int ret = bch_btree_map_keys(&s->op, s->iop.c, - &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), - cache_lookup_fn, MAP_END_KEY); + ret = bch_btree_map_keys(&s->op, s->iop.c, + &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), + cache_lookup_fn, MAP_END_KEY); if (ret == -EAGAIN) continue_at(cl, cache_lookup, bcache_wq); @@ -754,12 +774,12 @@ static void bio_complete(struct search *s) } } -static void do_bio_hook(struct search *s) +static void do_bio_hook(struct search *s, struct bio *orig_bio) { struct bio *bio = &s->bio.bio; bio_init(bio); - __bio_clone_fast(bio, s->orig_bio); + __bio_clone_fast(bio, orig_bio); bio->bi_end_io = request_endio; bio->bi_private = &s->cl; @@ -778,26 +798,32 @@ static void search_free(struct closure *cl) mempool_free(s, s->d->c->search); } -static struct search *search_alloc(struct bio *bio, struct bcache_device *d) +static inline struct search *search_alloc(struct bio *bio, + struct bcache_device *d) { struct search *s; s = mempool_alloc(d->c->search, GFP_NOIO); - memset(s, 0, offsetof(struct search, iop.insert_keys)); - __closure_init(&s->cl, NULL); + closure_init(&s->cl, NULL); + do_bio_hook(s, bio); - s->iop.inode = d->id; - s->iop.c = d->c; - s->d = d; - s->op.lock = -1; - s->iop.write_point = hash_long((unsigned long) current, 16); s->orig_bio = bio; - s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; + s->cache_miss = NULL; + s->d = d; s->recoverable = 1; + s->write = (bio->bi_rw & REQ_WRITE) != 0; + s->read_dirty_data = 0; s->start_time = jiffies; - do_bio_hook(s); + + s->iop.c = d->c; + s->iop.bio = NULL; + s->iop.inode = d->id; + s->iop.write_point = hash_long((unsigned long) current, 16); + s->iop.write_prio = 0; + s->iop.error = 0; + s->iop.flags = 0; + s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; return s; } @@ -843,7 +869,7 @@ static void cached_dev_read_error(struct closure *cl) trace_bcache_read_retry(s->orig_bio); s->iop.error = 0; - do_bio_hook(s); + do_bio_hook(s, s->orig_bio); /* XXX: invalidate cache */ diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 2cd65bf..39f21db 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -13,17 +13,22 @@ struct data_insert_op { uint16_t write_prio; short error; - unsigned bypass:1; - unsigned writeback:1; - unsigned flush_journal:1; - unsigned csum:1; + union { + uint16_t flags; - unsigned replace:1; - unsigned replace_collision:1; + struct { + unsigned bypass:1; + unsigned writeback:1; + unsigned flush_journal:1; + unsigned csum:1; - unsigned insert_data_done:1; + unsigned replace:1; + unsigned replace_collision:1; + + unsigned insert_data_done:1; + }; + }; - /* Anything past this point won't get zeroed in search_alloc() */ struct keylist insert_keys; BKEY_PADDED(replace_key); }; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 93d593f..24a3a15 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -9,6 +9,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include "request.h" #include "writeback.h" @@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error) struct cached_dev *dc = bio->bi_private; /* XXX: error checking */ - closure_put(&dc->sb_write.cl); + closure_put(&dc->sb_write); } static void __write_super(struct cache_sb *sb, struct bio *bio) @@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) submit_bio(REQ_WRITE, bio); } +static void bch_write_bdev_super_unlock(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); + + up(&dc->sb_write_mutex); +} + void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) { - struct closure *cl = &dc->sb_write.cl; + struct closure *cl = &dc->sb_write; struct bio *bio = &dc->sb_bio; - closure_lock(&dc->sb_write, parent); + down(&dc->sb_write_mutex); + closure_init(cl, parent); bio_reset(bio); bio->bi_bdev = dc->bdev; @@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) closure_get(cl); __write_super(&dc->sb, bio); - closure_return(cl); + closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } static void write_super_endio(struct bio *bio, int error) @@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error) struct cache *ca = bio->bi_private; bch_count_io_errors(ca, error, "writing superblock"); - closure_put(&ca->set->sb_write.cl); + closure_put(&ca->set->sb_write); +} + +static void bcache_write_super_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, sb_write); + + up(&c->sb_write_mutex); } void bcache_write_super(struct cache_set *c) { - struct closure *cl = &c->sb_write.cl; + struct closure *cl = &c->sb_write; struct cache *ca; unsigned i; - closure_lock(&c->sb_write, &c->cl); + down(&c->sb_write_mutex); + closure_init(cl, &c->cl); c->sb.seq++; @@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c) __write_super(&ca->sb, bio); } - closure_return(cl); + closure_return_with_destructor(cl, bcache_write_super_unlock); } /* UUID io */ @@ -325,23 +342,31 @@ void bcache_write_super(struct cache_set *c) static void uuid_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); cache_set_err_on(error, c, "accessing uuids"); bch_bbio_free(bio, c); closure_put(cl); } +static void uuid_io_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); + + up(&c->uuid_write_mutex); +} + static void uuid_io(struct cache_set *c, unsigned long rw, struct bkey *k, struct closure *parent) { - struct closure *cl = &c->uuid_write.cl; + struct closure *cl = &c->uuid_write; struct uuid_entry *u; unsigned i; char buf[80]; BUG_ON(!parent); - closure_lock(&c->uuid_write, parent); + down(&c->uuid_write_mutex); + closure_init(cl, parent); for (i = 0; i < KEY_PTRS(k); i++) { struct bio *bio = bch_bbio_alloc(c); @@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, break; } - bch_bkey_to_text(buf, sizeof(buf), k); + bch_extent_to_text(buf, sizeof(buf), k); pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) @@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw, u - c->uuids, u->uuid, u->label, u->first_reg, u->last_reg, u->invalidated); - closure_return(cl); + closure_return_with_destructor(cl, uuid_io_unlock); } static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) { struct bkey *k = &j->uuid_bucket; - if (bch_btree_ptr_invalid(c, k)) + if (__bch_btree_ptr_invalid(c, k)) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); @@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c) lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) return 1; SET_KEY_SIZE(&k.key, c->sb.bucket_size); @@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), + // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; @@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -1098,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); INIT_WORK(&dc->detach, cached_dev_detach_finish); - closure_init_unlocked(&dc->sb_write); + sema_init(&dc->sb_write_mutex, 1); INIT_LIST_HEAD(&dc->io_lru); spin_lock_init(&dc->io_lock); bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); @@ -1110,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } + dc->disk.stripe_size = q->limits.io_opt >> 9; + + if (dc->disk.stripe_size) + dc->partial_stripes_expensive = + q->limits.raid_partial_stripes_expensive; + ret = bcache_device_init(&dc->disk, block_size, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); if (ret) @@ -1321,8 +1352,8 @@ static void cache_set_free(struct closure *cl) if (ca) kobject_put(&ca->kobj); + bch_bset_sort_state_free(&c->sort); free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); - free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); if (c->bio_split) bioset_free(c->bio_split); @@ -1447,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); - c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; + c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); - c->sort_crit_factor = int_sqrt(c->btree_pages); - - closure_init_unlocked(&c->sb_write); + sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->try_wait); init_waitqueue_head(&c->bucket_wait); - closure_init_unlocked(&c->uuid_write); - mutex_init(&c->sort_lock); + sema_init(&c->uuid_write_mutex, 1); - spin_lock_init(&c->sort_time.lock); spin_lock_init(&c->btree_gc_time.lock); spin_lock_init(&c->btree_split_time.lock); spin_lock_init(&c->btree_read_time.lock); @@ -1489,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bucket_pages(c))) || !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || bch_journal_alloc(c) || bch_btree_cache_alloc(c) || - bch_open_buckets_alloc(c)) + bch_open_buckets_alloc(c) || + bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) goto err; c->congested_read_threshold_us = 2000; @@ -1549,7 +1576,7 @@ static void run_cache_set(struct cache_set *c) k = &j->btree_root; err = "bad btree root"; - if (bch_btree_ptr_invalid(c, k)) + if (__bch_btree_ptr_invalid(c, k)) goto err; err = "error reading btree root"; @@ -1743,6 +1770,7 @@ err: void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); + unsigned i; if (ca->set) ca->set->cache[ca->sb.nr_this_dev] = NULL; @@ -1756,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj) free_heap(&ca->heap); free_fifo(&ca->unused); free_fifo(&ca->free_inc); - free_fifo(&ca->free); + + for (i = 0; i < RESERVE_NR; i++) + free_fifo(&ca->free[i]); if (ca->sb_bio.bi_inline_vecs[0].bv_page) put_page(ca->sb_bio.bi_io_vec[0].bv_page); @@ -1782,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) ca->journal.bio.bi_max_vecs = 8; ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; - free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; - free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); + free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; - if (!init_fifo(&ca->free, free, GFP_KERNEL) || + if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || !init_heap(&ca->heap, free << 3, GFP_KERNEL) || @@ -2030,7 +2062,8 @@ static void bcache_exit(void) kobject_put(bcache_kobj); if (bcache_wq) destroy_workqueue(bcache_wq); - unregister_blkdev(bcache_major, "bcache"); + if (bcache_major) + unregister_blkdev(bcache_major, "bcache"); unregister_reboot_notifier(&reboot); } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index a1f8561..c6ab693 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test); rw_attribute(key_merging_disabled); rw_attribute(gc_always_rewrite); rw_attribute(expensive_debug_checks); -rw_attribute(freelist_percent); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); @@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = { }; KTYPE(bch_flash_dev); +struct bset_stats_op { + struct btree_op op; + size_t nodes; + struct bset_stats stats; +}; + +static int btree_bset_stats(struct btree_op *b_op, struct btree *b) +{ + struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); + + op->nodes++; + bch_btree_keys_stats(&b->keys, &op->stats); + + return MAP_CONTINUE; +} + +int bch_bset_print_stats(struct cache_set *c, char *buf) +{ + struct bset_stats_op op; + int ret; + + memset(&op, 0, sizeof(op)); + bch_btree_op_init(&op.op, -1); + + ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); + if (ret < 0) + return ret; + + return snprintf(buf, PAGE_SIZE, + "btree nodes: %zu\n" + "written sets: %zu\n" + "unwritten sets: %zu\n" + "written key bytes: %zu\n" + "unwritten key bytes: %zu\n" + "floats: %zu\n" + "failed: %zu\n", + op.nodes, + op.stats.sets_written, op.stats.sets_unwritten, + op.stats.bytes_written, op.stats.bytes_unwritten, + op.stats.floats, op.stats.failed); +} + SHOW(__bch_cache_set) { unsigned root_usage(struct cache_set *c) @@ -419,7 +460,7 @@ lock_root: rw_lock(false, b, b->level); } while (b != c->root); - for_each_key_filter(b, k, &iter, bch_ptr_bad) + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) bytes += bkey_bytes(k); rw_unlock(false, b); @@ -434,7 +475,7 @@ lock_root: mutex_lock(&c->bucket_lock); list_for_each_entry(b, &c->btree_cache, list) - ret += 1 << (b->page_order + PAGE_SHIFT); + ret += 1 << (b->keys.page_order + PAGE_SHIFT); mutex_unlock(&c->bucket_lock); return ret; @@ -491,7 +532,7 @@ lock_root: sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); - sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); + sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); @@ -711,9 +752,6 @@ SHOW(__bch_cache) sysfs_print(io_errors, atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); - sysfs_print(freelist_percent, ca->free.size * 100 / - ((size_t) ca->sb.nbuckets)); - if (attr == &sysfs_cache_replacement_policy) return bch_snprint_string_list(buf, PAGE_SIZE, cache_replacement_policies, @@ -820,32 +858,6 @@ STORE(__bch_cache) } } - if (attr == &sysfs_freelist_percent) { - DECLARE_FIFO(long, free); - long i; - size_t p = strtoul_or_return(buf); - - p = clamp_t(size_t, - ((size_t) ca->sb.nbuckets * p) / 100, - roundup_pow_of_two(ca->sb.nbuckets) >> 9, - ca->sb.nbuckets / 2); - - if (!init_fifo_exact(&free, p, GFP_KERNEL)) - return -ENOMEM; - - mutex_lock(&ca->set->bucket_lock); - - fifo_move(&free, &ca->free); - fifo_swap(&free, &ca->free); - - mutex_unlock(&ca->set->bucket_lock); - - while (fifo_pop(&free, i)) - atomic_dec(&ca->buckets[i].pin); - - free_fifo(&free); - } - if (attr == &sysfs_clear_stats) { atomic_long_set(&ca->sectors_written, 0); atomic_long_set(&ca->btree_sectors_written, 0); @@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = { &sysfs_metadata_written, &sysfs_io_errors, &sysfs_clear_stats, - &sysfs_freelist_percent, &sysfs_cache_replacement_policy, NULL }; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1030c60..ac7d0d1 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -2,6 +2,7 @@ #ifndef _BCACHE_UTIL_H #define _BCACHE_UTIL_H +#include <linux/blkdev.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/llist.h> @@ -17,11 +18,13 @@ struct closure; #ifdef CONFIG_BCACHE_DEBUG +#define EBUG_ON(cond) BUG_ON(cond) #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) #else /* DEBUG */ +#define EBUG_ON(cond) do { if (cond); } while (0) #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) @@ -391,6 +394,11 @@ struct time_stats { void bch_time_stats_update(struct time_stats *stats, uint64_t time); +static inline unsigned local_clock_us(void) +{ + return local_clock() >> 10; +} + #define NSEC_PER_ns 1L #define NSEC_PER_us NSEC_PER_USEC #define NSEC_PER_ms NSEC_PER_MSEC diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 67ca9c3..f1feade 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6103,6 +6103,7 @@ static int run(struct mddev *mddev) blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); + mddev->queue->limits.raid_partial_stripes_expensive = 1; /* * We can only discard a whole stripe. It doesn't make sense to * discard data disk but write parity disk |