From cc9d3c382bc1674884c2e5e468d51230a9503dee Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Fri, 13 Sep 2013 14:54:30 +0900 Subject: dm mpath: do not fail path on -ENOSPC Since ENOSPC is a target-side error, dm-mpath should just pass the error information to upper layer instead of retrying itself with path failover. Otherwise it will end up failing all paths down while path checkers find all paths ok. ENOSPC can now be returned from SCSI device after commit a9d6ceb8 ("[SCSI] return ENOSPC on thin provisioning failure"). Signed-off-by: Jun'ichi Nomura Acked-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index b759a12..075747e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1268,6 +1268,7 @@ static int noretry_error(int error) case -EREMOTEIO: case -EILSEQ: case -ENODATA: + case -ENOSPC: return 1; } -- cgit v1.1 From bbf3f8cbdc139860a14c4fc2bb25432427045aaa Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 13 Sep 2013 17:42:24 -0400 Subject: dm stats: fix possible counter corruption on 32-bit systems There was a deliberate race condition in dm_stat_for_entry() to avoid the overhead of disabling and enabling interrupts. The race could result in some events not being counted on 64-bit architectures. However, on 32-bit architectures, operations on long long variables are not atomic, so the race condition could cause the counter to jump by 2^32. Such jumps could be disruptive, so we need to do proper locking on 32-bit architectures. Signed-off-by: Mikulas Patocka Cc: Alasdair G. Kergon Signed-off-by: Mike Snitzer --- drivers/md/dm-stats.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 8ae31e8..3d404c1 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -451,19 +451,26 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry, struct dm_stat_percpu *p; /* - * For strict correctness we should use local_irq_disable/enable + * For strict correctness we should use local_irq_save/restore * instead of preempt_disable/enable. * - * This is racy if the driver finishes bios from non-interrupt - * context as well as from interrupt context or from more different - * interrupts. + * preempt_disable/enable is racy if the driver finishes bios + * from non-interrupt context as well as from interrupt context + * or from more different interrupts. * - * However, the race only results in not counting some events, - * so it is acceptable. + * On 64-bit architectures the race only results in not counting some + * events, so it is acceptable. On 32-bit architectures the race could + * cause the counter going off by 2^32, so we need to do proper locking + * there. * * part_stat_lock()/part_stat_unlock() have this race too. */ +#if BITS_PER_LONG == 32 + unsigned long flags; + local_irq_save(flags); +#else preempt_disable(); +#endif p = &s->stat_percpu[smp_processor_id()][entry]; if (!end) { @@ -478,7 +485,11 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry, p->ticks[idx] += duration; } +#if BITS_PER_LONG == 32 + local_irq_restore(flags); +#else preempt_enable(); +#endif } static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw, -- cgit v1.1 From 5ea330a75bd86b2b2a01d7b85c516983238306fb Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 18 Sep 2013 19:14:22 -0400 Subject: dm snapshot: workaround for a false positive lockdep warning The kernel reports a lockdep warning if a snapshot is invalidated because it runs out of space. The lockdep warning was triggered by commit 0976dfc1d0cd80a4e9dfaf87bd87 ("workqueue: Catch more locking problems with flush_work()") in v3.5. The warning is false positive. The real cause for the warning is that the lockdep engine treats different instances of md->lock as a single lock. This patch is a workaround - we use flush_workqueue instead of flush_work. This code path is not performance sensitive (it is called only on initialization or invalidation), thus it doesn't matter that we flush the whole workqueue. The real fix for the problem would be to teach the lockdep engine to treat different instances of md->lock as separate locks. Signed-off-by: Mikulas Patocka Acked-by: Alasdair G Kergon Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.5+ --- drivers/md/dm-snap-persistent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 3ac4156..4caa8e6 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, */ INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); - flush_work(&req.work); + flush_workqueue(ps->metadata_wq); return req.result; } -- cgit v1.1 From 60e356f381954d79088d0455e357db48cfdd6857 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 18 Sep 2013 19:40:42 -0400 Subject: dm-snapshot: fix performance degradation due to small hash size LVM2, since version 2.02.96, creates origin with zero size, then loads the snapshot driver and then loads the origin. Consequently, the snapshot driver sees the origin size zero and sets the hash size to the lower bound 64. Such small hash table causes performance degradation. This patch changes it so that the hash size is determined by the size of snapshot volume, not minimum of origin and snapshot size. It doesn't make sense to set the snapshot size significantly larger than the origin size, so we do not need to take origin size into account when calculating the hash size. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-snap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c434e5a..aec57d7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -725,17 +725,16 @@ static int calc_max_buckets(void) */ static int init_hash_tables(struct dm_snapshot *s) { - sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + sector_t hash_size, cow_dev_size, max_buckets; /* * Calculate based on the size of the original volume or * the COW volume... */ cow_dev_size = get_dev_size(s->cow->bdev); - origin_dev_size = get_dev_size(s->origin->bdev); max_buckets = calc_max_buckets(); - hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; + hash_size = cow_dev_size >> s->store->chunk_shift; hash_size = min(hash_size, max_buckets); if (hash_size < 64) -- cgit v1.1 From f84cb8a46a771f36a04a02c61ea635c968ed5f6a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 19 Sep 2013 12:13:58 -0400 Subject: dm mpath: disable WRITE SAME if it fails Workaround the SCSI layer's problematic WRITE SAME heuristics by disabling WRITE SAME in the DM multipath device's queue_limits if an underlying device disabled it. The WRITE SAME heuristics, with both the original commit 5db44863b6eb ("[SCSI] sd: Implement support for WRITE SAME") and the updated commit 66c28f971 ("[SCSI] sd: Update WRITE SAME heuristics"), default to enabling WRITE SAME(10) even without successfully determining it is supported. After the first failed WRITE SAME the SCSI layer will disable WRITE SAME for the device (by setting sdkp->device->no_write_same which results in 'max_write_same_sectors' in device's queue_limits to be set to 0). When a device is stacked ontop of such a SCSI device any changes to that SCSI device's queue_limits do not automatically propagate up the stack. As such, a DM multipath device will not have its WRITE SAME support disabled. This causes the block layer to continue to issue WRITE SAME requests to the mpath device which causes paths to fail and (if mpath IO isn't configured to queue when no paths are available) it will result in actual IO errors to the upper layers. This fix doesn't help configurations that have additional devices stacked ontop of the mpath device (e.g. LVM created linear DM devices ontop). A proper fix that restacks all the queue_limits from the bottom of the device stack up will need to be explored if SCSI will continue to use this model of optimistically allowing op codes and then disabling them after they fail for the first time. Before this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: failing WRITE SAME IO with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. device-mapper: multipath: Failing path 8:112. end_request: I/O error, dev dm-6, sector 4616 dm-6: WRITE SAME failed. Manually zeroing. end_request: I/O error, dev dm-6, sector 4616 end_request: I/O error, dev dm-6, sector 5640 end_request: I/O error, dev dm-6, sector 6664 end_request: I/O error, dev dm-6, sector 7688 end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. end_request: I/O error, dev dm-6, sector 524296 Aborting journal on device dm-6-8. end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 33553920 After this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: WRITE SAME I/O failed with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 0 It should be noted that WRITE SAME support wasn't enabled in DM multipath until v3.10. Signed-off-by: Mike Snitzer Cc: Martin K. Petersen Cc: Hannes Reinecke Cc: stable@vger.kernel.org # 3.10+ --- drivers/md/dm-mpath.c | 11 ++++++++++- drivers/md/dm.c | 11 +++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 075747e..e08be94 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1299,8 +1299,17 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!error && !clone->errors) return 0; /* I/O complete */ - if (noretry_error(error)) + if (noretry_error(error)) { + if ((clone->cmd_flags & REQ_WRITE_SAME) && + !clone->q->limits.max_write_same_sectors) { + struct queue_limits *limits; + + /* device doesn't really support WRITE SAME, disable it */ + limits = dm_get_queue_limits(dm_table_get_md(m->ti->table)); + limits->max_write_same_sectors = 0; + } return error; + } if (mpio->pgpath) fail_path(mpio->pgpath); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6a5e9ed..7b0ccdc 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2278,6 +2278,17 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md) } /* + * The queue_limits are only valid as long as you have a reference + * count on 'md'. + */ +struct queue_limits *dm_get_queue_limits(struct mapped_device *md) +{ + BUG_ON(!atomic_read(&md->holders)); + return &md->queue->limits; +} +EXPORT_SYMBOL_GPL(dm_get_queue_limits); + +/* * Fully initialize a request-based queue (->elevator, ->request_fn, etc). */ static int dm_init_request_based_queue(struct mapped_device *md) -- cgit v1.1 From b60ab990ccdf34b0159bf5ff52f4acee7c940d78 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 19 Sep 2013 18:49:11 -0400 Subject: dm thin: do not expose non-zero discard limits if discards disabled Fix issue where the block layer would stack the discard limits of the pool's data device even if the "ignore_discard" pool feature was specified. The pool and thin device(s) still had discards disabled because the QUEUE_FLAG_DISCARD request_queue flag wasn't set. But to avoid user confusion when "ignore_discard" is used: both the pool device and the thin device(s) have zeroes for all discard limits. Also, always set discard_zeroes_data_unsupported in targets because they should never advertise the 'discard_zeroes_data' capability (even if the pool's data device supports it). Signed-off-by: Mike Snitzer Acked-by: Joe Thornber --- drivers/md/dm-thin.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index ed06342..2c0cf51 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2095,6 +2095,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) * them down to the data device. The thin device's discard * processing will cause mappings to be removed from the btree. */ + ti->discard_zeroes_data_unsupported = true; if (pf.discard_enabled && pf.discard_passdown) { ti->num_discard_bios = 1; @@ -2104,7 +2105,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) * thin devices' discard limits consistent). */ ti->discards_supported = true; - ti->discard_zeroes_data_unsupported = true; } ti->private = pt; @@ -2689,8 +2689,16 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) * They get transferred to the live pool in bind_control_target() * called from pool_preresume(). */ - if (!pt->adjusted_pf.discard_enabled) + if (!pt->adjusted_pf.discard_enabled) { + /* + * Must explicitly disallow stacking discard limits otherwise the + * block layer will stack them if pool's data device has support. + * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the + * user to see that, so make sure to set all discard limits to 0. + */ + limits->discard_granularity = 0; return; + } disable_passdown_if_not_supported(pt); @@ -2826,10 +2834,10 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); /* In case the pool supports discards, pass them on. */ + ti->discard_zeroes_data_unsupported = true; if (tc->pool->pf.discard_enabled) { ti->discards_supported = true; ti->num_discard_bios = 1; - ti->discard_zeroes_data_unsupported = true; /* Discard bios must be split on a block boundary */ ti->split_discard_bios = true; } -- cgit v1.1 From 6cfa58573f9b4a543005baa002e137820504c7af Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Sep 2013 18:06:11 -0400 Subject: dm: lower bio-based mempool reservation Bio-based device mapper processing doesn't need larger mempools (like request-based DM does), so lower the number of reserved entries for bio-based operation. 16 was already used for bio-based DM's bioset but mistakenly wasn't used for it's _io_cache. Formalize difference between bio-based and request-based defaults by introducing RESERVED_BIO_BASED_IOS and RESERVED_REQUEST_BASED_IOS. (based on older code from Mikulas Patocka) Signed-off-by: Mike Snitzer Signed-off-by: Frank Mayhar Acked-by: Mikulas Patocka --- drivers/md/dm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7b0ccdc..f2e5a50 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -211,7 +211,8 @@ struct dm_md_mempools { struct bio_set *bs; }; -#define MIN_IOS 256 +#define RESERVED_BIO_BASED_IOS 16 +#define RESERVED_REQUEST_BASED_IOS 256 static struct kmem_cache *_io_cache; static struct kmem_cache *_rq_tio_cache; @@ -2873,18 +2874,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u if (type == DM_TYPE_BIO_BASED) { cachep = _io_cache; - pool_size = 16; + pool_size = RESERVED_BIO_BASED_IOS; front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); } else if (type == DM_TYPE_REQUEST_BASED) { cachep = _rq_tio_cache; - pool_size = MIN_IOS; + pool_size = RESERVED_REQUEST_BASED_IOS; front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_bio_data_size is not used. See __bind_mempools(). */ WARN_ON(per_bio_data_size != 0); } else goto out; - pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep); + pools->io_pool = mempool_create_slab_pool(pool_size, cachep); if (!pools->io_pool) goto out; -- cgit v1.1 From f47908269fb53d522a956b78612a0037f5faf8e7 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Sep 2013 18:06:12 -0400 Subject: dm: add reserved_rq_based_ios module parameter Allow user to change the number of IOs that are reserved by request-based DM's mempools by writing to this file: /sys/module/dm_mod/parameters/reserved_rq_based_ios The default value is RESERVED_REQUEST_BASED_IOS (256). The maximum allowed value is RESERVED_MAX_IOS (1024). Export dm_get_reserved_rq_based_ios() for use by DM targets and core code. Switch to sizing dm-mpath's mempool using DM core's configurable 'reserved_rq_based_ios'. Signed-off-by: Mike Snitzer Signed-off-by: Frank Mayhar Acked-by: Mikulas Patocka --- drivers/md/dm-mpath.c | 6 +++--- drivers/md/dm.c | 38 +++++++++++++++++++++++++++++++++++++- drivers/md/dm.h | 2 ++ 3 files changed, 42 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e08be94..de570a5 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -7,6 +7,7 @@ #include +#include "dm.h" #include "dm-path-selector.h" #include "dm-uevent.h" @@ -116,8 +117,6 @@ struct dm_mpath_io { typedef int (*action_fn) (struct pgpath *pgpath); -#define MIN_IOS 256 /* Mempool size */ - static struct kmem_cache *_mpio_cache; static struct workqueue_struct *kmultipathd, *kmpath_handlerd; @@ -190,6 +189,7 @@ static void free_priority_group(struct priority_group *pg, static struct multipath *alloc_multipath(struct dm_target *ti) { struct multipath *m; + unsigned min_ios = dm_get_reserved_rq_based_ios(); m = kzalloc(sizeof(*m), GFP_KERNEL); if (m) { @@ -202,7 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) INIT_WORK(&m->trigger_event, trigger_event); init_waitqueue_head(&m->pg_init_wait); mutex_init(&m->work_mutex); - m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); + m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); if (!m->mpio_pool) { kfree(m); return NULL; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f2e5a50..1e85f1d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -213,9 +213,41 @@ struct dm_md_mempools { #define RESERVED_BIO_BASED_IOS 16 #define RESERVED_REQUEST_BASED_IOS 256 +#define RESERVED_MAX_IOS 1024 static struct kmem_cache *_io_cache; static struct kmem_cache *_rq_tio_cache; +/* + * Request-based DM's mempools' reserved IOs set by the user. + */ +static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; + +static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, + unsigned def, unsigned max) +{ + unsigned ios = ACCESS_ONCE(*reserved_ios); + unsigned modified_ios = 0; + + if (!ios) + modified_ios = def; + else if (ios > max) + modified_ios = max; + + if (modified_ios) { + (void)cmpxchg(reserved_ios, ios, modified_ios); + ios = modified_ios; + } + + return ios; +} + +unsigned dm_get_reserved_rq_based_ios(void) +{ + return __dm_get_reserved_ios(&reserved_rq_based_ios, + RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); +} +EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); + static int __init local_init(void) { int r = -ENOMEM; @@ -2878,7 +2910,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); } else if (type == DM_TYPE_REQUEST_BASED) { cachep = _rq_tio_cache; - pool_size = RESERVED_REQUEST_BASED_IOS; + pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_bio_data_size is not used. See __bind_mempools(). */ WARN_ON(per_bio_data_size != 0); @@ -2936,6 +2968,10 @@ module_exit(dm_exit); module_param(major, uint, 0); MODULE_PARM_DESC(major, "The major number of the device mapper"); + +module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber "); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 5e604cc..1539650 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -184,6 +184,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools); /* * Helpers that are used by DM core */ +unsigned dm_get_reserved_rq_based_ios(void); + static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen) { return !maxlen || strlen(result) + 1 >= maxlen; -- cgit v1.1 From e8603136cb04ec2d0c9b4b5be7a071fc003cb399 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Sep 2013 18:06:12 -0400 Subject: dm: add reserved_bio_based_ios module parameter Allow user to change the number of IOs that are reserved by bio-based DM's mempools by writing to this file: /sys/module/dm_mod/parameters/reserved_bio_based_ios The default value is RESERVED_BIO_BASED_IOS (16). The maximum allowed value is RESERVED_MAX_IOS (1024). Export dm_get_reserved_bio_based_ios() for use by DM targets and core code. Switch to sizing dm-io's mempool and bioset using DM core's configurable 'reserved_bio_based_ios'. Signed-off-by: Mike Snitzer Signed-off-by: Frank Mayhar --- drivers/md/dm-io.c | 7 +++---- drivers/md/dm.c | 17 ++++++++++++++++- drivers/md/dm.h | 1 + 3 files changed, 20 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ea49834..2a20986 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -19,8 +19,6 @@ #define DM_MSG_PREFIX "io" #define DM_IO_MAX_REGIONS BITS_PER_LONG -#define MIN_IOS 16 -#define MIN_BIOS 16 struct dm_io_client { mempool_t *pool; @@ -50,16 +48,17 @@ static struct kmem_cache *_dm_io_cache; struct dm_io_client *dm_io_client_create(void) { struct dm_io_client *client; + unsigned min_ios = dm_get_reserved_bio_based_ios(); client = kmalloc(sizeof(*client), GFP_KERNEL); if (!client) return ERR_PTR(-ENOMEM); - client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache); + client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache); if (!client->pool) goto bad; - client->bios = bioset_create(MIN_BIOS, 0); + client->bios = bioset_create(min_ios, 0); if (!client->bios) goto bad; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1e85f1d..b3e26c7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -218,6 +218,11 @@ static struct kmem_cache *_io_cache; static struct kmem_cache *_rq_tio_cache; /* + * Bio-based DM's mempools' reserved IOs set by the user. + */ +static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; + +/* * Request-based DM's mempools' reserved IOs set by the user. */ static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; @@ -241,6 +246,13 @@ static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, return ios; } +unsigned dm_get_reserved_bio_based_ios(void) +{ + return __dm_get_reserved_ios(&reserved_bio_based_ios, + RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); +} +EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); + unsigned dm_get_reserved_rq_based_ios(void) { return __dm_get_reserved_ios(&reserved_rq_based_ios, @@ -2906,7 +2918,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u if (type == DM_TYPE_BIO_BASED) { cachep = _io_cache; - pool_size = RESERVED_BIO_BASED_IOS; + pool_size = dm_get_reserved_bio_based_ios(); front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); } else if (type == DM_TYPE_REQUEST_BASED) { cachep = _rq_tio_cache; @@ -2969,6 +2981,9 @@ module_exit(dm_exit); module_param(major, uint, 0); MODULE_PARM_DESC(major, "The major number of the device mapper"); +module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); + module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1539650..1d1ad7b 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -184,6 +184,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools); /* * Helpers that are used by DM core */ +unsigned dm_get_reserved_bio_based_ios(void); unsigned dm_get_reserved_rq_based_ios(void); static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen) -- cgit v1.1 From 6d9d21e35fbfa2934339e96934f862d118abac23 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2013 23:17:27 -0700 Subject: bcache: Fix a dumb journal discard bug That switch statement was obviously wrong, leading to some sort of weird spinning on rare occasion with discards enabled... Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index ba95ab8..c0017ca 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -428,7 +428,7 @@ static void do_journal_discard(struct cache *ca) return; } - switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { + switch (atomic_read(&ja->discard_in_flight)) { case DISCARD_IN_FLIGHT: return; -- cgit v1.1 From aee6f1cfff3ce240eb4b43b41ca466b907acbd2e Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Mon, 23 Sep 2013 23:17:28 -0700 Subject: bcache: Strip endline when writing the label through sysfs sysfs attributes with unusual characters have crappy failure modes in Squeeze (udev 164); later versions of udev are unaffected. This should make these characters more unusual. Signed-off-by: Gabriel de Perthuis Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/sysfs.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4fe6ab2..924dcfd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -223,8 +223,13 @@ STORE(__cached_dev) } if (attr == &sysfs_label) { - /* note: endlines are preserved */ - memcpy(dc->sb.label, buf, SB_LABEL_SIZE); + if (size > SB_LABEL_SIZE) + return -EINVAL; + memcpy(dc->sb.label, buf, size); + if (size < SB_LABEL_SIZE) + dc->sb.label[size] = '\0'; + if (size && dc->sb.label[size - 1] == '\n') + dc->sb.label[size - 1] = '\0'; bch_write_bdev_super(dc, NULL); if (dc->disk.c) { memcpy(dc->disk.c->uuids[dc->disk.id].label, -- cgit v1.1 From c426c4fd46f709ade2bddd51c5738729c7ae1db5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2013 23:17:29 -0700 Subject: bcache: Fix for when no journal entries are found The journal replay code didn't handle this case, causing it to go into an infinite loop... Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/journal.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index c0017ca..f5203df 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -153,7 +153,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); pr_debug("%u journal buckets", ca->sb.njournal_buckets); - /* Read journal buckets ordered by golden ratio hash to quickly + /* + * Read journal buckets ordered by golden ratio hash to quickly * find a sequence of buckets with valid journal entries */ for (i = 0; i < ca->sb.njournal_buckets; i++) { @@ -166,18 +167,20 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, goto bsearch; } - /* If that fails, check all the buckets we haven't checked + /* + * If that fails, check all the buckets we haven't checked * already */ pr_debug("falling back to linear search"); - for (l = 0; l < ca->sb.njournal_buckets; l++) { - if (test_bit(l, bitmap)) - continue; - + for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); + l < ca->sb.njournal_buckets; + l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1)) if (read_bucket(l)) goto bsearch; - } + + if (list_empty(list)) + continue; bsearch: /* Binary search */ m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); @@ -197,10 +200,12 @@ bsearch: r = m; } - /* Read buckets in reverse order until we stop finding more + /* + * Read buckets in reverse order until we stop finding more * journal entries */ - pr_debug("finishing up"); + pr_debug("finishing up: m %u njournal_buckets %u", + m, ca->sb.njournal_buckets); l = m; while (1) { @@ -228,9 +233,10 @@ bsearch: } } - c->journal.seq = list_entry(list->prev, - struct journal_replay, - list)->j.seq; + if (!list_empty(list)) + c->journal.seq = list_entry(list->prev, + struct journal_replay, + list)->j.seq; return 0; #undef read_bucket -- cgit v1.1 From 61cbd250f867f98bb4738000afc6002d6f2b14bd Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 23 Sep 2013 23:17:30 -0700 Subject: bcache: Correct printf()-style format length modifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix drivers/md/bcache/btree.c: In function ‘bch_btree_node_read’: drivers/md/bcache/btree.c:259: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 3 has type ‘size_t’ Signed-off-by: Geert Uytterhoeven Signed-off-by: Kent Overstreet Signed-off-by: Linus Torvalds --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f9764e6..8fad840 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -255,7 +255,7 @@ void bch_btree_node_read(struct btree *b) return; err: - bch_cache_set_error(b->c, "io error reading bucket %lu", + bch_cache_set_error(b->c, "io error reading bucket %zu", PTR_BUCKET_NR(b->c, &b->key, 0)); } -- cgit v1.1 From c2a4f3183a1248f615a695fbd8905da55ad11bba Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2013 23:17:31 -0700 Subject: bcache: Fix a writeback performance regression Background writeback works by scanning the btree for dirty data and adding those keys into a fixed size buffer, then for each dirty key in the keybuf writing it to the backing device. When read_dirty() finishes and it's time to scan for more dirty data, we need to wait for the outstanding writeback IO to finish - they still take up slots in the keybuf (so that foreground writes can check for them to avoid races) - without that wait, we'll continually rescan when we'll be able to add at most a key or two to the keybuf, and that takes locks that starves foreground IO. Doh. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/bcache.h | 7 +++---- drivers/md/bcache/util.c | 11 ++++++++++- drivers/md/bcache/util.h | 12 +++++++++--- drivers/md/bcache/writeback.c | 43 +++++++++++++++++++++---------------------- 4 files changed, 43 insertions(+), 30 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b39f6f0..0f12382 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -498,7 +498,7 @@ struct cached_dev { */ atomic_t has_dirty; - struct ratelimit writeback_rate; + struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; /* @@ -507,10 +507,9 @@ struct cached_dev { */ sector_t last_read; - /* Number of writeback bios in flight */ - atomic_t in_flight; + /* Limit number of writeback bios in flight */ + struct semaphore in_flight; struct closure_with_timer writeback; - struct closure_waitlist writeback_wait; struct keybuf writeback_keys; diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 98eb811..420dad5 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) stats->last = now ?: 1; } -unsigned bch_next_delay(struct ratelimit *d, uint64_t done) +/** + * bch_next_delay() - increment @d by the amount of work done, and return how + * long to delay until the next time to do some work. + * + * @d - the struct bch_ratelimit to update + * @done - the amount of work done, in arbitrary units + * + * Returns the amount of time to delay by, in jiffies + */ +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1ae2a73..ea345c6 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units) (ewma) >> factor; \ }) -struct ratelimit { +struct bch_ratelimit { + /* Next time we want to do some work, in nanoseconds */ uint64_t next; + + /* + * Rate at which we want to do work, in units per nanosecond + * The units here correspond to the units passed to bch_next_delay() + */ unsigned rate; }; -static inline void ratelimit_reset(struct ratelimit *d) +static inline void bch_ratelimit_reset(struct bch_ratelimit *d) { d->next = local_clock(); } -unsigned bch_next_delay(struct ratelimit *d, uint64_t done); +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done); #define __DIV_SAFE(n, d, zero) \ ({ \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 22cbff5..27ac519 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work) static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { + uint64_t ret; + if (atomic_read(&dc->disk.detaching) || !dc->writeback_percent) return 0; - return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); + ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); + + return min_t(uint64_t, ret, HZ); } /* Background writeback */ @@ -208,7 +212,7 @@ normal_refill: up_write(&dc->writeback_lock); - ratelimit_reset(&dc->writeback_rate); + bch_ratelimit_reset(&dc->writeback_rate); /* Punt to workqueue only so we don't recurse and blow the stack */ continue_at(cl, read_dirty, dirty_wq); @@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl) } bch_keybuf_del(&dc->writeback_keys, w); - atomic_dec_bug(&dc->in_flight); - - closure_wake_up(&dc->writeback_wait); + up(&dc->in_flight); closure_return_with_destructor(cl, dirty_io_destructor); } @@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl) closure_bio_submit(&io->bio, cl, &io->dc->disk); - continue_at(cl, write_dirty_finish, dirty_wq); + continue_at(cl, write_dirty_finish, system_wq); } static void read_dirty_endio(struct bio *bio, int error) @@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl) closure_bio_submit(&io->bio, cl, &io->dc->disk); - continue_at(cl, write_dirty, dirty_wq); + continue_at(cl, write_dirty, system_wq); } static void read_dirty(struct closure *cl) @@ -394,12 +396,9 @@ static void read_dirty(struct closure *cl) if (delay > 0 && (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50)) { - w->private = NULL; - - closure_delay(&dc->writeback, delay); - continue_at(cl, read_dirty, dirty_wq); - } + jiffies_to_msecs(delay) > 50)) + while (delay) + delay = schedule_timeout(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -424,15 +423,10 @@ static void read_dirty(struct closure *cl) trace_bcache_writeback(&w->key); - closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); + down(&dc->in_flight); + closure_call(&io->cl, read_dirty_submit, NULL, cl); delay = writeback_delay(dc, KEY_SIZE(&w->key)); - - atomic_inc(&dc->in_flight); - - if (!closure_wait_event(&dc->writeback_wait, cl, - atomic_read(&dc->in_flight) < 64)) - continue_at(cl, read_dirty, dirty_wq); } if (0) { @@ -442,7 +436,11 @@ err: bch_keybuf_del(&dc->writeback_keys, w); } - refill_dirty(cl); + /* + * Wait for outstanding writeback IOs to finish (and keybuf slots to be + * freed) before refilling again + */ + continue_at(cl, refill_dirty, dirty_wq); } /* Init */ @@ -484,6 +482,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc) void bch_cached_dev_writeback_init(struct cached_dev *dc) { + sema_init(&dc->in_flight, 64); closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); @@ -513,7 +512,7 @@ void bch_writeback_exit(void) int __init bch_writeback_init(void) { - dirty_wq = create_singlethread_workqueue("bcache_writeback"); + dirty_wq = create_workqueue("bcache_writeback"); if (!dirty_wq) return -ENOMEM; -- cgit v1.1 From 1394d6761b6e9e15ee7c632a6d48791188727b40 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2013 23:17:32 -0700 Subject: bcache: Fix a flush/fua performance bug bch_journal_meta() was missing the flush to make the journal write actually go down (instead of waiting up to journal_delay_ms)... Whoops Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/journal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index f5203df..8435f81 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -695,6 +695,7 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl) if (cl) BUG_ON(!closure_wait(&w->wait, cl)); + closure_flush(&c->journal.io); __journal_try_write(c, true); } } -- cgit v1.1 From 79e3dab90d9f826ceca67c7890e048ac9169de49 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2013 23:17:33 -0700 Subject: bcache: Fix a dumb CPU spinning bug in writeback schedule_timeout() != schedule_timeout_uninterruptible() Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/writeback.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 27ac519..ba3ee48 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -397,8 +397,7 @@ static void read_dirty(struct closure *cl) if (delay > 0 && (KEY_START(&w->key) != dc->last_read || jiffies_to_msecs(delay) > 50)) - while (delay) - delay = schedule_timeout(delay); + delay = schedule_timeout_uninterruptible(delay); dc->last_read = KEY_OFFSET(&w->key); -- cgit v1.1 From a698e08c82dfb9771e0bac12c7337c706d729b6d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2013 23:17:34 -0700 Subject: bcache: Fix a shrinker deadlock GFP_NOIO means we could be getting called recursively - mca_alloc() -> mca_data_alloc() - definitely can't use mutex_lock(bucket_lock) then. Whoops. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 8fad840..f42fc7e 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -612,7 +612,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, return SHRINK_STOP; /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_WAIT) + if (sc->gfp_mask & __GFP_IO) mutex_lock(&c->bucket_lock); else if (!mutex_trylock(&c->bucket_lock)) return -1; -- cgit v1.1 From 84786438ed17978d72eeced580ab757e4da8830b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2013 23:17:35 -0700 Subject: bcache: Fix for handling overlapping extents when reading in a btree node btree_sort_fixup() was overly clever, because it was trying to avoid pulling a key off the btree iterator in more than one place. This led to a really obscure bug where we'd break early from the loop in btree_sort_fixup() if the current key overlapped with keys in more than one older set, and the next key it overlapped with was zero size. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/bset.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 8010eed..22d1ae7 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -926,28 +926,45 @@ struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) /* Mergesort */ +static void sort_key_next(struct btree_iter *iter, + struct btree_iter_set *i) +{ + i->k = bkey_next(i->k); + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + static void btree_sort_fixup(struct btree_iter *iter) { while (iter->used > 1) { struct btree_iter_set *top = iter->data, *i = top + 1; - struct bkey *k; if (iter->used > 2 && btree_iter_cmp(i[0], i[1])) i++; - for (k = i->k; - k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0; - k = bkey_next(k)) - if (top->k > i->k) - __bch_cut_front(top->k, k); - else if (KEY_SIZE(k)) - bch_cut_back(&START_KEY(k), top->k); - - if (top->k < i->k || k == i->k) + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) break; - heap_sift(iter, i - top, btree_iter_cmp); + if (!KEY_SIZE(i->k)) { + sort_key_next(iter, i); + heap_sift(iter, i - top, btree_iter_cmp); + continue; + } + + if (top->k > i->k) { + if (bkey_cmp(top->k, i->k) >= 0) + sort_key_next(iter, i); + else + bch_cut_front(top->k, i->k); + + heap_sift(iter, i - top, btree_iter_cmp); + } else { + /* can't happen because of comparison func */ + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); + bch_cut_back(&START_KEY(i->k), top->k); + } } } -- cgit v1.1 From c0f04d88e46d14de51f4baebb6efafb7d59e9f96 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2013 23:17:36 -0700 Subject: bcache: Fix flushes in writeback mode In writeback mode, when we get a cache flush we need to make sure we issue a flush to the backing device. The code for sending down an extra flush was wrong - by cloning the bio we were probably getting flags that didn't make sense for a bare flush, and also the old code was firing for FUA bios, for which we don't need to send a flush to the backing device. This was causing data corruption somehow - the mechanism was never determined, but this patch fixes it for the users that were seeing it. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/request.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 786a1a4..71eb233 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -997,14 +997,17 @@ static void request_write(struct cached_dev *dc, struct search *s) } else { bch_writeback_add(dc); - if (s->op.flush_journal) { + if (bio->bi_rw & REQ_FLUSH) { /* Also need to send a flush to the backing device */ - s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, - dc->disk.bio_split); + struct bio *flush = bio_alloc_bioset(0, GFP_NOIO, + dc->disk.bio_split); - bio->bi_size = 0; - bio->bi_vcnt = 0; - closure_bio_submit(bio, cl, s->d); + flush->bi_rw = WRITE_FLUSH; + flush->bi_bdev = bio->bi_bdev; + flush->bi_end_io = request_endio; + flush->bi_private = cl; + + closure_bio_submit(flush, cl, s->d); } else { s->op.cache_bio = bio; } -- cgit v1.1