summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig4
-rw-r--r--drivers/md/dm-crypt.c392
-rw-r--r--drivers/md/dm-io.c17
-rw-r--r--drivers/md/dm-raid1.c9
-rw-r--r--drivers/md/dm-snap.c124
-rw-r--r--drivers/md/dm-thin.c11
-rw-r--r--drivers/md/dm.c48
-rw-r--r--drivers/md/md.c17
-rw-r--r--drivers/md/persistent-data/Kconfig2
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c4
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/md/raid1.c5
-rw-r--r--drivers/md/raid5.c13
13 files changed, 419 insertions, 229 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index c396444..63e05e3 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -178,7 +178,7 @@ config MD_FAULTY
source "drivers/md/bcache/Kconfig"
config BLK_DEV_DM_BUILTIN
- boolean
+ bool
config BLK_DEV_DM
tristate "Device mapper support"
@@ -197,7 +197,7 @@ config BLK_DEV_DM
If unsure, say N.
config DM_DEBUG
- boolean "Device mapper debugging support"
+ bool "Device mapper debugging support"
depends on BLK_DEV_DM
---help---
Enable this for messages that may help debug device-mapper problems.
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 08981be..713a962 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,9 +18,11 @@
#include <linux/slab.h>
#include <linux/crypto.h>
#include <linux/workqueue.h>
+#include <linux/kthread.h>
#include <linux/backing-dev.h>
#include <linux/atomic.h>
#include <linux/scatterlist.h>
+#include <linux/rbtree.h>
#include <asm/page.h>
#include <asm/unaligned.h>
#include <crypto/hash.h>
@@ -58,7 +60,8 @@ struct dm_crypt_io {
atomic_t io_pending;
int error;
sector_t sector;
- struct dm_crypt_io *base_io;
+
+ struct rb_node rb_node;
} CRYPTO_MINALIGN_ATTR;
struct dm_crypt_request {
@@ -108,7 +111,8 @@ struct iv_tcw_private {
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
*/
-enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
+ DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
/*
* The fields in here must be read only after initialization.
@@ -121,14 +125,18 @@ struct crypt_config {
* pool for per bio private data, crypto requests and
* encryption requeusts/buffer pages
*/
- mempool_t *io_pool;
mempool_t *req_pool;
mempool_t *page_pool;
struct bio_set *bs;
+ struct mutex bio_alloc_lock;
struct workqueue_struct *io_queue;
struct workqueue_struct *crypt_queue;
+ struct task_struct *write_thread;
+ wait_queue_head_t write_thread_wait;
+ struct rb_root write_tree;
+
char *cipher;
char *cipher_string;
@@ -172,9 +180,6 @@ struct crypt_config {
};
#define MIN_IOS 16
-#define MIN_POOL_PAGES 32
-
-static struct kmem_cache *_crypt_io_pool;
static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
@@ -946,57 +951,70 @@ static int crypt_convert(struct crypt_config *cc,
return 0;
}
+static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
+
/*
* Generate a new unfragmented bio with the given size
* This should never violate the device limitations
- * May return a smaller bio when running out of pages, indicated by
- * *out_of_pages set to 1.
+ *
+ * This function may be called concurrently. If we allocate from the mempool
+ * concurrently, there is a possibility of deadlock. For example, if we have
+ * mempool of 256 pages, two processes, each wanting 256, pages allocate from
+ * the mempool concurrently, it may deadlock in a situation where both processes
+ * have allocated 128 pages and the mempool is exhausted.
+ *
+ * In order to avoid this scenario we allocate the pages under a mutex.
+ *
+ * In order to not degrade performance with excessive locking, we try
+ * non-blocking allocations without a mutex first but on failure we fallback
+ * to blocking allocations with a mutex.
*/
-static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
- unsigned *out_of_pages)
+static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
{
struct crypt_config *cc = io->cc;
struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
- unsigned i, len;
+ gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
+ unsigned i, len, remaining_size;
struct page *page;
+ struct bio_vec *bvec;
+
+retry:
+ if (unlikely(gfp_mask & __GFP_WAIT))
+ mutex_lock(&cc->bio_alloc_lock);
clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
if (!clone)
- return NULL;
+ goto return_clone;
clone_init(io, clone);
- *out_of_pages = 0;
+
+ remaining_size = size;
for (i = 0; i < nr_iovecs; i++) {
page = mempool_alloc(cc->page_pool, gfp_mask);
if (!page) {
- *out_of_pages = 1;
- break;
+ crypt_free_buffer_pages(cc, clone);
+ bio_put(clone);
+ gfp_mask |= __GFP_WAIT;
+ goto retry;
}
- /*
- * If additional pages cannot be allocated without waiting,
- * return a partially-allocated bio. The caller will then try
- * to allocate more bios while submitting this partial bio.
- */
- gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
+ len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
- len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
+ bvec = &clone->bi_io_vec[clone->bi_vcnt++];
+ bvec->bv_page = page;
+ bvec->bv_len = len;
+ bvec->bv_offset = 0;
- if (!bio_add_page(clone, page, len, 0)) {
- mempool_free(page, cc->page_pool);
- break;
- }
+ clone->bi_iter.bi_size += len;
- size -= len;
+ remaining_size -= len;
}
- if (!clone->bi_iter.bi_size) {
- bio_put(clone);
- return NULL;
- }
+return_clone:
+ if (unlikely(gfp_mask & __GFP_WAIT))
+ mutex_unlock(&cc->bio_alloc_lock);
return clone;
}
@@ -1020,7 +1038,6 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
io->base_bio = bio;
io->sector = sector;
io->error = 0;
- io->base_io = NULL;
io->ctx.req = NULL;
atomic_set(&io->io_pending, 0);
}
@@ -1033,13 +1050,11 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
/*
* One of the bios was finished. Check for completion of
* the whole request and correctly clean up the buffer.
- * If base_io is set, wait for the last fragment to complete.
*/
static void crypt_dec_pending(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
struct bio *base_bio = io->base_bio;
- struct dm_crypt_io *base_io = io->base_io;
int error = io->error;
if (!atomic_dec_and_test(&io->io_pending))
@@ -1047,16 +1062,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
if (io->ctx.req)
crypt_free_req(cc, io->ctx.req, base_bio);
- if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
- mempool_free(io, cc->io_pool);
-
- if (likely(!base_io))
- bio_endio(base_bio, error);
- else {
- if (error && !base_io->error)
- base_io->error = error;
- crypt_dec_pending(base_io);
- }
+
+ bio_endio(base_bio, error);
}
/*
@@ -1138,37 +1145,97 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
return 0;
}
+static void kcryptd_io_read_work(struct work_struct *work)
+{
+ struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+
+ crypt_inc_pending(io);
+ if (kcryptd_io_read(io, GFP_NOIO))
+ io->error = -ENOMEM;
+ crypt_dec_pending(io);
+}
+
+static void kcryptd_queue_read(struct dm_crypt_io *io)
+{
+ struct crypt_config *cc = io->cc;
+
+ INIT_WORK(&io->work, kcryptd_io_read_work);
+ queue_work(cc->io_queue, &io->work);
+}
+
static void kcryptd_io_write(struct dm_crypt_io *io)
{
struct bio *clone = io->ctx.bio_out;
+
generic_make_request(clone);
}
-static void kcryptd_io(struct work_struct *work)
+#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node)
+
+static int dmcrypt_write(void *data)
{
- struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+ struct crypt_config *cc = data;
+ struct dm_crypt_io *io;
- if (bio_data_dir(io->base_bio) == READ) {
- crypt_inc_pending(io);
- if (kcryptd_io_read(io, GFP_NOIO))
- io->error = -ENOMEM;
- crypt_dec_pending(io);
- } else
- kcryptd_io_write(io);
-}
+ while (1) {
+ struct rb_root write_tree;
+ struct blk_plug plug;
-static void kcryptd_queue_io(struct dm_crypt_io *io)
-{
- struct crypt_config *cc = io->cc;
+ DECLARE_WAITQUEUE(wait, current);
- INIT_WORK(&io->work, kcryptd_io);
- queue_work(cc->io_queue, &io->work);
+ spin_lock_irq(&cc->write_thread_wait.lock);
+continue_locked:
+
+ if (!RB_EMPTY_ROOT(&cc->write_tree))
+ goto pop_from_list;
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ __add_wait_queue(&cc->write_thread_wait, &wait);
+
+ spin_unlock_irq(&cc->write_thread_wait.lock);
+
+ if (unlikely(kthread_should_stop())) {
+ set_task_state(current, TASK_RUNNING);
+ remove_wait_queue(&cc->write_thread_wait, &wait);
+ break;
+ }
+
+ schedule();
+
+ set_task_state(current, TASK_RUNNING);
+ spin_lock_irq(&cc->write_thread_wait.lock);
+ __remove_wait_queue(&cc->write_thread_wait, &wait);
+ goto continue_locked;
+
+pop_from_list:
+ write_tree = cc->write_tree;
+ cc->write_tree = RB_ROOT;
+ spin_unlock_irq(&cc->write_thread_wait.lock);
+
+ BUG_ON(rb_parent(write_tree.rb_node));
+
+ /*
+ * Note: we cannot walk the tree here with rb_next because
+ * the structures may be freed when kcryptd_io_write is called.
+ */
+ blk_start_plug(&plug);
+ do {
+ io = crypt_io_from_node(rb_first(&write_tree));
+ rb_erase(&io->rb_node, &write_tree);
+ kcryptd_io_write(io);
+ } while (!RB_EMPTY_ROOT(&write_tree));
+ blk_finish_plug(&plug);
+ }
+ return 0;
}
static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
{
struct bio *clone = io->ctx.bio_out;
struct crypt_config *cc = io->cc;
+ unsigned long flags;
+ sector_t sector;
+ struct rb_node **rbp, *parent;
if (unlikely(io->error < 0)) {
crypt_free_buffer_pages(cc, clone);
@@ -1182,20 +1249,34 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
clone->bi_iter.bi_sector = cc->start + io->sector;
- if (async)
- kcryptd_queue_io(io);
- else
+ if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) {
generic_make_request(clone);
+ return;
+ }
+
+ spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
+ rbp = &cc->write_tree.rb_node;
+ parent = NULL;
+ sector = io->sector;
+ while (*rbp) {
+ parent = *rbp;
+ if (sector < crypt_io_from_node(parent)->sector)
+ rbp = &(*rbp)->rb_left;
+ else
+ rbp = &(*rbp)->rb_right;
+ }
+ rb_link_node(&io->rb_node, parent, rbp);
+ rb_insert_color(&io->rb_node, &cc->write_tree);
+
+ wake_up_locked(&cc->write_thread_wait);
+ spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
}
static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
struct bio *clone;
- struct dm_crypt_io *new_io;
int crypt_finished;
- unsigned out_of_pages = 0;
- unsigned remaining = io->base_bio->bi_iter.bi_size;
sector_t sector = io->sector;
int r;
@@ -1205,80 +1286,30 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
crypt_inc_pending(io);
crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
- /*
- * The allocated buffers can be smaller than the whole bio,
- * so repeat the whole process until all the data can be handled.
- */
- while (remaining) {
- clone = crypt_alloc_buffer(io, remaining, &out_of_pages);
- if (unlikely(!clone)) {
- io->error = -ENOMEM;
- break;
- }
-
- io->ctx.bio_out = clone;
- io->ctx.iter_out = clone->bi_iter;
-
- remaining -= clone->bi_iter.bi_size;
- sector += bio_sectors(clone);
-
- crypt_inc_pending(io);
-
- r = crypt_convert(cc, &io->ctx);
- if (r < 0)
- io->error = -EIO;
-
- crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
-
- /* Encryption was already finished, submit io now */
- if (crypt_finished) {
- kcryptd_crypt_write_io_submit(io, 0);
-
- /*
- * If there was an error, do not try next fragments.
- * For async, error is processed in async handler.
- */
- if (unlikely(r < 0))
- break;
+ clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
+ if (unlikely(!clone)) {
+ io->error = -EIO;
+ goto dec;
+ }
- io->sector = sector;
- }
+ io->ctx.bio_out = clone;
+ io->ctx.iter_out = clone->bi_iter;
- /*
- * Out of memory -> run queues
- * But don't wait if split was due to the io size restriction
- */
- if (unlikely(out_of_pages))
- congestion_wait(BLK_RW_ASYNC, HZ/100);
+ sector += bio_sectors(clone);
- /*
- * With async crypto it is unsafe to share the crypto context
- * between fragments, so switch to a new dm_crypt_io structure.
- */
- if (unlikely(!crypt_finished && remaining)) {
- new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
- crypt_io_init(new_io, io->cc, io->base_bio, sector);
- crypt_inc_pending(new_io);
- crypt_convert_init(cc, &new_io->ctx, NULL,
- io->base_bio, sector);
- new_io->ctx.iter_in = io->ctx.iter_in;
-
- /*
- * Fragments after the first use the base_io
- * pending count.
- */
- if (!io->base_io)
- new_io->base_io = io;
- else {
- new_io->base_io = io->base_io;
- crypt_inc_pending(io->base_io);
- crypt_dec_pending(io);
- }
+ crypt_inc_pending(io);
+ r = crypt_convert(cc, &io->ctx);
+ if (r)
+ io->error = -EIO;
+ crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
- io = new_io;
- }
+ /* Encryption was already finished, submit io now */
+ if (crypt_finished) {
+ kcryptd_crypt_write_io_submit(io, 0);
+ io->sector = sector;
}
+dec:
crypt_dec_pending(io);
}
@@ -1481,6 +1512,9 @@ static void crypt_dtr(struct dm_target *ti)
if (!cc)
return;
+ if (cc->write_thread)
+ kthread_stop(cc->write_thread);
+
if (cc->io_queue)
destroy_workqueue(cc->io_queue);
if (cc->crypt_queue)
@@ -1495,8 +1529,6 @@ static void crypt_dtr(struct dm_target *ti)
mempool_destroy(cc->page_pool);
if (cc->req_pool)
mempool_destroy(cc->req_pool);
- if (cc->io_pool)
- mempool_destroy(cc->io_pool);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc);
@@ -1688,7 +1720,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
char dummy;
static struct dm_arg _args[] = {
- {0, 1, "Invalid number of feature args"},
+ {0, 3, "Invalid number of feature args"},
};
if (argc < 5) {
@@ -1710,13 +1742,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (ret < 0)
goto bad;
- ret = -ENOMEM;
- cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
- if (!cc->io_pool) {
- ti->error = "Cannot allocate crypt io mempool";
- goto bad;
- }
-
cc->dmreq_start = sizeof(struct ablkcipher_request);
cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
@@ -1734,6 +1759,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
}
+ ret = -ENOMEM;
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
if (!cc->req_pool) {
@@ -1746,7 +1772,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
ARCH_KMALLOC_MINALIGN);
- cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+ cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
if (!cc->page_pool) {
ti->error = "Cannot allocate page mempool";
goto bad;
@@ -1758,6 +1784,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
+ mutex_init(&cc->bio_alloc_lock);
+
ret = -EINVAL;
if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid iv_offset sector";
@@ -1788,15 +1816,26 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (ret)
goto bad;
- opt_string = dm_shift_arg(&as);
+ while (opt_params--) {
+ opt_string = dm_shift_arg(&as);
+ if (!opt_string) {
+ ti->error = "Not enough feature arguments";
+ goto bad;
+ }
- if (opt_params == 1 && opt_string &&
- !strcasecmp(opt_string, "allow_discards"))
- ti->num_discard_bios = 1;
- else if (opt_params) {
- ret = -EINVAL;
- ti->error = "Invalid feature arguments";
- goto bad;
+ if (!strcasecmp(opt_string, "allow_discards"))
+ ti->num_discard_bios = 1;
+
+ else if (!strcasecmp(opt_string, "same_cpu_crypt"))
+ set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+
+ else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
+ set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+
+ else {
+ ti->error = "Invalid feature arguments";
+ goto bad;
+ }
}
}
@@ -1807,13 +1846,28 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- cc->crypt_queue = alloc_workqueue("kcryptd",
- WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ else
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ num_online_cpus());
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
goto bad;
}
+ init_waitqueue_head(&cc->write_thread_wait);
+ cc->write_tree = RB_ROOT;
+
+ cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
+ if (IS_ERR(cc->write_thread)) {
+ ret = PTR_ERR(cc->write_thread);
+ cc->write_thread = NULL;
+ ti->error = "Couldn't spawn write thread";
+ goto bad;
+ }
+ wake_up_process(cc->write_thread);
+
ti->num_flush_bios = 1;
ti->discard_zeroes_data_unsupported = true;
@@ -1848,7 +1902,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
if (bio_data_dir(io->base_bio) == READ) {
if (kcryptd_io_read(io, GFP_NOWAIT))
- kcryptd_queue_io(io);
+ kcryptd_queue_read(io);
} else
kcryptd_queue_crypt(io);
@@ -1860,6 +1914,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
{
struct crypt_config *cc = ti->private;
unsigned i, sz = 0;
+ int num_feature_args = 0;
switch (type) {
case STATUSTYPE_INFO:
@@ -1878,8 +1933,18 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
cc->dev->name, (unsigned long long)cc->start);
- if (ti->num_discard_bios)
- DMEMIT(" 1 allow_discards");
+ num_feature_args += !!ti->num_discard_bios;
+ num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+ num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ if (num_feature_args) {
+ DMEMIT(" %d", num_feature_args);
+ if (ti->num_discard_bios)
+ DMEMIT(" allow_discards");
+ if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+ DMEMIT(" same_cpu_crypt");
+ if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
+ DMEMIT(" submit_from_crypt_cpus");
+ }
break;
}
@@ -1976,7 +2041,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 13, 0},
+ .version = {1, 14, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
@@ -1994,15 +2059,9 @@ static int __init dm_crypt_init(void)
{
int r;
- _crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0);
- if (!_crypt_io_pool)
- return -ENOMEM;
-
r = dm_register_target(&crypt_target);
- if (r < 0) {
+ if (r < 0)
DMERR("register failed %d", r);
- kmem_cache_destroy(_crypt_io_pool);
- }
return r;
}
@@ -2010,7 +2069,6 @@ static int __init dm_crypt_init(void)
static void __exit dm_crypt_exit(void)
{
dm_unregister_target(&crypt_target);
- kmem_cache_destroy(_crypt_io_pool);
}
module_init(dm_crypt_init);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index c09359d..74adcd2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -289,6 +289,19 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
struct request_queue *q = bdev_get_queue(where->bdev);
unsigned short logical_block_size = queue_logical_block_size(q);
sector_t num_sectors;
+ unsigned int uninitialized_var(special_cmd_max_sectors);
+
+ /*
+ * Reject unsupported discard and write same requests.
+ */
+ if (rw & REQ_DISCARD)
+ special_cmd_max_sectors = q->limits.max_discard_sectors;
+ else if (rw & REQ_WRITE_SAME)
+ special_cmd_max_sectors = q->limits.max_write_same_sectors;
+ if ((rw & (REQ_DISCARD | REQ_WRITE_SAME)) && special_cmd_max_sectors == 0) {
+ dec_count(io, region, -EOPNOTSUPP);
+ return;
+ }
/*
* where->count may be zero if rw holds a flush and we need to
@@ -311,7 +324,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
store_io_and_region_in_bio(bio, io, region);
if (rw & REQ_DISCARD) {
- num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
+ num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
remaining -= num_sectors;
} else if (rw & REQ_WRITE_SAME) {
@@ -320,7 +333,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
*/
dp->get_page(dp, &page, &len, &offset);
bio_add_page(bio, page, logical_block_size, offset);
- num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining);
+ num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
offset = 0;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 7dfdb5c..089d627 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -604,6 +604,15 @@ static void write_callback(unsigned long error, void *context)
return;
}
+ /*
+ * If the bio is discard, return an error, but do not
+ * degrade the array.
+ */
+ if (bio->bi_rw & REQ_DISCARD) {
+ bio_endio(bio, -EOPNOTSUPP);
+ return;
+ }
+
for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error))
fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 864b03f..f83a0f3 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -20,6 +20,8 @@
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
+#include "dm.h"
+
#include "dm-exception-store.h"
#define DM_MSG_PREFIX "snapshots"
@@ -291,12 +293,23 @@ struct origin {
};
/*
+ * This structure is allocated for each origin target
+ */
+struct dm_origin {
+ struct dm_dev *dev;
+ struct dm_target *ti;
+ unsigned split_boundary;
+ struct list_head hash_list;
+};
+
+/*
* Size of the hash table for origin volumes. If we make this
* the size of the minors list then it should be nearly perfect
*/
#define ORIGIN_HASH_SIZE 256
#define ORIGIN_MASK 0xFF
static struct list_head *_origins;
+static struct list_head *_dm_origins;
static struct rw_semaphore _origins_lock;
static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
@@ -310,12 +323,22 @@ static int init_origin_hash(void)
_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
GFP_KERNEL);
if (!_origins) {
- DMERR("unable to allocate memory");
+ DMERR("unable to allocate memory for _origins");
return -ENOMEM;
}
-
for (i = 0; i < ORIGIN_HASH_SIZE; i++)
INIT_LIST_HEAD(_origins + i);
+
+ _dm_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!_dm_origins) {
+ DMERR("unable to allocate memory for _dm_origins");
+ kfree(_origins);
+ return -ENOMEM;
+ }
+ for (i = 0; i < ORIGIN_HASH_SIZE; i++)
+ INIT_LIST_HEAD(_dm_origins + i);
+
init_rwsem(&_origins_lock);
return 0;
@@ -324,6 +347,7 @@ static int init_origin_hash(void)
static void exit_origin_hash(void)
{
kfree(_origins);
+ kfree(_dm_origins);
}
static unsigned origin_hash(struct block_device *bdev)
@@ -350,6 +374,30 @@ static void __insert_origin(struct origin *o)
list_add_tail(&o->hash_list, sl);
}
+static struct dm_origin *__lookup_dm_origin(struct block_device *origin)
+{
+ struct list_head *ol;
+ struct dm_origin *o;
+
+ ol = &_dm_origins[origin_hash(origin)];
+ list_for_each_entry (o, ol, hash_list)
+ if (bdev_equal(o->dev->bdev, origin))
+ return o;
+
+ return NULL;
+}
+
+static void __insert_dm_origin(struct dm_origin *o)
+{
+ struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
+ list_add_tail(&o->hash_list, sl);
+}
+
+static void __remove_dm_origin(struct dm_origin *o)
+{
+ list_del(&o->hash_list);
+}
+
/*
* _origins_lock must be held when calling this function.
* Returns number of snapshots registered using the supplied cow device, plus:
@@ -1432,8 +1480,6 @@ out:
full_bio->bi_private = pe->full_bio_private;
atomic_inc(&full_bio->bi_remaining);
}
- free_pending_exception(pe);
-
increment_pending_exceptions_done_count();
up_write(&s->lock);
@@ -1450,6 +1496,8 @@ out:
}
retry_origin_bios(s, origin_bios);
+
+ free_pending_exception(pe);
}
static void commit_callback(void *context, int success)
@@ -1840,9 +1888,40 @@ static int snapshot_preresume(struct dm_target *ti)
static void snapshot_resume(struct dm_target *ti)
{
struct dm_snapshot *s = ti->private;
- struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+ struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL;
+ struct dm_origin *o;
+ struct mapped_device *origin_md = NULL;
+ bool must_restart_merging = false;
+
+ down_read(&_origins_lock);
+
+ o = __lookup_dm_origin(s->origin->bdev);
+ if (o)
+ origin_md = dm_table_get_md(o->ti->table);
+ if (!origin_md) {
+ (void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
+ if (snap_merging)
+ origin_md = dm_table_get_md(snap_merging->ti->table);
+ }
+ if (origin_md == dm_table_get_md(ti->table))
+ origin_md = NULL;
+ if (origin_md) {
+ if (dm_hold(origin_md))
+ origin_md = NULL;
+ }
+
+ up_read(&_origins_lock);
+
+ if (origin_md) {
+ dm_internal_suspend_fast(origin_md);
+ if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
+ must_restart_merging = true;
+ stop_merge(snap_merging);
+ }
+ }
down_read(&_origins_lock);
+
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
down_write(&snap_src->lock);
@@ -1851,8 +1930,16 @@ static void snapshot_resume(struct dm_target *ti)
up_write(&snap_dest->lock);
up_write(&snap_src->lock);
}
+
up_read(&_origins_lock);
+ if (origin_md) {
+ if (must_restart_merging)
+ start_merge(snap_merging);
+ dm_internal_resume_fast(origin_md);
+ dm_put(origin_md);
+ }
+
/* Now we have correct chunk size, reregister */
reregister_snapshot(s);
@@ -2133,11 +2220,6 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
* Origin: maps a linear range of a device, with hooks for snapshotting.
*/
-struct dm_origin {
- struct dm_dev *dev;
- unsigned split_boundary;
-};
-
/*
* Construct an origin mapping: <dev_path>
* The context for an origin is merely a 'struct dm_dev *'
@@ -2166,6 +2248,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_open;
}
+ o->ti = ti;
ti->private = o;
ti->num_flush_bios = 1;
@@ -2180,6 +2263,7 @@ bad_alloc:
static void origin_dtr(struct dm_target *ti)
{
struct dm_origin *o = ti->private;
+
dm_put_device(ti, o->dev);
kfree(o);
}
@@ -2216,6 +2300,19 @@ static void origin_resume(struct dm_target *ti)
struct dm_origin *o = ti->private;
o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
+
+ down_write(&_origins_lock);
+ __insert_dm_origin(o);
+ up_write(&_origins_lock);
+}
+
+static void origin_postsuspend(struct dm_target *ti)
+{
+ struct dm_origin *o = ti->private;
+
+ down_write(&_origins_lock);
+ __remove_dm_origin(o);
+ up_write(&_origins_lock);
}
static void origin_status(struct dm_target *ti, status_type_t type,
@@ -2258,12 +2355,13 @@ static int origin_iterate_devices(struct dm_target *ti,
static struct target_type origin_target = {
.name = "snapshot-origin",
- .version = {1, 8, 1},
+ .version = {1, 9, 0},
.module = THIS_MODULE,
.ctr = origin_ctr,
.dtr = origin_dtr,
.map = origin_map,
.resume = origin_resume,
+ .postsuspend = origin_postsuspend,
.status = origin_status,
.merge = origin_merge,
.iterate_devices = origin_iterate_devices,
@@ -2271,7 +2369,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 12, 0},
+ .version = {1, 13, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2285,7 +2383,7 @@ static struct target_type snapshot_target = {
static struct target_type merge_target = {
.name = dm_snapshot_merge_target_name,
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 654773c..921aafd 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2358,17 +2358,6 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
case -ENODATA:
- if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
- /*
- * This block isn't provisioned, and we have no way
- * of doing so.
- */
- handle_unserviceable_bio(tc->pool, bio);
- cell_defer_no_holder(tc, virt_cell);
- return DM_MAPIO_SUBMITTED;
- }
- /* fall through */
-
case -EWOULDBLOCK:
thin_defer_cell(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ec1444f..9b641b3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2571,7 +2571,7 @@ int dm_setup_md_queue(struct mapped_device *md)
return 0;
}
-static struct mapped_device *dm_find_md(dev_t dev)
+struct mapped_device *dm_get_md(dev_t dev)
{
struct mapped_device *md;
unsigned minor = MINOR(dev);
@@ -2582,12 +2582,15 @@ static struct mapped_device *dm_find_md(dev_t dev)
spin_lock(&_minor_lock);
md = idr_find(&_minor_idr, minor);
- if (md && (md == MINOR_ALLOCED ||
- (MINOR(disk_devt(dm_disk(md))) != minor) ||
- dm_deleting_md(md) ||
- test_bit(DMF_FREEING, &md->flags))) {
- md = NULL;
- goto out;
+ if (md) {
+ if ((md == MINOR_ALLOCED ||
+ (MINOR(disk_devt(dm_disk(md))) != minor) ||
+ dm_deleting_md(md) ||
+ test_bit(DMF_FREEING, &md->flags))) {
+ md = NULL;
+ goto out;
+ }
+ dm_get(md);
}
out:
@@ -2595,16 +2598,6 @@ out:
return md;
}
-
-struct mapped_device *dm_get_md(dev_t dev)
-{
- struct mapped_device *md = dm_find_md(dev);
-
- if (md)
- dm_get(md);
-
- return md;
-}
EXPORT_SYMBOL_GPL(dm_get_md);
void *dm_get_mdptr(struct mapped_device *md)
@@ -2623,6 +2616,19 @@ void dm_get(struct mapped_device *md)
BUG_ON(test_bit(DMF_FREEING, &md->flags));
}
+int dm_hold(struct mapped_device *md)
+{
+ spin_lock(&_minor_lock);
+ if (test_bit(DMF_FREEING, &md->flags)) {
+ spin_unlock(&_minor_lock);
+ return -EBUSY;
+ }
+ dm_get(md);
+ spin_unlock(&_minor_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_hold);
+
const char *dm_device_name(struct mapped_device *md)
{
return md->name;
@@ -2645,10 +2651,16 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
if (dm_request_based(md))
flush_kthread_worker(&md->kworker);
+ /*
+ * Take suspend_lock so that presuspend and postsuspend methods
+ * do not race with internal suspend.
+ */
+ mutex_lock(&md->suspend_lock);
if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
}
+ mutex_unlock(&md->suspend_lock);
/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
dm_put_live_table(md, srcu_idx);
@@ -3122,6 +3134,7 @@ void dm_internal_suspend_fast(struct mapped_device *md)
flush_workqueue(md->wq);
dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
}
+EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
void dm_internal_resume_fast(struct mapped_device *md)
{
@@ -3133,6 +3146,7 @@ void dm_internal_resume_fast(struct mapped_device *md)
done:
mutex_unlock(&md->suspend_lock);
}
+EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
/*-----------------------------------------------------------------
* Event notification.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c8d2bac..717daad 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2555,7 +2555,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
return err ? err : len;
}
static struct rdev_sysfs_entry rdev_state =
-__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
+__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
static ssize_t
errors_show(struct md_rdev *rdev, char *page)
@@ -3638,7 +3638,8 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
return err ?: len;
}
static struct md_sysfs_entry md_resync_start =
-__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
+__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
+ resync_start_show, resync_start_store);
/*
* The array state can be:
@@ -3851,7 +3852,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
return err ?: len;
}
static struct md_sysfs_entry md_array_state =
-__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
+__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
static ssize_t
max_corrected_read_errors_show(struct mddev *mddev, char *page) {
@@ -4101,7 +4102,7 @@ out_unlock:
}
static struct md_sysfs_entry md_metadata =
-__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t
action_show(struct mddev *mddev, char *page)
@@ -4189,7 +4190,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
}
static struct md_sysfs_entry md_scan_mode =
-__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
static ssize_t
last_sync_action_show(struct mddev *mddev, char *page)
@@ -4335,7 +4336,8 @@ sync_completed_show(struct mddev *mddev, char *page)
return sprintf(page, "%llu / %llu\n", resync, max_sectors);
}
-static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
+static struct md_sysfs_entry md_sync_completed =
+ __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
static ssize_t
min_sync_show(struct mddev *mddev, char *page)
@@ -5078,7 +5080,8 @@ int md_run(struct mddev *mddev)
}
if (err) {
mddev_detach(mddev);
- pers->free(mddev, mddev->private);
+ if (mddev->private)
+ pers->free(mddev, mddev->private);
module_put(pers->owner);
bitmap_destroy(mddev);
return err;
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 0c2dec7..78c74bb 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -8,7 +8,7 @@ config DM_PERSISTENT_DATA
device-mapper targets such as the thin provisioning target.
config DM_DEBUG_BLOCK_STACK_TRACING
- boolean "Keep stack trace of persistent data block lock holders"
+ bool "Keep stack trace of persistent data block lock holders"
depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
select STACKTRACE
---help---
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index cfbf961..ebb280a 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -78,7 +78,9 @@ static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b,
if (r)
return r;
- return count > 1;
+ *result = count > 1;
+
+ return 0;
}
static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index a13f738..3ed9f42 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -467,8 +467,6 @@ static int raid0_run(struct mddev *mddev)
dump_zones(mddev);
ret = md_integrity_register(mddev);
- if (ret)
- raid0_free(mddev, conf);
return ret;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4153da5..d34e238 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -560,7 +560,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
- if (best_disk < 0) {
+ if (best_dist_disk < 0) {
if (is_badblock(rdev, this_sector, sectors,
&first_bad, &bad_sectors)) {
if (first_bad < this_sector)
@@ -569,7 +569,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
best_good_sectors = first_bad - this_sector;
} else
best_good_sectors = sectors;
- best_disk = disk;
+ best_dist_disk = disk;
+ best_pending_disk = disk;
}
continue;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e75d48c..cd2f96b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5121,12 +5121,17 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
schedule_timeout_uninterruptible(1);
}
/* Need to check if array will still be degraded after recovery/resync
- * We don't need to check the 'failed' flag as when that gets set,
- * recovery aborts.
+ * Note in case of > 1 drive failures it's possible we're rebuilding
+ * one drive while leaving another faulty drive in array.
*/
- for (i = 0; i < conf->raid_disks; i++)
- if (conf->disks[i].rdev == NULL)
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
+
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
+ }
+ rcu_read_unlock();
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
OpenPOWER on IntegriCloud