From ab4c1424882be9cd70b89abf2b484add355712fa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 6 Jan 2009 03:05:09 +0000 Subject: dm: support barriers on simple devices Implement barrier support for single device DM devices This patch implements barrier support in DM for the common case of dm linear just remapping a single underlying device. In this case we can safely pass the barrier through because there can be no reordering between devices. NB. Any DM device might cease to support barriers if it gets reconfigured so code must continue to allow for a possible -EOPNOTSUPP on every barrier bio submitted. - agk Signed-off-by: Andi Kleen Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-table.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers/md/dm-table.c') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 04e5fd7..ebaaf72 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -38,6 +38,8 @@ struct dm_table { sector_t *highs; struct dm_target *targets; + unsigned barriers_supported:1; + /* * Indicates the rw permissions for the new logical * device. This should be a combination of FMODE_READ @@ -227,6 +229,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); atomic_set(&t->holders, 1); + t->barriers_supported = 1; if (!num_targets) num_targets = KEYS_PER_NODE; @@ -728,6 +731,10 @@ int dm_table_add_target(struct dm_table *t, const char *type, /* FIXME: the plan is to combine high here and then have * the merge fn apply the target level restrictions. */ combine_restrictions_low(&t->limits, &tgt->limits); + + if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS)) + t->barriers_supported = 0; + return 0; bad: @@ -772,6 +779,12 @@ int dm_table_complete(struct dm_table *t) check_for_valid_limits(&t->limits); + /* + * We only support barriers if there is exactly one underlying device. + */ + if (!list_is_singular(&t->devices)) + t->barriers_supported = 0; + /* how many indexes will the btree have ? */ leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); @@ -986,6 +999,12 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) return t->md; } +int dm_table_barrier_ok(struct dm_table *t) +{ + return t->barriers_supported; +} +EXPORT_SYMBOL(dm_table_barrier_ok); + EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); -- cgit v1.1 From d58168763f74d1edbc296d7038c60efe6493fdd4 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 6 Jan 2009 03:05:10 +0000 Subject: dm table: rework reference counting Rework table reference counting. The existing code uses a reference counter. When the last reference is dropped and the counter reaches zero, the table destructor is called. Table reference counters are acquired/released from upcalls from other kernel code (dm_any_congested, dm_merge_bvec, dm_unplug_all). If the reference counter reaches zero in one of the upcalls, the table destructor is called from almost random kernel code. This leads to various problems: * dm_any_congested being called under a spinlock, which calls the destructor, which calls some sleeping function. * the destructor attempting to take a lock that is already taken by the same process. * stale reference from some other kernel code keeps the table constructed, which keeps some devices open, even after successful return from "dmsetup remove". This can confuse lvm and prevent closing of underlying devices or reusing device minor numbers. The patch changes reference counting so that the table destructor can be called only at predetermined places. The table has always exactly one reference from either mapped_device->map or hash_cell->new_map. After this patch, this reference is not counted in table->holders. A pair of dm_create_table/dm_destroy_table functions is used for table creation/destruction. Temporary references from the other code increase table->holders. A pair of dm_table_get/dm_table_put functions is used to manipulate it. When the table is about to be destroyed, we wait for table->holders to reach 0. Then, we call the table destructor. We use active waiting with msleep(1), because the situation happens rarely (to one user in 5 years) and removing the device isn't performance-critical task: the user doesn't care if it takes one tick more or not. This way, the destructor is called only at specific points (dm_table_destroy function) and the above problems associated with lazy destruction can't happen. Finally remove the temporary protection added to dm_any_congested(). Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-table.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'drivers/md/dm-table.c') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ebaaf72..2fd66c3 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2001 Sistina Software (UK) Limited. - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -15,6 +15,7 @@ #include #include #include +#include #include #define DM_MSG_PREFIX "table" @@ -24,6 +25,19 @@ #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) +/* + * The table has always exactly one reference from either mapped_device->map + * or hash_cell->new_map. This reference is not counted in table->holders. + * A pair of dm_create_table/dm_destroy_table functions is used for table + * creation/destruction. + * + * Temporary references from the other code increase table->holders. A pair + * of dm_table_get/dm_table_put functions is used to manipulate it. + * + * When the table is about to be destroyed, we wait for table->holders to + * drop to zero. + */ + struct dm_table { struct mapped_device *md; atomic_t holders; @@ -228,7 +242,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); - atomic_set(&t->holders, 1); + atomic_set(&t->holders, 0); t->barriers_supported = 1; if (!num_targets) @@ -259,10 +273,14 @@ static void free_devices(struct list_head *devices) } } -static void table_destroy(struct dm_table *t) +void dm_table_destroy(struct dm_table *t) { unsigned int i; + while (atomic_read(&t->holders)) + msleep(1); + smp_mb(); + /* free the indexes (see dm_table_complete) */ if (t->depth >= 2) vfree(t->index[t->depth - 2]); @@ -300,8 +318,8 @@ void dm_table_put(struct dm_table *t) if (!t) return; - if (atomic_dec_and_test(&t->holders)) - table_destroy(t); + smp_mb__before_atomic_dec(); + atomic_dec(&t->holders); } /* -- cgit v1.1