From 49ece554288caf1a8ea9e546ab1ff5bc4b175456 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 08:37:00 -0500 Subject: rbd: fix leak of format 2 snapshot context When rbd_dev_v2_refresh() is called, the rbd device already has a snapshot context associated with it. But that never gets freed, the pointer just gets overwritten. Fix this by dropping the rbd device's reference to the snapshot context before overwriting the pointer. Because ceph_put_snap_context() already handles for a null pointer we don't need to check for that (for the probe case, where no context has yet been assigned). This resolves: http://tracker.ceph.com/issues/4912 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c2ca181..4263743 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4004,6 +4004,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) for (i = 0; i < snap_count; i++) snapc->snaps[i] = ceph_decode_64(&p); + ceph_put_snap_context(rbd_dev->header.snapc); rbd_dev->header.snapc = snapc; dout(" snap context seq = %llu, snap_count = %u\n", -- cgit v1.1 From e627db085e0dab7744b68f3c927be6ed6df2f7f9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: rbd: revalidate only for mapping size changes This commit: d98df63e rbd: revalidate_disk upon rbd resize instituted a call to revalidate_disk() to notify interested parties that a mapped image has changed size. This works well, as long as the the rbd device doesn't map a snapshot. A snapshot will never change size. However, the base image the snapshot is associated with can, and it can do so while the snapshot is mapped. The problem is that the test for the size is looking at the size of the base image, not the size of the mapped snapshot. This patch corrects that. Update the warning message shown in the event of error, and move it into the callers. This resolves: http://tracker.ceph.com/issues/4911 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4263743..5d5e3f0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2628,6 +2628,7 @@ out: static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { struct rbd_device *rbd_dev = (struct rbd_device *)data; + int ret; if (!rbd_dev) return; @@ -2635,7 +2636,9 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, rbd_dev->header_name, (unsigned long long)notify_id, (unsigned int)opcode); - (void)rbd_dev_refresh(rbd_dev); + ret = rbd_dev_refresh(rbd_dev); + if (ret) + rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret); rbd_obj_notify_ack(rbd_dev, notify_id); } @@ -3182,11 +3185,11 @@ static void rbd_exists_validate(struct rbd_device *rbd_dev) static int rbd_dev_refresh(struct rbd_device *rbd_dev) { - u64 image_size; + u64 mapping_size; int ret; rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); - image_size = rbd_dev->header.image_size; + mapping_size = rbd_dev->mapping.size; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); if (rbd_dev->image_format == 1) ret = rbd_dev_v1_refresh(rbd_dev); @@ -3197,10 +3200,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) rbd_exists_validate(rbd_dev); mutex_unlock(&ctl_mutex); - if (ret) - rbd_warn(rbd_dev, "got notification but failed to " - " update snaps: %d\n", ret); - if (image_size != rbd_dev->header.image_size) + if (mapping_size != rbd_dev->mapping.size) revalidate_disk(rbd_dev->disk); return ret; @@ -3405,6 +3405,8 @@ static ssize_t rbd_image_refresh(struct device *dev, int ret; ret = rbd_dev_refresh(rbd_dev); + if (ret) + rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret); return ret < 0 ? ret : size; } -- cgit v1.1 From 00a653e216a8427547774ab3f2cc92709c3e28c9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: rbd: update capacity in rbd_dev_refresh() When a mapped image changes size, we change the capacity recorded for the Linux disk associated with it, in rbd_update_mapping_size(). That function is called in two places--the format 1 and format 2 refresh routines. There is no need to set the capacity while holding the header semaphore. Instead, do it in the common rbd_dev_refresh(), using the logic that's already there to initiate disk revalidation. Add handling in the request function, just in case a request that exceeds the capacity of the device comes in (perhaps one that was started before a refresh shrunk the device). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5d5e3f0..9c094c6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2874,6 +2874,13 @@ static void rbd_request_fn(struct request_queue *q) goto end_request; /* Shouldn't happen */ } + result = -EIO; + if (offset + length > rbd_dev->mapping.size) { + rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", + offset, length, rbd_dev->mapping.size); + goto end_request; + } + result = -ENOMEM; img_request = rbd_img_request_create(rbd_dev, offset, length, write_request, false); @@ -3116,14 +3123,8 @@ static void rbd_update_mapping_size(struct rbd_device *rbd_dev) if (rbd_dev->spec->snap_id != CEPH_NOSNAP) return; - if (rbd_dev->mapping.size != rbd_dev->header.image_size) { - sector_t size; - + if (rbd_dev->mapping.size != rbd_dev->header.image_size) rbd_dev->mapping.size = rbd_dev->header.image_size; - size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; - dout("setting size to %llu sectors", (unsigned long long)size); - set_capacity(rbd_dev->disk, size); - } } /* @@ -3200,8 +3201,14 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) rbd_exists_validate(rbd_dev); mutex_unlock(&ctl_mutex); - if (mapping_size != rbd_dev->mapping.size) + if (mapping_size != rbd_dev->mapping.size) { + sector_t size; + + size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; + dout("setting size to %llu sectors", (unsigned long long)size); + set_capacity(rbd_dev->disk, size); revalidate_disk(rbd_dev->disk); + } return ret; } -- cgit v1.1 From 29334ba49c3e3defd9a2697cd4a199c597c30dc9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: rbd: kill rbd_update_mapping_size() Since rbd_update_mapping_size() is now a trivial wrapper, just open code it in its two callers. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9c094c6..4c98695 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3118,15 +3118,6 @@ static int rbd_read_header(struct rbd_device *rbd_dev, return ret; } -static void rbd_update_mapping_size(struct rbd_device *rbd_dev) -{ - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) - return; - - if (rbd_dev->mapping.size != rbd_dev->header.image_size) - rbd_dev->mapping.size = rbd_dev->header.image_size; -} - /* * only read the first part of the ondisk header, without the snaps info */ @@ -3143,7 +3134,9 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) /* Update image size, and check for resize of mapped image */ rbd_dev->header.image_size = h.image_size; - rbd_update_mapping_size(rbd_dev); + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) + if (rbd_dev->mapping.size != rbd_dev->header.image_size) + rbd_dev->mapping.size = rbd_dev->header.image_size; /* rbd_dev->header.object_prefix shouldn't change */ kfree(rbd_dev->header.snap_sizes); @@ -4074,7 +4067,9 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev) ret = rbd_dev_v2_image_size(rbd_dev); if (ret) goto out; - rbd_update_mapping_size(rbd_dev); + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) + if (rbd_dev->mapping.size != rbd_dev->header.image_size) + rbd_dev->mapping.size = rbd_dev->header.image_size; ret = rbd_dev_v2_snap_context(rbd_dev); dout("rbd_dev_v2_snap_context returned %d\n", ret); -- cgit v1.1 From c734b79655a91a24afcae73738a43a0db09a801a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: rbd: don't print warning if not mapping a parent The presence of the LAYERING bit in an rbd image's feature mask does not guarantee the image actually has a parent image. Currently that bit is set only when a clone (i.e., image with a parent) is created, but it is (currently) not cleared if that clone gets flattened back into a "normal" image. A "parent_id" query will leave the parent_spec for the image being mapped a null pointer, but will not return an error. Currently, whenever an image with the LAYERED feature gets mapped, a warning about the use of layered images gets printed. But we don't want to do this for a flattened image, so print the warning only if we find there is a parent spec after the probe. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4c98695..2d34aea 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4567,13 +4567,14 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) goto out_err; - /* - * Don't print a warning for parent images. We can - * tell this point because we won't know its pool - * name yet (just its pool id). + * Print a warning if this image has a parent. + * Don't print it if the image now being probed + * is itself a parent. We can tell at this point + * because we won't know its pool name yet (just its + * pool id). */ - if (rbd_dev->spec->pool_name) + if (rbd_dev->parent_spec && rbd_dev->spec->pool_name) rbd_warn(rbd_dev, "WARNING: kernel layering " "is EXPERIMENTAL!"); } -- cgit v1.1 From 8f4b7d9821715767ac28bbc2d401bbb5f3f9a448 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: rbd: don't look up snapshot id in rbd_dev_mapping_set() Currently rbd_dev_mapping_set() looks up the snapshot id for the snapshot whose name is found in the rbd device's spec structure. That function gets called by rbd_dev_device_setup(), which is called by rbd_add() *after* rbd_dev_image_probe(). If the image probe succeeds, the rbd device's spec will already have been updated to include names and ids for all fields. Therefore there's no need to look up the snapshot id in rbd_dev_mapping_set(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2d34aea..75a8cea 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -936,20 +936,11 @@ static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) { - const char *snap_name = rbd_dev->spec->snap_name; - u64 snap_id; + u64 snap_id = rbd_dev->spec->snap_id; u64 size = 0; u64 features = 0; int ret; - if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) { - snap_id = rbd_snap_id_by_name(rbd_dev, snap_name); - if (snap_id == CEPH_NOSNAP) - return -ENOENT; - } else { - snap_id = CEPH_NOSNAP; - } - ret = rbd_snap_size(rbd_dev, snap_id, &size); if (ret) return ret; -- cgit v1.1 From 6d80b130d516deef51666e210fde674c947b8b5c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: rbd: kill rbd_dev_clear_mapping() This function is a duplicate of rbd_dev_mapping_clear(), and was added by mistake. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 75a8cea..a62e59a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -966,13 +966,6 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.read_only = true; } -static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev) -{ - rbd_dev->mapping.size = 0; - rbd_dev->mapping.features = 0; - rbd_dev->mapping.read_only = true; -} - static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) { char *name; @@ -4910,7 +4903,7 @@ static void rbd_dev_device_release(struct device *dev) rbd_free_disk(rbd_dev); clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); - rbd_dev_clear_mapping(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); unregister_blkdev(rbd_dev->major, rbd_dev->name); rbd_dev->major = 0; rbd_dev_id_put(rbd_dev); -- cgit v1.1 From 51344a38ba2033be18a4ec23e318845caeccdc04 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: rbd: always set read-only flag in rbd_add() Hold off setting the read-only flag in rbd_add() for an image being mapped until we have successfully probed the image. At that point we know whether it's a snapshot mapping or not, so we can set the read-only flag in that one place rather than doing so (for snapshots) in rbd_dev_mapping_set(). To do this, pass a flag to the image probe routine indicating whether we want a read-only mapping. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a62e59a..b5cac79 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -359,7 +359,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); -static int rbd_dev_image_probe(struct rbd_device *rbd_dev); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -951,11 +951,6 @@ static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) rbd_dev->mapping.size = size; rbd_dev->mapping.features = features; - /* If we are mapping a snapshot it must be marked read-only */ - - if (snap_id != CEPH_NOSNAP) - rbd_dev->mapping.read_only = true; - return 0; } @@ -963,7 +958,6 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) { rbd_dev->mapping.size = 0; rbd_dev->mapping.features = 0; - rbd_dev->mapping.read_only = true; } static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) @@ -4620,7 +4614,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) if (!parent) goto out_err; - ret = rbd_dev_image_probe(parent); + ret = rbd_dev_image_probe(parent, true); if (ret < 0) goto out_err; rbd_dev->parent = parent; @@ -4743,7 +4737,7 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) * device. For format 2 images this includes determining the image * id. */ -static int rbd_dev_image_probe(struct rbd_device *rbd_dev) +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) { int ret; int tmp; @@ -4778,6 +4772,12 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) if (ret) goto err_out_probe; + /* If we are mapping a snapshot it must be marked read-only */ + + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + read_only = true; + rbd_dev->mapping.read_only = read_only; + ret = rbd_dev_probe_parent(rbd_dev); if (!ret) return 0; @@ -4811,6 +4811,7 @@ static ssize_t rbd_add(struct bus_type *bus, struct rbd_spec *spec = NULL; struct rbd_client *rbdc; struct ceph_osd_client *osdc; + bool read_only; int rc = -ENOMEM; if (!try_module_get(THIS_MODULE)) @@ -4820,6 +4821,9 @@ static ssize_t rbd_add(struct bus_type *bus, rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); if (rc < 0) goto err_out_module; + read_only = rbd_opts->read_only; + kfree(rbd_opts); + rbd_opts = NULL; /* done with this */ rbdc = rbd_get_client(ceph_opts); if (IS_ERR(rbdc)) { @@ -4850,11 +4854,7 @@ static ssize_t rbd_add(struct bus_type *bus, rbdc = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */ - rbd_dev->mapping.read_only = rbd_opts->read_only; - kfree(rbd_opts); - rbd_opts = NULL; /* done with this */ - - rc = rbd_dev_image_probe(rbd_dev); + rc = rbd_dev_image_probe(rbd_dev, read_only); if (rc < 0) goto err_out_rbd_dev; -- cgit v1.1 From f35a4dee14c31dc00807f3bcd59cc7a6959f63d7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:29 -0500 Subject: rbd: set the mapping size and features later Defer setting the size and features fields of a mapped image until after the Linux disk structure is set up. Set the capacity of the disk after that. Rearrange the definition of rbd_image_header, separating the fields that are set only once from those that can be updated. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b5cac79..a05b6e5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -100,21 +100,20 @@ * block device image metadata (in-memory version) */ struct rbd_image_header { - /* These four fields never change for a given rbd image */ + /* These six fields never change for a given rbd image */ char *object_prefix; - u64 features; __u8 obj_order; __u8 crypt_type; __u8 comp_type; + u64 stripe_unit; + u64 stripe_count; + u64 features; /* Might be changeable someday? */ /* The remaining fields need to be updated occasionally */ u64 image_size; struct ceph_snap_context *snapc; - char *snap_names; - u64 *snap_sizes; - - u64 stripe_unit; - u64 stripe_count; + char *snap_names; /* format 1 only */ + u64 *snap_sizes; /* format 1 only */ }; /* @@ -4637,10 +4636,6 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) { int ret; - ret = rbd_dev_mapping_set(rbd_dev); - if (ret) - return ret; - /* generate unique id: find highest unique id, add one */ rbd_dev_id_get(rbd_dev); @@ -4662,13 +4657,17 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) if (ret) goto err_out_blkdev; - ret = rbd_bus_add_dev(rbd_dev); + ret = rbd_dev_mapping_set(rbd_dev); if (ret) goto err_out_disk; + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + + ret = rbd_bus_add_dev(rbd_dev); + if (ret) + goto err_out_mapping; /* Everything's ready. Announce the disk to the world. */ - set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); add_disk(rbd_dev->disk); @@ -4677,6 +4676,8 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) return ret; +err_out_mapping: + rbd_dev_mapping_clear(rbd_dev); err_out_disk: rbd_free_disk(rbd_dev); err_out_blkdev: -- cgit v1.1 From 46578dcdca951f3da70d3a5a9b5166b2a492a182 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:29 -0500 Subject: rbd: zero format 1 header structure earlier The passed-in header structure is zeroed in rbd_header_from_disk(). Instead, have the caller do it. Note that there are two callers, rbd_dev_v1_refresh() and rbd_dev_v1_probe(). The latter already has a zeroed header structure so zeroing it isn't necessary there. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a05b6e5..b0fbc38 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -738,8 +738,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header, size_t size; u32 i; - memset(header, 0, sizeof (*header)); - snap_count = le32_to_cpu(ondisk->snap_count); len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); @@ -3103,6 +3101,7 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) int ret; struct rbd_image_header h; + memset(&h, 0, sizeof (h)); ret = rbd_read_header(rbd_dev, &h); if (ret < 0) return ret; -- cgit v1.1 From bb23e37acb2ae9604130c4819fb8ae0f784a3a2b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:29 -0500 Subject: rbd: refactor rbd_header_from_disk() This rearranges rbd_header_from_disk so that it: - allocates the snapshot context right away - keeps results in local variables, not changing the passed-in header until it's known we'll succeed - does initialization of set-once fields in a header only if they have not already been set The last point is moot at the moment, because rbd_read_header() (the only caller) always supplies a zero-filled header buffer. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 127 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 52 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b0fbc38..a3b6bf5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -727,86 +727,109 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) } /* - * Create a new header structure, translate header format from the on-disk - * header. + * Fill an rbd image header with information from the given format 1 + * on-disk header. */ static int rbd_header_from_disk(struct rbd_image_header *header, struct rbd_image_header_ondisk *ondisk) { + bool first_time = header->object_prefix == NULL; + struct ceph_snap_context *snapc; + char *object_prefix = NULL; + char *snap_names = NULL; + u64 *snap_sizes = NULL; u32 snap_count; - size_t len; size_t size; + int ret = -ENOMEM; u32 i; - snap_count = le32_to_cpu(ondisk->snap_count); + /* Allocate this now to avoid having to handle failure below */ - len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); - header->object_prefix = kmalloc(len + 1, GFP_KERNEL); - if (!header->object_prefix) - return -ENOMEM; - memcpy(header->object_prefix, ondisk->object_prefix, len); - header->object_prefix[len] = '\0'; + if (first_time) { + size_t len; + len = strnlen(ondisk->object_prefix, + sizeof (ondisk->object_prefix)); + object_prefix = kmalloc(len + 1, GFP_KERNEL); + if (!object_prefix) + return -ENOMEM; + memcpy(object_prefix, ondisk->object_prefix, len); + object_prefix[len] = '\0'; + } + + /* Allocate the snapshot context and fill it in */ + + snap_count = le32_to_cpu(ondisk->snap_count); + snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); + if (!snapc) + goto out_err; + snapc->seq = le64_to_cpu(ondisk->snap_seq); if (snap_count) { + struct rbd_image_snap_ondisk *snaps; u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); - /* Save a copy of the snapshot names */ + /* We'll keep a copy of the snapshot names... */ - if (snap_names_len > (u64) SIZE_MAX) - return -EIO; - header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); - if (!header->snap_names) + if (snap_names_len > (u64)SIZE_MAX) + goto out_2big; + snap_names = kmalloc(snap_names_len, GFP_KERNEL); + if (!snap_names) goto out_err; - /* - * Note that rbd_dev_v1_header_read() guarantees - * the ondisk buffer we're working with has - * snap_names_len bytes beyond the end of the - * snapshot id array, this memcpy() is safe. - */ - memcpy(header->snap_names, &ondisk->snaps[snap_count], - snap_names_len); - /* Record each snapshot's size */ + /* ...as well as the array of their sizes. */ size = snap_count * sizeof (*header->snap_sizes); - header->snap_sizes = kmalloc(size, GFP_KERNEL); - if (!header->snap_sizes) + snap_sizes = kmalloc(size, GFP_KERNEL); + if (!snap_sizes) goto out_err; - for (i = 0; i < snap_count; i++) - header->snap_sizes[i] = - le64_to_cpu(ondisk->snaps[i].image_size); - } else { - header->snap_names = NULL; - header->snap_sizes = NULL; + + /* + * Copy the names, and fill in each snapshot's id + * and size. + * + * Note that rbd_dev_v1_header_read() guarantees the + * ondisk buffer we're working with has + * snap_names_len bytes beyond the end of the + * snapshot id array, this memcpy() is safe. + */ + memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); + snaps = ondisk->snaps; + for (i = 0; i < snap_count; i++) { + snapc->snaps[i] = le64_to_cpu(snaps[i].id); + snap_sizes[i] = le64_to_cpu(snaps[i].image_size); + } } - header->features = 0; /* No features support in v1 images */ - header->obj_order = ondisk->options.order; - header->crypt_type = ondisk->options.crypt_type; - header->comp_type = ondisk->options.comp_type; + /* We won't fail any more, fill in the header */ + + if (first_time) { + header->object_prefix = object_prefix; + header->obj_order = ondisk->options.order; + header->crypt_type = ondisk->options.crypt_type; + header->comp_type = ondisk->options.comp_type; + /* The rest aren't used for format 1 images */ + header->stripe_unit = 0; + header->stripe_count = 0; + header->features = 0; + } - /* Allocate and fill in the snapshot context */ + /* The remaining fields always get updated (when we refresh) */ header->image_size = le64_to_cpu(ondisk->image_size); - - header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); - if (!header->snapc) - goto out_err; - header->snapc->seq = le64_to_cpu(ondisk->snap_seq); - for (i = 0; i < snap_count; i++) - header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); + header->snapc = snapc; + header->snap_names = snap_names; + header->snap_sizes = snap_sizes; return 0; - +out_2big: + ret = -EIO; out_err: - kfree(header->snap_sizes); - header->snap_sizes = NULL; - kfree(header->snap_names); - header->snap_names = NULL; - kfree(header->object_prefix); - header->object_prefix = NULL; + kfree(snap_sizes); + kfree(snap_names); + ceph_put_snap_context(snapc); + kfree(object_prefix); - return -ENOMEM; + return ret; } static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) -- cgit v1.1 From 662518b128c27def65e9af4bea2b56a1e04b3251 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:29 -0500 Subject: rbd: update in-core header directly Now that rbd_header_from_disk() only fills in one-time fields once, we can extend it slightly so it releases the other fields before replacing their values. This way there's no need to pass a temporary buffer and then copy all the results in. Just use the rbd device header structure in rbd_header_from_disk() so its values get updated directly. Note that this means we need to take the header semaphore at the point we update things. So pass the rbd_dev rather than the address of its header as its first argument to rbd_header_from_disk(), and have it return an error code. As a result, rbd_dev_v1_header_read() does all the work, rbd_read_header() becomes unnecessary, and rbd_dev_v1_refresh() becomes a very simple wrapper. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 98 +++++++++++++++-------------------------------------- 1 file changed, 27 insertions(+), 71 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a3b6bf5..e4586f2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -730,9 +730,10 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) * Fill an rbd image header with information from the given format 1 * on-disk header. */ -static int rbd_header_from_disk(struct rbd_image_header *header, +static int rbd_header_from_disk(struct rbd_device *rbd_dev, struct rbd_image_header_ondisk *ondisk) { + struct rbd_image_header *header = &rbd_dev->header; bool first_time = header->object_prefix == NULL; struct ceph_snap_context *snapc; char *object_prefix = NULL; @@ -802,6 +803,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header, /* We won't fail any more, fill in the header */ + down_write(&rbd_dev->header_rwsem); if (first_time) { header->object_prefix = object_prefix; header->obj_order = ondisk->options.order; @@ -811,6 +813,10 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->stripe_unit = 0; header->stripe_count = 0; header->features = 0; + } else { + ceph_put_snap_context(header->snapc); + kfree(header->snap_names); + kfree(header->snap_sizes); } /* The remaining fields always get updated (when we refresh) */ @@ -820,6 +826,14 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_names = snap_names; header->snap_sizes = snap_sizes; + /* Make sure mapping size is consistent with header info */ + + if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time) + if (rbd_dev->mapping.size != header->image_size) + rbd_dev->mapping.size = header->image_size; + + up_write(&rbd_dev->header_rwsem); + return 0; out_2big: ret = -EIO; @@ -3032,17 +3046,11 @@ out: } /* - * Read the complete header for the given rbd device. - * - * Returns a pointer to a dynamically-allocated buffer containing - * the complete and validated header. Caller can pass the address - * of a variable that will be filled in with the version of the - * header object at the time it was read. - * - * Returns a pointer-coded errno if a failure occurs. + * Read the complete header for the given rbd device. On successful + * return, the rbd_dev->header field will contain up-to-date + * information about the image. */ -static struct rbd_image_header_ondisk * -rbd_dev_v1_header_read(struct rbd_device *rbd_dev) +static int rbd_dev_v1_header_read(struct rbd_device *rbd_dev) { struct rbd_image_header_ondisk *ondisk = NULL; u32 snap_count = 0; @@ -3067,22 +3075,22 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev) size += names_size; ondisk = kmalloc(size, GFP_KERNEL); if (!ondisk) - return ERR_PTR(-ENOMEM); + return -ENOMEM; ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 0, size, ondisk); if (ret < 0) - goto out_err; + goto out; if ((size_t)ret < size) { ret = -ENXIO; rbd_warn(rbd_dev, "short header read (want %zd got %d)", size, ret); - goto out_err; + goto out; } if (!rbd_dev_ondisk_valid(ondisk)) { ret = -ENXIO; rbd_warn(rbd_dev, "invalid header"); - goto out_err; + goto out; } names_size = le64_to_cpu(ondisk->snap_names_len); @@ -3090,27 +3098,8 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev) snap_count = le32_to_cpu(ondisk->snap_count); } while (snap_count != want_count); - return ondisk; - -out_err: - kfree(ondisk); - - return ERR_PTR(ret); -} - -/* - * reload the ondisk the header - */ -static int rbd_read_header(struct rbd_device *rbd_dev, - struct rbd_image_header *header) -{ - struct rbd_image_header_ondisk *ondisk; - int ret; - - ondisk = rbd_dev_v1_header_read(rbd_dev); - if (IS_ERR(ondisk)) - return PTR_ERR(ondisk); - ret = rbd_header_from_disk(header, ondisk); + ret = rbd_header_from_disk(rbd_dev, ondisk); +out: kfree(ondisk); return ret; @@ -3121,40 +3110,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev, */ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) { - int ret; - struct rbd_image_header h; - - memset(&h, 0, sizeof (h)); - ret = rbd_read_header(rbd_dev, &h); - if (ret < 0) - return ret; - - down_write(&rbd_dev->header_rwsem); - - /* Update image size, and check for resize of mapped image */ - rbd_dev->header.image_size = h.image_size; - if (rbd_dev->spec->snap_id == CEPH_NOSNAP) - if (rbd_dev->mapping.size != rbd_dev->header.image_size) - rbd_dev->mapping.size = rbd_dev->header.image_size; - - /* rbd_dev->header.object_prefix shouldn't change */ - kfree(rbd_dev->header.snap_sizes); - kfree(rbd_dev->header.snap_names); - /* osd requests may still refer to snapc */ - ceph_put_snap_context(rbd_dev->header.snapc); - - rbd_dev->header.image_size = h.image_size; - rbd_dev->header.snapc = h.snapc; - rbd_dev->header.snap_names = h.snap_names; - rbd_dev->header.snap_sizes = h.snap_sizes; - /* Free the extra copy of the object prefix */ - if (strcmp(rbd_dev->header.object_prefix, h.object_prefix)) - rbd_warn(rbd_dev, "object prefix changed (ignoring)"); - kfree(h.object_prefix); - - up_write(&rbd_dev->header_rwsem); - - return ret; + return rbd_dev_v1_header_read(rbd_dev); } /* @@ -4517,7 +4473,7 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) /* Populate rbd image metadata */ - ret = rbd_read_header(rbd_dev, &rbd_dev->header); + ret = rbd_dev_v1_header_read(rbd_dev); if (ret < 0) goto out_err; -- cgit v1.1 From 30d60ba2f258da24b91edb880338c7178f901de9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:30 -0500 Subject: rbd: simplify rbd_dev_v1_probe() An rbd_dev structure's fields are all zero-filled for an initial probe, so there's no need to explicitly zero the parent_spec and parent_overlap fields in rbd_dev_v1_probe(). Removing these assignments makes rbd_dev_v1_probe() *almost* trivial. Move the dout() message that announces discovery of an image into rbd_dev_image_probe(), generalize to support images in either format and only show it if an image is fully discovered. This highlights that are some unnecessary cleanups in the error path for rbd_dev_v1_probe(), so they can be removed. Now rbd_dev_v1_probe() *is* a trivial wrapper function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e4586f2..ae22381 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4469,31 +4469,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) { - int ret; - - /* Populate rbd image metadata */ - - ret = rbd_dev_v1_header_read(rbd_dev); - if (ret < 0) - goto out_err; - - /* Version 1 images have no parent (no layering) */ - - rbd_dev->parent_spec = NULL; - rbd_dev->parent_overlap = 0; - - dout("discovered version 1 image, header name is %s\n", - rbd_dev->header_name); - - return 0; - -out_err: - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; - kfree(rbd_dev->spec->image_id); - rbd_dev->spec->image_id = NULL; - - return ret; + return rbd_dev_v1_header_read(rbd_dev); } static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) @@ -4553,9 +4529,6 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) if (ret) goto out_err; - dout("discovered version 2 image, header name is %s\n", - rbd_dev->header_name); - return 0; out_err: rbd_dev->parent_overlap = 0; @@ -4758,9 +4731,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) rbd_dev->mapping.read_only = read_only; ret = rbd_dev_probe_parent(rbd_dev); - if (!ret) - return 0; + if (ret) + goto err_out_probe; + + dout("discovered format %u image, header name is %s\n", + rbd_dev->image_format, rbd_dev->header_name); + return 0; err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: -- cgit v1.1 From 99a41ebcee1a1ea0463b1b29d2e888de21a60c66 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:30 -0500 Subject: rbd: get rid of trivial v1 header wrappers Get rid of the trivial wrapper functions rbd_dev_v1_refresh() and rbd_dev_v1_probe(), substituting rbd_dev_v1_header_read() calls in their place. Rename rbd_dev_v1_header_read() to be rbd_dev_v1_header_info(), to be more generic (it will better reflect what happens with format 2 images). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ae22381..6748fe2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -788,7 +788,7 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, * Copy the names, and fill in each snapshot's id * and size. * - * Note that rbd_dev_v1_header_read() guarantees the + * Note that rbd_dev_v1_header_info() guarantees the * ondisk buffer we're working with has * snap_names_len bytes beyond the end of the * snapshot id array, this memcpy() is safe. @@ -3050,7 +3050,7 @@ out: * return, the rbd_dev->header field will contain up-to-date * information about the image. */ -static int rbd_dev_v1_header_read(struct rbd_device *rbd_dev) +static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) { struct rbd_image_header_ondisk *ondisk = NULL; u32 snap_count = 0; @@ -3106,14 +3106,6 @@ out: } /* - * only read the first part of the ondisk header, without the snaps info - */ -static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) -{ - return rbd_dev_v1_header_read(rbd_dev); -} - -/* * Clear the rbd device's EXISTS flag if the snapshot it's mapped to * has disappeared from the (just updated) snapshot context. */ @@ -3141,7 +3133,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) mapping_size = rbd_dev->mapping.size; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); if (rbd_dev->image_format == 1) - ret = rbd_dev_v1_refresh(rbd_dev); + ret = rbd_dev_v1_header_info(rbd_dev); else ret = rbd_dev_v2_refresh(rbd_dev); @@ -4467,11 +4459,6 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) memset(header, 0, sizeof (*header)); } -static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) -{ - return rbd_dev_v1_header_read(rbd_dev); -} - static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) { int ret; @@ -4714,7 +4701,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) goto out_header_name; if (rbd_dev->image_format == 1) - ret = rbd_dev_v1_probe(rbd_dev); + ret = rbd_dev_v1_header_info(rbd_dev); else ret = rbd_dev_v2_probe(rbd_dev); if (ret) -- cgit v1.1 From 2df3fac75851dc4257b90dc72fdd3cf27ba177bc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:30 -0500 Subject: rbd: define rbd_dev_v2_header_info() This rearranges rbd_dev_v2_refresh() so it works more like rbd_dev_v1_header_info(). While format 1 images need to read the whole header object to get any information, format 2 can collect almost all information selectively. So the one-time initialization will remain in a separate function--based on rbd_dev_v2_probe(). Rename rbd_dev_v2_refresh() to be rbd_dev_v2_header_info(), and have it call rbd_dev_v2_header_onetime() if it's being called for the first time for the given rbd device. Rename rbd_dev_v2_probe() to be rbd_dev_v2_header_onetime() and remove the image size and snapshot context calls it held in common with the refresh function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6748fe2..0d874a5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -425,7 +425,8 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request); static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev); -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev); +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); +static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev); static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u64 snap_id); static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, @@ -3135,7 +3136,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) if (rbd_dev->image_format == 1) ret = rbd_dev_v1_header_info(rbd_dev); else - ret = rbd_dev_v2_refresh(rbd_dev); + ret = rbd_dev_v2_header_info(rbd_dev); /* If it's a mapped snapshot, validate its EXISTS flag */ @@ -4005,12 +4006,19 @@ out: return snap_name; } -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev) +static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) { + bool first_time = rbd_dev->header.object_prefix == NULL; int ret; down_write(&rbd_dev->header_rwsem); + if (first_time) { + ret = rbd_dev_v2_header_onetime(rbd_dev); + if (ret) + goto out; + } + ret = rbd_dev_v2_image_size(rbd_dev); if (ret) goto out; @@ -4459,22 +4467,18 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) memset(header, 0, sizeof (*header)); } -static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) { int ret; - ret = rbd_dev_v2_image_size(rbd_dev); - if (ret) - goto out_err; - - /* Get the object prefix (a.k.a. block_name) for the image */ - ret = rbd_dev_v2_object_prefix(rbd_dev); if (ret) goto out_err; - /* Get the and check features for the image */ - + /* + * Get the and check features for the image. Currently the + * features are assumed to never change. + */ ret = rbd_dev_v2_features(rbd_dev); if (ret) goto out_err; @@ -4504,17 +4508,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; } - - /* crypto and compression type aren't (yet) supported for v2 images */ - - rbd_dev->header.crypt_type = 0; - rbd_dev->header.comp_type = 0; - - /* Get the snapshot context, plus the header version */ - - ret = rbd_dev_v2_snap_context(rbd_dev); - if (ret) - goto out_err; + /* No support for crypto and compression type format 2 images */ return 0; out_err: @@ -4703,7 +4697,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) if (rbd_dev->image_format == 1) ret = rbd_dev_v1_header_info(rbd_dev); else - ret = rbd_dev_v2_probe(rbd_dev); + ret = rbd_dev_v2_header_info(rbd_dev); if (ret) goto err_out_watch; -- cgit v1.1 From 91c6febb3817be576785ef06aeaaa8ed423e0a2a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:32 -0500 Subject: rbd: fix an incorrect assertion condition In rbd_img_obj_parent_read_full_callback() there is an assertion intended to verify the size of the image request for a full parent read was the size of the original request's target object. But assertion was looking at the parent image order rather than the original one, and these values can differ. Fix that. This resolves: http://tracker.ceph.com/issues/4938 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0d874a5..15ac2a5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2186,13 +2186,13 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) result = img_request->result; obj_size = img_request->length; xferred = img_request->xferred; + rbd_img_request_put(img_request); - rbd_dev = img_request->rbd_dev; + rbd_assert(orig_request->img_request); + rbd_dev = orig_request->img_request->rbd_dev; rbd_assert(rbd_dev); rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); - rbd_img_request_put(img_request); - if (result) goto out_err; -- cgit v1.1 From 5b2ab72d367d2682c1a237448fbc1595881a88fa Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: rbd: support reading parent page data Currently, rbd_img_parent_read() assumes the incoming object request contains bio data. But if a layered image is part of a multi-layer stack of images it will result in read requests of page data to parent images. Fortunately, it's not hard to add support for page data. This resolves: http://tracker.ceph.com/issues/4939 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 15ac2a5..2a0e9b8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2574,7 +2574,7 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) rbd_assert(obj_request_img_data_test(obj_request)); rbd_assert(obj_request->img_request != NULL); rbd_assert(obj_request->result == (s32) -ENOENT); - rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request_type_valid(obj_request->type)); rbd_dev = obj_request->img_request->rbd_dev; rbd_assert(rbd_dev->parent != NULL); @@ -2590,8 +2590,12 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) rbd_obj_request_get(obj_request); img_request->obj_request = obj_request; - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, - obj_request->bio_list); + if (obj_request->type == OBJ_REQUEST_BIO) + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + obj_request->bio_list); + else + result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, + obj_request->pages); if (result) goto out_err; -- cgit v1.1 From 7ce4eef7b5fad73b365b7e4b8892af3af72d4bd3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: rbd: set mapping read-only flag in rbd_add() The rbd_dev->mapping field for a parent image is not meaningful. Since rbd_image_probe() is used both for images being mapped and their parents, it doesn't make sense to set that flag in that function. So move the setting of the mapping.read_only flag out of rbd_dev_image_probe() and into rbd_add() instead. This resolves: http://tracker.ceph.com/issues/4940 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2a0e9b8..dbfc44a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -358,7 +358,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); -static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -4549,7 +4549,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) if (!parent) goto out_err; - ret = rbd_dev_image_probe(parent, true); + ret = rbd_dev_image_probe(parent); if (ret < 0) goto out_err; rbd_dev->parent = parent; @@ -4671,10 +4671,9 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) /* * Probe for the existence of the header object for the given rbd - * device. For format 2 images this includes determining the image - * id. + * device. */ -static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) +static int rbd_dev_image_probe(struct rbd_device *rbd_dev) { int ret; int tmp; @@ -4709,12 +4708,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) if (ret) goto err_out_probe; - /* If we are mapping a snapshot it must be marked read-only */ - - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) - read_only = true; - rbd_dev->mapping.read_only = read_only; - ret = rbd_dev_probe_parent(rbd_dev); if (ret) goto err_out_probe; @@ -4795,10 +4788,16 @@ static ssize_t rbd_add(struct bus_type *bus, rbdc = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */ - rc = rbd_dev_image_probe(rbd_dev, read_only); + rc = rbd_dev_image_probe(rbd_dev); if (rc < 0) goto err_out_rbd_dev; + /* If we are mapping a snapshot it must be marked read-only */ + + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + read_only = true; + rbd_dev->mapping.read_only = read_only; + rc = rbd_dev_device_setup(rbd_dev); if (!rc) return count; -- cgit v1.1 From 1f3ef78861ac4b510175e177899b9b5ba4bbed91 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: rbd: only set up watch for mapped images Any changes to parent images are immaterial to any mapped clone. So there is no need to have a watch event registered on header objects except for the header object of an image that is mapped. In fact, a watch request is a write operation, and we may only have read access to a parent image. We can't set up the watch request until we know the name of the header object though. So pass a flag to rbd_dev_image_probe() to indicate whether this probe is for a mapping or for a parent image. Change the second parameter to rbd_dev_header_watch_sync() be Boolean while we're at it. This resolves: http://tracker.ceph.com/issues/4941 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index dbfc44a..e417704 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -358,7 +358,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); -static int rbd_dev_image_probe(struct rbd_device *rbd_dev); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -2664,7 +2664,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) * Request sync osd watch/unwatch. The value of "start" determines * whether a watch request is being initiated or torn down. */ -static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; @@ -2698,7 +2698,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) rbd_dev->watch_request->osd_req); osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, 0, start); + rbd_dev->watch_event->cookie, 0, start ? 1 : 0); rbd_osd_req_format_write(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); @@ -4549,7 +4549,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) if (!parent) goto out_err; - ret = rbd_dev_image_probe(parent); + ret = rbd_dev_image_probe(parent, false); if (ret < 0) goto out_err; rbd_dev->parent = parent; @@ -4654,12 +4654,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) static void rbd_dev_image_release(struct rbd_device *rbd_dev) { - int ret; - rbd_dev_unprobe(rbd_dev); - ret = rbd_dev_header_watch_sync(rbd_dev, 0); - if (ret) - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; rbd_dev->image_format = 0; @@ -4671,9 +4666,11 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) /* * Probe for the existence of the header object for the given rbd - * device. + * device. If this image is the one being mapped (i.e., not a + * parent), initiate a watch on its header object before using that + * object to get detailed information about the rbd image. */ -static int rbd_dev_image_probe(struct rbd_device *rbd_dev) +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) { int ret; int tmp; @@ -4693,9 +4690,11 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) if (ret) goto err_out_format; - ret = rbd_dev_header_watch_sync(rbd_dev, 1); - if (ret) - goto out_header_name; + if (mapping) { + ret = rbd_dev_header_watch_sync(rbd_dev, true); + if (ret) + goto out_header_name; + } if (rbd_dev->image_format == 1) ret = rbd_dev_v1_header_info(rbd_dev); @@ -4719,9 +4718,12 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: - tmp = rbd_dev_header_watch_sync(rbd_dev, 0); - if (tmp) - rbd_warn(rbd_dev, "unable to tear down watch request\n"); + if (mapping) { + tmp = rbd_dev_header_watch_sync(rbd_dev, false); + if (tmp) + rbd_warn(rbd_dev, "unable to tear down " + "watch request (%d)\n", tmp); + } out_header_name: kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; @@ -4788,7 +4790,7 @@ static ssize_t rbd_add(struct bus_type *bus, rbdc = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */ - rc = rbd_dev_image_probe(rbd_dev); + rc = rbd_dev_image_probe(rbd_dev, true); if (rc < 0) goto err_out_rbd_dev; @@ -4910,10 +4912,13 @@ static ssize_t rbd_remove(struct bus_type *bus, spin_unlock_irq(&rbd_dev->lock); if (ret < 0) goto done; - ret = count; rbd_bus_del_dev(rbd_dev); + ret = rbd_dev_header_watch_sync(rbd_dev, false); + if (ret) + rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); rbd_dev_image_release(rbd_dev); module_put(THIS_MODULE); + ret = count; done: mutex_unlock(&ctl_mutex); -- cgit v1.1 From c48f3f86e248b1649ad22151dd45ef2610101ed3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 8 May 2013 08:49:52 -0500 Subject: rbd: kill rbd_img_request_get() Get rid of rbd_img_request_get(), because it isn't used, and maybe won't ever be needed. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e417704..51c45e7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1357,13 +1357,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) kref_put(&obj_request->kref, rbd_obj_request_destroy); } -static void rbd_img_request_get(struct rbd_img_request *img_request) -{ - dout("%s: img %p (was %d)\n", __func__, img_request, - atomic_read(&img_request->kref.refcount)); - kref_get(&img_request->kref); -} - static void rbd_img_request_destroy(struct kref *kref); static void rbd_img_request_put(struct rbd_img_request *img_request) { @@ -1888,9 +1881,6 @@ static struct rbd_img_request *rbd_img_request_create( INIT_LIST_HEAD(&img_request->obj_requests); kref_init(&img_request->kref); - rbd_img_request_get(img_request); /* Avoid a warning */ - rbd_img_request_put(img_request); /* TEMPORARY */ - dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, write_request ? "write" : "read", offset, length, img_request); -- cgit v1.1 From ebda6408f2227a774343c3cc2861384942143ff3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 May 2013 16:29:22 -0500 Subject: rbd: fix parent request size assumption The code that reads object data from the parent for a copyup on write request currently assumes that the size of that request is the size of a "full" object from the original target image. That is not necessarily the case. The parent overlap could reduce the request size below that. To fix that assumption we need to record the number of pages in the copyup_pages array, for both an image request and an object request. Rename a local variable in rbd_img_obj_parent_read_full_callback() to reflect we're recording the length of the parent read request, not the size of the target object. This resolves: http://tracker.ceph.com/issues/5038 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 51c45e7..597b9bb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -224,6 +224,7 @@ struct rbd_obj_request { }; }; struct page **copyup_pages; + u32 copyup_page_count; struct ceph_osd_request *osd_req; @@ -256,6 +257,7 @@ struct rbd_img_request { struct rbd_obj_request *obj_request; /* obj req initiator */ }; struct page **copyup_pages; + u32 copyup_page_count; spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; rbd_img_callback_t callback; @@ -2119,7 +2121,7 @@ rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; struct rbd_device *rbd_dev; - u64 length; + struct page **pages; u32 page_count; rbd_assert(obj_request->type == OBJ_REQUEST_BIO); @@ -2129,12 +2131,14 @@ rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) rbd_dev = img_request->rbd_dev; rbd_assert(rbd_dev); - length = (u64)1 << rbd_dev->header.obj_order; - page_count = (u32)calc_pages_for(0, length); - rbd_assert(obj_request->copyup_pages); - ceph_release_page_vector(obj_request->copyup_pages, page_count); + pages = obj_request->copyup_pages; + rbd_assert(pages != NULL); obj_request->copyup_pages = NULL; + page_count = obj_request->copyup_page_count; + rbd_assert(page_count); + obj_request->copyup_page_count = 0; + ceph_release_page_vector(pages, page_count); /* * We want the transfer count to reflect the size of the @@ -2158,9 +2162,9 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) struct ceph_osd_client *osdc; struct rbd_device *rbd_dev; struct page **pages; + u32 page_count; int result; - u64 obj_size; - u64 xferred; + u64 parent_length; rbd_assert(img_request_child_test(img_request)); @@ -2169,19 +2173,21 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) pages = img_request->copyup_pages; rbd_assert(pages != NULL); img_request->copyup_pages = NULL; + page_count = img_request->copyup_page_count; + rbd_assert(page_count); + img_request->copyup_page_count = 0; orig_request = img_request->obj_request; rbd_assert(orig_request != NULL); rbd_assert(orig_request->type == OBJ_REQUEST_BIO); result = img_request->result; - obj_size = img_request->length; - xferred = img_request->xferred; + parent_length = img_request->length; + rbd_assert(parent_length == img_request->xferred); rbd_img_request_put(img_request); rbd_assert(orig_request->img_request); rbd_dev = orig_request->img_request->rbd_dev; rbd_assert(rbd_dev); - rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); if (result) goto out_err; @@ -2195,11 +2201,12 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) goto out_err; orig_request->osd_req = osd_req; orig_request->copyup_pages = pages; + orig_request->copyup_page_count = page_count; /* Initialize the copyup op */ osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); - osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, + osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, false, false); /* Then the original write request op */ @@ -2312,6 +2319,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) if (result) goto out_err; parent_request->copyup_pages = pages; + parent_request->copyup_page_count = page_count; parent_request->callback = rbd_img_obj_parent_read_full_callback; result = rbd_img_request_submit(parent_request); @@ -2319,6 +2327,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) return 0; parent_request->copyup_pages = NULL; + parent_request->copyup_page_count = 0; parent_request->obj_request = NULL; rbd_obj_request_put(obj_request); out_err: -- cgit v1.1 From b91f09f17b2a302f07022e2f766969e2536d71b3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 May 2013 16:29:22 -0500 Subject: rbd: support reading parent page data for writes Currently, rbd_img_obj_parent_read_full() assumes the incoming object request contains bio data. But if a layered image is part of a multi-layer stack of images it will result in read requests of page data to parent images. This is handling the same kind of issue as was resolved by this commit: 5b2ab72d rbd: support reading parent page data This resolves: http://tracker.ceph.com/issues/5027 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 597b9bb..5161e80 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2165,6 +2165,8 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) u32 page_count; int result; u64 parent_length; + u64 offset; + u64 length; rbd_assert(img_request_child_test(img_request)); @@ -2179,7 +2181,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) orig_request = img_request->obj_request; rbd_assert(orig_request != NULL); - rbd_assert(orig_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request_type_valid(orig_request->type)); result = img_request->result; parent_length = img_request->length; rbd_assert(parent_length == img_request->xferred); @@ -2211,11 +2213,17 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) /* Then the original write request op */ + offset = orig_request->offset; + length = orig_request->length; osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, - orig_request->offset, - orig_request->length, 0, 0); - osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, - orig_request->length); + offset, length, 0, 0); + if (orig_request->type == OBJ_REQUEST_BIO) + osd_req_op_extent_osd_data_bio(osd_req, 1, + orig_request->bio_list, length); + else + osd_req_op_extent_osd_data_pages(osd_req, 1, + orig_request->pages, length, + offset & ~PAGE_MASK, false, false); rbd_osd_req_format_write(orig_request); @@ -2261,7 +2269,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) int result; rbd_assert(obj_request_img_data_test(obj_request)); - rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request_type_valid(obj_request->type)); img_request = obj_request->img_request; rbd_assert(img_request != NULL); -- cgit v1.1 From 70cf49cfc7a4d1eb4aeea6cd128b88230be9d0b1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: rbd: ignore zero-overlap parent An rbd clone image that has an overlap with its parent of 0 is effectively not a layered image at all. Detect this case and treat such an image as non-layered. Issue a warning to be sure the user knows what's going on. This resolves: http://tracker.ceph.com/issues/5028 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5161e80..b67ecda 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3665,9 +3665,13 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); ceph_decode_64_safe(&p, end, overlap, out_err); - rbd_dev->parent_overlap = overlap; - rbd_dev->parent_spec = parent_spec; - parent_spec = NULL; /* rbd_dev now owns this */ + if (overlap) { + rbd_dev->parent_spec = parent_spec; + parent_spec = NULL; /* rbd_dev now owns this */ + rbd_dev->parent_overlap = overlap; + } else { + rbd_warn(rbd_dev, "ignoring parent of clone with overlap 0\n"); + } out: ret = 0; out_err: -- cgit v1.1 From 642a25375f4c863607d2170f4471aec8becf7788 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: rbd: get parent info on refresh Get parent info for format 2 images on every refresh (rather than just during the initial probe). This will be needed to detect the disappearance of the parent image in the event a mapped image becomes unlayered (i.e., flattened). Avoid leaking the previous parent spec on the second and subsequent times this information is requested by dropping the previous one (if any) before updating it. (Also, extract the pool id into a local variable before assigning it into the parent spec.) Switch to using a non-zero parent overlap value rather than the existence of a parent (a non-null parent_spec pointer) to determine whether to mark a request layered. It will soon be possible for a layered image to become unlayered while a request is in flight. This means that the layered flag for an image request indicates that there was a non-zero parent overlap at the time the image request was created. The parent overlap can change thereafter, which may lead to special handling at request submission or completion time. This and the next several patches are related to: http://tracker.ceph.com/issues/3763 NOTE: If an error occurs while refreshing the parent info (i.e., requesting it after initial probe), the old parent info will persist. This is not really correct, and is a scenario that needs to be addressed. For now we'll assert that the failure mode is unlikely, but the issue has been documented in tracker issue 5040. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 67 +++++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 30 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b67ecda..fcef63c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1873,7 +1873,7 @@ static struct rbd_img_request *rbd_img_request_create( } if (child_request) img_request_child_set(img_request); - if (rbd_dev->parent_spec) + if (rbd_dev->parent_overlap) img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; @@ -3613,6 +3613,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) __le64 snapid; void *p; void *end; + u64 pool_id; char *image_id; u64 overlap; int ret; @@ -3643,18 +3644,19 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) p = reply_buf; end = reply_buf + ret; ret = -ERANGE; - ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); - if (parent_spec->pool_id == CEPH_NOPOOL) + ceph_decode_64_safe(&p, end, pool_id, out_err); + if (pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ /* The ceph file layout needs to fit pool id in 32 bits */ ret = -EIO; - if (parent_spec->pool_id > (u64)U32_MAX) { + if (pool_id > (u64)U32_MAX) { rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", - (unsigned long long)parent_spec->pool_id, U32_MAX); + (unsigned long long)pool_id, U32_MAX); goto out_err; } + parent_spec->pool_id = pool_id; image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { @@ -3666,6 +3668,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) ceph_decode_64_safe(&p, end, overlap, out_err); if (overlap) { + rbd_spec_put(rbd_dev->parent_spec); rbd_dev->parent_spec = parent_spec; parent_spec = NULL; /* rbd_dev now owns this */ rbd_dev->parent_overlap = overlap; @@ -4034,17 +4037,43 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) goto out; } + /* + * If the image supports layering, get the parent info. We + * need to probe the first time regardless. Thereafter we + * only need to if there's a parent, to see if it has + * disappeared due to the mapped image getting flattened. + */ + if (rbd_dev->header.features & RBD_FEATURE_LAYERING && + (first_time || rbd_dev->parent_spec)) { + bool warn; + + ret = rbd_dev_v2_parent_info(rbd_dev); + if (ret) + goto out; + + /* + * Print a warning if this is the initial probe and + * the image has a parent. Don't print it if the + * image now being probed is itself a parent. We + * can tell at this point because we won't know its + * pool name yet (just its pool id). + */ + warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name; + if (first_time && warn) + rbd_warn(rbd_dev, "WARNING: kernel layering " + "is EXPERIMENTAL!"); + } + ret = rbd_dev_v2_image_size(rbd_dev); if (ret) goto out; + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) if (rbd_dev->mapping.size != rbd_dev->header.image_size) rbd_dev->mapping.size = rbd_dev->header.image_size; ret = rbd_dev_v2_snap_context(rbd_dev); dout("rbd_dev_v2_snap_context returned %d\n", ret); - if (ret) - goto out; out: up_write(&rbd_dev->header_rwsem); @@ -4498,24 +4527,6 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) if (ret) goto out_err; - /* If the image supports layering, get the parent info */ - - if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { - ret = rbd_dev_v2_parent_info(rbd_dev); - if (ret) - goto out_err; - /* - * Print a warning if this image has a parent. - * Don't print it if the image now being probed - * is itself a parent. We can tell at this point - * because we won't know its pool name yet (just its - * pool id). - */ - if (rbd_dev->parent_spec && rbd_dev->spec->pool_name) - rbd_warn(rbd_dev, "WARNING: kernel layering " - "is EXPERIMENTAL!"); - } - /* If the image supports fancy striping, get its parameters */ if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { @@ -4527,11 +4538,7 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) return 0; out_err: - rbd_dev->parent_overlap = 0; - rbd_spec_put(rbd_dev->parent_spec); - rbd_dev->parent_spec = NULL; - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; + rbd_dev->header.features = 0; kfree(rbd_dev->header.object_prefix); rbd_dev->header.object_prefix = NULL; -- cgit v1.1 From 8785b1d487f0a31afd2c802499786d3b355eccea Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 9 May 2013 10:08:49 -0500 Subject: rbd: don't release write request until necessary Previously when a layered write was going to involve a copyup request, the original osd request was released before submitting the parent full-object read. The osd request for the copyup would then be allocated in rbd_img_obj_parent_read_full_callback(). Shortly we will be handling the event of mapped layered images getting flattened, and when that occurs we need to resubmit the original request. We therefore don't want to release the osd request until we really konw we're going to replace it--in the callback function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fcef63c..d861c71 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2194,13 +2194,17 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) if (result) goto out_err; - /* Allocate the new copyup osd request for the original request */ - + /* + * The original osd request is of no use to use any more. + * We need a new one that can hold the two ops in a copyup + * request. Allocate the new copyup osd request for the + * original request, and release the old one. + */ result = -ENOMEM; - rbd_assert(!orig_request->osd_req); osd_req = rbd_osd_req_create_copyup(orig_request); if (!osd_req) goto out_err; + rbd_osd_req_destroy(orig_request->osd_req); orig_request->osd_req = osd_req; orig_request->copyup_pages = pages; orig_request->copyup_page_count = page_count; @@ -2277,15 +2281,6 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) rbd_assert(rbd_dev->parent != NULL); /* - * First things first. The original osd request is of no - * use to use any more, we'll need a new one that can hold - * the two ops in a copyup request. We'll get that later, - * but for now we can release the old one. - */ - rbd_osd_req_destroy(obj_request->osd_req); - obj_request->osd_req = NULL; - - /* * Determine the byte range covered by the object in the * child image to which the original request was to be sent. */ -- cgit v1.1 From fb65d2284c117cfc28d30217d25a14a8e7a75a94 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 8 May 2013 22:50:04 -0500 Subject: rbd: define rbd_dev_unparent() Define rbd_dev_unparent() to encapsulate cleaning up parent data structures from a layered rbd image. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d861c71..9c2b20a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1837,6 +1837,17 @@ static void rbd_obj_request_destroy(struct kref *kref) kmem_cache_free(rbd_obj_request_cache, obj_request); } +/* It's OK to call this for a device with no parent */ + +static void rbd_spec_put(struct rbd_spec *spec); +static void rbd_dev_unparent(struct rbd_device *rbd_dev) +{ + rbd_dev_remove_parent(rbd_dev); + rbd_spec_put(rbd_dev->parent_spec); + rbd_dev->parent_spec = NULL; + rbd_dev->parent_overlap = 0; +} + /* * Caller is responsible for filling in the list of object requests * that comprises the image request, and the Linux request pointer @@ -4491,10 +4502,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { struct rbd_image_header *header; - rbd_dev_remove_parent(rbd_dev); - rbd_spec_put(rbd_dev->parent_spec); - rbd_dev->parent_spec = NULL; - rbd_dev->parent_overlap = 0; + rbd_dev_unparent(rbd_dev); /* Free dynamic fields from the header, then zero it out */ @@ -4570,7 +4578,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) return 0; out_err: if (parent) { - rbd_spec_put(rbd_dev->parent_spec); + rbd_dev_unparent(rbd_dev); kfree(rbd_dev->header_name); rbd_dev_destroy(parent); } else { -- cgit v1.1 From e93f3152357ca75284284bef8eeea7d45fe1bab1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 8 May 2013 22:50:04 -0500 Subject: rbd: define parent image request routines Define rbd_parent_request_create() and rbd_parent_request_destroy() to handle the creation of parent image requests submitted for layered image objects. For simplicity, let rbd_img_request_put() handle dropping the reference to any image request (parent or not), and call whichever destructor is appropriate on the last put. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 78 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 23 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9c2b20a..1ffdfbf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1359,13 +1359,18 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) kref_put(&obj_request->kref, rbd_obj_request_destroy); } +static bool img_request_child_test(struct rbd_img_request *img_request); +static void rbd_parent_request_destroy(struct kref *kref); static void rbd_img_request_destroy(struct kref *kref); static void rbd_img_request_put(struct rbd_img_request *img_request) { rbd_assert(img_request != NULL); dout("%s: img %p (was %d)\n", __func__, img_request, atomic_read(&img_request->kref.refcount)); - kref_put(&img_request->kref, rbd_img_request_destroy); + if (img_request_child_test(img_request)) + kref_put(&img_request->kref, rbd_parent_request_destroy); + else + kref_put(&img_request->kref, rbd_img_request_destroy); } static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, @@ -1482,6 +1487,12 @@ static void img_request_child_set(struct rbd_img_request *img_request) smp_mb(); } +static void img_request_child_clear(struct rbd_img_request *img_request) +{ + clear_bit(IMG_REQ_CHILD, &img_request->flags); + smp_mb(); +} + static bool img_request_child_test(struct rbd_img_request *img_request) { smp_mb(); @@ -1856,8 +1867,7 @@ static void rbd_dev_unparent(struct rbd_device *rbd_dev) static struct rbd_img_request *rbd_img_request_create( struct rbd_device *rbd_dev, u64 offset, u64 length, - bool write_request, - bool child_request) + bool write_request) { struct rbd_img_request *img_request; @@ -1882,8 +1892,6 @@ static struct rbd_img_request *rbd_img_request_create( } else { img_request->snap_id = rbd_dev->spec->snap_id; } - if (child_request) - img_request_child_set(img_request); if (rbd_dev->parent_overlap) img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); @@ -1918,12 +1926,46 @@ static void rbd_img_request_destroy(struct kref *kref) if (img_request_write_test(img_request)) ceph_put_snap_context(img_request->snapc); - if (img_request_child_test(img_request)) - rbd_obj_request_put(img_request->obj_request); - kmem_cache_free(rbd_img_request_cache, img_request); } +static struct rbd_img_request *rbd_parent_request_create( + struct rbd_obj_request *obj_request, + u64 img_offset, u64 length) +{ + struct rbd_img_request *parent_request; + struct rbd_device *rbd_dev; + + rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + + parent_request = rbd_img_request_create(rbd_dev->parent, + img_offset, length, false); + if (!parent_request) + return NULL; + + img_request_child_set(parent_request); + rbd_obj_request_get(obj_request); + parent_request->obj_request = obj_request; + + return parent_request; +} + +static void rbd_parent_request_destroy(struct kref *kref) +{ + struct rbd_img_request *parent_request; + struct rbd_obj_request *orig_request; + + parent_request = container_of(kref, struct rbd_img_request, kref); + orig_request = parent_request->obj_request; + + parent_request->obj_request = NULL; + rbd_obj_request_put(orig_request); + img_request_child_clear(parent_request); + + rbd_img_request_destroy(kref); +} + static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; @@ -2321,13 +2363,10 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) } result = -ENOMEM; - parent_request = rbd_img_request_create(rbd_dev->parent, - img_offset, length, - false, true); + parent_request = rbd_parent_request_create(obj_request, + img_offset, length); if (!parent_request) goto out_err; - rbd_obj_request_get(obj_request); - parent_request->obj_request = obj_request; result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); if (result) @@ -2580,7 +2619,6 @@ out: static void rbd_img_parent_read(struct rbd_obj_request *obj_request) { - struct rbd_device *rbd_dev; struct rbd_img_request *img_request; int result; @@ -2589,20 +2627,14 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) rbd_assert(obj_request->result == (s32) -ENOENT); rbd_assert(obj_request_type_valid(obj_request->type)); - rbd_dev = obj_request->img_request->rbd_dev; - rbd_assert(rbd_dev->parent != NULL); /* rbd_read_finish(obj_request, obj_request->length); */ - img_request = rbd_img_request_create(rbd_dev->parent, + img_request = rbd_parent_request_create(obj_request, obj_request->img_offset, - obj_request->length, - false, true); + obj_request->length); result = -ENOMEM; if (!img_request) goto out_err; - rbd_obj_request_get(obj_request); - img_request->obj_request = obj_request; - if (obj_request->type == OBJ_REQUEST_BIO) result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, obj_request->bio_list); @@ -2913,7 +2945,7 @@ static void rbd_request_fn(struct request_queue *q) result = -ENOMEM; img_request = rbd_img_request_create(rbd_dev, offset, length, - write_request, false); + write_request); if (!img_request) goto end_request; -- cgit v1.1 From a2acd00e7964dbb1668a5956c9d0a4bdeb838c4a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 8 May 2013 22:50:04 -0500 Subject: rbd: reference count parent requests Keep a reference count for uses of the parent information for an rbd device. An initial reference is set in rbd_img_request_create() if the target image has a parent (with non-zero overlap). Each image request for an image with a non-zero parent overlap gets another reference when it's created, and that reference is dropped when the request is destroyed. The initial reference is dropped when the image gets torn down. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1ffdfbf..8b6091b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -55,6 +55,39 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) +/* + * Increment the given counter and return its updated value. + * If the counter is already 0 it will not be incremented. + * If the counter is already at its maximum value returns + * -EINVAL without updating it. + */ +static int atomic_inc_return_safe(atomic_t *v) +{ + unsigned int counter; + + counter = (unsigned int)__atomic_add_unless(v, 1, 0); + if (counter <= (unsigned int)INT_MAX) + return (int)counter; + + atomic_dec(v); + + return -EINVAL; +} + +/* Decrement the counter. Return the resulting value, or -EINVAL */ +static int atomic_dec_return_safe(atomic_t *v) +{ + int counter; + + counter = atomic_dec_return(v); + if (counter >= 0) + return counter; + + atomic_inc(v); + + return -EINVAL; +} + #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -312,6 +345,7 @@ struct rbd_device { struct rbd_spec *parent_spec; u64 parent_overlap; + atomic_t parent_ref; struct rbd_device *parent; /* protects updating the header */ @@ -361,6 +395,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); +static void rbd_spec_put(struct rbd_spec *spec); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -1505,6 +1540,12 @@ static void img_request_layered_set(struct rbd_img_request *img_request) smp_mb(); } +static void img_request_layered_clear(struct rbd_img_request *img_request) +{ + clear_bit(IMG_REQ_LAYERED, &img_request->flags); + smp_mb(); +} + static bool img_request_layered_test(struct rbd_img_request *img_request) { smp_mb(); @@ -1860,6 +1901,58 @@ static void rbd_dev_unparent(struct rbd_device *rbd_dev) } /* + * Parent image reference counting is used to determine when an + * image's parent fields can be safely torn down--after there are no + * more in-flight requests to the parent image. When the last + * reference is dropped, cleaning them up is safe. + */ +static void rbd_dev_parent_put(struct rbd_device *rbd_dev) +{ + int counter; + + if (!rbd_dev->parent_spec) + return; + + counter = atomic_dec_return_safe(&rbd_dev->parent_ref); + if (counter > 0) + return; + + /* Last reference; clean up parent data structures */ + + if (!counter) + rbd_dev_unparent(rbd_dev); + else + rbd_warn(rbd_dev, "parent reference underflow\n"); +} + +/* + * If an image has a non-zero parent overlap, get a reference to its + * parent. + * + * Returns true if the rbd device has a parent with a non-zero + * overlap and a reference for it was successfully taken, or + * false otherwise. + */ +static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) +{ + int counter; + + if (!rbd_dev->parent_spec) + return false; + + counter = atomic_inc_return_safe(&rbd_dev->parent_ref); + if (counter > 0 && rbd_dev->parent_overlap) + return true; + + /* Image was flattened, but parent is not yet torn down */ + + if (counter < 0) + rbd_warn(rbd_dev, "parent reference overflow\n"); + + return false; +} + +/* * Caller is responsible for filling in the list of object requests * that comprises the image request, and the Linux request pointer * (if there is one). @@ -1892,7 +1985,7 @@ static struct rbd_img_request *rbd_img_request_create( } else { img_request->snap_id = rbd_dev->spec->snap_id; } - if (rbd_dev->parent_overlap) + if (rbd_dev_parent_get(rbd_dev)) img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; @@ -1923,6 +2016,11 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_img_obj_request_del(img_request, obj_request); rbd_assert(img_request->obj_request_count == 0); + if (img_request_layered_test(img_request)) { + img_request_layered_clear(img_request); + rbd_dev_parent_put(img_request->rbd_dev); + } + if (img_request_write_test(img_request)) ceph_put_snap_context(img_request->snapc); @@ -3502,6 +3600,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, spin_lock_init(&rbd_dev->lock); rbd_dev->flags = 0; + atomic_set(&rbd_dev->parent_ref, 0); INIT_LIST_HEAD(&rbd_dev->node); init_rwsem(&rbd_dev->header_rwsem); @@ -4534,7 +4633,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { struct rbd_image_header *header; - rbd_dev_unparent(rbd_dev); + rbd_dev_parent_put(rbd_dev); /* Free dynamic fields from the header, then zero it out */ @@ -4606,6 +4705,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; rbd_dev->parent = parent; + atomic_set(&rbd_dev->parent_ref, 1); return 0; out_err: -- cgit v1.1 From 392a9dad7e777296fe79d97a6b3acd735ad2eb5f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: rbd: detect when clone image is flattened A format 2 clone image can be the subject of a "flatten" operation, during which all of its data gets "copied up" from its parent image, leaving the image fully populated. Once this is complete, the clone's association with the parent is abolished. Since this can occur when a clone is mapped, we need to detect when it has occurred and handle it accordingly. We know an image has been flattened when we know it at one time had a parent, but we have learned (via a "get_parent" object class method call) it no longer has one. There might be in-flight requests at the point we learn an image has been flattened, so we can't simply clean up parent data structures right away. Instead, we'll drop the initial parent reference when the parent has disappeared (rather than when the image gets destroyed), which will allow the last in-flight reference to clean things up when it's complete. We leverage the fact that a zero parent overlap renders an image effectively unlayered. We set the overlap to 0 at the point we detect the clone image has flattened, which allows the unlayered behavior to take effect immediately, while keeping other parent structures in place until in-flight requests to complete. This and the next few patches resolve: http://tracker.ceph.com/issues/3763 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8b6091b..9717e20 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1929,6 +1929,11 @@ static void rbd_dev_parent_put(struct rbd_device *rbd_dev) * If an image has a non-zero parent overlap, get a reference to its * parent. * + * We must get the reference before checking for the overlap to + * coordinate properly with zeroing the parent overlap in + * rbd_dev_v2_parent_info() when an image gets flattened. We + * drop it again if there is no overlap. + * * Returns true if the rbd device has a parent with a non-zero * overlap and a reference for it was successfully taken, or * false otherwise. @@ -3782,8 +3787,26 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) end = reply_buf + ret; ret = -ERANGE; ceph_decode_64_safe(&p, end, pool_id, out_err); - if (pool_id == CEPH_NOPOOL) + if (pool_id == CEPH_NOPOOL) { + /* + * Either the parent never existed, or we have + * record of it but the image got flattened so it no + * longer has a parent. When the parent of a + * layered image disappears we immediately set the + * overlap to 0. The effect of this is that all new + * requests will be treated as if the image had no + * parent. + */ + if (rbd_dev->parent_overlap) { + rbd_dev->parent_overlap = 0; + smp_mb(); + rbd_dev_parent_put(rbd_dev); + pr_info("%s: clone image has been flattened\n", + rbd_dev->disk->disk_name); + } + goto out; /* No parent? No problem. */ + } /* The ceph file layout needs to fit pool id in 32 bits */ @@ -4633,7 +4656,10 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { struct rbd_image_header *header; - rbd_dev_parent_put(rbd_dev); + /* Drop parent reference unless it's already been done (or none) */ + + if (rbd_dev->parent_overlap) + rbd_dev_parent_put(rbd_dev); /* Free dynamic fields from the header, then zero it out */ -- cgit v1.1 From 02c74fbad9d4a5149756eb35be7300736e4904e9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: rbd: re-submit read request for flattened clone If a clone image gets flattened while a parent read request is underway, the original rbd object request needs to be resubmitted. The reason is that by the time we get the response to the parent read request, the data read from the parent may be out of date. In other words, we could see this sequence of events: rbd client parent image/osd ---------- ---------------- original object ENOENT; issue parent read respond to parent read child image flattened original image header refresh <--- original object written independently here parent read response received Add code to rbd_img_parent_read_callback() to detect when a clone's parent image has disappeared (as evidenced by its parent overlap becoming 0), and re-submit the original read request in that case. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9717e20..4edcb6d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2682,14 +2682,36 @@ static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) struct rbd_obj_request *obj_request; struct rbd_device *rbd_dev; u64 obj_end; + u64 img_xferred; + int img_result; rbd_assert(img_request_child_test(img_request)); + /* First get what we need from the image request and release it */ + obj_request = img_request->obj_request; + img_xferred = img_request->xferred; + img_result = img_request->result; + rbd_img_request_put(img_request); + + /* + * If the overlap has become 0 (most likely because the + * image has been flattened) we need to re-submit the + * original request. + */ rbd_assert(obj_request); rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + if (!rbd_dev->parent_overlap) { + struct ceph_osd_client *osdc; + + osdc = &rbd_dev->rbd_client->client->osdc; + img_result = rbd_obj_request_submit(osdc, obj_request); + if (!img_result) + return; + } - obj_request->result = img_request->result; + obj_request->result = img_result; if (obj_request->result) goto out; @@ -2702,7 +2724,6 @@ static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) */ rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); obj_end = obj_request->img_offset + obj_request->length; - rbd_dev = obj_request->img_request->rbd_dev; if (obj_end > rbd_dev->parent_overlap) { u64 xferred = 0; @@ -2710,12 +2731,11 @@ static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) xferred = rbd_dev->parent_overlap - obj_request->img_offset; - obj_request->xferred = min(img_request->xferred, xferred); + obj_request->xferred = min(img_xferred, xferred); } else { - obj_request->xferred = img_request->xferred; + obj_request->xferred = img_xferred; } out: - rbd_img_request_put(img_request); rbd_img_obj_request_read_callback(obj_request); rbd_obj_request_complete(obj_request); } -- cgit v1.1 From bbea1c1a31b1128d34b2b5b4f28276969cce14e9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: rbd: re-submit write request for flattened clone Add code to rbd_img_parent_read_full_callback() to detect when a clone's parent image has disappeared, and re-submit the original write request in that case. (See the previous commit for more reasoning about why this is appropriate.) Rename some variables in rbd_img_obj_parent_read_full_callback() to match the convention used in the previous patch. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4edcb6d..5c27318 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2319,7 +2319,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) struct rbd_device *rbd_dev; struct page **pages; u32 page_count; - int result; + int img_result; u64 parent_length; u64 offset; u64 length; @@ -2338,7 +2338,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) orig_request = img_request->obj_request; rbd_assert(orig_request != NULL); rbd_assert(obj_request_type_valid(orig_request->type)); - result = img_request->result; + img_result = img_request->result; parent_length = img_request->length; rbd_assert(parent_length == img_request->xferred); rbd_img_request_put(img_request); @@ -2347,7 +2347,22 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) rbd_dev = orig_request->img_request->rbd_dev; rbd_assert(rbd_dev); - if (result) + /* + * If the overlap has become 0 (most likely because the + * image has been flattened) we need to free the pages + * and re-submit the original write request. + */ + if (!rbd_dev->parent_overlap) { + struct ceph_osd_client *osdc; + + ceph_release_page_vector(pages, page_count); + osdc = &rbd_dev->rbd_client->client->osdc; + img_result = rbd_obj_request_submit(osdc, orig_request); + if (!img_result) + return; + } + + if (img_result) goto out_err; /* @@ -2356,7 +2371,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) * request. Allocate the new copyup osd request for the * original request, and release the old one. */ - result = -ENOMEM; + img_result = -ENOMEM; osd_req = rbd_osd_req_create_copyup(orig_request); if (!osd_req) goto out_err; @@ -2391,13 +2406,13 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) orig_request->callback = rbd_img_obj_copyup_callback; osdc = &rbd_dev->rbd_client->client->osdc; - result = rbd_obj_request_submit(osdc, orig_request); - if (!result) + img_result = rbd_obj_request_submit(osdc, orig_request); + if (!img_result) return; out_err: /* Record the error code and complete the request */ - orig_request->result = result; + orig_request->result = img_result; orig_request->xferred = 0; obj_request_done_set(orig_request); rbd_obj_request_complete(orig_request); -- cgit v1.1 From 638f5abed3f7d8a7fc24087bd760fa3d99f68a39 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: rbd: re-submit flattened write request (part 2) Add code to rbd_img_obj_exists_callback() to detect when a clone's parent image has disappeared, and re-submit the original write request in that case. Kill off some redundant assertions. This completes the resolution for: http://tracker.ceph.com/issues/3763 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5c27318..6b872f2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2516,6 +2516,7 @@ out_err: static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) { struct rbd_obj_request *orig_request; + struct rbd_device *rbd_dev; int result; rbd_assert(!obj_request_img_data_test(obj_request)); @@ -2538,8 +2539,21 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) obj_request->xferred, obj_request->length); rbd_obj_request_put(obj_request); - rbd_assert(orig_request); - rbd_assert(orig_request->img_request); + /* + * If the overlap has become 0 (most likely because the + * image has been flattened) we need to free the pages + * and re-submit the original write request. + */ + rbd_dev = orig_request->img_request->rbd_dev; + if (!rbd_dev->parent_overlap) { + struct ceph_osd_client *osdc; + + rbd_obj_request_put(orig_request); + osdc = &rbd_dev->rbd_client->client->osdc; + result = rbd_obj_request_submit(osdc, orig_request); + if (!result) + return; + } /* * Our only purpose here is to determine whether the object -- cgit v1.1 From 974a51a245c2c8bece21cf2d3cbfc8261260f729 Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Wed, 15 May 2013 10:04:34 +0200 Subject: mtip32xx: Fix NULL pointer dereference during module unload An open file-handle to one or more of the driver exported debugfs nodes causes raciness in recursive removal during module unload; sometimes a stale parent dentry is dereferenced when more than 1 pci device is present. Signed-off-by: Sam Bradshaw Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 847107e..e366c74 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3002,7 +3002,8 @@ static int mtip_hw_debugfs_init(struct driver_data *dd) static void mtip_hw_debugfs_exit(struct driver_data *dd) { - debugfs_remove_recursive(dd->dfs_node); + if (dd->dfs_node) + debugfs_remove_recursive(dd->dfs_node); } -- cgit v1.1 From 093c959307f2f5af72b24fdc4af7d4d0263f6eea Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Wed, 15 May 2013 10:09:05 +0200 Subject: mtip32xx: Correctly handle bio->bi_idx != 0 conditions Stacking drivers may append bvecs to existing bio's, resulting in non-zero bi_idx conditions. This patch counts the loops of bio_for_each_segment() rather than inheriting the bi_idx value to pass as a segment count to the hardware submission routine. Signed-off-by: Sam Bradshaw Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index e366c74..20dd52a 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3864,7 +3864,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) struct driver_data *dd = queue->queuedata; struct scatterlist *sg; struct bio_vec *bvec; - int nents = 0; + int i, nents = 0; int tag = 0, unaligned = 0; if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { @@ -3922,11 +3922,12 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) } /* Create the scatter list for this bio. */ - bio_for_each_segment(bvec, bio, nents) { + bio_for_each_segment(bvec, bio, i) { sg_set_page(&sg[nents], bvec->bv_page, bvec->bv_len, bvec->bv_offset); + nents++; } /* Issue the read/write. */ -- cgit v1.1 From 76e0310c2d5a4fb62f674af7dad16f544950fc13 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Tue, 14 May 2013 08:32:13 +0530 Subject: NVMe: Remove redundant version.h header include version.h header inclusion is not necessary as detected by checkversion.pl. Signed-off-by: Sachin Kamat Acked-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index fed54b0..64bde68 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include -- cgit v1.1 From 710a143dd870cd660b5ae26bb94b6da854376e91 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Mon, 13 May 2013 14:55:18 -0600 Subject: NVMe: Fix a signedness bug in nvme_trans_modesel_get_mp nvme_trans_modesel_get_mp() was defined with a unsigned return type, but can return signed values. Reported-by: Dan Carpenter Signed-off-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 64bde68..102de2f 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -1653,7 +1653,7 @@ static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, } } -static u16 nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, +static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *mode_page, u8 page_code) { int res = SNTI_TRANSLATION_SUCCESS; -- cgit v1.1 From 5460fc03105fbed01fe27aa572d9f65bb410a61d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 13 May 2013 17:59:50 +0300 Subject: NVMe: check for integer overflow in nvme_map_user_pages() You need to have CAP_SYS_ADMIN to trigger this overflow but it makes the static checkers complain so we should fix it. The worry is that "length" comes from copy_from_user() so we need to check that "length + offset" can't overflow. I also changed the min_t() cast to be unsigned instead of signed. Now that we cap "length" to INT_MAX it doesn't make a difference, but it's a little easier for reviewers to know that large values aren't cast to negative. Signed-off-by: Dan Carpenter Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 8efdfaa..4376375 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1206,7 +1206,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, if (addr & 3) return ERR_PTR(-EINVAL); - if (!length) + if (!length || length > INT_MAX - PAGE_SIZE) return ERR_PTR(-EINVAL); offset = offset_in_page(addr); @@ -1227,7 +1227,8 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, sg_init_table(sg, count); for (i = 0; i < count; i++) { sg_set_page(&sg[i], pages[i], - min_t(int, length, PAGE_SIZE - offset), offset); + min_t(unsigned, length, PAGE_SIZE - offset), + offset); length -= (PAGE_SIZE - offset); offset = 0; } -- cgit v1.1 From 1287dabd345f447bbe0f7a99fc95ab89bcfc0f5d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 13 May 2013 22:29:04 +0800 Subject: NVMe: fix error return code in nvme_submit_bio_queue() nvme_submit_flush_data() might overwrite the initialisation of the return value with 0, so move the -ENOMEM setting close to the usage. Signed-off-by: Wei Yongjun Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 4376375..d783f15 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -629,7 +629,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, struct nvme_command *cmnd; struct nvme_iod *iod; enum dma_data_direction dma_dir; - int cmdid, length, result = -ENOMEM; + int cmdid, length, result; u16 control; u32 dsmgmt; int psegs = bio_phys_segments(ns->queue, bio); @@ -640,6 +640,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, return result; } + result = -ENOMEM; iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); if (!iod) goto nomem; -- cgit v1.1 From 053ab702cc2702f25a97ead087ed344b864785b7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 30 Apr 2013 11:19:38 -0600 Subject: NVMe: Do not cancel command multiple times Cancelling an already cancelled command does not do anything, so check the command context before cancelling it, continuing if had already been cancelled so we do not log the same problem every second if a device stops responding. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index d783f15..42abf72 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -978,6 +978,8 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) if (timeout && !time_after(now, info[cmdid].timeout)) continue; + if (info[cmdid].ctx == CMD_CTX_CANCELLED) + continue; dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); ctx = cancel_cmdid(nvmeq, cmdid, &fn); fn(nvmeq->dev, ctx, &cqe); -- cgit v1.1 From 7262cfca430a1a0e0707149af29ae86bc0ded230 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 16 May 2013 15:04:20 -0500 Subject: rbd: don't destroy ceph_opts in rbd_add() Whether rbd_client_create() successfully creates a new client or not, it takes responsibility for getting the ceph_opts structure it's passed destroyed. If successful, the structure becomes associated with the created client; if not, rbd_client_create() will destroy it. Previously, rbd_get_client() would call ceph_destroy_options() if rbd_get_client() failed, and that meant it got called twice. That led freeing various pointers more than once, which is never a good idea. This resolves: http://tracker.ceph.com/issues/4559 Cc: stable@vger.kernel.org # 3.8+ Reported-by: Dan van der Ster Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6b872f2..5f64ba7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -521,8 +521,8 @@ static const struct block_device_operations rbd_bd_ops = { }; /* - * Initialize an rbd client instance. - * We own *ceph_opts. + * Initialize an rbd client instance. Success or not, this function + * consumes ceph_opts. */ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) { @@ -677,7 +677,8 @@ static int parse_rbd_opts_token(char *c, void *private) /* * Get a ceph client with specific addr and configuration, if one does - * not exist create it. + * not exist create it. Either way, ceph_opts is consumed by this + * function. */ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) { @@ -4994,7 +4995,6 @@ static ssize_t rbd_add(struct bus_type *bus, rc = PTR_ERR(rbdc); goto err_out_args; } - ceph_opts = NULL; /* rbd_dev client now owns this */ /* pick the pool */ osdc = &rbdc->client->osdc; @@ -5038,8 +5038,6 @@ err_out_rbd_dev: err_out_client: rbd_put_client(rbdc); err_out_args: - if (ceph_opts) - ceph_destroy_options(ceph_opts); kfree(rbd_opts); rbd_spec_put(spec); err_out_module: -- cgit v1.1 From 3abef3b3585bbc67d56fdc9c67761a900fb4b69d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 13 May 2013 20:35:37 -0500 Subject: rbd: fix cleanup in rbd_add() Bjorn Helgaas pointed out that a recent commit introduced a use-after-free condition in an error path for rbd_add(). He correctly stated: I think b536f69a3a5 "rbd: set up devices only for mapped images" introduced a use-after-free error in rbd_add(): ... If rbd_dev_device_setup() returns an error, we call rbd_dev_image_release(), which ultimately kfrees rbd_dev. Then we call rbd_dev_destroy(), which references fields in the already-freed rbd_dev struct before kfreeing it again. The simple fix is to return the error code after the call to rbd_dev_image_release(). Closer examination revealed that there's no need to clean up rbd_opts in that function, so fix that too. Update some other comments that have also become out of date. Reported-by: Bjorn Helgaas Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5f64ba7..3a897a5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4700,8 +4700,10 @@ out: return ret; } -/* Undo whatever state changes are made by v1 or v2 image probe */ - +/* + * Undo whatever state changes are made by v1 or v2 header info + * call. + */ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { struct rbd_image_header *header; @@ -4905,9 +4907,10 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) int tmp; /* - * Get the id from the image id object. If it's not a - * format 2 image, we'll get ENOENT back, and we'll assume - * it's a format 1 image. + * Get the id from the image id object. Unless there's an + * error, rbd_dev->spec->image_id will be filled in with + * a dynamically-allocated string, and rbd_dev->image_format + * will be set to either 1 or 2. */ ret = rbd_dev_image_id(rbd_dev); if (ret) @@ -5029,16 +5032,18 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev->mapping.read_only = read_only; rc = rbd_dev_device_setup(rbd_dev); - if (!rc) - return count; + if (rc) { + rbd_dev_image_release(rbd_dev); + goto err_out_module; + } + + return count; - rbd_dev_image_release(rbd_dev); err_out_rbd_dev: rbd_dev_destroy(rbd_dev); err_out_client: rbd_put_client(rbdc); err_out_args: - kfree(rbd_opts); rbd_spec_put(spec); err_out_module: module_put(THIS_MODULE); -- cgit v1.1 From cf90bc4830b858487fe4b9b9ecd0031e23ca3e83 Mon Sep 17 00:00:00 2001 From: Chayan Biswas Date: Wed, 22 May 2013 22:34:49 +0000 Subject: Return the result from user admin command IOCTL even in case of failure We copy the result to user if the command is completed from the controller even if it completes with failure (non-zero) status. A return status of < 0 indicates the command was not completed by the controller. The user application may expect the error code in the result field in case of failure. Signed-off-by: Chayan Biswas Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 42abf72..8493708 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1439,7 +1439,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, nvme_free_iod(dev, iod); } - if (!status && copy_to_user(&ucmd->result, &cmd.result, + if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, sizeof(cmd.result))) status = -EFAULT; -- cgit v1.1 From 585dc0c2f68981c02a0bb6fc8fe191a3f513959c Mon Sep 17 00:00:00 2001 From: Gernot Vormayr Date: Fri, 24 May 2013 15:55:03 -0700 Subject: drivers/block/xsysace.c: fix id with missing port-number If the port number is missing from the device-tree the device gets named xs` instead of xsa. This fixes the check for missing ids. Tested on ml507 board. Signed-off-by: Gernot Vormayr Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: Grant Likely Cc: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/xsysace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index f8ef15f..3fd130f 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -1160,8 +1160,7 @@ static int ace_probe(struct platform_device *dev) dev_dbg(&dev->dev, "ace_probe(%p)\n", dev); /* device id and bus width */ - of_property_read_u32(dev->dev.of_node, "port-number", &id); - if (id < 0) + if (of_property_read_u32(dev->dev.of_node, "port-number", &id)) id = 0; if (of_find_property(dev->dev.of_node, "8-bit", NULL)) bus_width = ACE_BUS_WIDTH_8; -- cgit v1.1 From dfd20b2b174d3a9b258ea3b7a35ead33576587b1 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 24 May 2013 15:55:28 -0700 Subject: drivers/block/brd.c: fix brd_lookup_page() race The index on the page must be set before it is inserted in the radix tree. Otherwise there is a small race which can occur during lookup where the page can be found with the incorrect index. This will trigger the BUG_ON() in brd_lookup_page(). Signed-off-by: Brian Behlendorf Reported-by: Chris Wedgwood Cc: Jens Axboe Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/brd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/brd.c b/drivers/block/brd.c index f1a29f8..9bf4371 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -117,13 +117,13 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) spin_lock(&brd->brd_lock); idx = sector >> PAGE_SECTORS_SHIFT; + page->index = idx; if (radix_tree_insert(&brd->brd_pages, idx, page)) { __free_page(page); page = radix_tree_lookup(&brd->brd_pages, idx); BUG_ON(!page); BUG_ON(page->index != idx); - } else - page->index = idx; + } spin_unlock(&brd->brd_lock); radix_tree_preload_end(); -- cgit v1.1 From cf9f123b38345b2c2777e642eb9bb561f4b7de91 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 28 May 2013 16:46:46 -0400 Subject: NVMe: Use dma_set_mask() correctly In some circumstances setting a 64-bit DMA mask can fail, as explained in Documentation/DMA-API-HOWTO.txt. Use the recommended code sequence to set a 32-bit DMA mask if setting a 64-bit DMA mask fails. Reported-by: Chayan Biswas Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 8493708..a57d3bc 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1927,8 +1927,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); dev->pci_dev = pdev; pci_set_drvdata(pdev, dev); - dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); - dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); + + if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); + else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); + else + goto disable; + result = nvme_set_instance(dev); if (result) goto disable; -- cgit v1.1 From 104529661b645295903c5007ae632d2dd4029254 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 May 2013 11:11:08 +0300 Subject: pktcdvd: silence static checker warning Static checkers complain about widening the binary "not" operations here because sectors are u64 and "(pd)->settings.size" is unsigned int. It unintentionally clears the high 32 bits of the sector. This means that the driver won't work for devices with over 2TB of space. Since this is a DVD drive, we're unlikely to reach that limit, but we may as well silence the warning. Signed-off-by: Dan Carpenter Signed-off-by: Jiri Kosina --- drivers/block/pktcdvd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 3c08983..f5d0ea1 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -83,7 +83,8 @@ #define MAX_SPEED 0xffff -#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1)) +#define ZONE(sector, pd) (((sector) + (pd)->offset) & \ + ~(sector_t)((pd)->settings.size - 1)) static DEFINE_MUTEX(pktcdvd_mutex); static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; -- cgit v1.1 From fa08a396647767abd24a9e7015cb177121d0cf15 Mon Sep 17 00:00:00 2001 From: Ramachandra Rao Gajula Date: Sat, 11 May 2013 15:19:31 -0700 Subject: NVMe: Add MSI support Some devices only have support for MSI, not MSI-X. While MSI is more limited, it still provides better performance than line-based interrupts. Signed-off-by: Ramachandra Gajula Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index a57d3bc..ce79a59 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1637,7 +1637,8 @@ static int set_queue_count(struct nvme_dev *dev, int count) static int nvme_setup_io_queues(struct nvme_dev *dev) { - int result, cpu, i, nr_io_queues, db_bar_size, q_depth; + struct pci_dev *pdev = dev->pci_dev; + int result, cpu, i, nr_io_queues, db_bar_size, q_depth, q_count; nr_io_queues = num_online_cpus(); result = set_queue_count(dev, nr_io_queues); @@ -1646,14 +1647,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result < nr_io_queues) nr_io_queues = result; + q_count = nr_io_queues; /* Deregister the admin queue's interrupt */ free_irq(dev->entry[0].vector, dev->queues[0]); db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); if (db_bar_size > 8192) { iounmap(dev->bar); - dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), - db_bar_size); + dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size); dev->dbs = ((void __iomem *)dev->bar) + 4096; dev->queues[0]->q_db = dev->dbs; } @@ -1661,19 +1662,36 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) for (i = 0; i < nr_io_queues; i++) dev->entry[i].entry = i; for (;;) { - result = pci_enable_msix(dev->pci_dev, dev->entry, - nr_io_queues); + result = pci_enable_msix(pdev, dev->entry, nr_io_queues); if (result == 0) { break; } else if (result > 0) { nr_io_queues = result; continue; } else { - nr_io_queues = 1; + nr_io_queues = 0; break; } } + if (nr_io_queues == 0) { + nr_io_queues = q_count; + for (;;) { + result = pci_enable_msi_block(pdev, nr_io_queues); + if (result == 0) { + for (i = 0; i < nr_io_queues; i++) + dev->entry[i].vector = i + pdev->irq; + break; + } else if (result > 0) { + nr_io_queues = result; + continue; + } else { + nr_io_queues = 1; + break; + } + } + } + result = queue_request_irq(dev, dev->queues[0], "nvme admin"); /* XXX: handle failure here */ @@ -1854,7 +1872,10 @@ static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); nvme_dev_remove(dev); - pci_disable_msix(dev->pci_dev); + if (dev->pci_dev->msi_enabled) + pci_disable_msi(dev->pci_dev); + else if (dev->pci_dev->msix_enabled) + pci_disable_msix(dev->pci_dev); iounmap(dev->bar); nvme_release_instance(dev); nvme_release_prp_pools(dev); @@ -1987,7 +2008,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) unmap: iounmap(dev->bar); disable_msix: - pci_disable_msix(pdev); + if (dev->pci_dev->msi_enabled) + pci_disable_msi(dev->pci_dev); + else if (dev->pci_dev->msix_enabled) + pci_disable_msix(dev->pci_dev); nvme_release_instance(dev); nvme_release_prp_pools(dev); disable: -- cgit v1.1 From 03f47e888daf56c8e9046c674719a0bcc644eed5 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Wed, 12 Jun 2013 14:04:47 -0700 Subject: cciss: fix broken mutex usage in ioctl If a new logical drive is added and the CCISS_REGNEWD ioctl is invoked (as is normal with the Array Configuration Utility) the process will hang as below. It attempts to acquire the same mutex twice, once in do_ioctl() and once in cciss_unlocked_open(). The BKL was recursive, the mutex isn't. Linux version 3.10.0-rc2 (scameron@localhost.localdomain) (gcc version 4.4.7 20120313 (Red Hat 4.4.7-3) (GCC) ) #1 SMP Fri May 24 14:32:12 CDT 2013 [...] acu D 0000000000000001 0 3246 3191 0x00000080 Call Trace: schedule+0x29/0x70 schedule_preempt_disabled+0xe/0x10 __mutex_lock_slowpath+0x17b/0x220 mutex_lock+0x2b/0x50 cciss_unlocked_open+0x2f/0x110 [cciss] __blkdev_get+0xd3/0x470 blkdev_get+0x5c/0x1e0 register_disk+0x182/0x1a0 add_disk+0x17c/0x310 cciss_add_disk+0x13a/0x170 [cciss] cciss_update_drive_info+0x39b/0x480 [cciss] rebuild_lun_table+0x258/0x370 [cciss] cciss_ioctl+0x34f/0x470 [cciss] do_ioctl+0x49/0x70 [cciss] __blkdev_driver_ioctl+0x28/0x30 blkdev_ioctl+0x200/0x7b0 block_ioctl+0x3c/0x40 do_vfs_ioctl+0x89/0x350 SyS_ioctl+0xa1/0xb0 system_call_fastpath+0x16/0x1b This mutex usage was added into the ioctl path when the big kernel lock was removed. As it turns out, these paths are all thread safe anyway (or can easily be made so) and we don't want ioctl() to be single threaded in any case. Signed-off-by: Stephen M. Cameron Cc: Jens Axboe Cc: Mike Miller Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/cciss.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 6374dc1..62b6c2c 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -168,8 +168,6 @@ static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id); static int cciss_open(struct block_device *bdev, fmode_t mode); static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode); static void cciss_release(struct gendisk *disk, fmode_t mode); -static int do_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo); @@ -235,7 +233,7 @@ static const struct block_device_operations cciss_fops = { .owner = THIS_MODULE, .open = cciss_unlocked_open, .release = cciss_release, - .ioctl = do_ioctl, + .ioctl = cciss_ioctl, .getgeo = cciss_getgeo, #ifdef CONFIG_COMPAT .compat_ioctl = cciss_compat_ioctl, @@ -1143,16 +1141,6 @@ static void cciss_release(struct gendisk *disk, fmode_t mode) mutex_unlock(&cciss_mutex); } -static int do_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - int ret; - mutex_lock(&cciss_mutex); - ret = cciss_ioctl(bdev, mode, cmd, arg); - mutex_unlock(&cciss_mutex); - return ret; -} - #ifdef CONFIG_COMPAT static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode, @@ -1179,7 +1167,7 @@ static int cciss_compat_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_REGNEWD: case CCISS_RESCANDISK: case CCISS_GETLUNINFO: - return do_ioctl(bdev, mode, cmd, arg); + return cciss_ioctl(bdev, mode, cmd, arg); case CCISS_PASSTHRU32: return cciss_ioctl32_passthru(bdev, mode, cmd, arg); @@ -1219,7 +1207,7 @@ static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode, if (err) return -EFAULT; - err = do_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p); + err = cciss_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p); if (err) return err; err |= @@ -1261,7 +1249,7 @@ static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode, if (err) return -EFAULT; - err = do_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p); + err = cciss_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p); if (err) return err; err |= @@ -1311,11 +1299,14 @@ static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp) static int cciss_getintinfo(ctlr_info_t *h, void __user *argp) { cciss_coalint_struct intinfo; + unsigned long flags; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay); intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user (argp, &intinfo, sizeof(cciss_coalint_struct))) return -EFAULT; @@ -1356,12 +1347,15 @@ static int cciss_setintinfo(ctlr_info_t *h, void __user *argp) static int cciss_getnodename(ctlr_info_t *h, void __user *argp) { NodeName_type NodeName; + unsigned long flags; int i; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); for (i = 0; i < 16; i++) NodeName[i] = readb(&h->cfgtable->ServerName[i]); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user(argp, NodeName, sizeof(NodeName_type))) return -EFAULT; return 0; @@ -1398,10 +1392,13 @@ static int cciss_setnodename(ctlr_info_t *h, void __user *argp) static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) { Heartbeat_type heartbeat; + unsigned long flags; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); heartbeat = readl(&h->cfgtable->HeartBeat); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type))) return -EFAULT; return 0; @@ -1410,10 +1407,13 @@ static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) static int cciss_getbustypes(ctlr_info_t *h, void __user *argp) { BusTypes_type BusTypes; + unsigned long flags; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); BusTypes = readl(&h->cfgtable->BusTypes); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type))) return -EFAULT; return 0; -- cgit v1.1 From 3a96d5cd7bdce45d5dded75c3a62d4fb98050280 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Wed, 12 Jun 2013 19:15:06 -0700 Subject: rbd: use the correct length for format 2 object names Format 2 objects use 16 characters for the object name suffix to be able to express the full 64-bit range of object numbers. Format 1 images only use 12 characters for this. Using 12-character names for format 2 caused userspace and kernel rbd clients to read differently named objects, which made an image written by one client look empty to the other client. CC: stable@vger.kernel.org # 3.9+ Reported-by: Chris Dunlop Signed-off-by: Josh Durgin Reviewed-by: Sage Weil --- drivers/block/rbd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3a897a5..cc7c60e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1038,12 +1038,16 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) char *name; u64 segment; int ret; + char *name_format; name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); if (!name) return NULL; segment = offset >> rbd_dev->header.obj_order; - ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", + name_format = "%s.%012llx"; + if (rbd_dev->image_format == 2) + name_format = "%s.%016llx"; + ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format, rbd_dev->header.object_prefix, segment); if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { pr_err("error formatting segment name for #%llu (%d)\n", -- cgit v1.1