From 24f1df60ce943aee107b3cb99b37a0152c9dd47a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 12 Jan 2018 17:22:10 +0100 Subject: rbd: set max_segment_size to UINT_MAX Commit 21acdf45f495 ("rbd: set max_segments to USHRT_MAX") removed the limit on max_segments. Remove the limit on max_segment_size as well. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8e40da0..4ca0a45 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4378,7 +4378,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); q->limits.max_sectors = queue_max_hw_sectors(q); blk_queue_max_segments(q, USHRT_MAX); - blk_queue_max_segment_size(q, segment_size); + blk_queue_max_segment_size(q, UINT_MAX); blk_queue_io_min(q, segment_size); blk_queue_io_opt(q, segment_size); -- cgit v1.1 From a1fbb5e7bbb56fccdf54bf4ab5086c6080ee5bfa Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 16 Jan 2018 12:15:02 +0100 Subject: rbd: start enums at 1 instead of 0 Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4ca0a45..883f17d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -217,12 +217,14 @@ struct rbd_obj_request; typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); enum obj_request_type { - OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES + OBJ_REQUEST_NODATA = 1, + OBJ_REQUEST_BIO, + OBJ_REQUEST_PAGES, }; enum obj_operation_type { + OBJ_OP_READ = 1, OBJ_OP_WRITE, - OBJ_OP_READ, OBJ_OP_DISCARD, }; -- cgit v1.1 From 5359a17d2706b86da2af83027343d5eb256f7670 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 20 Jan 2018 10:30:10 +0100 Subject: libceph, rbd: new bio handling code (aka don't clone bios) The reason we clone bios is to be able to give each object request (and consequently each ceph_osd_data/ceph_msg_data item) its own pointer to a (list of) bio(s). The messenger then initializes its cursor with cloned bio's ->bi_iter, so it knows where to start reading from/writing to. That's all the cloned bios are used for: to determine each object request's starting position in the provided data buffer. Introduce ceph_bio_iter to do exactly that -- store position within bio list (i.e. pointer to bio) + position within that bio (i.e. bvec_iter). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 67 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 27 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 883f17d..8eaebf6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -218,7 +218,7 @@ typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); enum obj_request_type { OBJ_REQUEST_NODATA = 1, - OBJ_REQUEST_BIO, + OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ OBJ_REQUEST_PAGES, }; @@ -270,7 +270,7 @@ struct rbd_obj_request { enum obj_request_type type; union { - struct bio *bio_list; + struct ceph_bio_iter bio_pos; struct { struct page **pages; u32 page_count; @@ -1255,6 +1255,27 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, return length; } +static void zero_bvec(struct bio_vec *bv) +{ + void *buf; + unsigned long flags; + + buf = bvec_kmap_irq(bv, &flags); + memset(buf, 0, bv->bv_len); + flush_dcache_page(bv->bv_page); + bvec_kunmap_irq(buf, &flags); +} + +static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) +{ + struct ceph_bio_iter it = *bio_pos; + + ceph_bio_iter_advance(&it, off); + ceph_bio_iter_advance_step(&it, bytes, ({ + zero_bvec(&bv); + })); +} + /* * bio helpers */ @@ -1719,13 +1740,14 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); if (obj_request->result == -ENOENT) { if (obj_request->type == OBJ_REQUEST_BIO) - zero_bio_chain(obj_request->bio_list, 0); + zero_bios(&obj_request->bio_pos, 0, length); else zero_pages(obj_request->pages, 0, length); obj_request->result = 0; } else if (xferred < length && !obj_request->result) { if (obj_request->type == OBJ_REQUEST_BIO) - zero_bio_chain(obj_request->bio_list, xferred); + zero_bios(&obj_request->bio_pos, xferred, + length - xferred); else zero_pages(obj_request->pages, xferred, length); } @@ -2036,11 +2058,8 @@ static void rbd_obj_request_destroy(struct kref *kref) rbd_assert(obj_request_type_valid(obj_request->type)); switch (obj_request->type) { case OBJ_REQUEST_NODATA: - break; /* Nothing to do */ case OBJ_REQUEST_BIO: - if (obj_request->bio_list) - bio_chain_put(obj_request->bio_list); - break; + break; /* Nothing to do */ case OBJ_REQUEST_PAGES: /* img_data requests don't own their page array */ if (obj_request->pages && @@ -2368,7 +2387,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, if (obj_request->type == OBJ_REQUEST_BIO) osd_req_op_extent_osd_data_bio(osd_request, num_ops, - obj_request->bio_list, length); + &obj_request->bio_pos, length); else if (obj_request->type == OBJ_REQUEST_PAGES) osd_req_op_extent_osd_data_pages(osd_request, num_ops, obj_request->pages, length, @@ -2396,8 +2415,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, struct rbd_device *rbd_dev = img_request->rbd_dev; struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; - struct bio *bio_list = NULL; - unsigned int bio_offset = 0; + struct ceph_bio_iter bio_it; struct page **pages = NULL; enum obj_operation_type op_type; u64 img_offset; @@ -2412,9 +2430,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, op_type = rbd_img_request_op_type(img_request); if (type == OBJ_REQUEST_BIO) { - bio_list = data_desc; + bio_it = *(struct ceph_bio_iter *)data_desc; rbd_assert(img_offset == - bio_list->bi_iter.bi_sector << SECTOR_SHIFT); + bio_it.iter.bi_sector << SECTOR_SHIFT); } else if (type == OBJ_REQUEST_PAGES) { pages = data_desc; } @@ -2440,17 +2458,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, rbd_img_obj_request_add(img_request, obj_request); if (type == OBJ_REQUEST_BIO) { - unsigned int clone_size; - - rbd_assert(length <= (u64)UINT_MAX); - clone_size = (unsigned int)length; - obj_request->bio_list = - bio_chain_clone_range(&bio_list, - &bio_offset, - clone_size, - GFP_NOIO); - if (!obj_request->bio_list) - goto out_unwind; + obj_request->bio_pos = bio_it; + ceph_bio_iter_advance(&bio_it, length); } else if (type == OBJ_REQUEST_PAGES) { unsigned int page_count; @@ -2980,7 +2989,7 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) if (obj_request->type == OBJ_REQUEST_BIO) result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, - obj_request->bio_list); + &obj_request->bio_pos); else result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, obj_request->pages); @@ -4093,9 +4102,13 @@ static void rbd_queue_workfn(struct work_struct *work) if (op_type == OBJ_OP_DISCARD) result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, NULL); - else + else { + struct ceph_bio_iter bio_it = { .bio = rq->bio, + .iter = rq->bio->bi_iter }; + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, - rq->bio); + &bio_it); + } if (result) goto err_img_request; -- cgit v1.1 From df6ba7015dd3a64a2e74353d1e7d19871af86f38 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 20 Jan 2018 10:30:10 +0100 Subject: rbd: remove bio cloning helpers Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 141 ---------------------------------------------------- 1 file changed, 141 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8eaebf6..8b90473 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -444,8 +444,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; -static struct bio_set *rbd_bio_clone; - static int rbd_major; static DEFINE_IDA(rbd_dev_id_ida); @@ -1277,49 +1275,6 @@ static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) } /* - * bio helpers - */ - -static void bio_chain_put(struct bio *chain) -{ - struct bio *tmp; - - while (chain) { - tmp = chain; - chain = chain->bi_next; - bio_put(tmp); - } -} - -/* - * zeros a bio chain, starting at specific offset - */ -static void zero_bio_chain(struct bio *chain, int start_ofs) -{ - struct bio_vec bv; - struct bvec_iter iter; - unsigned long flags; - void *buf; - int pos = 0; - - while (chain) { - bio_for_each_segment(bv, chain, iter) { - if (pos + bv.bv_len > start_ofs) { - int remainder = max(start_ofs - pos, 0); - buf = bvec_kmap_irq(&bv, &flags); - memset(buf + remainder, 0, - bv.bv_len - remainder); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(buf, &flags); - } - pos += bv.bv_len; - } - - chain = chain->bi_next; - } -} - -/* * similar to zero_bio_chain(), zeros data defined by a page array, * starting at the given byte offset from the start of the array and * continuing up to the given end offset. The pages array is @@ -1352,90 +1307,6 @@ static void zero_pages(struct page **pages, u64 offset, u64 end) } /* - * Clone a portion of a bio, starting at the given byte offset - * and continuing for the number of bytes indicated. - */ -static struct bio *bio_clone_range(struct bio *bio_src, - unsigned int offset, - unsigned int len, - gfp_t gfpmask) -{ - struct bio *bio; - - bio = bio_clone_fast(bio_src, gfpmask, rbd_bio_clone); - if (!bio) - return NULL; /* ENOMEM */ - - bio_advance(bio, offset); - bio->bi_iter.bi_size = len; - - return bio; -} - -/* - * Clone a portion of a bio chain, starting at the given byte offset - * into the first bio in the source chain and continuing for the - * number of bytes indicated. The result is another bio chain of - * exactly the given length, or a null pointer on error. - * - * The bio_src and offset parameters are both in-out. On entry they - * refer to the first source bio and the offset into that bio where - * the start of data to be cloned is located. - * - * On return, bio_src is updated to refer to the bio in the source - * chain that contains first un-cloned byte, and *offset will - * contain the offset of that byte within that bio. - */ -static struct bio *bio_chain_clone_range(struct bio **bio_src, - unsigned int *offset, - unsigned int len, - gfp_t gfpmask) -{ - struct bio *bi = *bio_src; - unsigned int off = *offset; - struct bio *chain = NULL; - struct bio **end; - - /* Build up a chain of clone bios up to the limit */ - - if (!bi || off >= bi->bi_iter.bi_size || !len) - return NULL; /* Nothing to clone */ - - end = &chain; - while (len) { - unsigned int bi_size; - struct bio *bio; - - if (!bi) { - rbd_warn(NULL, "bio_chain exhausted with %u left", len); - goto out_err; /* EINVAL; ran out of bio's */ - } - bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); - bio = bio_clone_range(bi, off, bi_size, gfpmask); - if (!bio) - goto out_err; /* ENOMEM */ - - *end = bio; - end = &bio->bi_next; - - off += bi_size; - if (off == bi->bi_iter.bi_size) { - bi = bi->bi_next; - off = 0; - } - len -= bi_size; - } - *bio_src = bi; - *offset = off; - - return chain; -out_err: - bio_chain_put(chain); - - return NULL; -} - -/* * The default/initial value for all object request flags is 0. For * each flag, once its value is set to 1 it is never reset to 0 * again. @@ -6390,16 +6261,8 @@ static int rbd_slab_init(void) if (!rbd_obj_request_cache) goto out_err; - rbd_assert(!rbd_bio_clone); - rbd_bio_clone = bioset_create(BIO_POOL_SIZE, 0, 0); - if (!rbd_bio_clone) - goto out_err_clone; - return 0; -out_err_clone: - kmem_cache_destroy(rbd_obj_request_cache); - rbd_obj_request_cache = NULL; out_err: kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; @@ -6415,10 +6278,6 @@ static void rbd_slab_exit(void) rbd_assert(rbd_img_request_cache); kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; - - rbd_assert(rbd_bio_clone); - bioset_free(rbd_bio_clone); - rbd_bio_clone = NULL; } static int __init rbd_init(void) -- cgit v1.1 From 06fbb6993504974db6334a80b6796d6522ad45eb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 20 Jan 2018 10:30:10 +0100 Subject: rbd: don't (ab)use obj_req->pages for stat requests obj_req->pages is for provided data buffers. stat requests are internal and should be NODATA. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8b90473..aa3f6a6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2645,11 +2645,9 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; struct rbd_obj_request *stat_request; struct page **pages; - u32 page_count; - size_t size; int ret; - stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES); + stat_request = rbd_obj_request_create(OBJ_REQUEST_NODATA); if (!stat_request) return -ENOMEM; @@ -2670,22 +2668,19 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) * le32 tv_nsec; * } mtime; */ - size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); - page_count = (u32)calc_pages_for(0, size); - pages = ceph_alloc_page_vector(page_count, GFP_NOIO); + pages = ceph_alloc_page_vector(1, GFP_NOIO); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto fail_stat_request; } osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); - osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, - false, false); + osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, + 8 + sizeof(struct ceph_timespec), + 0, false, true); rbd_obj_request_get(obj_request); stat_request->obj_request = obj_request; - stat_request->pages = pages; - stat_request->page_count = page_count; stat_request->callback = rbd_img_obj_exists_callback; rbd_obj_request_submit(stat_request); -- cgit v1.1 From f9dcbc44cd317ee3c5e443db7f9a62f52689f08e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 20 Jan 2018 10:30:11 +0100 Subject: rbd: get rid of img_req->copyup_pages The initiating object request is the proper owner -- save a bit of space. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 43 +++++++++---------------------------------- 1 file changed, 9 insertions(+), 34 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index aa3f6a6..722422e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -309,8 +309,6 @@ struct rbd_img_request { struct request *rq; /* block request */ struct rbd_obj_request *obj_request; /* obj req initiator */ }; - struct page **copyup_pages; - u32 copyup_page_count; spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; rbd_img_callback_t callback; @@ -1940,6 +1938,9 @@ static void rbd_obj_request_destroy(struct kref *kref) break; } + ceph_release_page_vector(obj_request->copyup_pages, + obj_request->copyup_page_count); + kmem_cache_free(rbd_obj_request_cache, obj_request); } @@ -2372,8 +2373,6 @@ rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; struct rbd_device *rbd_dev; - struct page **pages; - u32 page_count; dout("%s: obj %p\n", __func__, obj_request); @@ -2386,14 +2385,6 @@ rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) rbd_dev = img_request->rbd_dev; rbd_assert(rbd_dev); - pages = obj_request->copyup_pages; - rbd_assert(pages != NULL); - obj_request->copyup_pages = NULL; - page_count = obj_request->copyup_page_count; - rbd_assert(page_count); - obj_request->copyup_page_count = 0; - ceph_release_page_vector(pages, page_count); - /* * We want the transfer count to reflect the size of the * original write request. There is no such thing as a @@ -2412,9 +2403,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) struct rbd_obj_request *orig_request; struct ceph_osd_request *osd_req; struct rbd_device *rbd_dev; - struct page **pages; enum obj_operation_type op_type; - u32 page_count; int img_result; u64 parent_length; @@ -2422,13 +2411,6 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) /* First get what we need from the image request */ - pages = img_request->copyup_pages; - rbd_assert(pages != NULL); - img_request->copyup_pages = NULL; - page_count = img_request->copyup_page_count; - rbd_assert(page_count); - img_request->copyup_page_count = 0; - orig_request = img_request->obj_request; rbd_assert(orig_request != NULL); rbd_assert(obj_request_type_valid(orig_request->type)); @@ -2447,7 +2429,6 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) * and re-submit the original write request. */ if (!rbd_dev->parent_overlap) { - ceph_release_page_vector(pages, page_count); rbd_obj_request_submit(orig_request); return; } @@ -2467,14 +2448,12 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) goto out_err; rbd_osd_req_destroy(orig_request->osd_req); orig_request->osd_req = osd_req; - orig_request->copyup_pages = pages; - orig_request->copyup_page_count = page_count; /* Initialize the copyup op */ osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); - osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, - false, false); + osd_req_op_cls_request_data_pages(osd_req, 0, orig_request->copyup_pages, + parent_length, 0, false, false); /* Add the other op(s) */ @@ -2487,7 +2466,6 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) return; out_err: - ceph_release_page_vector(pages, page_count); rbd_obj_request_error(orig_request, img_result); } @@ -2542,10 +2520,13 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) pages = ceph_alloc_page_vector(page_count, GFP_NOIO); if (IS_ERR(pages)) { result = PTR_ERR(pages); - pages = NULL; goto out_err; } + rbd_assert(!obj_request->copyup_pages); + obj_request->copyup_pages = pages; + obj_request->copyup_page_count = page_count; + result = -ENOMEM; parent_request = rbd_parent_request_create(obj_request, img_offset, length); @@ -2556,19 +2537,13 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) if (result) goto out_err; - parent_request->copyup_pages = pages; - parent_request->copyup_page_count = page_count; parent_request->callback = rbd_img_obj_parent_read_full_callback; result = rbd_img_request_submit(parent_request); if (!result) return 0; - parent_request->copyup_pages = NULL; - parent_request->copyup_page_count = 0; out_err: - if (pages) - ceph_release_page_vector(pages, page_count); if (parent_request) rbd_img_request_put(parent_request); return result; -- cgit v1.1 From 7e07efb12db96c2f7c5fafeccada327d1f869e60 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 20 Jan 2018 10:30:11 +0100 Subject: rbd: move from raw pages to bvec data descriptors In preparation for rbd "fancy" striping which requires bio_vec arrays, wire up BVECS data type and kill off PAGES data type. There is nothing wrong with using page vectors for copyup requests -- it's just less iterator boilerplate code to write for the new striping framework. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 155 ++++++++++++++++++++++++++-------------------------- 1 file changed, 77 insertions(+), 78 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 722422e..bff3e13 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -219,7 +219,7 @@ typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); enum obj_request_type { OBJ_REQUEST_NODATA = 1, OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ - OBJ_REQUEST_PAGES, + OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ }; enum obj_operation_type { @@ -272,12 +272,12 @@ struct rbd_obj_request { union { struct ceph_bio_iter bio_pos; struct { - struct page **pages; - u32 page_count; + struct ceph_bvec_iter bvec_pos; + u32 bvec_count; }; }; - struct page **copyup_pages; - u32 copyup_page_count; + struct bio_vec *copyup_bvecs; + u32 copyup_bvec_count; struct ceph_osd_request *osd_req; @@ -1272,36 +1272,14 @@ static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) })); } -/* - * similar to zero_bio_chain(), zeros data defined by a page array, - * starting at the given byte offset from the start of the array and - * continuing up to the given end offset. The pages array is - * assumed to be big enough to hold all bytes up to the end. - */ -static void zero_pages(struct page **pages, u64 offset, u64 end) +static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) { - struct page **page = &pages[offset >> PAGE_SHIFT]; - - rbd_assert(end > offset); - rbd_assert(end - offset <= (u64)SIZE_MAX); - while (offset < end) { - size_t page_offset; - size_t length; - unsigned long flags; - void *kaddr; - - page_offset = offset & ~PAGE_MASK; - length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); - local_irq_save(flags); - kaddr = kmap_atomic(*page); - memset(kaddr + page_offset, 0, length); - flush_dcache_page(*page); - kunmap_atomic(kaddr); - local_irq_restore(flags); + struct ceph_bvec_iter it = *bvec_pos; - offset += length; - page++; - } + ceph_bvec_iter_advance(&it, off); + ceph_bvec_iter_advance_step(&it, bytes, ({ + zero_bvec(&bv); + })); } /* @@ -1461,7 +1439,7 @@ static bool obj_request_type_valid(enum obj_request_type type) switch (type) { case OBJ_REQUEST_NODATA: case OBJ_REQUEST_BIO: - case OBJ_REQUEST_PAGES: + case OBJ_REQUEST_BVECS: return true; default: return false; @@ -1611,14 +1589,15 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) if (obj_request->type == OBJ_REQUEST_BIO) zero_bios(&obj_request->bio_pos, 0, length); else - zero_pages(obj_request->pages, 0, length); + zero_bvecs(&obj_request->bvec_pos, 0, length); obj_request->result = 0; } else if (xferred < length && !obj_request->result) { if (obj_request->type == OBJ_REQUEST_BIO) zero_bios(&obj_request->bio_pos, xferred, length - xferred); else - zero_pages(obj_request->pages, xferred, length); + zero_bvecs(&obj_request->bvec_pos, xferred, + length - xferred); } obj_request->xferred = length; obj_request_done_set(obj_request); @@ -1913,6 +1892,7 @@ rbd_obj_request_create(enum obj_request_type type) static void rbd_obj_request_destroy(struct kref *kref) { struct rbd_obj_request *obj_request; + u32 i; obj_request = container_of(kref, struct rbd_obj_request, kref); @@ -1924,22 +1904,22 @@ static void rbd_obj_request_destroy(struct kref *kref) if (obj_request->osd_req) rbd_osd_req_destroy(obj_request->osd_req); - rbd_assert(obj_request_type_valid(obj_request->type)); switch (obj_request->type) { case OBJ_REQUEST_NODATA: case OBJ_REQUEST_BIO: + case OBJ_REQUEST_BVECS: break; /* Nothing to do */ - case OBJ_REQUEST_PAGES: - /* img_data requests don't own their page array */ - if (obj_request->pages && - !obj_request_img_data_test(obj_request)) - ceph_release_page_vector(obj_request->pages, - obj_request->page_count); - break; + default: + rbd_assert(0); } - ceph_release_page_vector(obj_request->copyup_pages, - obj_request->copyup_page_count); + if (obj_request->copyup_bvecs) { + for (i = 0; i < obj_request->copyup_bvec_count; i++) { + if (obj_request->copyup_bvecs[i].bv_page) + __free_page(obj_request->copyup_bvecs[i].bv_page); + } + kfree(obj_request->copyup_bvecs); + } kmem_cache_free(rbd_obj_request_cache, obj_request); } @@ -2260,10 +2240,9 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, if (obj_request->type == OBJ_REQUEST_BIO) osd_req_op_extent_osd_data_bio(osd_request, num_ops, &obj_request->bio_pos, length); - else if (obj_request->type == OBJ_REQUEST_PAGES) - osd_req_op_extent_osd_data_pages(osd_request, num_ops, - obj_request->pages, length, - offset & ~PAGE_MASK, false, false); + else if (obj_request->type == OBJ_REQUEST_BVECS) + osd_req_op_extent_osd_data_bvec_pos(osd_request, num_ops, + &obj_request->bvec_pos); /* Discards are also writes */ if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) @@ -2288,7 +2267,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; struct ceph_bio_iter bio_it; - struct page **pages = NULL; + struct ceph_bvec_iter bvec_it; enum obj_operation_type op_type; u64 img_offset; u64 resid; @@ -2305,8 +2284,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, bio_it = *(struct ceph_bio_iter *)data_desc; rbd_assert(img_offset == bio_it.iter.bi_sector << SECTOR_SHIFT); - } else if (type == OBJ_REQUEST_PAGES) { - pages = data_desc; + } else if (type == OBJ_REQUEST_BVECS) { + bvec_it = *(struct ceph_bvec_iter *)data_desc; } while (resid) { @@ -2332,15 +2311,10 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, if (type == OBJ_REQUEST_BIO) { obj_request->bio_pos = bio_it; ceph_bio_iter_advance(&bio_it, length); - } else if (type == OBJ_REQUEST_PAGES) { - unsigned int page_count; - - obj_request->pages = pages; - page_count = (u32)calc_pages_for(offset, length); - obj_request->page_count = page_count; - if ((offset + length) & ~PAGE_MASK) - page_count--; /* more on last page */ - pages += page_count; + } else if (type == OBJ_REQUEST_BVECS) { + obj_request->bvec_pos = bvec_it; + ceph_bvec_iter_shorten(&obj_request->bvec_pos, length); + ceph_bvec_iter_advance(&bvec_it, length); } osd_req = rbd_osd_req_create(rbd_dev, op_type, @@ -2452,8 +2426,8 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) /* Initialize the copyup op */ osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); - osd_req_op_cls_request_data_pages(osd_req, 0, orig_request->copyup_pages, - parent_length, 0, false, false); + osd_req_op_cls_request_data_bvecs(osd_req, 0, orig_request->copyup_bvecs, + parent_length); /* Add the other op(s) */ @@ -2469,6 +2443,8 @@ out_err: rbd_obj_request_error(orig_request, img_result); } +static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap); + /* * Read from the parent image the range of data that covers the * entire target of the given object request. This is used for @@ -2487,10 +2463,9 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) { struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; struct rbd_img_request *parent_request = NULL; + struct ceph_bvec_iter bvec_it = { 0 }; u64 img_offset; u64 length; - struct page **pages = NULL; - u32 page_count; int result; rbd_assert(rbd_dev->parent != NULL); @@ -2516,16 +2491,9 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) * Allocate a page array big enough to receive the data read * from the parent. */ - page_count = (u32)calc_pages_for(0, length); - pages = ceph_alloc_page_vector(page_count, GFP_NOIO); - if (IS_ERR(pages)) { - result = PTR_ERR(pages); + result = setup_copyup_bvecs(obj_request, length); + if (result) goto out_err; - } - - rbd_assert(!obj_request->copyup_pages); - obj_request->copyup_pages = pages; - obj_request->copyup_page_count = page_count; result = -ENOMEM; parent_request = rbd_parent_request_create(obj_request, @@ -2533,7 +2501,10 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) if (!parent_request) goto out_err; - result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); + bvec_it.bvecs = obj_request->copyup_bvecs; + bvec_it.iter.bi_size = length; + result = rbd_img_request_fill(parent_request, OBJ_REQUEST_BVECS, + &bvec_it); if (result) goto out_err; @@ -2751,6 +2722,34 @@ out_put_ireq: return ret; } +static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) +{ + u32 i; + + rbd_assert(!obj_req->copyup_bvecs); + obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap); + obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count, + sizeof(*obj_req->copyup_bvecs), + GFP_NOIO); + if (!obj_req->copyup_bvecs) + return -ENOMEM; + + for (i = 0; i < obj_req->copyup_bvec_count; i++) { + unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); + + obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); + if (!obj_req->copyup_bvecs[i].bv_page) + return -ENOMEM; + + obj_req->copyup_bvecs[i].bv_offset = 0; + obj_req->copyup_bvecs[i].bv_len = len; + obj_overlap -= len; + } + + rbd_assert(!obj_overlap); + return 0; +} + static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) { struct rbd_obj_request *obj_request; @@ -2832,8 +2831,8 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, &obj_request->bio_pos); else - result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, - obj_request->pages); + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BVECS, + &obj_request->bvec_pos); if (result) goto out_err; -- cgit v1.1 From 3da691bf436690c4bb943d5d16e5934937625578 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 29 Jan 2018 14:04:08 +0100 Subject: rbd: new request handling code The notable changes are: - instead of explicitly stat'ing the object to see if it exists before issuing the write, send the write optimistically along with the stat in a single OSD request - zero copyup optimization - all object requests are associated with an image request and have a valid ->img_request pointer; there are no standalone (!IMG_DATA) object requests anymore - code is structured as a state machine (vs a bunch of callbacks with implicit state) Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 678 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 601 insertions(+), 77 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bff3e13..1bffad1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -235,11 +235,37 @@ enum obj_req_flags { OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ }; +/* + * Writes go through the following state machine to deal with + * layering: + * + * need copyup + * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP + * | ^ | + * v \------------------------------/ + * done + * ^ + * | + * RBD_OBJ_WRITE_FLAT + * + * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether + * there is a parent or not. + */ +enum rbd_obj_write_state { + RBD_OBJ_WRITE_FLAT = 1, + RBD_OBJ_WRITE_GUARD, + RBD_OBJ_WRITE_COPYUP, +}; + struct rbd_obj_request { u64 object_no; u64 offset; /* object start byte */ u64 length; /* bytes from offset */ unsigned long flags; + union { + bool tried_parent; /* for reads */ + enum rbd_obj_write_state write_state; /* for writes */ + }; /* * An object request associated with an image will have its @@ -1283,6 +1309,27 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) } /* + * Zero a range in @obj_req data buffer defined by a bio (list) or + * bio_vec array. + * + * @off is relative to the start of the data buffer. + */ +static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, + u32 bytes) +{ + switch (obj_req->type) { + case OBJ_REQUEST_BIO: + zero_bios(&obj_req->bio_pos, off, bytes); + break; + case OBJ_REQUEST_BVECS: + zero_bvecs(&obj_req->bvec_pos, off, bytes); + break; + default: + rbd_assert(0); + } +} + +/* * The default/initial value for all object request flags is 0. For * each flag, once its value is set to 1 it is never reset to 0 * again. @@ -1567,6 +1614,35 @@ rbd_img_request_op_type(struct rbd_img_request *img_request) return OBJ_OP_READ; } +static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + + return !obj_req->offset && + obj_req->length == rbd_dev->layout.object_size; +} + +static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + + return obj_req->offset + obj_req->length == + rbd_dev->layout.object_size; +} + +static bool rbd_img_is_write(struct rbd_img_request *img_req) +{ + switch (rbd_img_request_op_type(img_req)) { + case OBJ_OP_READ: + return false; + case OBJ_OP_WRITE: + case OBJ_OP_DISCARD: + return true; + default: + rbd_assert(0); + } +} + static void rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) { @@ -1697,63 +1773,28 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) obj_request_done_set(obj_request); } +static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); + static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) { - struct rbd_obj_request *obj_request = osd_req->r_priv; - u16 opcode; - - dout("%s: osd_req %p\n", __func__, osd_req); - rbd_assert(osd_req == obj_request->osd_req); - if (obj_request_img_data_test(obj_request)) { - rbd_assert(obj_request->img_request); - rbd_assert(obj_request->which != BAD_WHICH); - } else { - rbd_assert(obj_request->which == BAD_WHICH); - } + struct rbd_obj_request *obj_req = osd_req->r_priv; - if (osd_req->r_result < 0) - obj_request->result = osd_req->r_result; + dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, + osd_req->r_result, obj_req); + rbd_assert(osd_req == obj_req->osd_req); - /* - * We support a 64-bit length, but ultimately it has to be - * passed to the block layer, which just supports a 32-bit - * length field. - */ - obj_request->xferred = osd_req->r_ops[0].outdata_len; - rbd_assert(obj_request->xferred < (u64)UINT_MAX); - - opcode = osd_req->r_ops[0].op; - switch (opcode) { - case CEPH_OSD_OP_READ: - rbd_osd_read_callback(obj_request); - break; - case CEPH_OSD_OP_SETALLOCHINT: - rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || - osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); - /* fall through */ - case CEPH_OSD_OP_WRITE: - case CEPH_OSD_OP_WRITEFULL: - rbd_osd_write_callback(obj_request); - break; - case CEPH_OSD_OP_STAT: - rbd_osd_stat_callback(obj_request); - break; - case CEPH_OSD_OP_DELETE: - case CEPH_OSD_OP_TRUNCATE: - case CEPH_OSD_OP_ZERO: - rbd_osd_discard_callback(obj_request); - break; - case CEPH_OSD_OP_CALL: - rbd_osd_call_callback(obj_request); - break; - default: - rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d", - obj_request->object_no, opcode); - break; - } + obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0; + if (!obj_req->result && !rbd_img_is_write(obj_req->img_request)) + obj_req->xferred = osd_req->r_result; + else + /* + * Writes aren't allowed to return a data payload. In some + * guarded write cases (e.g. stat + zero on an empty object) + * a stat response makes it through, but we don't care. + */ + obj_req->xferred = 0; - if (obj_request_done_test(obj_request)) - rbd_obj_request_complete(obj_request); + rbd_obj_handle_request(obj_req); } static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) @@ -1806,12 +1847,6 @@ err_req: return NULL; } -/* - * Create an osd request. A read request has one osd op (read). - * A write request has either one (watch) or two (hint+write) osd ops. - * (All rbd data writes are prefixed with an allocation hint op, but - * technically osd watch is a write request, hence this distinction.) - */ static struct ceph_osd_request *rbd_osd_req_create( struct rbd_device *rbd_dev, enum obj_operation_type op_type, @@ -1831,8 +1866,6 @@ static struct ceph_osd_request *rbd_osd_req_create( snapc = img_request->snapc; } - rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); - return __rbd_osd_req_create(rbd_dev, snapc, num_ops, (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ? CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request); @@ -2251,6 +2284,211 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, rbd_osd_req_format_read(obj_request); } +static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) +{ + switch (obj_req->type) { + case OBJ_REQUEST_BIO: + osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, + &obj_req->bio_pos, + obj_req->length); + break; + case OBJ_REQUEST_BVECS: + rbd_assert(obj_req->bvec_pos.iter.bi_size == + obj_req->length); + osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, + &obj_req->bvec_pos); + break; + default: + rbd_assert(0); + } +} + +static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + + obj_req->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, obj_req); + if (!obj_req->osd_req) + return -ENOMEM; + + osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ, + obj_req->offset, obj_req->length, 0, 0); + rbd_osd_req_setup_data(obj_req, 0); + + rbd_osd_req_format_read(obj_req); + return 0; +} + +static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req, + unsigned int which) +{ + struct page **pages; + + /* + * The response data for a STAT call consists of: + * le64 length; + * struct { + * le32 tv_sec; + * le32 tv_nsec; + * } mtime; + */ + pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0); + osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages, + 8 + sizeof(struct ceph_timespec), + 0, false, true); + return 0; +} + +static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, + unsigned int which) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + u16 opcode; + + osd_req_op_alloc_hint_init(obj_req->osd_req, which++, + rbd_dev->layout.object_size, + rbd_dev->layout.object_size); + + if (rbd_obj_is_entire(obj_req)) + opcode = CEPH_OSD_OP_WRITEFULL; + else + opcode = CEPH_OSD_OP_WRITE; + + osd_req_op_extent_init(obj_req->osd_req, which, opcode, + obj_req->offset, obj_req->length, 0, 0); + rbd_osd_req_setup_data(obj_req, which++); + + rbd_assert(which == obj_req->osd_req->r_num_ops); + rbd_osd_req_format_write(obj_req); +} + +static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + unsigned int num_osd_ops, which = 0; + int ret; + + if (obj_request_overlaps_parent(obj_req)) { + obj_req->write_state = RBD_OBJ_WRITE_GUARD; + num_osd_ops = 3; /* stat + setallochint + write/writefull */ + } else { + obj_req->write_state = RBD_OBJ_WRITE_FLAT; + num_osd_ops = 2; /* setallochint + write/writefull */ + } + + obj_req->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, + num_osd_ops, obj_req); + if (!obj_req->osd_req) + return -ENOMEM; + + if (obj_request_overlaps_parent(obj_req)) { + ret = __rbd_obj_setup_stat(obj_req, which++); + if (ret) + return ret; + } + + __rbd_obj_setup_write(obj_req, which); + return 0; +} + +static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, + unsigned int which) +{ + u16 opcode; + + if (rbd_obj_is_entire(obj_req)) { + if (obj_request_overlaps_parent(obj_req)) { + opcode = CEPH_OSD_OP_TRUNCATE; + } else { + osd_req_op_init(obj_req->osd_req, which++, + CEPH_OSD_OP_DELETE, 0); + opcode = 0; + } + } else if (rbd_obj_is_tail(obj_req)) { + opcode = CEPH_OSD_OP_TRUNCATE; + } else { + opcode = CEPH_OSD_OP_ZERO; + } + + if (opcode) + osd_req_op_extent_init(obj_req->osd_req, which++, opcode, + obj_req->offset, obj_req->length, + 0, 0); + + rbd_assert(which == obj_req->osd_req->r_num_ops); + rbd_osd_req_format_write(obj_req); +} + +static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + unsigned int num_osd_ops, which = 0; + int ret; + + if (rbd_obj_is_entire(obj_req)) { + obj_req->write_state = RBD_OBJ_WRITE_FLAT; + num_osd_ops = 1; /* truncate/delete */ + } else { + if (obj_request_overlaps_parent(obj_req)) { + obj_req->write_state = RBD_OBJ_WRITE_GUARD; + num_osd_ops = 2; /* stat + truncate/zero */ + } else { + obj_req->write_state = RBD_OBJ_WRITE_FLAT; + num_osd_ops = 1; /* truncate/zero */ + } + } + + obj_req->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_DISCARD, + num_osd_ops, obj_req); + if (!obj_req->osd_req) + return -ENOMEM; + + if (!rbd_obj_is_entire(obj_req) && + obj_request_overlaps_parent(obj_req)) { + ret = __rbd_obj_setup_stat(obj_req, which++); + if (ret) + return ret; + } + + __rbd_obj_setup_discard(obj_req, which); + return 0; +} + +/* + * For each object request in @img_req, allocate an OSD request, add + * individual OSD ops and prepare them for submission. The number of + * OSD ops depends on op_type and the overlap point (if any). + */ +static int __rbd_img_fill_request(struct rbd_img_request *img_req) +{ + struct rbd_obj_request *obj_req; + int ret; + + for_each_obj_request(img_req, obj_req) { + switch (rbd_img_request_op_type(img_req)) { + case OBJ_OP_READ: + ret = rbd_obj_setup_read(obj_req); + break; + case OBJ_OP_WRITE: + ret = rbd_obj_setup_write(obj_req); + break; + case OBJ_OP_DISCARD: + ret = rbd_obj_setup_discard(obj_req); + break; + default: + rbd_assert(0); + } + if (ret) + return ret; + } + + return 0; +} + /* * Split up an image request into one or more object requests, each * to a different object. The "type" parameter indicates whether @@ -2268,7 +2506,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, struct rbd_obj_request *next_obj_request; struct ceph_bio_iter bio_it; struct ceph_bvec_iter bvec_it; - enum obj_operation_type op_type; u64 img_offset; u64 resid; @@ -2278,7 +2515,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, img_offset = img_request->offset; resid = img_request->length; rbd_assert(resid > 0); - op_type = rbd_img_request_op_type(img_request); if (type == OBJ_REQUEST_BIO) { bio_it = *(struct ceph_bio_iter *)data_desc; @@ -2289,7 +2525,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, } while (resid) { - struct ceph_osd_request *osd_req; u64 object_no = img_offset >> rbd_dev->header.obj_order; u64 offset = rbd_segment_offset(rbd_dev, img_offset); u64 length = rbd_segment_length(rbd_dev, img_offset, resid); @@ -2317,23 +2552,14 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, ceph_bvec_iter_advance(&bvec_it, length); } - osd_req = rbd_osd_req_create(rbd_dev, op_type, - (op_type == OBJ_OP_WRITE) ? 2 : 1, - obj_request); - if (!osd_req) - goto out_unwind; - - obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; obj_request->img_offset = img_offset; - rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); - img_offset += length; resid -= length; } - return 0; + return __rbd_img_fill_request(img_request); out_unwind: for_each_obj_request_safe(img_request, obj_request, next_obj_request) @@ -2712,16 +2938,171 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) rbd_img_request_get(img_request); for_each_obj_request_safe(img_request, obj_request, next_obj_request) { - ret = rbd_img_obj_request_submit(obj_request); - if (ret) - goto out_put_ireq; + rbd_obj_request_submit(obj_request); } -out_put_ireq: rbd_img_request_put(img_request); return ret; } +static void rbd_img_end_child_request(struct rbd_img_request *img_req); + +static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req, + u64 img_offset, u32 bytes) +{ + struct rbd_img_request *img_req = obj_req->img_request; + struct rbd_img_request *child_img_req; + int ret; + + child_img_req = rbd_parent_request_create(obj_req, img_offset, bytes); + if (!child_img_req) + return -ENOMEM; + + child_img_req->callback = rbd_img_end_child_request; + + if (!rbd_img_is_write(img_req)) { + switch (obj_req->type) { + case OBJ_REQUEST_BIO: + ret = rbd_img_request_fill(child_img_req, + OBJ_REQUEST_BIO, + &obj_req->bio_pos); + break; + case OBJ_REQUEST_BVECS: + ret = rbd_img_request_fill(child_img_req, + OBJ_REQUEST_BVECS, + &obj_req->bvec_pos); + break; + default: + rbd_assert(0); + } + } else { + struct ceph_bvec_iter it = { + .bvecs = obj_req->copyup_bvecs, + .iter = { .bi_size = bytes }, + }; + + ret = rbd_img_request_fill(child_img_req, OBJ_REQUEST_BVECS, + &it); + } + if (ret) { + rbd_img_request_put(child_img_req); + return ret; + } + + rbd_img_request_submit(child_img_req); + return 0; +} + +static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + int ret; + + if (obj_req->result == -ENOENT && + obj_req->img_offset < rbd_dev->parent_overlap && + !obj_req->tried_parent) { + u64 obj_overlap = min(obj_req->length, + rbd_dev->parent_overlap - obj_req->img_offset); + + obj_req->tried_parent = true; + ret = rbd_obj_read_from_parent(obj_req, obj_req->img_offset, + obj_overlap); + if (ret) { + obj_req->result = ret; + return true; + } + return false; + } + + /* + * -ENOENT means a hole in the image -- zero-fill the entire + * length of the request. A short read also implies zero-fill + * to the end of the request. In both cases we update xferred + * count to indicate the whole request was satisfied. + */ + if (obj_req->result == -ENOENT || + (!obj_req->result && obj_req->xferred < obj_req->length)) { + rbd_assert(!obj_req->xferred || !obj_req->result); + rbd_obj_zero_range(obj_req, obj_req->xferred, + obj_req->length - obj_req->xferred); + obj_req->result = 0; + obj_req->xferred = obj_req->length; + } + + return true; +} + +/* + * copyup_bvecs pages are never highmem pages + */ +static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) +{ + struct ceph_bvec_iter it = { + .bvecs = bvecs, + .iter = { .bi_size = bytes }, + }; + + ceph_bvec_iter_advance_step(&it, bytes, ({ + if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0, + bv.bv_len)) + return false; + })); + return true; +} + +static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + unsigned int num_osd_ops = obj_req->osd_req->r_num_ops; + + dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); + rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); + rbd_osd_req_destroy(obj_req->osd_req); + + /* + * Create a copyup request with the same number of OSD ops as + * the original request. The original request was stat + op(s), + * the new copyup request will be copyup + the same op(s). + */ + obj_req->osd_req = rbd_osd_req_create(rbd_dev, + rbd_img_request_op_type(obj_req->img_request), + num_osd_ops, obj_req); + if (!obj_req->osd_req) + return -ENOMEM; + + /* + * Only send non-zero copyup data to save some I/O and network + * bandwidth -- zero copyup data is equivalent to the object not + * existing. + */ + if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) { + dout("%s obj_req %p detected zeroes\n", __func__, obj_req); + bytes = 0; + } + + osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd", + "copyup"); + osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, + obj_req->copyup_bvecs, bytes); + + switch (rbd_img_request_op_type(obj_req->img_request)) { + case OBJ_OP_WRITE: + __rbd_obj_setup_write(obj_req, 1); + break; + case OBJ_OP_DISCARD: + rbd_assert(!rbd_obj_is_entire(obj_req)); + __rbd_obj_setup_discard(obj_req, 1); + break; + default: + rbd_assert(0); + } + + rbd_obj_request_submit(obj_req); + /* FIXME: in lieu of rbd_img_obj_callback() */ + rbd_img_request_put(obj_req->img_request); + return 0; +} + static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) { u32 i; @@ -2850,6 +3231,149 @@ out_err: obj_request_done_set(obj_request); } +static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + u64 img_offset; + u64 obj_overlap; + int ret; + + if (!obj_request_overlaps_parent(obj_req)) { + /* + * The overlap has become 0 (most likely because the + * image has been flattened). Use rbd_obj_issue_copyup() + * to re-submit the original write request -- the copyup + * operation itself will be a no-op, since someone must + * have populated the child object while we weren't + * looking. Move to WRITE_FLAT state as we'll be done + * with the operation once the null copyup completes. + */ + obj_req->write_state = RBD_OBJ_WRITE_FLAT; + return rbd_obj_issue_copyup(obj_req, 0); + } + + /* + * Determine the byte range covered by the object in the + * child image to which the original request was to be sent. + */ + img_offset = obj_req->img_offset - obj_req->offset; + obj_overlap = rbd_dev->layout.object_size; + + /* + * There is no defined parent data beyond the parent + * overlap, so limit what we read at that boundary if + * necessary. + */ + if (img_offset + obj_overlap > rbd_dev->parent_overlap) { + rbd_assert(img_offset < rbd_dev->parent_overlap); + obj_overlap = rbd_dev->parent_overlap - img_offset; + } + + ret = setup_copyup_bvecs(obj_req, obj_overlap); + if (ret) + return ret; + + obj_req->write_state = RBD_OBJ_WRITE_COPYUP; + return rbd_obj_read_from_parent(obj_req, img_offset, obj_overlap); +} + +static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) +{ + int ret; + +again: + switch (obj_req->write_state) { + case RBD_OBJ_WRITE_GUARD: + rbd_assert(!obj_req->xferred); + if (obj_req->result == -ENOENT) { + /* + * The target object doesn't exist. Read the data for + * the entire target object up to the overlap point (if + * any) from the parent, so we can use it for a copyup. + */ + ret = rbd_obj_handle_write_guard(obj_req); + if (ret) { + obj_req->result = ret; + return true; + } + return false; + } + /* fall through */ + case RBD_OBJ_WRITE_FLAT: + if (!obj_req->result) + /* + * There is no such thing as a successful short + * write -- indicate the whole request was satisfied. + */ + obj_req->xferred = obj_req->length; + return true; + case RBD_OBJ_WRITE_COPYUP: + obj_req->write_state = RBD_OBJ_WRITE_GUARD; + if (obj_req->result) + goto again; + + rbd_assert(obj_req->xferred); + ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); + if (ret) { + obj_req->result = ret; + return true; + } + return false; + default: + rbd_assert(0); + } +} + +/* + * Returns true if @obj_req is completed, or false otherwise. + */ +static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) +{ + switch (rbd_img_request_op_type(obj_req->img_request)) { + case OBJ_OP_READ: + return rbd_obj_handle_read(obj_req); + case OBJ_OP_WRITE: + return rbd_obj_handle_write(obj_req); + case OBJ_OP_DISCARD: + if (rbd_obj_handle_write(obj_req)) { + /* + * Hide -ENOENT from delete/truncate/zero -- discarding + * a non-existent object is not a problem. + */ + if (obj_req->result == -ENOENT) { + obj_req->result = 0; + obj_req->xferred = obj_req->length; + } + return true; + } + return false; + default: + rbd_assert(0); + } +} + +static void rbd_img_end_child_request(struct rbd_img_request *img_req) +{ + struct rbd_obj_request *obj_req = img_req->obj_request; + + rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags)); + + obj_req->result = img_req->result; + obj_req->xferred = img_req->xferred; + rbd_img_request_put(img_req); + + rbd_obj_handle_request(obj_req); +} + +static void rbd_obj_handle_request(struct rbd_obj_request *obj_req) +{ + if (!__rbd_obj_handle_request(obj_req)) + return; + + obj_request_done_set(obj_req); + rbd_obj_request_complete(obj_req); +} + static const struct rbd_client_id rbd_empty_cid; static bool rbd_cid_equal(const struct rbd_client_id *lhs, -- cgit v1.1 From 51c3509e5e167cf0fdc82c81f4d85da46b1ee1ee Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 29 Jan 2018 14:04:08 +0100 Subject: rbd: remove old request handling code Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 734 +--------------------------------------------------- 1 file changed, 4 insertions(+), 730 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1bffad1..e7e99e7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -231,8 +231,6 @@ enum obj_operation_type { enum obj_req_flags { OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ - OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ - OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ }; /* @@ -271,27 +269,15 @@ struct rbd_obj_request { * An object request associated with an image will have its * img_data flag set; a standalone object request will not. * - * A standalone object request will have which == BAD_WHICH - * and a null obj_request pointer. - * - * An object request initiated in support of a layered image - * object (to check for its existence before a write) will - * have which == BAD_WHICH and a non-null obj_request pointer. - * * Finally, an object request for rbd image data will have * which != BAD_WHICH, and will have a non-null img_request * pointer. The value of which will be in the range * 0..(img_request->obj_request_count-1). */ - union { - struct rbd_obj_request *obj_request; /* STAT op */ - struct { - struct rbd_img_request *img_request; - u64 img_offset; - /* links for img_request->obj_requests list */ - struct list_head links; - }; - }; + struct rbd_img_request *img_request; + u64 img_offset; + /* links for img_request->obj_requests list */ + struct list_head links; u32 which; /* posn image request list */ enum obj_request_type type; @@ -480,8 +466,6 @@ static bool single_major = true; module_param(single_major, bool, S_IRUGO); MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); -static int rbd_img_request_submit(struct rbd_img_request *img_request); - static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, @@ -610,9 +594,6 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) # define rbd_assert(expr) ((void) 0) #endif /* !RBD_DEBUG */ -static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); -static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); -static void rbd_img_parent_read(struct rbd_obj_request *obj_request); static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev); @@ -1369,37 +1350,6 @@ static bool obj_request_done_test(struct rbd_obj_request *obj_request) return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; } -/* - * This sets the KNOWN flag after (possibly) setting the EXISTS - * flag. The latter is set based on the "exists" value provided. - * - * Note that for our purposes once an object exists it never goes - * away again. It's possible that the response from two existence - * checks are separated by the creation of the target object, and - * the first ("doesn't exist") response arrives *after* the second - * ("does exist"). In that case we ignore the second one. - */ -static void obj_request_existence_set(struct rbd_obj_request *obj_request, - bool exists) -{ - if (exists) - set_bit(OBJ_REQ_EXISTS, &obj_request->flags); - set_bit(OBJ_REQ_KNOWN, &obj_request->flags); - smp_mb(); -} - -static bool obj_request_known_test(struct rbd_obj_request *obj_request) -{ - smp_mb(); - return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; -} - -static bool obj_request_exists_test(struct rbd_obj_request *obj_request) -{ - smp_mb(); - return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; -} - static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) { struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; @@ -1643,42 +1593,6 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req) } } -static void -rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) -{ - u64 xferred = obj_request->xferred; - u64 length = obj_request->length; - - dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, - obj_request, obj_request->img_request, obj_request->result, - xferred, length); - /* - * ENOENT means a hole in the image. We zero-fill the entire - * length of the request. A short read also implies zero-fill - * to the end of the request. An error requires the whole - * length of the request to be reported finished with an error - * to the block layer. In each case we update the xferred - * count to indicate the whole request was satisfied. - */ - rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); - if (obj_request->result == -ENOENT) { - if (obj_request->type == OBJ_REQUEST_BIO) - zero_bios(&obj_request->bio_pos, 0, length); - else - zero_bvecs(&obj_request->bvec_pos, 0, length); - obj_request->result = 0; - } else if (xferred < length && !obj_request->result) { - if (obj_request->type == OBJ_REQUEST_BIO) - zero_bios(&obj_request->bio_pos, xferred, - length - xferred); - else - zero_bvecs(&obj_request->bvec_pos, xferred, - length - xferred); - } - obj_request->xferred = length; - obj_request_done_set(obj_request); -} - static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) { dout("%s: obj %p cb %p\n", __func__, obj_request, @@ -1686,93 +1600,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) obj_request->callback(obj_request); } -static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err) -{ - obj_request->result = err; - obj_request->xferred = 0; - /* - * kludge - mirror rbd_obj_request_submit() to match a put in - * rbd_img_obj_callback() - */ - if (obj_request_img_data_test(obj_request)) { - WARN_ON(obj_request->callback != rbd_img_obj_callback); - rbd_img_request_get(obj_request->img_request); - } - obj_request_done_set(obj_request); - rbd_obj_request_complete(obj_request); -} - -static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) -{ - struct rbd_img_request *img_request = NULL; - struct rbd_device *rbd_dev = NULL; - bool layered = false; - - if (obj_request_img_data_test(obj_request)) { - img_request = obj_request->img_request; - layered = img_request && img_request_layered_test(img_request); - rbd_dev = img_request->rbd_dev; - } - - dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, - obj_request, img_request, obj_request->result, - obj_request->xferred, obj_request->length); - if (layered && obj_request->result == -ENOENT && - obj_request->img_offset < rbd_dev->parent_overlap) - rbd_img_parent_read(obj_request); - else if (img_request) - rbd_img_obj_request_read_callback(obj_request); - else - obj_request_done_set(obj_request); -} - -static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p result %d %llu\n", __func__, obj_request, - obj_request->result, obj_request->length); - /* - * There is no such thing as a successful short write. Set - * it to our originally-requested length. - */ - obj_request->xferred = obj_request->length; - obj_request_done_set(obj_request); -} - -static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p result %d %llu\n", __func__, obj_request, - obj_request->result, obj_request->length); - /* - * There is no such thing as a successful short discard. Set - * it to our originally-requested length. - */ - obj_request->xferred = obj_request->length; - /* discarding a non-existent object is not a problem */ - if (obj_request->result == -ENOENT) - obj_request->result = 0; - obj_request_done_set(obj_request); -} - -/* - * For a simple stat call there's nothing to do. We'll do more if - * this is part of a write sequence for a layered image. - */ -static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p\n", __func__, obj_request); - obj_request_done_set(obj_request); -} - -static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p\n", __func__, obj_request); - - if (obj_request_img_data_test(obj_request)) - rbd_osd_copyup_callback(obj_request); - else - obj_request_done_set(obj_request); -} - static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) @@ -1871,32 +1698,6 @@ static struct ceph_osd_request *rbd_osd_req_create( CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request); } -/* - * Create a copyup osd request based on the information in the object - * request supplied. A copyup request has two or three osd ops, a - * copyup method call, potentially a hint op, and a write or truncate - * or zero op. - */ -static struct ceph_osd_request * -rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) -{ - struct rbd_img_request *img_request; - int num_osd_ops = 3; - - rbd_assert(obj_request_img_data_test(obj_request)); - img_request = obj_request->img_request; - rbd_assert(img_request); - rbd_assert(img_request_write_test(img_request) || - img_request_discard_test(img_request)); - - if (img_request_discard_test(img_request)) - num_osd_ops = 2; - - return __rbd_osd_req_create(img_request->rbd_dev, - img_request->snapc, num_osd_ops, - CEPH_OSD_FLAG_WRITE, obj_request); -} - static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) { ceph_osdc_put_request(osd_req); @@ -2217,73 +2018,6 @@ out: rbd_img_request_complete(img_request); } -/* - * Add individual osd ops to the given ceph_osd_request and prepare - * them for submission. num_ops is the current number of - * osd operations already to the object request. - */ -static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, - struct ceph_osd_request *osd_request, - enum obj_operation_type op_type, - unsigned int num_ops) -{ - struct rbd_img_request *img_request = obj_request->img_request; - struct rbd_device *rbd_dev = img_request->rbd_dev; - u64 object_size = rbd_obj_bytes(&rbd_dev->header); - u64 offset = obj_request->offset; - u64 length = obj_request->length; - u64 img_end; - u16 opcode; - - if (op_type == OBJ_OP_DISCARD) { - if (!offset && length == object_size && - (!img_request_layered_test(img_request) || - !obj_request_overlaps_parent(obj_request))) { - opcode = CEPH_OSD_OP_DELETE; - } else if ((offset + length == object_size)) { - opcode = CEPH_OSD_OP_TRUNCATE; - } else { - down_read(&rbd_dev->header_rwsem); - img_end = rbd_dev->header.image_size; - up_read(&rbd_dev->header_rwsem); - - if (obj_request->img_offset + length == img_end) - opcode = CEPH_OSD_OP_TRUNCATE; - else - opcode = CEPH_OSD_OP_ZERO; - } - } else if (op_type == OBJ_OP_WRITE) { - if (!offset && length == object_size) - opcode = CEPH_OSD_OP_WRITEFULL; - else - opcode = CEPH_OSD_OP_WRITE; - osd_req_op_alloc_hint_init(osd_request, num_ops, - object_size, object_size); - num_ops++; - } else { - opcode = CEPH_OSD_OP_READ; - } - - if (opcode == CEPH_OSD_OP_DELETE) - osd_req_op_init(osd_request, num_ops, opcode, 0); - else - osd_req_op_extent_init(osd_request, num_ops, opcode, - offset, length, 0, 0); - - if (obj_request->type == OBJ_REQUEST_BIO) - osd_req_op_extent_osd_data_bio(osd_request, num_ops, - &obj_request->bio_pos, length); - else if (obj_request->type == OBJ_REQUEST_BVECS) - osd_req_op_extent_osd_data_bvec_pos(osd_request, num_ops, - &obj_request->bvec_pos); - - /* Discards are also writes */ - if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) - rbd_osd_req_format_write(obj_request); - else - rbd_osd_req_format_read(obj_request); -} - static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) { switch (obj_req->type) { @@ -2568,366 +2302,6 @@ out_unwind: return -ENOMEM; } -static void -rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) -{ - struct rbd_img_request *img_request; - struct rbd_device *rbd_dev; - - dout("%s: obj %p\n", __func__, obj_request); - - rbd_assert(obj_request->type == OBJ_REQUEST_BIO || - obj_request->type == OBJ_REQUEST_NODATA); - rbd_assert(obj_request_img_data_test(obj_request)); - img_request = obj_request->img_request; - rbd_assert(img_request); - - rbd_dev = img_request->rbd_dev; - rbd_assert(rbd_dev); - - /* - * We want the transfer count to reflect the size of the - * original write request. There is no such thing as a - * successful short write, so if the request was successful - * we can just set it to the originally-requested length. - */ - if (!obj_request->result) - obj_request->xferred = obj_request->length; - - obj_request_done_set(obj_request); -} - -static void -rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) -{ - struct rbd_obj_request *orig_request; - struct ceph_osd_request *osd_req; - struct rbd_device *rbd_dev; - enum obj_operation_type op_type; - int img_result; - u64 parent_length; - - rbd_assert(img_request_child_test(img_request)); - - /* First get what we need from the image request */ - - orig_request = img_request->obj_request; - rbd_assert(orig_request != NULL); - rbd_assert(obj_request_type_valid(orig_request->type)); - img_result = img_request->result; - parent_length = img_request->length; - rbd_assert(img_result || parent_length == img_request->xferred); - rbd_img_request_put(img_request); - - rbd_assert(orig_request->img_request); - rbd_dev = orig_request->img_request->rbd_dev; - rbd_assert(rbd_dev); - - /* - * If the overlap has become 0 (most likely because the - * image has been flattened) we need to free the pages - * and re-submit the original write request. - */ - if (!rbd_dev->parent_overlap) { - rbd_obj_request_submit(orig_request); - return; - } - - if (img_result) - goto out_err; - - /* - * The original osd request is of no use to use any more. - * We need a new one that can hold the three ops in a copyup - * request. Allocate the new copyup osd request for the - * original request, and release the old one. - */ - img_result = -ENOMEM; - osd_req = rbd_osd_req_create_copyup(orig_request); - if (!osd_req) - goto out_err; - rbd_osd_req_destroy(orig_request->osd_req); - orig_request->osd_req = osd_req; - - /* Initialize the copyup op */ - - osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); - osd_req_op_cls_request_data_bvecs(osd_req, 0, orig_request->copyup_bvecs, - parent_length); - - /* Add the other op(s) */ - - op_type = rbd_img_request_op_type(orig_request->img_request); - rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); - - /* All set, send it off. */ - - rbd_obj_request_submit(orig_request); - return; - -out_err: - rbd_obj_request_error(orig_request, img_result); -} - -static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap); - -/* - * Read from the parent image the range of data that covers the - * entire target of the given object request. This is used for - * satisfying a layered image write request when the target of an - * object request from the image request does not exist. - * - * A page array big enough to hold the returned data is allocated - * and supplied to rbd_img_request_fill() as the "data descriptor." - * When the read completes, this page array will be transferred to - * the original object request for the copyup operation. - * - * If an error occurs, it is recorded as the result of the original - * object request in rbd_img_obj_exists_callback(). - */ -static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) -{ - struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; - struct rbd_img_request *parent_request = NULL; - struct ceph_bvec_iter bvec_it = { 0 }; - u64 img_offset; - u64 length; - int result; - - rbd_assert(rbd_dev->parent != NULL); - - /* - * Determine the byte range covered by the object in the - * child image to which the original request was to be sent. - */ - img_offset = obj_request->img_offset - obj_request->offset; - length = rbd_obj_bytes(&rbd_dev->header); - - /* - * There is no defined parent data beyond the parent - * overlap, so limit what we read at that boundary if - * necessary. - */ - if (img_offset + length > rbd_dev->parent_overlap) { - rbd_assert(img_offset < rbd_dev->parent_overlap); - length = rbd_dev->parent_overlap - img_offset; - } - - /* - * Allocate a page array big enough to receive the data read - * from the parent. - */ - result = setup_copyup_bvecs(obj_request, length); - if (result) - goto out_err; - - result = -ENOMEM; - parent_request = rbd_parent_request_create(obj_request, - img_offset, length); - if (!parent_request) - goto out_err; - - bvec_it.bvecs = obj_request->copyup_bvecs; - bvec_it.iter.bi_size = length; - result = rbd_img_request_fill(parent_request, OBJ_REQUEST_BVECS, - &bvec_it); - if (result) - goto out_err; - - parent_request->callback = rbd_img_obj_parent_read_full_callback; - - result = rbd_img_request_submit(parent_request); - if (!result) - return 0; - -out_err: - if (parent_request) - rbd_img_request_put(parent_request); - return result; -} - -static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) -{ - struct rbd_obj_request *orig_request; - struct rbd_device *rbd_dev; - int result; - - rbd_assert(!obj_request_img_data_test(obj_request)); - - /* - * All we need from the object request is the original - * request and the result of the STAT op. Grab those, then - * we're done with the request. - */ - orig_request = obj_request->obj_request; - obj_request->obj_request = NULL; - rbd_obj_request_put(orig_request); - rbd_assert(orig_request); - rbd_assert(orig_request->img_request); - - result = obj_request->result; - obj_request->result = 0; - - dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, - obj_request, orig_request, result, - obj_request->xferred, obj_request->length); - rbd_obj_request_put(obj_request); - - /* - * If the overlap has become 0 (most likely because the - * image has been flattened) we need to re-submit the - * original request. - */ - rbd_dev = orig_request->img_request->rbd_dev; - if (!rbd_dev->parent_overlap) { - rbd_obj_request_submit(orig_request); - return; - } - - /* - * Our only purpose here is to determine whether the object - * exists, and we don't want to treat the non-existence as - * an error. If something else comes back, transfer the - * error to the original request and complete it now. - */ - if (!result) { - obj_request_existence_set(orig_request, true); - } else if (result == -ENOENT) { - obj_request_existence_set(orig_request, false); - } else { - goto fail_orig_request; - } - - /* - * Resubmit the original request now that we have recorded - * whether the target object exists. - */ - result = rbd_img_obj_request_submit(orig_request); - if (result) - goto fail_orig_request; - - return; - -fail_orig_request: - rbd_obj_request_error(orig_request, result); -} - -static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) -{ - struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; - struct rbd_obj_request *stat_request; - struct page **pages; - int ret; - - stat_request = rbd_obj_request_create(OBJ_REQUEST_NODATA); - if (!stat_request) - return -ENOMEM; - - stat_request->object_no = obj_request->object_no; - - stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, - stat_request); - if (!stat_request->osd_req) { - ret = -ENOMEM; - goto fail_stat_request; - } - - /* - * The response data for a STAT call consists of: - * le64 length; - * struct { - * le32 tv_sec; - * le32 tv_nsec; - * } mtime; - */ - pages = ceph_alloc_page_vector(1, GFP_NOIO); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - goto fail_stat_request; - } - - osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); - osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, - 8 + sizeof(struct ceph_timespec), - 0, false, true); - - rbd_obj_request_get(obj_request); - stat_request->obj_request = obj_request; - stat_request->callback = rbd_img_obj_exists_callback; - - rbd_obj_request_submit(stat_request); - return 0; - -fail_stat_request: - rbd_obj_request_put(stat_request); - return ret; -} - -static bool img_obj_request_simple(struct rbd_obj_request *obj_request) -{ - struct rbd_img_request *img_request = obj_request->img_request; - struct rbd_device *rbd_dev = img_request->rbd_dev; - - /* Reads */ - if (!img_request_write_test(img_request) && - !img_request_discard_test(img_request)) - return true; - - /* Non-layered writes */ - if (!img_request_layered_test(img_request)) - return true; - - /* - * Layered writes outside of the parent overlap range don't - * share any data with the parent. - */ - if (!obj_request_overlaps_parent(obj_request)) - return true; - - /* - * Entire-object layered writes - we will overwrite whatever - * parent data there is anyway. - */ - if (!obj_request->offset && - obj_request->length == rbd_obj_bytes(&rbd_dev->header)) - return true; - - /* - * If the object is known to already exist, its parent data has - * already been copied. - */ - if (obj_request_known_test(obj_request) && - obj_request_exists_test(obj_request)) - return true; - - return false; -} - -static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) -{ - rbd_assert(obj_request_img_data_test(obj_request)); - rbd_assert(obj_request_type_valid(obj_request->type)); - rbd_assert(obj_request->img_request); - - if (img_obj_request_simple(obj_request)) { - rbd_obj_request_submit(obj_request); - return 0; - } - - /* - * It's a layered write. The target object might exist but - * we may not know that yet. If we know it doesn't exist, - * start by reading the data for the full target object from - * the parent so we can use it for a copyup to the target. - */ - if (obj_request_known_test(obj_request)) - return rbd_img_obj_parent_read_full(obj_request); - - /* We don't know whether the target exists. Go find out. */ - - return rbd_img_obj_exists_submit(obj_request); -} - static int rbd_img_request_submit(struct rbd_img_request *img_request) { struct rbd_obj_request *obj_request; @@ -3131,106 +2505,6 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) return 0; } -static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) -{ - struct rbd_obj_request *obj_request; - struct rbd_device *rbd_dev; - u64 obj_end; - u64 img_xferred; - int img_result; - - rbd_assert(img_request_child_test(img_request)); - - /* First get what we need from the image request and release it */ - - obj_request = img_request->obj_request; - img_xferred = img_request->xferred; - img_result = img_request->result; - rbd_img_request_put(img_request); - - /* - * If the overlap has become 0 (most likely because the - * image has been flattened) we need to re-submit the - * original request. - */ - rbd_assert(obj_request); - rbd_assert(obj_request->img_request); - rbd_dev = obj_request->img_request->rbd_dev; - if (!rbd_dev->parent_overlap) { - rbd_obj_request_submit(obj_request); - return; - } - - obj_request->result = img_result; - if (obj_request->result) - goto out; - - /* - * We need to zero anything beyond the parent overlap - * boundary. Since rbd_img_obj_request_read_callback() - * will zero anything beyond the end of a short read, an - * easy way to do this is to pretend the data from the - * parent came up short--ending at the overlap boundary. - */ - rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); - obj_end = obj_request->img_offset + obj_request->length; - if (obj_end > rbd_dev->parent_overlap) { - u64 xferred = 0; - - if (obj_request->img_offset < rbd_dev->parent_overlap) - xferred = rbd_dev->parent_overlap - - obj_request->img_offset; - - obj_request->xferred = min(img_xferred, xferred); - } else { - obj_request->xferred = img_xferred; - } -out: - rbd_img_obj_request_read_callback(obj_request); - rbd_obj_request_complete(obj_request); -} - -static void rbd_img_parent_read(struct rbd_obj_request *obj_request) -{ - struct rbd_img_request *img_request; - int result; - - rbd_assert(obj_request_img_data_test(obj_request)); - rbd_assert(obj_request->img_request != NULL); - rbd_assert(obj_request->result == (s32) -ENOENT); - rbd_assert(obj_request_type_valid(obj_request->type)); - - /* rbd_read_finish(obj_request, obj_request->length); */ - img_request = rbd_parent_request_create(obj_request, - obj_request->img_offset, - obj_request->length); - result = -ENOMEM; - if (!img_request) - goto out_err; - - if (obj_request->type == OBJ_REQUEST_BIO) - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, - &obj_request->bio_pos); - else - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BVECS, - &obj_request->bvec_pos); - if (result) - goto out_err; - - img_request->callback = rbd_img_parent_read_callback; - result = rbd_img_request_submit(img_request); - if (result) - goto out_err; - - return; -out_err: - if (img_request) - rbd_img_request_put(img_request); - obj_request->result = result; - obj_request->xferred = 0; - obj_request_done_set(obj_request); -} - static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) { struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; -- cgit v1.1 From a162b308dc30ddeb848a1445534f5b04e41e1ed5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 30 Jan 2018 17:52:10 +0100 Subject: rbd: simplify rbd_osd_req_create() No need to pass rbd_dev and op_type to rbd_osd_req_create(): there are no standalone (!IMG_DATA) object requests anymore and osd_req->r_flags can be set in rbd_osd_req_format_{read,write}(). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 59 +++++++++++++---------------------------------------- 1 file changed, 14 insertions(+), 45 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e7e99e7..c426092 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1629,6 +1629,7 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) struct ceph_osd_request *osd_req = obj_request->osd_req; rbd_assert(obj_request_img_data_test(obj_request)); + osd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req->r_snapid = obj_request->img_request->snap_id; } @@ -1636,32 +1637,33 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; + osd_req->r_flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts(&osd_req->r_mtime); osd_req->r_data_offset = obj_request->offset; } static struct ceph_osd_request * -__rbd_osd_req_create(struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - int num_ops, unsigned int flags, - struct rbd_obj_request *obj_request) +rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) { + struct rbd_img_request *img_req = obj_req->img_request; + struct rbd_device *rbd_dev = img_req->rbd_dev; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_request *req; const char *name_format = rbd_dev->image_format == 1 ? RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; - req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); + req = ceph_osdc_alloc_request(osdc, + (rbd_img_is_write(img_req) ? img_req->snapc : NULL), + num_ops, false, GFP_NOIO); if (!req) return NULL; - req->r_flags = flags; req->r_callback = rbd_osd_req_callback; - req->r_priv = obj_request; + req->r_priv = obj_req; req->r_base_oloc.pool = rbd_dev->layout.pool_id; if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, - rbd_dev->header.object_prefix, obj_request->object_no)) + rbd_dev->header.object_prefix, obj_req->object_no)) goto err_req; if (ceph_osdc_alloc_messages(req, GFP_NOIO)) @@ -1674,30 +1676,6 @@ err_req: return NULL; } -static struct ceph_osd_request *rbd_osd_req_create( - struct rbd_device *rbd_dev, - enum obj_operation_type op_type, - unsigned int num_ops, - struct rbd_obj_request *obj_request) -{ - struct ceph_snap_context *snapc = NULL; - - if (obj_request_img_data_test(obj_request) && - (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { - struct rbd_img_request *img_request = obj_request->img_request; - if (op_type == OBJ_OP_WRITE) { - rbd_assert(img_request_write_test(img_request)); - } else { - rbd_assert(img_request_discard_test(img_request)); - } - snapc = img_request->snapc; - } - - return __rbd_osd_req_create(rbd_dev, snapc, num_ops, - (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ? - CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request); -} - static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) { ceph_osdc_put_request(osd_req); @@ -2039,9 +2017,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) { - struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; - - obj_req->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, obj_req); + obj_req->osd_req = rbd_osd_req_create(obj_req, 1); if (!obj_req->osd_req) return -ENOMEM; @@ -2102,7 +2078,6 @@ static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) { - struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; unsigned int num_osd_ops, which = 0; int ret; @@ -2114,8 +2089,7 @@ static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) num_osd_ops = 2; /* setallochint + write/writefull */ } - obj_req->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, - num_osd_ops, obj_req); + obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); if (!obj_req->osd_req) return -ENOMEM; @@ -2159,7 +2133,6 @@ static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) { - struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; unsigned int num_osd_ops, which = 0; int ret; @@ -2176,8 +2149,7 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) } } - obj_req->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_DISCARD, - num_osd_ops, obj_req); + obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); if (!obj_req->osd_req) return -ENOMEM; @@ -2426,7 +2398,6 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) { - struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; unsigned int num_osd_ops = obj_req->osd_req->r_num_ops; dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); @@ -2438,9 +2409,7 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) * the original request. The original request was stat + op(s), * the new copyup request will be copyup + the same op(s). */ - obj_req->osd_req = rbd_osd_req_create(rbd_dev, - rbd_img_request_op_type(obj_req->img_request), - num_osd_ops, obj_req); + obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); if (!obj_req->osd_req) return -ENOMEM; -- cgit v1.1 From 9bb0248d9eb9438b991ba538e30eedb493cf1fb4 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 30 Jan 2018 17:52:10 +0100 Subject: rbd: add img_req->op_type field Store op_type in its own field instead of packing it into flags. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 75 +++++++++-------------------------------------------- 1 file changed, 12 insertions(+), 63 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c426092..e542fda 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -302,14 +302,13 @@ struct rbd_obj_request { }; enum img_req_flags { - IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ - IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ }; struct rbd_img_request { struct rbd_device *rbd_dev; + enum obj_operation_type op_type; u64 offset; /* starting image byte offset */ u64 length; /* byte count from offset */ unsigned long flags; @@ -1490,33 +1489,6 @@ static void rbd_img_request_complete(struct rbd_img_request *img_request) * is conditionally set to 1 at image request initialization time * and currently never change thereafter. */ -static void img_request_write_set(struct rbd_img_request *img_request) -{ - set_bit(IMG_REQ_WRITE, &img_request->flags); - smp_mb(); -} - -static bool img_request_write_test(struct rbd_img_request *img_request) -{ - smp_mb(); - return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; -} - -/* - * Set the discard flag when the img_request is an discard request - */ -static void img_request_discard_set(struct rbd_img_request *img_request) -{ - set_bit(IMG_REQ_DISCARD, &img_request->flags); - smp_mb(); -} - -static bool img_request_discard_test(struct rbd_img_request *img_request) -{ - smp_mb(); - return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; -} - static void img_request_child_set(struct rbd_img_request *img_request) { set_bit(IMG_REQ_CHILD, &img_request->flags); @@ -1553,17 +1525,6 @@ static bool img_request_layered_test(struct rbd_img_request *img_request) return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; } -static enum obj_operation_type -rbd_img_request_op_type(struct rbd_img_request *img_request) -{ - if (img_request_write_test(img_request)) - return OBJ_OP_WRITE; - else if (img_request_discard_test(img_request)) - return OBJ_OP_DISCARD; - else - return OBJ_OP_READ; -} - static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) { struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; @@ -1582,7 +1543,7 @@ static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) static bool rbd_img_is_write(struct rbd_img_request *img_req) { - switch (rbd_img_request_op_type(img_req)) { + switch (img_req->op_type) { case OBJ_OP_READ: return false; case OBJ_OP_WRITE: @@ -1816,17 +1777,14 @@ static struct rbd_img_request *rbd_img_request_create( return NULL; img_request->rbd_dev = rbd_dev; + img_request->op_type = op_type; img_request->offset = offset; img_request->length = length; - if (op_type == OBJ_OP_DISCARD) { - img_request_discard_set(img_request); - img_request->snapc = snapc; - } else if (op_type == OBJ_OP_WRITE) { - img_request_write_set(img_request); - img_request->snapc = snapc; - } else { + if (!rbd_img_is_write(img_request)) img_request->snap_id = rbd_dev->spec->snap_id; - } + else + img_request->snapc = snapc; + if (rbd_dev_parent_get(rbd_dev)) img_request_layered_set(img_request); @@ -1859,8 +1817,7 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_dev_parent_put(img_request->rbd_dev); } - if (img_request_write_test(img_request) || - img_request_discard_test(img_request)) + if (rbd_img_is_write(img_request)) ceph_put_snap_context(img_request->snapc); kmem_cache_free(rbd_img_request_cache, img_request); @@ -1918,17 +1875,9 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) result = obj_request->result; if (result) { struct rbd_device *rbd_dev = img_request->rbd_dev; - enum obj_operation_type op_type; - - if (img_request_discard_test(img_request)) - op_type = OBJ_OP_DISCARD; - else if (img_request_write_test(img_request)) - op_type = OBJ_OP_WRITE; - else - op_type = OBJ_OP_READ; rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", - obj_op_name(op_type), obj_request->length, + obj_op_name(img_request->op_type), obj_request->length, obj_request->img_offset, obj_request->offset); rbd_warn(rbd_dev, " result %d xferred %x", result, xferred); @@ -2175,7 +2124,7 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) int ret; for_each_obj_request(img_req, obj_req) { - switch (rbd_img_request_op_type(img_req)) { + switch (img_req->op_type) { case OBJ_OP_READ: ret = rbd_obj_setup_read(obj_req); break; @@ -2428,7 +2377,7 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, obj_req->copyup_bvecs, bytes); - switch (rbd_img_request_op_type(obj_req->img_request)) { + switch (obj_req->img_request->op_type) { case OBJ_OP_WRITE: __rbd_obj_setup_write(obj_req, 1); break; @@ -2572,7 +2521,7 @@ again: */ static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) { - switch (rbd_img_request_op_type(obj_req->img_request)) { + switch (obj_req->img_request->op_type) { case OBJ_OP_READ: return rbd_obj_handle_read(obj_req); case OBJ_OP_WRITE: -- cgit v1.1 From efbd1a1106f15f9260c7cb9a67f5c380a39b4fcc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 30 Jan 2018 17:52:11 +0100 Subject: rbd: update rbd_img_request_submit() signature It should be void now. Also, object requests are unlinked only in image request destructor, which can't run before rbd_img_request_put(), so no need for _safe. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e542fda..015bd53 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2223,21 +2223,17 @@ out_unwind: return -ENOMEM; } -static int rbd_img_request_submit(struct rbd_img_request *img_request) +static void rbd_img_request_submit(struct rbd_img_request *img_request) { struct rbd_obj_request *obj_request; - struct rbd_obj_request *next_obj_request; - int ret = 0; dout("%s: img %p\n", __func__, img_request); rbd_img_request_get(img_request); - for_each_obj_request_safe(img_request, obj_request, next_obj_request) { + for_each_obj_request(img_request, obj_request) rbd_obj_request_submit(obj_request); - } rbd_img_request_put(img_request); - return ret; } static void rbd_img_end_child_request(struct rbd_img_request *img_req); @@ -3668,10 +3664,7 @@ static void rbd_queue_workfn(struct work_struct *work) if (result) goto err_img_request; - result = rbd_img_request_submit(img_request); - if (result) - goto err_img_request; - + rbd_img_request_submit(img_request); if (must_be_locked) up_read(&rbd_dev->lock_rwsem); return; -- cgit v1.1 From 7114edac357b8cc27cf95a4d7eed75d07c41970d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 1 Feb 2018 11:50:47 +0100 Subject: rbd: new request completion code Do away with partial request completions and all the associated complexity. Individual object requests no longer need to be completed in order -- when the last one becomes ready, we complete the entire higher level request all at once. This also wraps up the conversion to a state machine model and eliminates the recursion described in commit 6d69bb536bac ("rbd: prevent kernel stack blow up on rbd map"). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 68 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 13 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 015bd53..2eb0abd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -327,6 +327,7 @@ struct rbd_img_request { int result; /* first nonzero obj_request result */ u32 obj_request_count; + u32 pending_count; struct list_head obj_requests; /* rbd_obj_request structs */ struct kref kref; @@ -1406,6 +1407,7 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, obj_request_img_data_set(obj_request); rbd_assert(obj_request->which != BAD_WHICH); img_request->obj_request_count++; + img_request->pending_count++; list_add_tail(&obj_request->links, &img_request->obj_requests); dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, obj_request->which); @@ -1451,10 +1453,6 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, obj_request, obj_request->object_no, obj_request->offset, obj_request->length, osd_req); - if (obj_request_img_data_test(obj_request)) { - WARN_ON(obj_request->callback != rbd_img_obj_callback); - rbd_img_request_get(obj_request->img_request); - } ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); } @@ -2236,8 +2234,6 @@ static void rbd_img_request_submit(struct rbd_img_request *img_request) rbd_img_request_put(img_request); } -static void rbd_img_end_child_request(struct rbd_img_request *img_req); - static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req, u64 img_offset, u32 bytes) { @@ -2249,8 +2245,6 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req, if (!child_img_req) return -ENOMEM; - child_img_req->callback = rbd_img_end_child_request; - if (!rbd_img_is_write(img_req)) { switch (obj_req->type) { case OBJ_REQUEST_BIO: @@ -2386,8 +2380,6 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) } rbd_obj_request_submit(obj_req); - /* FIXME: in lieu of rbd_img_obj_callback() */ - rbd_img_request_put(obj_req->img_request); return 0; } @@ -2540,6 +2532,29 @@ static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) } } +static void rbd_obj_end_request(struct rbd_obj_request *obj_req) +{ + struct rbd_img_request *img_req = obj_req->img_request; + + rbd_assert((!obj_req->result && + obj_req->xferred == obj_req->length) || + (obj_req->result < 0 && !obj_req->xferred)); + if (!obj_req->result) { + img_req->xferred += obj_req->xferred; + return; + } + + rbd_warn(img_req->rbd_dev, + "%s at objno %llu %llu~%llu result %d xferred %llu", + obj_op_name(img_req->op_type), obj_req->object_no, + obj_req->offset, obj_req->length, obj_req->result, + obj_req->xferred); + if (!img_req->result) { + img_req->result = obj_req->result; + img_req->xferred = 0; + } +} + static void rbd_img_end_child_request(struct rbd_img_request *img_req) { struct rbd_obj_request *obj_req = img_req->obj_request; @@ -2549,17 +2564,44 @@ static void rbd_img_end_child_request(struct rbd_img_request *img_req) obj_req->result = img_req->result; obj_req->xferred = img_req->xferred; rbd_img_request_put(img_req); +} - rbd_obj_handle_request(obj_req); +static void rbd_img_end_request(struct rbd_img_request *img_req) +{ + rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); + rbd_assert((!img_req->result && + img_req->xferred == blk_rq_bytes(img_req->rq)) || + (img_req->result < 0 && !img_req->xferred)); + + blk_mq_end_request(img_req->rq, + errno_to_blk_status(img_req->result)); + rbd_img_request_put(img_req); } static void rbd_obj_handle_request(struct rbd_obj_request *obj_req) { + struct rbd_img_request *img_req; + +again: if (!__rbd_obj_handle_request(obj_req)) return; - obj_request_done_set(obj_req); - rbd_obj_request_complete(obj_req); + img_req = obj_req->img_request; + spin_lock(&img_req->completion_lock); + rbd_obj_end_request(obj_req); + rbd_assert(img_req->pending_count); + if (--img_req->pending_count) { + spin_unlock(&img_req->completion_lock); + return; + } + + spin_unlock(&img_req->completion_lock); + if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { + obj_req = img_req->obj_request; + rbd_img_end_child_request(img_req); + goto again; + } + rbd_img_end_request(img_req); } static const struct rbd_client_id rbd_empty_cid; -- cgit v1.1 From 15961b44947d9d53bfec0a89b5ebbcf30afeb6ac Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 1 Feb 2018 11:50:47 +0100 Subject: rbd: remove old request completion code Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 175 +--------------------------------------------------- 1 file changed, 3 insertions(+), 172 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2eb0abd..959aa95 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -209,12 +209,6 @@ struct rbd_client { }; struct rbd_img_request; -typedef void (*rbd_img_callback_t)(struct rbd_img_request *); - -#define BAD_WHICH U32_MAX /* Good which or bad which, which? */ - -struct rbd_obj_request; -typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); enum obj_request_type { OBJ_REQUEST_NODATA = 1, @@ -229,7 +223,6 @@ enum obj_operation_type { }; enum obj_req_flags { - OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ }; @@ -268,17 +261,11 @@ struct rbd_obj_request { /* * An object request associated with an image will have its * img_data flag set; a standalone object request will not. - * - * Finally, an object request for rbd image data will have - * which != BAD_WHICH, and will have a non-null img_request - * pointer. The value of which will be in the range - * 0..(img_request->obj_request_count-1). */ struct rbd_img_request *img_request; u64 img_offset; /* links for img_request->obj_requests list */ struct list_head links; - u32 which; /* posn image request list */ enum obj_request_type type; union { @@ -296,8 +283,6 @@ struct rbd_obj_request { u64 xferred; /* bytes transferred */ int result; - rbd_obj_callback_t callback; - struct kref kref; }; @@ -320,9 +305,7 @@ struct rbd_img_request { struct request *rq; /* block request */ struct rbd_obj_request *obj_request; /* obj req initiator */ }; - spinlock_t completion_lock;/* protects next_completion */ - u32 next_completion; - rbd_img_callback_t callback; + spinlock_t completion_lock; u64 xferred;/* aggregate bytes transferred */ int result; /* first nonzero obj_request result */ @@ -335,8 +318,6 @@ struct rbd_img_request { #define for_each_obj_request(ireq, oreq) \ list_for_each_entry(oreq, &(ireq)->obj_requests, links) -#define for_each_obj_request_from(ireq, oreq) \ - list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) #define for_each_obj_request_safe(ireq, oreq, n) \ list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) @@ -1332,24 +1313,6 @@ static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; } -static void obj_request_done_set(struct rbd_obj_request *obj_request) -{ - if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { - struct rbd_device *rbd_dev = NULL; - - if (obj_request_img_data_test(obj_request)) - rbd_dev = obj_request->img_request->rbd_dev; - rbd_warn(rbd_dev, "obj_request %p already marked done", - obj_request); - } -} - -static bool obj_request_done_test(struct rbd_obj_request *obj_request) -{ - smp_mb(); - return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; -} - static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) { struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; @@ -1402,33 +1365,24 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, /* Image request now owns object's original reference */ obj_request->img_request = img_request; - obj_request->which = img_request->obj_request_count; rbd_assert(!obj_request_img_data_test(obj_request)); obj_request_img_data_set(obj_request); - rbd_assert(obj_request->which != BAD_WHICH); img_request->obj_request_count++; img_request->pending_count++; list_add_tail(&obj_request->links, &img_request->obj_requests); - dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, - obj_request->which); + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); } static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, struct rbd_obj_request *obj_request) { - rbd_assert(obj_request->which != BAD_WHICH); - - dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, - obj_request->which); + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); list_del(&obj_request->links); rbd_assert(img_request->obj_request_count > 0); img_request->obj_request_count--; - rbd_assert(obj_request->which == img_request->obj_request_count); - obj_request->which = BAD_WHICH; rbd_assert(obj_request_img_data_test(obj_request)); rbd_assert(obj_request->img_request == img_request); obj_request->img_request = NULL; - obj_request->callback = NULL; rbd_obj_request_put(obj_request); } @@ -1444,8 +1398,6 @@ static bool obj_request_type_valid(enum obj_request_type type) } } -static void rbd_img_obj_callback(struct rbd_obj_request *obj_request); - static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; @@ -1456,32 +1408,6 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); } -static void rbd_img_request_complete(struct rbd_img_request *img_request) -{ - - dout("%s: img %p\n", __func__, img_request); - - /* - * If no error occurred, compute the aggregate transfer - * count for the image request. We could instead use - * atomic64_cmpxchg() to update it as each object request - * completes; not clear which way is better off hand. - */ - if (!img_request->result) { - struct rbd_obj_request *obj_request; - u64 xferred = 0; - - for_each_obj_request(img_request, obj_request) - xferred += obj_request->xferred; - img_request->xferred = xferred; - } - - if (img_request->callback) - img_request->callback(img_request); - else - rbd_img_request_put(img_request); -} - /* * The default/initial value for all image request flags is 0. Each * is conditionally set to 1 at image request initialization time @@ -1552,13 +1478,6 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req) } } -static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p cb %p\n", __func__, obj_request, - obj_request->callback); - obj_request->callback(obj_request); -} - static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) @@ -1651,7 +1570,6 @@ rbd_obj_request_create(enum obj_request_type type) if (!obj_request) return NULL; - obj_request->which = BAD_WHICH; obj_request->type = type; INIT_LIST_HEAD(&obj_request->links); kref_init(&obj_request->kref); @@ -1670,7 +1588,6 @@ static void rbd_obj_request_destroy(struct kref *kref) dout("%s: obj %p\n", __func__, obj_request); rbd_assert(obj_request->img_request == NULL); - rbd_assert(obj_request->which == BAD_WHICH); if (obj_request->osd_req) rbd_osd_req_destroy(obj_request->osd_req); @@ -1858,91 +1775,6 @@ static void rbd_parent_request_destroy(struct kref *kref) rbd_img_request_destroy(kref); } -static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) -{ - struct rbd_img_request *img_request; - unsigned int xferred; - int result; - bool more; - - rbd_assert(obj_request_img_data_test(obj_request)); - img_request = obj_request->img_request; - - rbd_assert(obj_request->xferred <= (u64)UINT_MAX); - xferred = (unsigned int)obj_request->xferred; - result = obj_request->result; - if (result) { - struct rbd_device *rbd_dev = img_request->rbd_dev; - - rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", - obj_op_name(img_request->op_type), obj_request->length, - obj_request->img_offset, obj_request->offset); - rbd_warn(rbd_dev, " result %d xferred %x", - result, xferred); - if (!img_request->result) - img_request->result = result; - /* - * Need to end I/O on the entire obj_request worth of - * bytes in case of error. - */ - xferred = obj_request->length; - } - - if (img_request_child_test(img_request)) { - rbd_assert(img_request->obj_request != NULL); - more = obj_request->which < img_request->obj_request_count - 1; - } else { - blk_status_t status = errno_to_blk_status(result); - - rbd_assert(img_request->rq != NULL); - - more = blk_update_request(img_request->rq, status, xferred); - if (!more) - __blk_mq_end_request(img_request->rq, status); - } - - return more; -} - -static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) -{ - struct rbd_img_request *img_request; - u32 which = obj_request->which; - bool more = true; - - rbd_assert(obj_request_img_data_test(obj_request)); - img_request = obj_request->img_request; - - dout("%s: img %p obj %p\n", __func__, img_request, obj_request); - rbd_assert(img_request != NULL); - rbd_assert(img_request->obj_request_count > 0); - rbd_assert(which != BAD_WHICH); - rbd_assert(which < img_request->obj_request_count); - - spin_lock_irq(&img_request->completion_lock); - if (which != img_request->next_completion) - goto out; - - for_each_obj_request_from(img_request, obj_request) { - rbd_assert(more); - rbd_assert(which < img_request->obj_request_count); - - if (!obj_request_done_test(obj_request)) - break; - more = rbd_img_obj_end_request(obj_request); - which++; - } - - rbd_assert(more ^ (which == img_request->obj_request_count)); - img_request->next_completion = which; -out: - spin_unlock_irq(&img_request->completion_lock); - rbd_img_request_put(img_request); - - if (!more) - rbd_img_request_complete(img_request); -} - static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) { switch (obj_req->type) { @@ -2205,7 +2037,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, ceph_bvec_iter_advance(&bvec_it, length); } - obj_request->callback = rbd_img_obj_callback; obj_request->img_offset = img_offset; img_offset += length; -- cgit v1.1 From 0be2d60ed888a25016a05148e52feea4bf401b0e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 1 Feb 2018 11:50:47 +0100 Subject: rbd: remove obj_req->flags field There are no standalone (!IMG_DATA) object requests anymore. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 959aa95..7ec4d14 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -222,10 +222,6 @@ enum obj_operation_type { OBJ_OP_DISCARD, }; -enum obj_req_flags { - OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ -}; - /* * Writes go through the following state machine to deal with * layering: @@ -252,16 +248,11 @@ struct rbd_obj_request { u64 object_no; u64 offset; /* object start byte */ u64 length; /* bytes from offset */ - unsigned long flags; union { bool tried_parent; /* for reads */ enum rbd_obj_write_state write_state; /* for writes */ }; - /* - * An object request associated with an image will have its - * img_data flag set; a standalone object request will not. - */ struct rbd_img_request *img_request; u64 img_offset; /* links for img_request->obj_requests list */ @@ -1291,28 +1282,6 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, } } -/* - * The default/initial value for all object request flags is 0. For - * each flag, once its value is set to 1 it is never reset to 0 - * again. - */ -static void obj_request_img_data_set(struct rbd_obj_request *obj_request) -{ - if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { - struct rbd_device *rbd_dev; - - rbd_dev = obj_request->img_request->rbd_dev; - rbd_warn(rbd_dev, "obj_request %p already marked img_data", - obj_request); - } -} - -static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) -{ - smp_mb(); - return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; -} - static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) { struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; @@ -1365,8 +1334,6 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, /* Image request now owns object's original reference */ obj_request->img_request = img_request; - rbd_assert(!obj_request_img_data_test(obj_request)); - obj_request_img_data_set(obj_request); img_request->obj_request_count++; img_request->pending_count++; list_add_tail(&obj_request->links, &img_request->obj_requests); @@ -1380,7 +1347,6 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, list_del(&obj_request->links); rbd_assert(img_request->obj_request_count > 0); img_request->obj_request_count--; - rbd_assert(obj_request_img_data_test(obj_request)); rbd_assert(obj_request->img_request == img_request); obj_request->img_request = NULL; rbd_obj_request_put(obj_request); @@ -1506,7 +1472,6 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; - rbd_assert(obj_request_img_data_test(obj_request)); osd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req->r_snapid = obj_request->img_request->snap_id; } -- cgit v1.1 From ecc633caebcc84a1469892e3f6f6f4b6a16f41af Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 1 Feb 2018 11:50:47 +0100 Subject: rbd: store data_type in img_req instead of obj_req All object requests are associated with an image request now -- avoid duplicating the same info in each object request. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7ec4d14..6ce9e0b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -258,7 +258,6 @@ struct rbd_obj_request { /* links for img_request->obj_requests list */ struct list_head links; - enum obj_request_type type; union { struct ceph_bio_iter bio_pos; struct { @@ -285,6 +284,7 @@ enum img_req_flags { struct rbd_img_request { struct rbd_device *rbd_dev; enum obj_operation_type op_type; + enum obj_request_type data_type; u64 offset; /* starting image byte offset */ u64 length; /* byte count from offset */ unsigned long flags; @@ -1270,7 +1270,7 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, u32 bytes) { - switch (obj_req->type) { + switch (obj_req->img_request->data_type) { case OBJ_REQUEST_BIO: zero_bios(&obj_req->bio_pos, off, bytes); break; @@ -1348,22 +1348,9 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, rbd_assert(img_request->obj_request_count > 0); img_request->obj_request_count--; rbd_assert(obj_request->img_request == img_request); - obj_request->img_request = NULL; rbd_obj_request_put(obj_request); } -static bool obj_request_type_valid(enum obj_request_type type) -{ - switch (type) { - case OBJ_REQUEST_NODATA: - case OBJ_REQUEST_BIO: - case OBJ_REQUEST_BVECS: - return true; - default: - return false; - } -} - static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; @@ -1524,18 +1511,14 @@ static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) ceph_osdc_put_request(osd_req); } -static struct rbd_obj_request * -rbd_obj_request_create(enum obj_request_type type) +static struct rbd_obj_request *rbd_obj_request_create(void) { struct rbd_obj_request *obj_request; - rbd_assert(obj_request_type_valid(type)); - obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); if (!obj_request) return NULL; - obj_request->type = type; INIT_LIST_HEAD(&obj_request->links); kref_init(&obj_request->kref); @@ -1552,12 +1535,10 @@ static void rbd_obj_request_destroy(struct kref *kref) dout("%s: obj %p\n", __func__, obj_request); - rbd_assert(obj_request->img_request == NULL); - if (obj_request->osd_req) rbd_osd_req_destroy(obj_request->osd_req); - switch (obj_request->type) { + switch (obj_request->img_request->data_type) { case OBJ_REQUEST_NODATA: case OBJ_REQUEST_BIO: case OBJ_REQUEST_BVECS: @@ -1742,7 +1723,7 @@ static void rbd_parent_request_destroy(struct kref *kref) static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) { - switch (obj_req->type) { + switch (obj_req->img_request->data_type) { case OBJ_REQUEST_BIO: osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, &obj_req->bio_pos, @@ -1979,7 +1960,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, u64 offset = rbd_segment_offset(rbd_dev, img_offset); u64 length = rbd_segment_length(rbd_dev, img_offset, resid); - obj_request = rbd_obj_request_create(type); + obj_request = rbd_obj_request_create(); if (!obj_request) goto out_unwind; @@ -2008,6 +1989,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, resid -= length; } + img_request->data_type = type; return __rbd_img_fill_request(img_request); out_unwind: @@ -2042,7 +2024,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req, return -ENOMEM; if (!rbd_img_is_write(img_req)) { - switch (obj_req->type) { + switch (img_req->data_type) { case OBJ_REQUEST_BIO: ret = rbd_img_request_fill(child_img_req, OBJ_REQUEST_BIO, -- cgit v1.1 From 43df3d35c0a558e461a1d7b3f0b21f5c43a5955f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 2 Feb 2018 15:23:22 +0100 Subject: rbd: incorporate ceph_object_extent obj_req->object_no -> obj_req->ex.oe_objno obj_req->offset -> obj_req->ex.oe_off obj_req->length -> obj_req->ex.oe_len ... and use ex for linking object requests to image requests. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 71 +++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6ce9e0b..568e974 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -245,9 +246,7 @@ enum rbd_obj_write_state { }; struct rbd_obj_request { - u64 object_no; - u64 offset; /* object start byte */ - u64 length; /* bytes from offset */ + struct ceph_object_extent ex; union { bool tried_parent; /* for reads */ enum rbd_obj_write_state write_state; /* for writes */ @@ -255,8 +254,6 @@ struct rbd_obj_request { struct rbd_img_request *img_request; u64 img_offset; - /* links for img_request->obj_requests list */ - struct list_head links; union { struct ceph_bio_iter bio_pos; @@ -300,17 +297,17 @@ struct rbd_img_request { u64 xferred;/* aggregate bytes transferred */ int result; /* first nonzero obj_request result */ + struct list_head object_extents; /* obj_req.ex structs */ u32 obj_request_count; u32 pending_count; - struct list_head obj_requests; /* rbd_obj_request structs */ struct kref kref; }; #define for_each_obj_request(ireq, oreq) \ - list_for_each_entry(oreq, &(ireq)->obj_requests, links) + list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item) #define for_each_obj_request_safe(ireq, oreq, n) \ - list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) + list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item) enum rbd_watch_state { RBD_WATCH_STATE_UNREGISTERED, @@ -1336,7 +1333,7 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, obj_request->img_request = img_request; img_request->obj_request_count++; img_request->pending_count++; - list_add_tail(&obj_request->links, &img_request->obj_requests); + list_add_tail(&obj_request->ex.oe_item, &img_request->object_extents); dout("%s: img %p obj %p\n", __func__, img_request, obj_request); } @@ -1344,7 +1341,7 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, struct rbd_obj_request *obj_request) { dout("%s: img %p obj %p\n", __func__, img_request, obj_request); - list_del(&obj_request->links); + list_del(&obj_request->ex.oe_item); rbd_assert(img_request->obj_request_count > 0); img_request->obj_request_count--; rbd_assert(obj_request->img_request == img_request); @@ -1356,8 +1353,8 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) struct ceph_osd_request *osd_req = obj_request->osd_req; dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, - obj_request, obj_request->object_no, obj_request->offset, - obj_request->length, osd_req); + obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off, + obj_request->ex.oe_len, osd_req); ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); } @@ -1406,15 +1403,15 @@ static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) { struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; - return !obj_req->offset && - obj_req->length == rbd_dev->layout.object_size; + return !obj_req->ex.oe_off && + obj_req->ex.oe_len == rbd_dev->layout.object_size; } static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) { struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; - return obj_req->offset + obj_req->length == + return obj_req->ex.oe_off + obj_req->ex.oe_len == rbd_dev->layout.object_size; } @@ -1469,7 +1466,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) osd_req->r_flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts(&osd_req->r_mtime); - osd_req->r_data_offset = obj_request->offset; + osd_req->r_data_offset = obj_request->ex.oe_off; } static struct ceph_osd_request * @@ -1493,7 +1490,7 @@ rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) req->r_base_oloc.pool = rbd_dev->layout.pool_id; if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, - rbd_dev->header.object_prefix, obj_req->object_no)) + rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) goto err_req; if (ceph_osdc_alloc_messages(req, GFP_NOIO)) @@ -1519,7 +1516,7 @@ static struct rbd_obj_request *rbd_obj_request_create(void) if (!obj_request) return NULL; - INIT_LIST_HEAD(&obj_request->links); + ceph_object_extent_init(&obj_request->ex); kref_init(&obj_request->kref); dout("%s %p\n", __func__, obj_request); @@ -1650,7 +1647,7 @@ static struct rbd_img_request *rbd_img_request_create( img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); - INIT_LIST_HEAD(&img_request->obj_requests); + INIT_LIST_HEAD(&img_request->object_extents); kref_init(&img_request->kref); dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, @@ -1727,11 +1724,11 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) case OBJ_REQUEST_BIO: osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, &obj_req->bio_pos, - obj_req->length); + obj_req->ex.oe_len); break; case OBJ_REQUEST_BVECS: rbd_assert(obj_req->bvec_pos.iter.bi_size == - obj_req->length); + obj_req->ex.oe_len); osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, &obj_req->bvec_pos); break; @@ -1747,7 +1744,7 @@ static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) return -ENOMEM; osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ, - obj_req->offset, obj_req->length, 0, 0); + obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); rbd_osd_req_setup_data(obj_req, 0); rbd_osd_req_format_read(obj_req); @@ -1794,7 +1791,7 @@ static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, opcode = CEPH_OSD_OP_WRITE; osd_req_op_extent_init(obj_req->osd_req, which, opcode, - obj_req->offset, obj_req->length, 0, 0); + obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); rbd_osd_req_setup_data(obj_req, which++); rbd_assert(which == obj_req->osd_req->r_num_ops); @@ -1849,7 +1846,7 @@ static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, if (opcode) osd_req_op_extent_init(obj_req->osd_req, which++, opcode, - obj_req->offset, obj_req->length, + obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); rbd_assert(which == obj_req->osd_req->r_num_ops); @@ -1964,9 +1961,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, if (!obj_request) goto out_unwind; - obj_request->object_no = object_no; - obj_request->offset = offset; - obj_request->length = length; + obj_request->ex.oe_objno = object_no; + obj_request->ex.oe_off = offset; + obj_request->ex.oe_len = length; /* * set obj_request->img_request before creating the @@ -2064,7 +2061,7 @@ static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) if (obj_req->result == -ENOENT && obj_req->img_offset < rbd_dev->parent_overlap && !obj_req->tried_parent) { - u64 obj_overlap = min(obj_req->length, + u64 obj_overlap = min(obj_req->ex.oe_len, rbd_dev->parent_overlap - obj_req->img_offset); obj_req->tried_parent = true; @@ -2084,12 +2081,12 @@ static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) * count to indicate the whole request was satisfied. */ if (obj_req->result == -ENOENT || - (!obj_req->result && obj_req->xferred < obj_req->length)) { + (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) { rbd_assert(!obj_req->xferred || !obj_req->result); rbd_obj_zero_range(obj_req, obj_req->xferred, - obj_req->length - obj_req->xferred); + obj_req->ex.oe_len - obj_req->xferred); obj_req->result = 0; - obj_req->xferred = obj_req->length; + obj_req->xferred = obj_req->ex.oe_len; } return true; @@ -2214,7 +2211,7 @@ static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) * Determine the byte range covered by the object in the * child image to which the original request was to be sent. */ - img_offset = obj_req->img_offset - obj_req->offset; + img_offset = obj_req->img_offset - obj_req->ex.oe_off; obj_overlap = rbd_dev->layout.object_size; /* @@ -2263,7 +2260,7 @@ again: * There is no such thing as a successful short * write -- indicate the whole request was satisfied. */ - obj_req->xferred = obj_req->length; + obj_req->xferred = obj_req->ex.oe_len; return true; case RBD_OBJ_WRITE_COPYUP: obj_req->write_state = RBD_OBJ_WRITE_GUARD; @@ -2300,7 +2297,7 @@ static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) */ if (obj_req->result == -ENOENT) { obj_req->result = 0; - obj_req->xferred = obj_req->length; + obj_req->xferred = obj_req->ex.oe_len; } return true; } @@ -2315,7 +2312,7 @@ static void rbd_obj_end_request(struct rbd_obj_request *obj_req) struct rbd_img_request *img_req = obj_req->img_request; rbd_assert((!obj_req->result && - obj_req->xferred == obj_req->length) || + obj_req->xferred == obj_req->ex.oe_len) || (obj_req->result < 0 && !obj_req->xferred)); if (!obj_req->result) { img_req->xferred += obj_req->xferred; @@ -2324,8 +2321,8 @@ static void rbd_obj_end_request(struct rbd_obj_request *obj_req) rbd_warn(img_req->rbd_dev, "%s at objno %llu %llu~%llu result %d xferred %llu", - obj_op_name(img_req->op_type), obj_req->object_no, - obj_req->offset, obj_req->length, obj_req->result, + obj_op_name(img_req->op_type), obj_req->ex.oe_objno, + obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result, obj_req->xferred); if (!img_req->result) { img_req->result = obj_req->result; -- cgit v1.1 From 86bd7998fa2c1b18fda74cfa4674cfb49ae701c7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 6 Feb 2018 19:26:33 +0100 Subject: rbd: move to obj_req->img_extents In preparation for rbd "fancy" striping, replace obj_req->img_offset with obj_req->img_extents. A single starting offset isn't sufficient because we want only one OSD request per object and will merge adjacent object extents in ceph_file_to_extents(). The final object extent may map into multiple different byte ranges in the image. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 150 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 98 insertions(+), 52 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 568e974..0aa95e0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -253,7 +253,8 @@ struct rbd_obj_request { }; struct rbd_img_request *img_request; - u64 img_offset; + struct ceph_file_extent *img_extents; + u32 num_img_extents; union { struct ceph_bio_iter bio_pos; @@ -1279,14 +1280,6 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, } } -static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) -{ - struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; - - return obj_request->img_offset < - round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); -} - static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { dout("%s: obj %p (was %d)\n", __func__, obj_request, @@ -1415,6 +1408,12 @@ static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) rbd_dev->layout.object_size; } +static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) +{ + return ceph_file_extents_bytes(obj_req->img_extents, + obj_req->num_img_extents); +} + static bool rbd_img_is_write(struct rbd_img_request *img_req) { switch (img_req->op_type) { @@ -1544,6 +1543,7 @@ static void rbd_obj_request_destroy(struct kref *kref) rbd_assert(0); } + kfree(obj_request->img_extents); if (obj_request->copyup_bvecs) { for (i = 0; i < obj_request->copyup_bvec_count; i++) { if (obj_request->copyup_bvecs[i].bv_page) @@ -1718,6 +1718,53 @@ static void rbd_parent_request_destroy(struct kref *kref) rbd_img_request_destroy(kref); } +static void prune_extents(struct ceph_file_extent *img_extents, + u32 *num_img_extents, u64 overlap) +{ + u32 cnt = *num_img_extents; + + /* drop extents completely beyond the overlap */ + while (cnt && img_extents[cnt - 1].fe_off >= overlap) + cnt--; + + if (cnt) { + struct ceph_file_extent *ex = &img_extents[cnt - 1]; + + /* trim final overlapping extent */ + if (ex->fe_off + ex->fe_len > overlap) + ex->fe_len = overlap - ex->fe_off; + } + + *num_img_extents = cnt; +} + +/* + * Determine the byte range(s) covered by either just the object extent + * or the entire object in the parent image. + */ +static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req, + bool entire) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + int ret; + + if (!rbd_dev->parent_overlap) + return 0; + + ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno, + entire ? 0 : obj_req->ex.oe_off, + entire ? rbd_dev->layout.object_size : + obj_req->ex.oe_len, + &obj_req->img_extents, + &obj_req->num_img_extents); + if (ret) + return ret; + + prune_extents(obj_req->img_extents, &obj_req->num_img_extents, + rbd_dev->parent_overlap); + return 0; +} + static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) { switch (obj_req->img_request->data_type) { @@ -1803,7 +1850,12 @@ static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) unsigned int num_osd_ops, which = 0; int ret; - if (obj_request_overlaps_parent(obj_req)) { + /* reverse map the entire object onto the parent */ + ret = rbd_obj_calc_img_extents(obj_req, true); + if (ret) + return ret; + + if (obj_req->num_img_extents) { obj_req->write_state = RBD_OBJ_WRITE_GUARD; num_osd_ops = 3; /* stat + setallochint + write/writefull */ } else { @@ -1815,7 +1867,7 @@ static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) if (!obj_req->osd_req) return -ENOMEM; - if (obj_request_overlaps_parent(obj_req)) { + if (obj_req->num_img_extents) { ret = __rbd_obj_setup_stat(obj_req, which++); if (ret) return ret; @@ -1831,7 +1883,7 @@ static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, u16 opcode; if (rbd_obj_is_entire(obj_req)) { - if (obj_request_overlaps_parent(obj_req)) { + if (obj_req->num_img_extents) { opcode = CEPH_OSD_OP_TRUNCATE; } else { osd_req_op_init(obj_req->osd_req, which++, @@ -1858,11 +1910,16 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) unsigned int num_osd_ops, which = 0; int ret; + /* reverse map the entire object onto the parent */ + ret = rbd_obj_calc_img_extents(obj_req, true); + if (ret) + return ret; + if (rbd_obj_is_entire(obj_req)) { obj_req->write_state = RBD_OBJ_WRITE_FLAT; num_osd_ops = 1; /* truncate/delete */ } else { - if (obj_request_overlaps_parent(obj_req)) { + if (obj_req->num_img_extents) { obj_req->write_state = RBD_OBJ_WRITE_GUARD; num_osd_ops = 2; /* stat + truncate/zero */ } else { @@ -1875,8 +1932,7 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) if (!obj_req->osd_req) return -ENOMEM; - if (!rbd_obj_is_entire(obj_req) && - obj_request_overlaps_parent(obj_req)) { + if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) { ret = __rbd_obj_setup_stat(obj_req, which++); if (ret) return ret; @@ -1980,8 +2036,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, ceph_bvec_iter_advance(&bvec_it, length); } - obj_request->img_offset = img_offset; - img_offset += length; resid -= length; } @@ -2009,14 +2063,15 @@ static void rbd_img_request_submit(struct rbd_img_request *img_request) rbd_img_request_put(img_request); } -static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req, - u64 img_offset, u32 bytes) +static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) { struct rbd_img_request *img_req = obj_req->img_request; struct rbd_img_request *child_img_req; int ret; - child_img_req = rbd_parent_request_create(obj_req, img_offset, bytes); + child_img_req = rbd_parent_request_create(obj_req, + obj_req->img_extents[0].fe_off, + obj_req->img_extents[0].fe_len); if (!child_img_req) return -ENOMEM; @@ -2038,7 +2093,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req, } else { struct ceph_bvec_iter it = { .bvecs = obj_req->copyup_bvecs, - .iter = { .bi_size = bytes }, + .iter = { .bi_size = obj_req->img_extents[0].fe_len }, }; ret = rbd_img_request_fill(child_img_req, OBJ_REQUEST_BVECS, @@ -2059,19 +2114,23 @@ static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) int ret; if (obj_req->result == -ENOENT && - obj_req->img_offset < rbd_dev->parent_overlap && - !obj_req->tried_parent) { - u64 obj_overlap = min(obj_req->ex.oe_len, - rbd_dev->parent_overlap - obj_req->img_offset); - - obj_req->tried_parent = true; - ret = rbd_obj_read_from_parent(obj_req, obj_req->img_offset, - obj_overlap); + rbd_dev->parent_overlap && !obj_req->tried_parent) { + /* reverse map this object extent onto the parent */ + ret = rbd_obj_calc_img_extents(obj_req, false); if (ret) { obj_req->result = ret; return true; } - return false; + + if (obj_req->num_img_extents) { + obj_req->tried_parent = true; + ret = rbd_obj_read_from_parent(obj_req); + if (ret) { + obj_req->result = ret; + return true; + } + return false; + } } /* @@ -2189,11 +2248,12 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) { struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; - u64 img_offset; - u64 obj_overlap; int ret; - if (!obj_request_overlaps_parent(obj_req)) { + rbd_assert(obj_req->num_img_extents); + prune_extents(obj_req->img_extents, &obj_req->num_img_extents, + rbd_dev->parent_overlap); + if (!obj_req->num_img_extents) { /* * The overlap has become 0 (most likely because the * image has been flattened). Use rbd_obj_issue_copyup() @@ -2207,29 +2267,12 @@ static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) return rbd_obj_issue_copyup(obj_req, 0); } - /* - * Determine the byte range covered by the object in the - * child image to which the original request was to be sent. - */ - img_offset = obj_req->img_offset - obj_req->ex.oe_off; - obj_overlap = rbd_dev->layout.object_size; - - /* - * There is no defined parent data beyond the parent - * overlap, so limit what we read at that boundary if - * necessary. - */ - if (img_offset + obj_overlap > rbd_dev->parent_overlap) { - rbd_assert(img_offset < rbd_dev->parent_overlap); - obj_overlap = rbd_dev->parent_overlap - img_offset; - } - - ret = setup_copyup_bvecs(obj_req, obj_overlap); + ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); if (ret) return ret; obj_req->write_state = RBD_OBJ_WRITE_COPYUP; - return rbd_obj_read_from_parent(obj_req, img_offset, obj_overlap); + return rbd_obj_read_from_parent(obj_req); } static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) @@ -2335,6 +2378,9 @@ static void rbd_img_end_child_request(struct rbd_img_request *img_req) struct rbd_obj_request *obj_req = img_req->obj_request; rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags)); + rbd_assert((!img_req->result && + img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) || + (img_req->result < 0 && !img_req->xferred)); obj_req->result = img_req->result; obj_req->xferred = img_req->xferred; -- cgit v1.1 From 2bb1e56ec6450ce533c644c5bfa548dc34c551a0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 6 Feb 2018 19:26:34 +0100 Subject: rbd: create+truncate for whole-object layered discards A whole-object layered discard is implemented as a truncate rather than a delete: a dummy object is needed to prevent the CoW machinery from kicking in. However, a truncate on a non-existent object is a no-op. If the object doesn't exist in HEAD, a discard request is effectively ignored, which violates our "discard zeroes data" promise and breaks REQ_OP_WRITE_ZEROES implementation. A non-exclusive create on an existing object is also a no-op, so the fix is to do a compound create+truncate instead. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0aa95e0..fc94e2c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1884,6 +1884,8 @@ static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, if (rbd_obj_is_entire(obj_req)) { if (obj_req->num_img_extents) { + osd_req_op_init(obj_req->osd_req, which++, + CEPH_OSD_OP_CREATE, 0); opcode = CEPH_OSD_OP_TRUNCATE; } else { osd_req_op_init(obj_req->osd_req, which++, @@ -1917,7 +1919,10 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) if (rbd_obj_is_entire(obj_req)) { obj_req->write_state = RBD_OBJ_WRITE_FLAT; - num_osd_ops = 1; /* truncate/delete */ + if (obj_req->num_img_extents) + num_osd_ops = 2; /* create + truncate */ + else + num_osd_ops = 1; /* delete */ } else { if (obj_req->num_img_extents) { obj_req->write_state = RBD_OBJ_WRITE_GUARD; -- cgit v1.1 From 5a237819aa4e0421a17966e9baf91b9caedaf61d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 6 Feb 2018 19:26:34 +0100 Subject: rbd: switch to common striping framework Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 191 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 168 insertions(+), 23 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fc94e2c..24f169f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1326,7 +1326,6 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, obj_request->img_request = img_request; img_request->obj_request_count++; img_request->pending_count++; - list_add_tail(&obj_request->ex.oe_item, &img_request->object_extents); dout("%s: img %p obj %p\n", __func__, img_request, obj_request); } @@ -2055,6 +2054,158 @@ out_unwind: return -ENOMEM; } +union rbd_img_fill_iter { + struct ceph_bio_iter bio_iter; + struct ceph_bvec_iter bvec_iter; +}; + +struct rbd_img_fill_ctx { + enum obj_request_type pos_type; + union rbd_img_fill_iter *pos; + union rbd_img_fill_iter iter; + ceph_object_extent_fn_t set_pos_fn; +}; + +static struct ceph_object_extent *alloc_object_extent(void *arg) +{ + struct rbd_img_request *img_req = arg; + struct rbd_obj_request *obj_req; + + obj_req = rbd_obj_request_create(); + if (!obj_req) + return NULL; + + rbd_img_obj_request_add(img_req, obj_req); + return &obj_req->ex; +} + +/* + * Map a list of image extents to a list of object extents, create the + * corresponding object requests (normally each to a different object, + * but not always) and add them to @img_req. For each object request, + * set up its data descriptor to point to the corresponding chunk of + * @fctx->pos data buffer. + * + * @fctx->pos data buffer is assumed to be large enough. + */ +static int rbd_img_fill_request(struct rbd_img_request *img_req, + struct ceph_file_extent *img_extents, + u32 num_img_extents, + struct rbd_img_fill_ctx *fctx) +{ + u32 i; + int ret; + + img_req->data_type = fctx->pos_type; + + /* + * Create object requests and set each object request's starting + * position in the provided bio (list) or bio_vec array. + */ + fctx->iter = *fctx->pos; + for (i = 0; i < num_img_extents; i++) { + ret = ceph_file_to_extents(&img_req->rbd_dev->layout, + img_extents[i].fe_off, + img_extents[i].fe_len, + &img_req->object_extents, + alloc_object_extent, img_req, + fctx->set_pos_fn, &fctx->iter); + if (ret) + return ret; + } + + return __rbd_img_fill_request(img_req); +} + +static int rbd_img_fill_nodata(struct rbd_img_request *img_req, + u64 off, u64 len) +{ + struct ceph_file_extent ex = { off, len }; + union rbd_img_fill_iter dummy; + struct rbd_img_fill_ctx fctx = { + .pos_type = OBJ_REQUEST_NODATA, + .pos = &dummy, + }; + + return rbd_img_fill_request(img_req, &ex, 1, &fctx); +} + +static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) +{ + struct rbd_obj_request *obj_req = + container_of(ex, struct rbd_obj_request, ex); + struct ceph_bio_iter *it = arg; + + dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); + obj_req->bio_pos = *it; + ceph_bio_iter_advance(it, bytes); +} + +static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, + struct ceph_file_extent *img_extents, + u32 num_img_extents, + struct ceph_bio_iter *bio_pos) +{ + struct rbd_img_fill_ctx fctx = { + .pos_type = OBJ_REQUEST_BIO, + .pos = (union rbd_img_fill_iter *)bio_pos, + .set_pos_fn = set_bio_pos, + }; + + return rbd_img_fill_request(img_req, img_extents, num_img_extents, + &fctx); +} + +static int rbd_img_fill_from_bio(struct rbd_img_request *img_req, + u64 off, u64 len, struct bio *bio) +{ + struct ceph_file_extent ex = { off, len }; + struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter }; + + return __rbd_img_fill_from_bio(img_req, &ex, 1, &it); +} + +static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) +{ + struct rbd_obj_request *obj_req = + container_of(ex, struct rbd_obj_request, ex); + struct ceph_bvec_iter *it = arg; + + obj_req->bvec_pos = *it; + ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes); + ceph_bvec_iter_advance(it, bytes); +} + +static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, + struct ceph_file_extent *img_extents, + u32 num_img_extents, + struct ceph_bvec_iter *bvec_pos) +{ + struct rbd_img_fill_ctx fctx = { + .pos_type = OBJ_REQUEST_BVECS, + .pos = (union rbd_img_fill_iter *)bvec_pos, + .set_pos_fn = set_bvec_pos, + }; + + return rbd_img_fill_request(img_req, img_extents, num_img_extents, + &fctx); +} + +static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, + struct ceph_file_extent *img_extents, + u32 num_img_extents, + struct bio_vec *bvecs) +{ + struct ceph_bvec_iter it = { + .bvecs = bvecs, + .iter = { .bi_size = ceph_file_extents_bytes(img_extents, + num_img_extents) }, + }; + + return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents, + &it); +} + static void rbd_img_request_submit(struct rbd_img_request *img_request) { struct rbd_obj_request *obj_request; @@ -2083,26 +2234,25 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) if (!rbd_img_is_write(img_req)) { switch (img_req->data_type) { case OBJ_REQUEST_BIO: - ret = rbd_img_request_fill(child_img_req, - OBJ_REQUEST_BIO, - &obj_req->bio_pos); + ret = __rbd_img_fill_from_bio(child_img_req, + obj_req->img_extents, + obj_req->num_img_extents, + &obj_req->bio_pos); break; case OBJ_REQUEST_BVECS: - ret = rbd_img_request_fill(child_img_req, - OBJ_REQUEST_BVECS, - &obj_req->bvec_pos); + ret = __rbd_img_fill_from_bvecs(child_img_req, + obj_req->img_extents, + obj_req->num_img_extents, + &obj_req->bvec_pos); break; default: rbd_assert(0); } } else { - struct ceph_bvec_iter it = { - .bvecs = obj_req->copyup_bvecs, - .iter = { .bi_size = obj_req->img_extents[0].fe_len }, - }; - - ret = rbd_img_request_fill(child_img_req, OBJ_REQUEST_BVECS, - &it); + ret = rbd_img_fill_from_bvecs(child_img_req, + obj_req->img_extents, + obj_req->num_img_extents, + obj_req->copyup_bvecs); } if (ret) { rbd_img_request_put(child_img_req); @@ -3520,15 +3670,10 @@ static void rbd_queue_workfn(struct work_struct *work) snapc = NULL; /* img_request consumes a ref */ if (op_type == OBJ_OP_DISCARD) - result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, - NULL); - else { - struct ceph_bio_iter bio_it = { .bio = rq->bio, - .iter = rq->bio->bi_iter }; - - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, - &bio_it); - } + result = rbd_img_fill_nodata(img_request, offset, length); + else + result = rbd_img_fill_from_bio(img_request, offset, length, + rq->bio); if (result) goto err_img_request; -- cgit v1.1 From 0420c5dd2ef308b69a86b44a217390f5612bab58 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 6 Feb 2018 19:26:34 +0100 Subject: rbd: remove rbd_img_request_fill() and helpers Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 98 ----------------------------------------------------- 1 file changed, 98 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 24f169f..a22d265 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1207,27 +1207,6 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.features = 0; } -static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) -{ - u64 segment_size = rbd_obj_bytes(&rbd_dev->header); - - return offset & (segment_size - 1); -} - -static u64 rbd_segment_length(struct rbd_device *rbd_dev, - u64 offset, u64 length) -{ - u64 segment_size = rbd_obj_bytes(&rbd_dev->header); - - offset &= segment_size - 1; - - rbd_assert(length <= U64_MAX - offset); - if (offset + length > segment_size) - length = segment_size - offset; - - return length; -} - static void zero_bvec(struct bio_vec *bv) { void *buf; @@ -1977,83 +1956,6 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) return 0; } -/* - * Split up an image request into one or more object requests, each - * to a different object. The "type" parameter indicates whether - * "data_desc" is the pointer to the head of a list of bio - * structures, or the base of a page array. In either case this - * function assumes data_desc describes memory sufficient to hold - * all data described by the image request. - */ -static int rbd_img_request_fill(struct rbd_img_request *img_request, - enum obj_request_type type, - void *data_desc) -{ - struct rbd_device *rbd_dev = img_request->rbd_dev; - struct rbd_obj_request *obj_request = NULL; - struct rbd_obj_request *next_obj_request; - struct ceph_bio_iter bio_it; - struct ceph_bvec_iter bvec_it; - u64 img_offset; - u64 resid; - - dout("%s: img %p type %d data_desc %p\n", __func__, img_request, - (int)type, data_desc); - - img_offset = img_request->offset; - resid = img_request->length; - rbd_assert(resid > 0); - - if (type == OBJ_REQUEST_BIO) { - bio_it = *(struct ceph_bio_iter *)data_desc; - rbd_assert(img_offset == - bio_it.iter.bi_sector << SECTOR_SHIFT); - } else if (type == OBJ_REQUEST_BVECS) { - bvec_it = *(struct ceph_bvec_iter *)data_desc; - } - - while (resid) { - u64 object_no = img_offset >> rbd_dev->header.obj_order; - u64 offset = rbd_segment_offset(rbd_dev, img_offset); - u64 length = rbd_segment_length(rbd_dev, img_offset, resid); - - obj_request = rbd_obj_request_create(); - if (!obj_request) - goto out_unwind; - - obj_request->ex.oe_objno = object_no; - obj_request->ex.oe_off = offset; - obj_request->ex.oe_len = length; - - /* - * set obj_request->img_request before creating the - * osd_request so that it gets the right snapc - */ - rbd_img_obj_request_add(img_request, obj_request); - - if (type == OBJ_REQUEST_BIO) { - obj_request->bio_pos = bio_it; - ceph_bio_iter_advance(&bio_it, length); - } else if (type == OBJ_REQUEST_BVECS) { - obj_request->bvec_pos = bvec_it; - ceph_bvec_iter_shorten(&obj_request->bvec_pos, length); - ceph_bvec_iter_advance(&bvec_it, length); - } - - img_offset += length; - resid -= length; - } - - img_request->data_type = type; - return __rbd_img_fill_request(img_request); - -out_unwind: - for_each_obj_request_safe(img_request, obj_request, next_obj_request) - rbd_img_obj_request_del(img_request, obj_request); - - return -ENOMEM; -} - union rbd_img_fill_iter { struct ceph_bio_iter bio_iter; struct ceph_bvec_iter bvec_iter; -- cgit v1.1 From dfd9875f11008183c26fea5fdf23e6740fe8aa5a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 6 Feb 2018 19:26:35 +0100 Subject: rbd: get rid of img_req->{offset,length} These are set, but no longer used. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a22d265..b3e310a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -283,8 +283,6 @@ struct rbd_img_request { struct rbd_device *rbd_dev; enum obj_operation_type op_type; enum obj_request_type data_type; - u64 offset; /* starting image byte offset */ - u64 length; /* byte count from offset */ unsigned long flags; union { u64 snap_id; /* for reads */ @@ -1602,7 +1600,6 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) */ static struct rbd_img_request *rbd_img_request_create( struct rbd_device *rbd_dev, - u64 offset, u64 length, enum obj_operation_type op_type, struct ceph_snap_context *snapc) { @@ -1614,8 +1611,6 @@ static struct rbd_img_request *rbd_img_request_create( img_request->rbd_dev = rbd_dev; img_request->op_type = op_type; - img_request->offset = offset; - img_request->length = length; if (!rbd_img_is_write(img_request)) img_request->snap_id = rbd_dev->spec->snap_id; else @@ -1628,9 +1623,8 @@ static struct rbd_img_request *rbd_img_request_create( INIT_LIST_HEAD(&img_request->object_extents); kref_init(&img_request->kref); - dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, - obj_op_name(op_type), offset, length, img_request); - + dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, + obj_op_name(op_type), img_request); return img_request; } @@ -1659,9 +1653,8 @@ static void rbd_img_request_destroy(struct kref *kref) kmem_cache_free(rbd_img_request_cache, img_request); } -static struct rbd_img_request *rbd_parent_request_create( - struct rbd_obj_request *obj_request, - u64 img_offset, u64 length) +static struct rbd_img_request * +rbd_parent_request_create(struct rbd_obj_request *obj_request) { struct rbd_img_request *parent_request; struct rbd_device *rbd_dev; @@ -1669,8 +1662,8 @@ static struct rbd_img_request *rbd_parent_request_create( rbd_assert(obj_request->img_request); rbd_dev = obj_request->img_request->rbd_dev; - parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, - length, OBJ_OP_READ, NULL); + parent_request = rbd_img_request_create(rbd_dev->parent, OBJ_OP_READ, + NULL); if (!parent_request) return NULL; @@ -2127,9 +2120,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) struct rbd_img_request *child_img_req; int ret; - child_img_req = rbd_parent_request_create(obj_req, - obj_req->img_extents[0].fe_off, - obj_req->img_extents[0].fe_len); + child_img_req = rbd_parent_request_create(obj_req); if (!child_img_req) return -ENOMEM; @@ -3562,8 +3553,7 @@ static void rbd_queue_workfn(struct work_struct *work) } } - img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, - snapc); + img_request = rbd_img_request_create(rbd_dev, op_type, snapc); if (!img_request) { result = -ENOMEM; goto err_unlock; -- cgit v1.1 From e93aca0abb8b9f8fd23675dc9110b7517964657a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 6 Feb 2018 19:26:35 +0100 Subject: rbd: remove rbd_parent_request_{create,destroy}() rbd_parent_request_create() takes a ref on obj_req for child_img_req. There is no point in doing that because child_img_req is created on behalf of obj_req -- obj_req is the initiator and can't be completed before child_img_req. Open-code the rest of rbd_parent_request_create() and remove it along with rbd_parent_request_destroy(). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 74 +++++------------------------------------------------ 1 file changed, 6 insertions(+), 68 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b3e310a..5fa4e1a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1257,13 +1257,6 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, } } -static void rbd_obj_request_get(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p (was %d)\n", __func__, obj_request, - kref_read(&obj_request->kref)); - kref_get(&obj_request->kref); -} - static void rbd_obj_request_destroy(struct kref *kref); static void rbd_obj_request_put(struct rbd_obj_request *obj_request) { @@ -1280,18 +1273,13 @@ static void rbd_img_request_get(struct rbd_img_request *img_request) kref_get(&img_request->kref); } -static bool img_request_child_test(struct rbd_img_request *img_request); -static void rbd_parent_request_destroy(struct kref *kref); static void rbd_img_request_destroy(struct kref *kref); static void rbd_img_request_put(struct rbd_img_request *img_request) { rbd_assert(img_request != NULL); dout("%s: img %p (was %d)\n", __func__, img_request, kref_read(&img_request->kref)); - if (img_request_child_test(img_request)) - kref_put(&img_request->kref, rbd_parent_request_destroy); - else - kref_put(&img_request->kref, rbd_img_request_destroy); + kref_put(&img_request->kref, rbd_img_request_destroy); } static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, @@ -1332,24 +1320,6 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) * is conditionally set to 1 at image request initialization time * and currently never change thereafter. */ -static void img_request_child_set(struct rbd_img_request *img_request) -{ - set_bit(IMG_REQ_CHILD, &img_request->flags); - smp_mb(); -} - -static void img_request_child_clear(struct rbd_img_request *img_request) -{ - clear_bit(IMG_REQ_CHILD, &img_request->flags); - smp_mb(); -} - -static bool img_request_child_test(struct rbd_img_request *img_request) -{ - smp_mb(); - return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; -} - static void img_request_layered_set(struct rbd_img_request *img_request) { set_bit(IMG_REQ_LAYERED, &img_request->flags); @@ -1653,42 +1623,6 @@ static void rbd_img_request_destroy(struct kref *kref) kmem_cache_free(rbd_img_request_cache, img_request); } -static struct rbd_img_request * -rbd_parent_request_create(struct rbd_obj_request *obj_request) -{ - struct rbd_img_request *parent_request; - struct rbd_device *rbd_dev; - - rbd_assert(obj_request->img_request); - rbd_dev = obj_request->img_request->rbd_dev; - - parent_request = rbd_img_request_create(rbd_dev->parent, OBJ_OP_READ, - NULL); - if (!parent_request) - return NULL; - - img_request_child_set(parent_request); - rbd_obj_request_get(obj_request); - parent_request->obj_request = obj_request; - - return parent_request; -} - -static void rbd_parent_request_destroy(struct kref *kref) -{ - struct rbd_img_request *parent_request; - struct rbd_obj_request *orig_request; - - parent_request = container_of(kref, struct rbd_img_request, kref); - orig_request = parent_request->obj_request; - - parent_request->obj_request = NULL; - rbd_obj_request_put(orig_request); - img_request_child_clear(parent_request); - - rbd_img_request_destroy(kref); -} - static void prune_extents(struct ceph_file_extent *img_extents, u32 *num_img_extents, u64 overlap) { @@ -2120,10 +2054,14 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) struct rbd_img_request *child_img_req; int ret; - child_img_req = rbd_parent_request_create(obj_req); + child_img_req = rbd_img_request_create(img_req->rbd_dev->parent, + OBJ_OP_READ, NULL); if (!child_img_req) return -ENOMEM; + __set_bit(IMG_REQ_CHILD, &child_img_req->flags); + child_img_req->obj_request = obj_req; + if (!rbd_img_is_write(img_req)) { switch (img_req->data_type) { case OBJ_REQUEST_BIO: -- cgit v1.1 From afb978884c3ec17227626eb371130a97671e5238 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 6 Feb 2018 19:26:35 +0100 Subject: rbd: introduce OWN_BVECS data type If the layout is "fancy", we need to be able to rearrange the provided bio_vecs in stripe unit chunks to make it possible for the messenger to read/write directly from/to the provided data buffer, without employing a temporary data buffer for assembling the result. Higher level bio_vec arrays are generally immutable, so this requires copying into a private array. Only the bio_vecs themselves are shuffled around, not the actual data. OWN_BVECS doesn't own any pages. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 156 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 7 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5fa4e1a..056865cf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -215,6 +215,7 @@ enum obj_request_type { OBJ_REQUEST_NODATA = 1, OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ + OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */ }; enum obj_operation_type { @@ -261,6 +262,7 @@ struct rbd_obj_request { struct { struct ceph_bvec_iter bvec_pos; u32 bvec_count; + u32 bvec_idx; }; }; struct bio_vec *copyup_bvecs; @@ -1238,7 +1240,7 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) /* * Zero a range in @obj_req data buffer defined by a bio (list) or - * bio_vec array. + * (private) bio_vec array. * * @off is relative to the start of the data buffer. */ @@ -1250,6 +1252,7 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, zero_bios(&obj_req->bio_pos, off, bytes); break; case OBJ_REQUEST_BVECS: + case OBJ_REQUEST_OWN_BVECS: zero_bvecs(&obj_req->bvec_pos, off, bytes); break; default: @@ -1485,6 +1488,9 @@ static void rbd_obj_request_destroy(struct kref *kref) case OBJ_REQUEST_BIO: case OBJ_REQUEST_BVECS: break; /* Nothing to do */ + case OBJ_REQUEST_OWN_BVECS: + kfree(obj_request->bvec_pos.bvecs); + break; default: rbd_assert(0); } @@ -1679,8 +1685,10 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) obj_req->ex.oe_len); break; case OBJ_REQUEST_BVECS: + case OBJ_REQUEST_OWN_BVECS: rbd_assert(obj_req->bvec_pos.iter.bi_size == obj_req->ex.oe_len); + rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, &obj_req->bvec_pos); break; @@ -1893,6 +1901,8 @@ struct rbd_img_fill_ctx { union rbd_img_fill_iter *pos; union rbd_img_fill_iter iter; ceph_object_extent_fn_t set_pos_fn; + ceph_object_extent_fn_t count_fn; + ceph_object_extent_fn_t copy_fn; }; static struct ceph_object_extent *alloc_object_extent(void *arg) @@ -1909,12 +1919,57 @@ static struct ceph_object_extent *alloc_object_extent(void *arg) } /* + * While su != os && sc == 1 is technically not fancy (it's the same + * layout as su == os && sc == 1), we can't use the nocopy path for it + * because ->set_pos_fn() should be called only once per object. + * ceph_file_to_extents() invokes action_fn once per stripe unit, so + * treat su != os && sc == 1 as fancy. + */ +static bool rbd_layout_is_fancy(struct ceph_file_layout *l) +{ + return l->stripe_unit != l->object_size; +} + +static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req, + struct ceph_file_extent *img_extents, + u32 num_img_extents, + struct rbd_img_fill_ctx *fctx) +{ + u32 i; + int ret; + + img_req->data_type = fctx->pos_type; + + /* + * Create object requests and set each object request's starting + * position in the provided bio (list) or bio_vec array. + */ + fctx->iter = *fctx->pos; + for (i = 0; i < num_img_extents; i++) { + ret = ceph_file_to_extents(&img_req->rbd_dev->layout, + img_extents[i].fe_off, + img_extents[i].fe_len, + &img_req->object_extents, + alloc_object_extent, img_req, + fctx->set_pos_fn, &fctx->iter); + if (ret) + return ret; + } + + return __rbd_img_fill_request(img_req); +} + +/* * Map a list of image extents to a list of object extents, create the * corresponding object requests (normally each to a different object, * but not always) and add them to @img_req. For each object request, - * set up its data descriptor to point to the corresponding chunk of + * set up its data descriptor to point to the corresponding chunk(s) of * @fctx->pos data buffer. * + * Because ceph_file_to_extents() will merge adjacent object extents + * together, each object request's data descriptor may point to multiple + * different chunks of @fctx->pos data buffer. + * * @fctx->pos data buffer is assumed to be large enough. */ static int rbd_img_fill_request(struct rbd_img_request *img_req, @@ -1922,23 +1977,56 @@ static int rbd_img_fill_request(struct rbd_img_request *img_req, u32 num_img_extents, struct rbd_img_fill_ctx *fctx) { + struct rbd_device *rbd_dev = img_req->rbd_dev; + struct rbd_obj_request *obj_req; u32 i; int ret; - img_req->data_type = fctx->pos_type; + if (fctx->pos_type == OBJ_REQUEST_NODATA || + !rbd_layout_is_fancy(&rbd_dev->layout)) + return rbd_img_fill_request_nocopy(img_req, img_extents, + num_img_extents, fctx); + + img_req->data_type = OBJ_REQUEST_OWN_BVECS; /* - * Create object requests and set each object request's starting - * position in the provided bio (list) or bio_vec array. + * Create object requests and determine ->bvec_count for each object + * request. Note that ->bvec_count sum over all object requests may + * be greater than the number of bio_vecs in the provided bio (list) + * or bio_vec array because when mapped, those bio_vecs can straddle + * stripe unit boundaries. */ fctx->iter = *fctx->pos; for (i = 0; i < num_img_extents; i++) { - ret = ceph_file_to_extents(&img_req->rbd_dev->layout, + ret = ceph_file_to_extents(&rbd_dev->layout, img_extents[i].fe_off, img_extents[i].fe_len, &img_req->object_extents, alloc_object_extent, img_req, - fctx->set_pos_fn, &fctx->iter); + fctx->count_fn, &fctx->iter); + if (ret) + return ret; + } + + for_each_obj_request(img_req, obj_req) { + obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count, + sizeof(*obj_req->bvec_pos.bvecs), + GFP_NOIO); + if (!obj_req->bvec_pos.bvecs) + return -ENOMEM; + } + + /* + * Fill in each object request's private bio_vec array, splitting and + * rearranging the provided bio_vecs in stripe unit chunks as needed. + */ + fctx->iter = *fctx->pos; + for (i = 0; i < num_img_extents; i++) { + ret = ceph_iterate_extents(&rbd_dev->layout, + img_extents[i].fe_off, + img_extents[i].fe_len, + &img_req->object_extents, + fctx->copy_fn, &fctx->iter); if (ret) return ret; } @@ -1970,6 +2058,32 @@ static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) ceph_bio_iter_advance(it, bytes); } +static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) +{ + struct rbd_obj_request *obj_req = + container_of(ex, struct rbd_obj_request, ex); + struct ceph_bio_iter *it = arg; + + dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); + ceph_bio_iter_advance_step(it, bytes, ({ + obj_req->bvec_count++; + })); + +} + +static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) +{ + struct rbd_obj_request *obj_req = + container_of(ex, struct rbd_obj_request, ex); + struct ceph_bio_iter *it = arg; + + dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); + ceph_bio_iter_advance_step(it, bytes, ({ + obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; + obj_req->bvec_pos.iter.bi_size += bv.bv_len; + })); +} + static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, struct ceph_file_extent *img_extents, u32 num_img_extents, @@ -1979,6 +2093,8 @@ static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, .pos_type = OBJ_REQUEST_BIO, .pos = (union rbd_img_fill_iter *)bio_pos, .set_pos_fn = set_bio_pos, + .count_fn = count_bio_bvecs, + .copy_fn = copy_bio_bvecs, }; return rbd_img_fill_request(img_req, img_extents, num_img_extents, @@ -2005,6 +2121,29 @@ static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) ceph_bvec_iter_advance(it, bytes); } +static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) +{ + struct rbd_obj_request *obj_req = + container_of(ex, struct rbd_obj_request, ex); + struct ceph_bvec_iter *it = arg; + + ceph_bvec_iter_advance_step(it, bytes, ({ + obj_req->bvec_count++; + })); +} + +static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) +{ + struct rbd_obj_request *obj_req = + container_of(ex, struct rbd_obj_request, ex); + struct ceph_bvec_iter *it = arg; + + ceph_bvec_iter_advance_step(it, bytes, ({ + obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; + obj_req->bvec_pos.iter.bi_size += bv.bv_len; + })); +} + static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, struct ceph_file_extent *img_extents, u32 num_img_extents, @@ -2014,6 +2153,8 @@ static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, .pos_type = OBJ_REQUEST_BVECS, .pos = (union rbd_img_fill_iter *)bvec_pos, .set_pos_fn = set_bvec_pos, + .count_fn = count_bvecs, + .copy_fn = copy_bvecs, }; return rbd_img_fill_request(img_req, img_extents, num_img_extents, @@ -2071,6 +2212,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) &obj_req->bio_pos); break; case OBJ_REQUEST_BVECS: + case OBJ_REQUEST_OWN_BVECS: ret = __rbd_img_fill_from_bvecs(child_img_req, obj_req->img_extents, obj_req->num_img_extents, -- cgit v1.1 From b13318521776304a37c8cb3e2a3e613d228a38f3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 7 Feb 2018 12:09:12 +0100 Subject: rbd: allow "fancy" striping Signed-off-by: Ilya Dryomov Acked-by: Jason Dillaman --- drivers/block/rbd.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 056865cf..9a7f172 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4615,9 +4615,6 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) } __attribute__ ((packed)) striping_info_buf = { 0 }; size_t size = sizeof (striping_info_buf); void *p; - u64 obj_size; - u64 stripe_unit; - u64 stripe_count; int ret; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, @@ -4629,31 +4626,9 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) if (ret < size) return -ERANGE; - /* - * We don't actually support the "fancy striping" feature - * (STRIPINGV2) yet, but if the striping sizes are the - * defaults the behavior is the same as before. So find - * out, and only fail if the image has non-default values. - */ - ret = -EINVAL; - obj_size = rbd_obj_bytes(&rbd_dev->header); p = &striping_info_buf; - stripe_unit = ceph_decode_64(&p); - if (stripe_unit != obj_size) { - rbd_warn(rbd_dev, "unsupported stripe unit " - "(got %llu want %llu)", - stripe_unit, obj_size); - return -EINVAL; - } - stripe_count = ceph_decode_64(&p); - if (stripe_count != 1) { - rbd_warn(rbd_dev, "unsupported stripe count " - "(got %llu want 1)", stripe_count); - return -EINVAL; - } - rbd_dev->header.stripe_unit = stripe_unit; - rbd_dev->header.stripe_count = stripe_count; - + rbd_dev->header.stripe_unit = ceph_decode_64(&p); + rbd_dev->header.stripe_count = ceph_decode_64(&p); return 0; } -- cgit v1.1 From 0a4a1e68d861848d09ab4b4b280d13584ad8ca45 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 12 Feb 2018 16:00:36 +0100 Subject: rbd: remove redundant declaration of rbd_spec_put() Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9a7f172..5416b44 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -445,7 +445,6 @@ static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, size_t count); static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); -static void rbd_spec_put(struct rbd_spec *spec); static int rbd_dev_id_to_minor(int dev_id) { -- cgit v1.1 From 5feb0d8d2f10c3f39f3d3a754dded74bb430a5e6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 22 Feb 2018 13:19:04 +0100 Subject: rbd: move rbd_get_client() below rbd_put_client() ... to avoid a forward declaration in the next commit. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5416b44..a306192 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -840,26 +840,6 @@ static char* obj_op_name(enum obj_operation_type op_type) } /* - * Get a ceph client with specific addr and configuration, if one does - * not exist create it. Either way, ceph_opts is consumed by this - * function. - */ -static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) -{ - struct rbd_client *rbdc; - - mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); - rbdc = rbd_client_find(ceph_opts); - if (rbdc) /* using an existing client */ - ceph_destroy_options(ceph_opts); - else - rbdc = rbd_client_create(ceph_opts); - mutex_unlock(&client_mutex); - - return rbdc; -} - -/* * Destroy ceph client * * Caller must hold rbd_client_list_lock. @@ -887,6 +867,26 @@ static void rbd_put_client(struct rbd_client *rbdc) kref_put(&rbdc->kref, rbd_client_release); } +/* + * Get a ceph client with specific addr and configuration, if one does + * not exist create it. Either way, ceph_opts is consumed by this + * function. + */ +static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) +{ + struct rbd_client *rbdc; + + mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); + rbdc = rbd_client_find(ceph_opts); + if (rbdc) /* using an existing client */ + ceph_destroy_options(ceph_opts); + else + rbdc = rbd_client_create(ceph_opts); + mutex_unlock(&client_mutex); + + return rbdc; +} + static bool rbd_image_format_valid(u32 image_format) { return image_format == 1 || image_format == 2; -- cgit v1.1 From dd4358550fc5244d4757eae40e23d87894fe5273 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 22 Feb 2018 13:43:24 +0100 Subject: rbd: get the latest osdmap when using an existing client Currently we request the latest osdmap only if ceph_pg_poolid_by_name() fails with -ENOENT. This is effective with newly created pools, but we also want to avoid attempting to map from pools that were recently deleted and report "pool does not exist" instead. (Such an attempt eventually fails in the OSD client after map check code kicks in, but the error message is confusing.) Request the latest osdmap unconditionally after bumping a ref on an existing client in rbd_client_find(). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 69 +++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 36 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a306192..a382fce 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -867,6 +867,23 @@ static void rbd_put_client(struct rbd_client *rbdc) kref_put(&rbdc->kref, rbd_client_release); } +static int wait_for_latest_osdmap(struct ceph_client *client) +{ + u64 newest_epoch; + int ret; + + ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); + if (ret) + return ret; + + if (client->osdc.osdmap->epoch >= newest_epoch) + return 0; + + ceph_osdc_maybe_request_map(&client->osdc); + return ceph_monc_wait_osdmap(&client->monc, newest_epoch, + client->options->mount_timeout); +} + /* * Get a ceph client with specific addr and configuration, if one does * not exist create it. Either way, ceph_opts is consumed by this @@ -875,13 +892,26 @@ static void rbd_put_client(struct rbd_client *rbdc) static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) { struct rbd_client *rbdc; + int ret; mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); rbdc = rbd_client_find(ceph_opts); - if (rbdc) /* using an existing client */ + if (rbdc) { ceph_destroy_options(ceph_opts); - else + + /* + * Using an existing client. Make sure ->pg_pools is up to + * date before we look up the pool id in do_rbd_add(). + */ + ret = wait_for_latest_osdmap(rbdc->client); + if (ret) { + rbd_warn(NULL, "failed to get latest osdmap: %d", ret); + rbd_put_client(rbdc); + rbdc = ERR_PTR(ret); + } + } else { rbdc = rbd_client_create(ceph_opts); + } mutex_unlock(&client_mutex); return rbdc; @@ -5185,39 +5215,6 @@ out_err: return ret; } -/* - * Return pool id (>= 0) or a negative error code. - */ -static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) -{ - struct ceph_options *opts = rbdc->client->options; - u64 newest_epoch; - int tries = 0; - int ret; - -again: - ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); - if (ret == -ENOENT && tries++ < 1) { - ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", - &newest_epoch); - if (ret < 0) - return ret; - - if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { - ceph_osdc_maybe_request_map(&rbdc->client->osdc); - (void) ceph_monc_wait_osdmap(&rbdc->client->monc, - newest_epoch, - opts->mount_timeout); - goto again; - } else { - /* the osdmap we have is new enough */ - return -ENOENT; - } - } - - return ret; -} - static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) { down_write(&rbd_dev->lock_rwsem); @@ -5646,7 +5643,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, } /* pick the pool */ - rc = rbd_add_get_pool_id(rbdc, spec->pool_name); + rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name); if (rc < 0) { if (rc == -ENOENT) pr_info("pool %s does not exist\n", spec->pool_name); -- cgit v1.1 From f6870cc9a36623d1dcb0aceade9e8a4785a4283a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 19 Mar 2018 13:33:10 +0000 Subject: rbd: fix spelling mistake: "reregisteration" -> "reregistration" Trivial fix to spelling mistake in rdb_warn message text. Signed-off-by: Colin Ian King Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a382fce..aab513f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3479,7 +3479,7 @@ static void rbd_reregister_watch(struct work_struct *work) ret = rbd_dev_refresh(rbd_dev); if (ret) - rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); + rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret); } /* -- cgit v1.1 From 08a79102aa373e03ce704621fd84567605214465 Mon Sep 17 00:00:00 2001 From: Kyle Spiers Date: Sat, 17 Mar 2018 09:44:01 -0700 Subject: rbd: remove VLA usage As part of the effort to remove VLAs from the kernel[1], this moves the literal values into the stack array calculation instead of using a variable for the sizing. The resulting size can be found from sizeof(buf). [1] https://lkml.org/lkml/2018/3/7/621 Signed-off-by: Kyle Spiers Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index aab513f..e60a638 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2683,8 +2683,8 @@ static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_client_id cid = rbd_get_cid(rbd_dev); - int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN; - char buf[buf_size]; + char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN]; + int buf_size = sizeof(buf); void *p = buf; dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); @@ -3202,8 +3202,8 @@ static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, u64 cookie, s32 *result) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN; - char buf[buf_size]; + char buf[4 + CEPH_ENCODING_START_BLK_LEN]; + int buf_size = sizeof(buf); int ret; if (result) { -- cgit v1.1