diff options
author | mm <mm@FreeBSD.org> | 2010-05-13 20:32:56 +0000 |
---|---|---|
committer | mm <mm@FreeBSD.org> | 2010-05-13 20:32:56 +0000 |
commit | 6f4ba1587b481253ce8265f50114ebe438759779 (patch) | |
tree | 1cb19e0cf3e413fa2f84c4f8c268f6161be50805 | |
parent | cdb02238eea11b2e4495d0b09780e3669bce9e4c (diff) | |
download | FreeBSD-src-6f4ba1587b481253ce8265f50114ebe438759779.zip FreeBSD-src-6f4ba1587b481253ce8265f50114ebe438759779.tar.gz |
Import OpenSolaris revision 7837:001de5627df3
It includes the following changes:
- parallel reads in traversal code (Bug ID 6333409)
- faster traversal for zfs send (Bug ID 6418042)
- traversal code cleanup (Bug ID 6725675)
- fix for two scrub related bugs (Bug ID 6729696, 6730101)
- fix assertion in dbuf_verify (Bug ID 6752226)
- fix panic during zfs send with i/o errors (Bug ID 6577985)
- replace P2CROSS with P2BOUNDARY (Bug ID 6725680)
List of OpenSolaris Bug IDs:
6333409, 6418042, 6757112, 6725668, 6725675, 6725680,
6725698, 6729696, 6730101, 6752226, 6577985, 6755042
Approved by: pjd, delphij (mentor)
Obtained from: OpenSolaris (multiple Bug IDs)
MFC after: 1 week
23 files changed, 712 insertions, 1580 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index 9e1e106..f0b4ba4 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -50,6 +50,7 @@ #include <sys/zio_checksum.h> #include <sys/zio_compress.h> #include <sys/zfs_fuid.h> +#include <sys/arc.h> #undef ZFS_MAXNAMELEN #undef verify #include <libzfs.h> @@ -62,8 +63,6 @@ typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); extern void dump_intent_log(zilog_t *); uint64_t *zopt_object = NULL; int zopt_objects = 0; -int zdb_advance = ADVANCE_PRE; -zbookmark_t zdb_noread = { 0, 0, ZB_NO_LEVEL, 0 }; libzfs_handle_t *g_zfs; boolean_t zdb_sig_user_data = B_TRUE; int zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256; @@ -88,8 +87,8 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-udibcsvL] [-U cachefile_path] [-O order] " - "[-B os:obj:level:blkid] [-S user:cksumalg] " + "Usage: %s [-udibcsv] [-U cachefile_path] " + "[-S user:cksumalg] " "dataset [object...]\n" " %s -C [pool]\n" " %s -l dev\n" @@ -109,13 +108,8 @@ usage(void) "dump blkptr signatures\n"); (void) fprintf(stderr, " -v verbose (applies to all others)\n"); (void) fprintf(stderr, " -l dump label contents\n"); - (void) fprintf(stderr, " -L live pool (allows some errors)\n"); - (void) fprintf(stderr, " -O [!]<pre|post|prune|data|holes> " - "visitation order\n"); (void) fprintf(stderr, " -U cachefile_path -- use alternate " "cachefile\n"); - (void) fprintf(stderr, " -B objset:object:level:blkid -- " - "simulate bad block\n"); (void) fprintf(stderr, " -R read and display block from a " "device\n"); (void) fprintf(stderr, " -e Pool is exported/destroyed/" @@ -138,7 +132,7 @@ fatal(const char *fmt, ...) va_end(ap); (void) fprintf(stderr, "\n"); - exit(1); + abort(); } static void @@ -571,7 +565,7 @@ dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) } static uint64_t -blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid) +blkid2offset(const dnode_phys_t *dnp, int level, uint64_t blkid) { if (level < 0) return (blkid); @@ -602,115 +596,104 @@ sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas) (u_longlong_t)bp->blk_birth); } -/* ARGSUSED */ -static int -zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) +static void +print_indirect(blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp) { - zbookmark_t *zb = &bc->bc_bookmark; - blkptr_t *bp = &bc->bc_blkptr; - void *data = bc->bc_data; - dnode_phys_t *dnp = bc->bc_dnode; - char blkbuf[BP_SPRINTF_LEN + 80]; + char blkbuf[BP_SPRINTF_LEN]; int l; - if (bc->bc_errno) { - (void) sprintf(blkbuf, - "Error %d reading <%llu, %llu, %lld, %llu>: ", - bc->bc_errno, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid); - goto out; - } - - if (zb->zb_level == -1) { - ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); - ASSERT3U(BP_GET_LEVEL(bp), ==, 0); - } else { - ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); - ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - } - - if (zb->zb_level > 0) { - uint64_t fill = 0; - blkptr_t *bpx, *bpend; - - for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx); - bpx < bpend; bpx++) { - if (bpx->blk_birth != 0) { - fill += bpx->blk_fill; - } else { - ASSERT(bpx->blk_fill == 0); - } - } - ASSERT3U(fill, ==, bp->blk_fill); - } - - if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) { - uint64_t fill = 0; - dnode_phys_t *dnx, *dnend; + ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); + ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT); - dnx < dnend; dnx++) { - if (dnx->dn_type != DMU_OT_NONE) - fill++; - } - ASSERT3U(fill, ==, bp->blk_fill); - } - - (void) sprintf(blkbuf, "%16llx ", + (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { - (void) sprintf(blkbuf + strlen(blkbuf), "L%llx", - (u_longlong_t)zb->zb_level); + (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { - (void) sprintf(blkbuf + strlen(blkbuf), " "); + (void) printf(" "); } } -out: - if (bp->blk_birth == 0) { - (void) sprintf(blkbuf + strlen(blkbuf), "<hole>"); - (void) printf("%s\n", blkbuf); - } else { - sprintf_blkptr_compact(blkbuf + strlen(blkbuf), bp, - dump_opt['d'] > 5 ? 1 : 0); - (void) printf("%s\n", blkbuf); + sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0); + (void) printf("%s\n", blkbuf); +} + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +static int +visit_indirect(spa_t *spa, const dnode_phys_t *dnp, + blkptr_t *bp, const zbookmark_t *zb) +{ + int err; + + if (bp->blk_birth == 0) + return (0); + + print_indirect(bp, zb, dnp); + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + arc_buf_t *buf; + uint64_t fill = 0; + + err = arc_read_nolock(NULL, spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visit blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = visit_indirect(spa, dnp, cbp, &czb); + if (err) + break; + fill += cbp->blk_fill; + } + ASSERT3U(fill, ==, bp->blk_fill); + (void) arc_buf_remove_ref(buf, &buf); } - return (bc->bc_errno ? ERESTART : 0); + return (err); } /*ARGSUSED*/ static void -dump_indirect(objset_t *os, uint64_t object, void *data, size_t size) +dump_indirect(dnode_t *dn) { - traverse_handle_t *th; - uint64_t objset = dmu_objset_id(os); - int advance = zdb_advance; + dnode_phys_t *dnp = dn->dn_phys; + int j; + zbookmark_t czb; (void) printf("Indirect blocks:\n"); - if (object == 0) - advance |= ADVANCE_DATA; - - th = traverse_init(dmu_objset_spa(os), zdb_indirect_cb, NULL, advance, - ZIO_FLAG_CANFAIL); - th->th_noread = zdb_noread; - - traverse_add_dnode(th, 0, -1ULL, objset, object); - - while (traverse_more(th) == EAGAIN) - continue; + SET_BOOKMARK(&czb, dmu_objset_id(&dn->dn_objset->os), + dn->dn_object, dnp->dn_nlevels - 1, 0); + for (j = 0; j < dnp->dn_nblkptr; j++) { + czb.zb_blkid = j; + (void) visit_indirect(dmu_objset_spa(&dn->dn_objset->os), dnp, + &dnp->dn_blkptr[j], &czb); + } (void) printf("\n"); - - traverse_fini(th); } /*ARGSUSED*/ @@ -1093,7 +1076,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } if (verbosity >= 5) - dump_indirect(os, object, NULL, 0); + dump_indirect(dn); if (verbosity >= 5) { /* @@ -1458,18 +1441,17 @@ typedef struct zdb_blkstats { #define DMU_OT_DEFERRED DMU_OT_NONE #define DMU_OT_TOTAL DMU_OT_NUMTYPES -#define ZB_TOTAL ZB_MAXLEVEL +#define ZB_TOTAL DN_MAX_LEVELS typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1]; uint64_t zcb_errors[256]; - traverse_blk_cache_t *zcb_cache; int zcb_readfails; int zcb_haderrors; } zdb_cb_t; static void -zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type) +zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type) { for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; @@ -1485,7 +1467,7 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type) if (dump_opt['S']) { boolean_t print_sig; - print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 && + print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) == DMU_OT_PLAIN_FILE_CONTENTS); if (BP_GET_CHECKSUM(bp) < zdb_sig_cksumalg) @@ -1507,56 +1489,55 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type) } } - if (!dump_opt['L']) - VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp, - NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0); + VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp, + NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0); } static int -zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) { - zbookmark_t *zb = &bc->bc_bookmark; zdb_cb_t *zcb = arg; - blkptr_t *bp = &bc->bc_blkptr; - dmu_object_type_t type = BP_GET_TYPE(bp); char blkbuf[BP_SPRINTF_LEN]; - int error = 0; - ASSERT(!BP_IS_HOLE(bp)); + if (bp == NULL) + return (0); - zdb_count_block(spa, zcb, bp, type); + zdb_count_block(spa, zcb, bp, BP_GET_TYPE(bp)); - if (bc->bc_errno) { - if (zcb->zcb_readfails++ < 10 && dump_opt['L']) { - uberblock_t ub; - vdev_uberblock_load(NULL, spa->spa_root_vdev, &ub); - if (ub.ub_txg != 0) - spa->spa_ubsync = ub; - error = EAGAIN; - } else { + if (dump_opt['c'] || dump_opt['S']) { + int ioerr, size; + void *data; + + size = BP_GET_LSIZE(bp); + data = malloc(size); + ioerr = zio_wait(zio_read(NULL, spa, bp, data, size, + NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB, zb)); + free(data); + + /* We expect io errors on intent log */ + if (ioerr && BP_GET_TYPE(bp) != DMU_OT_INTENT_LOG) { zcb->zcb_haderrors = 1; - zcb->zcb_errors[bc->bc_errno]++; - error = ERESTART; - } + zcb->zcb_errors[ioerr]++; - if (dump_opt['b'] >= 3 || (dump_opt['b'] >= 2 && bc->bc_errno)) - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp); - else - blkbuf[0] = '\0'; - - if (!dump_opt['S']) { - (void) printf("zdb_blkptr_cb: Got error %d reading " - "<%llu, %llu, %lld, %llx> %s -- %s\n", - bc->bc_errno, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid, - blkbuf, - error == EAGAIN ? "retrying" : "skipping"); + if (dump_opt['b'] >= 2) + sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp); + else + blkbuf[0] = '\0'; + + if (!dump_opt['S']) { + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } } - - return (error); } zcb->zcb_readfails = 0; @@ -1566,8 +1547,8 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) (void) printf("objset %llu object %llu offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, - (u_longlong_t)blkid2offset(bc->bc_dnode, - zb->zb_level, zb->zb_blkid), blkbuf); + (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid), + blkbuf); } return (0); @@ -1576,22 +1557,12 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) static int dump_block_stats(spa_t *spa) { - traverse_handle_t *th; zdb_cb_t zcb = { 0 }; - traverse_blk_cache_t dummy_cache = { 0 }; zdb_blkstats_t *zb, *tzb; uint64_t alloc, space, logalloc; vdev_t *rvd = spa->spa_root_vdev; int leaks = 0; - int advance = zdb_advance; - int c, e, flags; - - zcb.zcb_cache = &dummy_cache; - - if (dump_opt['c'] || dump_opt['S']) - advance |= ADVANCE_DATA; - - advance |= ADVANCE_PRUNE | ADVANCE_ZIL; + int c, e; if (!dump_opt['S']) { (void) printf("\nTraversing all blocks to %sverify" @@ -1607,8 +1578,7 @@ dump_block_stats(spa_t *spa) * it's not part of any space map) is a double allocation, * reference to a freed block, or an unclaimed log block. */ - if (!dump_opt['L']) - zdb_leak_init(spa); + zdb_leak_init(spa); /* * If there's a deferred-free bplist, process that first. @@ -1634,22 +1604,7 @@ dump_block_stats(spa_t *spa) bplist_close(bpl); } - /* - * Now traverse the pool. If we're reading all data to verify - * checksums, do a scrubbing read so that we validate all copies. - */ - flags = ZIO_FLAG_CANFAIL; - if (advance & ADVANCE_DATA) - flags |= ZIO_FLAG_SCRUB; - th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags); - th->th_noread = zdb_noread; - - traverse_add_pool(th, 0, spa_first_txg(spa) + TXG_CONCURRENT_STATES); - - while (traverse_more(th) == EAGAIN) - continue; - - traverse_fini(th); + zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb); if (zcb.zcb_haderrors && !dump_opt['S']) { (void) printf("\nError counts:\n\n"); @@ -1665,8 +1620,7 @@ dump_block_stats(spa_t *spa) /* * Report any leaked segments. */ - if (!dump_opt['L']) - zdb_leak_fini(spa); + zdb_leak_fini(spa); /* * If we're interested in printing out the blkptr signatures, @@ -1676,10 +1630,6 @@ dump_block_stats(spa_t *spa) if (dump_opt['S']) return (zcb.zcb_haderrors ? 3 : 0); - if (dump_opt['L']) - (void) printf("\n\n *** Live pool traversal; " - "block counts are only approximate ***\n\n"); - alloc = spa_get_alloc(spa); space = spa_get_space(spa); @@ -2285,7 +2235,6 @@ main(int argc, char **argv) int dump_all = 1; int verbose = 0; int error; - int flag, set; int exported = 0; char *vdev_dir = NULL; @@ -2294,7 +2243,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "udibcsvCLO:B:S:U:lRep:")) != -1) { + while ((c = getopt(argc, argv, "udibcsvCS:U:lRep:")) != -1) { switch (c) { case 'u': case 'd': @@ -2308,49 +2257,6 @@ main(int argc, char **argv) dump_opt[c]++; dump_all = 0; break; - case 'L': - dump_opt[c]++; - break; - case 'O': - endstr = optarg; - if (endstr[0] == '!') { - endstr++; - set = 0; - } else { - set = 1; - } - if (strcmp(endstr, "post") == 0) { - flag = ADVANCE_PRE; - set = !set; - } else if (strcmp(endstr, "pre") == 0) { - flag = ADVANCE_PRE; - } else if (strcmp(endstr, "prune") == 0) { - flag = ADVANCE_PRUNE; - } else if (strcmp(endstr, "data") == 0) { - flag = ADVANCE_DATA; - } else if (strcmp(endstr, "holes") == 0) { - flag = ADVANCE_HOLES; - } else { - usage(); - } - if (set) - zdb_advance |= flag; - else - zdb_advance &= ~flag; - break; - case 'B': - endstr = optarg - 1; - zdb_noread.zb_objset = strtoull(endstr + 1, &endstr, 0); - zdb_noread.zb_object = strtoull(endstr + 1, &endstr, 0); - zdb_noread.zb_level = strtol(endstr + 1, &endstr, 0); - zdb_noread.zb_blkid = strtoull(endstr + 1, &endstr, 16); - (void) printf("simulating bad block " - "<%llu, %llu, %lld, %llx>\n", - (u_longlong_t)zdb_noread.zb_objset, - (u_longlong_t)zdb_noread.zb_object, - (u_longlong_t)zdb_noread.zb_level, - (u_longlong_t)zdb_noread.zb_blkid); - break; case 'v': verbose++; break; @@ -2387,21 +2293,17 @@ main(int argc, char **argv) } } - if (vdev_dir != NULL && exported == 0) - (void) fatal("-p option requires use of -e\n"); + if (vdev_dir != NULL && exported == 0) { + (void) fprintf(stderr, "-p option requires use of -e\n"); + usage(); + } kernel_init(FREAD); g_zfs = libzfs_init(); ASSERT(g_zfs != NULL); - /* - * Disable vdev caching. If we don't do this, live pool traversal - * won't make progress because it will never see disk updates. - */ - zfs_vdev_cache_size = 0; - for (c = 0; c < 256; c++) { - if (dump_all && c != 'L' && c != 'l' && c != 'R') + if (dump_all && c != 'l' && c != 'R') dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index f3e7d64..ff55c29 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -77,7 +77,6 @@ #include <sys/dmu.h> #include <sys/txg.h> #include <sys/zap.h> -#include <sys/dmu_traverse.h> #include <sys/dmu_objset.h> #include <sys/poll.h> #include <sys/stat.h> @@ -151,7 +150,6 @@ typedef struct ztest_args { hrtime_t za_start; hrtime_t za_stop; hrtime_t za_kill; - traverse_handle_t *za_th; /* * Thread-local variables can go here to aid debugging. */ @@ -206,7 +204,6 @@ ztest_info_t ztest_info[] = { { ztest_dmu_object_alloc_free, 1, &zopt_always }, { ztest_zap, 30, &zopt_always }, { ztest_zap_parallel, 100, &zopt_always }, - { ztest_traverse, 1, &zopt_often }, { ztest_dsl_prop_get_set, 1, &zopt_sometimes }, { ztest_dmu_objset_create_destroy, 1, &zopt_sometimes }, { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, @@ -1447,152 +1444,6 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za) (void) rw_unlock(&ztest_shared->zs_name_lock); } -#define ZTEST_TRAVERSE_BLOCKS 1000 - -static int -ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) -{ - ztest_args_t *za = arg; - zbookmark_t *zb = &bc->bc_bookmark; - blkptr_t *bp = &bc->bc_blkptr; - dnode_phys_t *dnp = bc->bc_dnode; - traverse_handle_t *th = za->za_th; - uint64_t size = BP_GET_LSIZE(bp); - - /* - * Level -1 indicates the objset_phys_t or something in its intent log. - */ - if (zb->zb_level == -1) { - if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - ASSERT3U(zb->zb_object, ==, 0); - ASSERT3U(zb->zb_blkid, ==, 0); - ASSERT3U(size, ==, sizeof (objset_phys_t)); - za->za_zil_seq = 0; - } else if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { - ASSERT3U(zb->zb_object, ==, 0); - ASSERT3U(zb->zb_blkid, >, za->za_zil_seq); - za->za_zil_seq = zb->zb_blkid; - } else { - ASSERT3U(zb->zb_object, !=, 0); /* lr_write_t */ - } - - return (0); - } - - ASSERT(dnp != NULL); - - if (bc->bc_errno) - return (ERESTART); - - /* - * Once in a while, abort the traverse. We only do this to odd - * instance numbers to ensure that even ones can run to completion. - */ - if ((za->za_instance & 1) && ztest_random(10000) == 0) - return (EINTR); - - if (bp->blk_birth == 0) { - ASSERT(th->th_advance & ADVANCE_HOLES); - return (0); - } - - if (zb->zb_level == 0 && !(th->th_advance & ADVANCE_DATA) && - bc == &th->th_cache[ZB_DN_CACHE][0]) { - ASSERT(bc->bc_data == NULL); - return (0); - } - - ASSERT(bc->bc_data != NULL); - - /* - * This is an expensive question, so don't ask it too often. - */ - if (((za->za_random ^ th->th_callbacks) & 0xff) == 0) { - void *xbuf = umem_alloc(size, UMEM_NOFAIL); - if (arc_tryread(spa, bp, xbuf) == 0) { - ASSERT(bcmp(bc->bc_data, xbuf, size) == 0); - } - umem_free(xbuf, size); - } - - if (zb->zb_level > 0) { - ASSERT3U(size, ==, 1ULL << dnp->dn_indblkshift); - return (0); - } - - ASSERT(zb->zb_level == 0); - ASSERT3U(size, ==, dnp->dn_datablkszsec << DEV_BSHIFT); - - return (0); -} - -/* - * Verify that live pool traversal works. - */ -void -ztest_traverse(ztest_args_t *za) -{ - spa_t *spa = za->za_spa; - traverse_handle_t *th = za->za_th; - int rc, advance; - uint64_t cbstart, cblimit; - - if (th == NULL) { - advance = 0; - - if (ztest_random(2) == 0) - advance |= ADVANCE_PRE; - - if (ztest_random(2) == 0) - advance |= ADVANCE_PRUNE; - - if (ztest_random(2) == 0) - advance |= ADVANCE_DATA; - - if (ztest_random(2) == 0) - advance |= ADVANCE_HOLES; - - if (ztest_random(2) == 0) - advance |= ADVANCE_ZIL; - - th = za->za_th = traverse_init(spa, ztest_blk_cb, za, advance, - ZIO_FLAG_CANFAIL); - - traverse_add_pool(th, 0, -1ULL); - } - - advance = th->th_advance; - cbstart = th->th_callbacks; - cblimit = cbstart + ((advance & ADVANCE_DATA) ? 100 : 1000); - - while ((rc = traverse_more(th)) == EAGAIN && th->th_callbacks < cblimit) - continue; - - if (zopt_verbose >= 5) - (void) printf("traverse %s%s%s%s %llu blocks to " - "<%llu, %llu, %lld, %llx>%s\n", - (advance & ADVANCE_PRE) ? "pre" : "post", - (advance & ADVANCE_PRUNE) ? "|prune" : "", - (advance & ADVANCE_DATA) ? "|data" : "", - (advance & ADVANCE_HOLES) ? "|holes" : "", - (u_longlong_t)(th->th_callbacks - cbstart), - (u_longlong_t)th->th_lastcb.zb_objset, - (u_longlong_t)th->th_lastcb.zb_object, - (u_longlong_t)th->th_lastcb.zb_level, - (u_longlong_t)th->th_lastcb.zb_blkid, - rc == 0 ? " [done]" : - rc == EINTR ? " [aborted]" : - rc == EAGAIN ? "" : - strerror(rc)); - - if (rc != EAGAIN) { - if (rc != 0 && rc != EINTR) - fatal(0, "traverse_more(%p) = %d", th, rc); - traverse_fini(th); - za->za_th = NULL; - } -} - /* * Verify dsl_dataset_promote handles EBUSY */ @@ -3067,12 +2918,12 @@ ztest_verify_blocks(char *pool) isa = strdup(isa); /* LINTED */ (void) sprintf(bin, - "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache -O %s %s", + "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache %s", isalen, isa, zopt_verbose >= 3 ? "s" : "", zopt_verbose >= 4 ? "v" : "", - ztest_random(2) == 0 ? "pre" : "post", pool); + pool); free(isa); if (zopt_verbose >= 5) @@ -3438,8 +3289,6 @@ ztest_run(char *pool) while (--t >= 0) { VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0); - if (za[t].za_th) - traverse_fini(za[t].za_th); if (t < zopt_datasets) { zil_close(za[t].za_zilog); dmu_objset_close(za[t].za_os); diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c index a13cd76..6365c6c 100644 --- a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c +++ b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <assert.h> #include <fcntl.h> #include <poll.h> @@ -842,6 +840,8 @@ kernel_init(int mode) VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1); VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1); + system_taskq_init(); + spa_init(mode); } diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h index 4ef6472..ee202c8 100644 --- a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h +++ b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h @@ -334,11 +334,14 @@ typedef void (task_func_t)(void *); #define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +extern taskq_t *system_taskq; + extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern int taskq_member(taskq_t *, void *); +extern void system_taskq_init(void); #define XVA_MAPSIZE 3 #define XVA_MAGIC 0x78766174 diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c b/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c index ccf5b4d..93acdcf 100644 --- a/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c +++ b/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c @@ -19,15 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> int taskq_now; +taskq_t *system_taskq; typedef struct task { struct task *task_next; @@ -253,3 +252,10 @@ taskq_member(taskq_t *tq, void *t) return (0); } + +void +system_taskq_init(void) +{ + system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512, + TASKQ_DYNAMIC | TASKQ_PREPOPULATE); +} diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h index ef13487..04c74a3 100644 --- a/sys/cddl/boot/zfs/zfsimpl.h +++ b/sys/cddl/boot/zfs/zfsimpl.h @@ -66,7 +66,7 @@ #define P2ROUNDUP(x, align) (-(-(x) & -(align))) #define P2END(x, align) (-(~(x) & -(align))) #define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) -#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2BOUNDARY(off, len, align) (((off) ^ ((off) + (len) - 1)) > (align) - 1) /* * General-purpose 32-bit and 64-bit bitfield encodings. diff --git a/sys/cddl/compat/opensolaris/sys/sysmacros.h b/sys/cddl/compat/opensolaris/sys/sysmacros.h index 3c1e9b1..0afc9ca 100644 --- a/sys/cddl/compat/opensolaris/sys/sysmacros.h +++ b/sys/cddl/compat/opensolaris/sys/sysmacros.h @@ -43,6 +43,10 @@ extern "C" { #define ABS(a) ((a) < 0 ? -(a) : (a)) #endif +#ifndef SIGNOF +#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0) +#endif + /* * Macro for checking power of 2 address alignment. */ @@ -63,7 +67,7 @@ extern "C" { #define P2ROUNDUP(x, align) (-(-(x) & -(align))) #define P2END(x, align) (-(~(x) & -(align))) #define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) -#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2BOUNDARY(off, len, align) (((off) ^ ((off) + (len) - 1)) > (align) - 1) /* * Determine whether two numbers have the same high-order bit. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 053c1e1..69ad489 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -308,20 +308,18 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } - if (db->db_level == 0) { - /* we can be momentarily larger in dnode_set_blksz() */ - if (db->db_blkid != DB_BONUS_BLKID && dn) { - ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); - } - if (db->db.db_object == DMU_META_DNODE_OBJECT) { - dbuf_dirty_record_t *dr = db->db_data_pending; - /* - * it should only be modified in syncing - * context, so make sure we only have - * one copy of the data. - */ - ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); - } + /* + * We can't assert that db_size matches dn_datablksz because it + * can be momentarily different when another thread is doing + * dnode_set_blksz(). + */ + if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { + dbuf_dirty_record_t *dr = db->db_data_pending; + /* + * It should only be modified in syncing context, so + * make sure we only have one copy of the data. + */ + ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 5c97cd7..6effae8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_impl.h> #include <sys/dmu_tx.h> @@ -172,66 +170,59 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) static int -backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) { struct backuparg *ba = arg; - uint64_t object = bc->bc_bookmark.zb_object; - int level = bc->bc_bookmark.zb_level; - uint64_t blkid = bc->bc_bookmark.zb_blkid; - blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; - void *data = bc->bc_data; int err = 0; if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); - ASSERT(data || bp == NULL); - - if (bp == NULL && object == 0) { - uint64_t span = BP_SPAN(bc->bc_dnode, level); - uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; + if (bp == NULL && zb->zb_object == 0) { + uint64_t span = BP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); } else if (bp == NULL) { - uint64_t span = BP_SPAN(bc->bc_dnode, level); - err = dump_free(ba, object, blkid * span, span); - } else if (data && level == 0 && type == DMU_OT_DNODE) { - dnode_phys_t *blk = data; + uint64_t span = BP_SPAN(dnp, zb->zb_level); + err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); + } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { + return (0); + } else if (type == DMU_OT_DNODE) { + dnode_phys_t *blk; int i; int blksz = BP_GET_LSIZE(bp); + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + if (arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + blk = abuf->b_data; for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = - (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; err = dump_dnode(ba, dnobj, blk+i); if (err) break; } - } else if (level == 0 && - type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { + (void) arc_buf_remove_ref(abuf, &abuf); + } else { /* it's a level-0 block of a regular object */ + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); - if (data == NULL) { - uint32_t aflags = ARC_WAIT; - arc_buf_t *abuf; - zbookmark_t zb; - - zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; - zb.zb_object = object; - zb.zb_level = level; - zb.zb_blkid = blkid; - (void) arc_read_nolock(NULL, spa, bp, - arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); - - if (abuf) { - err = dump_data(ba, type, object, blkid * blksz, - blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); - } - } else { - err = dump_data(ba, type, object, blkid * blksz, - blksz, data); - } + + if (arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, + blksz, abuf->b_data); + (void) arc_buf_remove_ref(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); @@ -311,8 +302,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, return (ba.err); } - err = traverse_dsl_dataset(ds, fromtxg, - ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, + err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, backup_cb, &ba); if (err) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index 43bf82e..5e177c5d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/dmu_objset.h> #include <sys/dmu_traverse.h> @@ -35,510 +33,88 @@ #include <sys/spa.h> #include <sys/zio.h> #include <sys/dmu_impl.h> -#include <sys/zvol.h> - -#define BP_SPAN_SHIFT(level, width) ((level) * (width)) - -#define BP_EQUAL(b1, b2) \ - (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ - (b1)->blk_birth == (b2)->blk_birth) - -/* - * Compare two bookmarks. - * - * For ADVANCE_PRE, the visitation order is: - * - * objset 0, 1, 2, ..., ZB_MAXOBJSET. - * object 0, 1, 2, ..., ZB_MAXOBJECT. - * blkoff 0, 1, 2, ... - * level ZB_MAXLEVEL, ..., 2, 1, 0. - * - * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid - * ordering vector is: - * - * < objset, object, blkoff, -level > - * - * For ADVANCE_POST, the starting offsets aren't sequential but ending - * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. - * The visitation order is: - * - * objset 1, 2, ..., ZB_MAXOBJSET, 0. - * object 1, 2, ..., ZB_MAXOBJECT, 0. - * blkoff 1, 2, ... - * level 0, 1, 2, ..., ZB_MAXLEVEL. - * - * and thus a valid ordering vector is: - * - * < objset - 1, object - 1, blkoff, level > - * - * Both orderings can be expressed as: - * - * < objset + bias, object + bias, blkoff, level ^ bias > - * - * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) - * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). - * - * Special case: an objset's osphys is represented as level -1 of object 0. - * It is always either the very first or very last block we visit in an objset. - * Therefore, if either bookmark's level is -1, level alone determines order. - */ -static int -compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, - int advance) -{ - int bias = (advance & ADVANCE_PRE) ? 0 : -1; - uint64_t sblkoff, eblkoff; - int slevel, elevel, wshift; - - if (szb->zb_objset + bias < ezb->zb_objset + bias) - return (-1); - - if (szb->zb_objset + bias > ezb->zb_objset + bias) - return (1); - - slevel = szb->zb_level; - elevel = ezb->zb_level; - - if ((slevel | elevel) < 0) - return ((slevel ^ bias) - (elevel ^ bias)); - - if (szb->zb_object + bias < ezb->zb_object + bias) - return (-1); - - if (szb->zb_object + bias > ezb->zb_object + bias) - return (1); - - if (dnp == NULL) - return (0); - - wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; - - sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); - eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); - - if (sblkoff < eblkoff) - return (-1); - - if (sblkoff > eblkoff) - return (1); - - return ((elevel ^ bias) - (slevel ^ bias)); -} - -#define SET_BOOKMARK(zb, objset, object, level, blkid) \ -{ \ - (zb)->zb_objset = objset; \ - (zb)->zb_object = object; \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - -#define SET_BOOKMARK_LB(zb, level, blkid) \ -{ \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - -static int -advance_objset(zseg_t *zseg, uint64_t objset, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - - if (advance & ADVANCE_PRE) { - if (objset >= ZB_MAXOBJSET) - return (ERANGE); - SET_BOOKMARK(zb, objset, 0, -1, 0); - } else { - if (objset >= ZB_MAXOBJSET) - objset = 0; - SET_BOOKMARK(zb, objset, 1, 0, 0); - } - - if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -advance_object(zseg_t *zseg, uint64_t object, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - - if (advance & ADVANCE_PRE) { - if (object >= ZB_MAXOBJECT) { - SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); - } else { - SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); - } - } else { - if (zb->zb_object == 0) { - SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); - } else { - if (object >= ZB_MAXOBJECT) - object = 0; - SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); - } - } - - if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -advance_from_osphys(zseg_t *zseg, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - - ASSERT(zb->zb_object == 0); - ASSERT(zb->zb_level == -1); - ASSERT(zb->zb_blkid == 0); - - if (advance & ADVANCE_PRE) { - SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); - } else { - if (zb->zb_objset == 0) - return (ERANGE); - SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); - } - - if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; - int maxlevel = dnp->dn_nlevels - 1; - int level = zb->zb_level; - uint64_t blkid = zb->zb_blkid; - - if (advance & ADVANCE_PRE) { - if (level > 0 && rc == 0) { - level--; - blkid <<= wshift; - } else { - blkid++; - - if ((blkid << BP_SPAN_SHIFT(level, wshift)) > - dnp->dn_maxblkid) - return (ERANGE); - - while (level < maxlevel) { - if (P2PHASE(blkid, 1ULL << wshift)) - break; - blkid >>= wshift; - level++; - } - } - } else { - if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { - blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); - level = 0; - } else { - blkid >>= wshift; - level++; - } - - while ((blkid << BP_SPAN_SHIFT(level, wshift)) > - dnp->dn_maxblkid) { - if (level == maxlevel) - return (ERANGE); - blkid >>= wshift; - level++; - } - } - SET_BOOKMARK_LB(zb, level, blkid); - - if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -/* - * The traverse_callback function will call the function specified in th_func. - * In the event of an error the callee, specified by th_func, must return - * one of the following errors: - * - * EINTR - Indicates that the callee wants the traversal to - * abort immediately. - * ERESTART - The callee has acknowledged the error and would - * like to continue. - */ -static int -traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) -{ - /* - * Before we issue the callback, prune against maxtxg. - * - * We prune against mintxg before we get here because it's a big win. - * If a given block was born in txg 37, then we know that the entire - * subtree below that block must have been born in txg 37 or earlier. - * We can therefore lop off huge branches of the tree as we go. - * - * There's no corresponding optimization for maxtxg because knowing - * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's - * children. In fact, the copy-on-write design of ZFS ensures that - * top-level blocks will pretty much always be new. - * - * Therefore, in the name of simplicity we don't prune against - * maxtxg until the last possible moment -- that being right now. - */ - if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) - return (0); - - /* - * Debugging: verify that the order we visit things agrees with the - * order defined by compare_bookmark(). We don't check this for - * log blocks because there's no defined ordering for them; they're - * always visited (or not) as part of visiting the objset_phys_t. - */ - if (bc->bc_errno == 0 && bc != &th->th_zil_cache) { - zbookmark_t *zb = &bc->bc_bookmark; - zbookmark_t *szb = &zseg->seg_start; - zbookmark_t *ezb = &zseg->seg_end; - zbookmark_t *lzb = &th->th_lastcb; - dnode_phys_t *dnp = bc->bc_dnode; - - ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); - ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); - ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || - lzb->zb_level == ZB_NO_LEVEL); - *lzb = *zb; - } - - th->th_callbacks++; - return (th->th_func(bc, th->th_spa, th->th_arg)); -} - -static int -traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, - dnode_phys_t *dnp) -{ - zbookmark_t *zb = &bc->bc_bookmark; - int error; - - th->th_hits++; - - bc->bc_dnode = dnp; - bc->bc_errno = 0; - - if (BP_EQUAL(&bc->bc_blkptr, bp)) - return (0); - - bc->bc_blkptr = *bp; - - if (bc->bc_data == NULL) - return (0); - - if (BP_IS_HOLE(bp)) { - ASSERT(th->th_advance & ADVANCE_HOLES); - return (0); - } - - if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { - error = EIO; - } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { - error = 0; - th->th_arc_hits++; - } else { - error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, - BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, - th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb)); - - if (BP_SHOULD_BYTESWAP(bp) && error == 0) - (zb->zb_level > 0 ? byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, - BP_GET_LSIZE(bp)); - th->th_reads++; - } - - if (error) { - bc->bc_errno = error; - error = traverse_callback(th, NULL, bc); - ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); - bc->bc_blkptr.blk_birth = -1ULL; - } - - dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", - bc - &th->th_cache[0][0], error, - zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); - - return (error); -} - -static int -find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) -{ - zbookmark_t *zb = &zseg->seg_start; - traverse_blk_cache_t *bc; - blkptr_t *bp = dnp->dn_blkptr; - int i, first, level; - int nbp = dnp->dn_nblkptr; - int minlevel = zb->zb_level; - int maxlevel = dnp->dn_nlevels - 1; - int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; - int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); - uint64_t blkid = zb->zb_blkid >> bp_shift; - int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; - int rc; - - if (minlevel > maxlevel || blkid >= nbp) - return (ERANGE); - - for (level = maxlevel; level >= minlevel; level--) { - first = P2PHASE(blkid, 1ULL << wshift); - - for (i = first; i < nbp; i++) - if (bp[i].blk_birth > zseg->seg_mintxg || - BP_IS_HOLE(&bp[i]) && do_holes) - break; - - if (i != first) { - i--; - SET_BOOKMARK_LB(zb, level, blkid + (i - first)); - return (ENOTBLK); - } - - bc = &th->th_cache[depth][level]; - - SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, - level, blkid); - - if (rc = traverse_read(th, bc, bp + i, dnp)) { - if (rc != EAGAIN) { - SET_BOOKMARK_LB(zb, level, blkid); - } - return (rc); - } - - if (BP_IS_HOLE(&bp[i])) { - SET_BOOKMARK_LB(zb, level, blkid); - th->th_lastcb.zb_level = ZB_NO_LEVEL; - return (0); - } - - nbp = 1 << wshift; - bp = bc->bc_data; - bp_shift -= wshift; - blkid = zb->zb_blkid >> bp_shift; - } - - return (0); +#include <sys/callb.h> + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ } -static int -get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, - uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) -{ - zseg_t zseg; - zbookmark_t *zb = &zseg.seg_start; - uint64_t object = *objectp; - int i, rc; - - SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); - SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); - - zseg.seg_mintxg = txg; - zseg.seg_maxtxg = -1ULL; - - for (;;) { - rc = find_block(th, &zseg, mdn, depth); - - if (rc == EAGAIN || rc == EINTR || rc == ERANGE) - break; - - if (rc == 0 && zb->zb_level == 0) { - dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; - for (i = 0; i < DNODES_PER_BLOCK; i++) { - object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; - if (object >= *objectp && - dnp[i].dn_type != DMU_OT_NONE && - (type == -1 || dnp[i].dn_type == type)) { - *objectp = object; - *dnpp = &dnp[i]; - return (0); - } - } - } - - rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); - - if (rc == ERANGE) - break; - } - - if (rc == ERANGE) - *objectp = ZB_MAXOBJECT; - - return (rc); -} +struct prefetch_data { + kmutex_t pd_mtx; + kcondvar_t pd_cv; + int pd_blks_max; + int pd_blks_fetched; + int pd_flags; + boolean_t pd_cancel; + boolean_t pd_exited; +}; + +struct traverse_data { + spa_t *td_spa; + uint64_t td_objset; + blkptr_t *td_rootbp; + uint64_t td_min_txg; + int td_flags; + struct prefetch_data *td_pfd; + blkptr_cb_t *td_func; + void *td_arg; +}; /* ARGSUSED */ static void traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { - traverse_handle_t *th = arg; - traverse_blk_cache_t *bc = &th->th_zil_cache; - zbookmark_t *zb = &bc->bc_bookmark; - zseg_t *zseg = list_head(&th->th_seglist); + struct traverse_data *td = arg; + zbookmark_t zb; - if (bp->blk_birth <= zseg->seg_mintxg) + if (bp->blk_birth == 0) return; - if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { - zb->zb_object = 0; - zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; - bc->bc_blkptr = *bp; - (void) traverse_callback(th, zseg, bc); - } + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) + return; + + zb.zb_objset = td->td_objset; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); } /* ARGSUSED */ static void traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { - traverse_handle_t *th = arg; - traverse_blk_cache_t *bc = &th->th_zil_cache; - zbookmark_t *zb = &bc->bc_bookmark; - zseg_t *zseg = list_head(&th->th_seglist); + struct traverse_data *td = arg; if (lrc->lrc_txtype == TX_WRITE) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; - if (bp->blk_birth <= zseg->seg_mintxg) + if (bp->blk_birth == 0) return; - if (claim_txg != 0 && bp->blk_birth >= claim_txg) { - zb->zb_object = lr->lr_foid; - zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); - bc->bc_blkptr = *bp; - (void) traverse_callback(th, zseg, bc); - } + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return; + + zb.zb_objset = td->td_objset; + zb.zb_object = lr->lr_foid; + zb.zb_level = BP_GET_LEVEL(bp); + zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); + VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); } } static void -traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc) +traverse_zil(struct traverse_data *td, zil_header_t *zh) { - spa_t *spa = th->th_spa; - dsl_pool_t *dp = spa_get_dsl(spa); - objset_phys_t *osphys = bc->bc_data; - zil_header_t *zh = &osphys->os_zil_header; uint64_t claim_txg = zh->zh_claim_txg; zilog_t *zilog; - ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]); - ASSERT(bc->bc_bookmark.zb_level == -1); - /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). @@ -546,375 +122,290 @@ traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc) if (claim_txg == 0 && (spa_mode & FWRITE)) return; - th->th_zil_cache.bc_bookmark = bc->bc_bookmark; - - zilog = zil_alloc(dp->dp_meta_objset, zh); + zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); - (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, claim_txg); zil_free(zilog); } static int -traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) +traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, + arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) { - zbookmark_t *zb = &zseg->seg_start; - traverse_blk_cache_t *bc; - dnode_phys_t *dn, *dn_tmp; - int worklimit = 100; - int rc; - - dprintf("<%llu, %llu, %d, %llx>\n", - zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); - - bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; - dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; - - SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); - - rc = traverse_read(th, bc, mosbp, dn); - - if (rc) /* If we get ERESTART, we've got nowhere left to go */ - return (rc == ERESTART ? EINTR : rc); - - ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); - - if (zb->zb_objset != 0) { - uint64_t objset = zb->zb_objset; - dsl_dataset_phys_t *dsp; - - rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, - DMU_OT_DSL_DATASET, ZB_MOS_CACHE); - - if (objset != zb->zb_objset) - rc = advance_objset(zseg, objset, th->th_advance); + int err = 0; + arc_buf_t *buf = NULL; + struct prefetch_data *pd = td->td_pfd; - if (rc != 0) - return (rc); - - dsp = DN_BONUS(dn_tmp); - - bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; - dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; - - SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); - - /* - * If we're traversing an open snapshot, we know that it - * can't be deleted (because it's open) and it can't change - * (because it's a snapshot). Therefore, once we've gotten - * from the uberblock down to the snapshot's objset_phys_t, - * we no longer need to synchronize with spa_sync(); we're - * traversing a completely static block tree from here on. - */ - if (th->th_advance & ADVANCE_NOLOCK) { - ASSERT(th->th_locked); - rw_exit(spa_traverse_rwlock(th->th_spa)); - th->th_locked = 0; - } - - if (BP_IS_HOLE(&dsp->ds_bp)) - rc = ERESTART; - else - rc = traverse_read(th, bc, &dsp->ds_bp, dn); - - if (rc != 0) { - if (rc == ERESTART) - rc = advance_objset(zseg, zb->zb_objset + 1, - th->th_advance); - return (rc); - } - - if (th->th_advance & ADVANCE_PRUNE) - zseg->seg_mintxg = - MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); + if (bp->blk_birth == 0) { + err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg); + return (err); } - if (zb->zb_level == -1) { - ASSERT(zb->zb_object == 0); - ASSERT(zb->zb_blkid == 0); - ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET); - - if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { - rc = traverse_callback(th, zseg, bc); - if (rc) { - ASSERT(rc == EINTR); - return (rc); - } - if ((th->th_advance & ADVANCE_ZIL) && - zb->zb_objset != 0) - traverse_zil(th, bc); - } + if (bp->blk_birth <= td->td_min_txg) + return (0); - return (advance_from_osphys(zseg, th->th_advance)); + if (pd && !pd->pd_exited && + ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { + mutex_enter(&pd->pd_mtx); + ASSERT(pd->pd_blks_fetched >= 0); + while (pd->pd_blks_fetched == 0 && !pd->pd_exited) + cv_wait(&pd->pd_cv, &pd->pd_mtx); + pd->pd_blks_fetched--; + cv_broadcast(&pd->pd_cv); + mutex_exit(&pd->pd_mtx); } - if (zb->zb_object != 0) { - uint64_t object = zb->zb_object; - - rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, - zseg->seg_mintxg, -1, ZB_MDN_CACHE); - - if (object != zb->zb_object) - rc = advance_object(zseg, object, th->th_advance); - - if (rc != 0) - return (rc); - - dn = dn_tmp; + if (td->td_flags & TRAVERSE_PRE) { + err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); + if (err) + return (err); } - if (zb->zb_level == ZB_MAXLEVEL) - zb->zb_level = dn->dn_nlevels - 1; - - for (;;) { - rc = find_block(th, zseg, dn, ZB_DN_CACHE); - - if (rc == EAGAIN || rc == EINTR || rc == ERANGE) - break; - - if (rc == 0) { - bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; - ASSERT(bc->bc_dnode == dn); - ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); - rc = traverse_callback(th, zseg, bc); - if (rc) { - ASSERT(rc == EINTR); - return (rc); - } - if (BP_IS_HOLE(&bc->bc_blkptr)) { - ASSERT(th->th_advance & ADVANCE_HOLES); - rc = ENOTBLK; + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read(NULL, td->td_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visitbp() blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = traverse_visitbp(td, dnp, buf, cbp, &czb); + if (err) + break; + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read(NULL, td->td_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visitbp() blocks below this */ + dnp = buf->b_data; + for (i = 0; i < epb && err == 0; i++, dnp++) { + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, + zb->zb_blkid * epb + i, + dnp->dn_nlevels - 1, j); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_blkptr[j], &czb); + if (err) + break; } } - - rc = advance_block(zseg, dn, rc, th->th_advance); - - if (rc == ERANGE) - break; - + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + int j; + + err = arc_read_nolock(NULL, td->td_spa, bp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + osp = buf->b_data; /* - * Give spa_sync() a chance to run. + * traverse_zil is just here for zdb's leak checking. + * For other consumers, there will be no ZIL blocks. */ - if (th->th_locked && spa_traverse_wanted(th->th_spa)) { - th->th_syncs++; - return (EAGAIN); - } - - if (--worklimit == 0) - return (EAGAIN); - } - - if (rc == ERANGE) - rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); - - return (rc); -} + traverse_zil(td, &osp->os_zil_header); -/* - * It is the caller's responsibility to ensure that the dsl_dataset_t - * doesn't go away during traversal. - */ -int -traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, - blkptr_cb_t func, void *arg) -{ - spa_t *spa = ds->ds_dir->dd_pool->dp_spa; - traverse_handle_t *th; - int err; + for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { + zbookmark_t czb; - th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); + SET_BOOKMARK(&czb, zb->zb_objset, 0, + osp->os_meta_dnode.dn_nlevels - 1, j); + err = traverse_visitbp(td, &osp->os_meta_dnode, buf, + (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j], + &czb); + if (err) + break; + } + } - traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); + if (buf) + (void) arc_buf_remove_ref(buf, &buf); - while ((err = traverse_more(th)) == EAGAIN) - continue; + if (err == 0 && (td->td_flags & TRAVERSE_POST)) + err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); - traverse_fini(th); return (err); } -int -traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg) +/* ARGSUSED */ +static int +traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) { - spa_t *spa = dmu_objset_spa(os); - traverse_handle_t *th; - int err; - - th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL); - - traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ); - - while ((err = traverse_more(th)) == EAGAIN) - continue; + struct prefetch_data *pfd = arg; + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; - traverse_fini(th); - return (err); -} + ASSERT(pfd->pd_blks_fetched >= 0); + if (pfd->pd_cancel) + return (EINTR); -int -traverse_more(traverse_handle_t *th) -{ - zseg_t *zseg = list_head(&th->th_seglist); - uint64_t save_txg; /* XXX won't be necessary with real itinerary */ - krwlock_t *rw = spa_traverse_rwlock(th->th_spa); - blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); - int rc; - - if (zseg == NULL) + if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) return (0); - th->th_restarts++; - - save_txg = zseg->seg_mintxg; - - rw_enter(rw, RW_READER); - th->th_locked = 1; - - rc = traverse_segment(th, zseg, mosbp); - ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); + mutex_enter(&pfd->pd_mtx); + while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) + cv_wait(&pfd->pd_cv, &pfd->pd_mtx); + pfd->pd_blks_fetched++; + cv_broadcast(&pfd->pd_cv); + mutex_exit(&pfd->pd_mtx); - if (th->th_locked) - rw_exit(rw); - th->th_locked = 0; - - zseg->seg_mintxg = save_txg; - - if (rc == ERANGE) { - list_remove(&th->th_seglist, zseg); - kmem_free(zseg, sizeof (*zseg)); - return (EAGAIN); - } + (void) arc_read_nolock(NULL, spa, bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, zb); - return (rc); + return (0); } -/* - * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves - * are not included. The blocks covered by this segment will all have - * mintxg < birth < maxtxg. - */ static void -traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, - uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, - uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) +traverse_prefetch_thread(void *arg) { - zseg_t *zseg; - - zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); + struct traverse_data *td_main = arg; + struct traverse_data td = *td_main; + zbookmark_t czb; - zseg->seg_mintxg = mintxg; - zseg->seg_maxtxg = maxtxg; + td.td_func = traverse_prefetcher; + td.td_arg = td_main->td_pfd; + td.td_pfd = NULL; - zseg->seg_start.zb_objset = sobjset; - zseg->seg_start.zb_object = sobject; - zseg->seg_start.zb_level = slevel; - zseg->seg_start.zb_blkid = sblkid; + SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0); + (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); - zseg->seg_end.zb_objset = eobjset; - zseg->seg_end.zb_object = eobject; - zseg->seg_end.zb_level = elevel; - zseg->seg_end.zb_blkid = eblkid; - - list_insert_tail(&th->th_seglist, zseg); + mutex_enter(&td_main->td_pfd->pd_mtx); + td_main->td_pfd->pd_exited = B_TRUE; + cv_broadcast(&td_main->td_pfd->pd_cv); + mutex_exit(&td_main->td_pfd->pd_mtx); } -void -traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, - uint64_t objset, uint64_t object) +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +static int +traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - if (th->th_advance & ADVANCE_PRE) - traverse_add_segment(th, mintxg, maxtxg, - objset, object, ZB_MAXLEVEL, 0, - objset, object, 0, ZB_MAXBLKID); - else - traverse_add_segment(th, mintxg, maxtxg, - objset, object, 0, 0, - objset, object, 0, ZB_MAXBLKID); -} + struct traverse_data td; + struct prefetch_data pd = { 0 }; + zbookmark_t czb; + int err; -void -traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, - uint64_t objset) -{ - if (th->th_advance & ADVANCE_PRE) - traverse_add_segment(th, mintxg, maxtxg, - objset, 0, -1, 0, - objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); - else - traverse_add_segment(th, mintxg, maxtxg, - objset, 1, 0, 0, - objset, 0, -1, 0); -} + td.td_spa = spa; + td.td_objset = objset; + td.td_rootbp = rootbp; + td.td_min_txg = txg_start; + td.td_func = func; + td.td_arg = arg; + td.td_pfd = &pd; + td.td_flags = flags; + + pd.pd_blks_max = 100; + pd.pd_flags = flags; + mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); + + if (!(flags & TRAVERSE_PREFETCH) || + 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, + &td, TQ_NOQUEUE)) + pd.pd_exited = B_TRUE; + + SET_BOOKMARK(&czb, objset, 0, -1, 0); + err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); + + mutex_enter(&pd.pd_mtx); + pd.pd_cancel = B_TRUE; + cv_broadcast(&pd.pd_cv); + while (!pd.pd_exited) + cv_wait(&pd.pd_cv, &pd.pd_mtx); + mutex_exit(&pd.pd_mtx); + + mutex_destroy(&pd.pd_mtx); + cv_destroy(&pd.pd_cv); -void -traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) -{ - if (th->th_advance & ADVANCE_PRE) - traverse_add_segment(th, mintxg, maxtxg, - 0, 0, -1, 0, - ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); - else - traverse_add_segment(th, mintxg, maxtxg, - 1, 1, 0, 0, - 0, 0, -1, 0); + return (err); } -traverse_handle_t * -traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, - int zio_flags) +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) { - traverse_handle_t *th; - int d, l; - - th = kmem_zalloc(sizeof (*th), KM_SLEEP); - - th->th_spa = spa; - th->th_func = func; - th->th_arg = arg; - th->th_advance = advance; - th->th_lastcb.zb_level = ZB_NO_LEVEL; - th->th_noread.zb_level = ZB_NO_LEVEL; - th->th_zio_flags = zio_flags; - - list_create(&th->th_seglist, sizeof (zseg_t), - offsetof(zseg_t, seg_node)); - - for (d = 0; d < ZB_DEPTH; d++) { - for (l = 0; l < ZB_MAXLEVEL; l++) { - if ((advance & ADVANCE_DATA) || - l != 0 || d != ZB_DN_CACHE) - th->th_cache[d][l].bc_data = - zio_buf_alloc(SPA_MAXBLOCKSIZE); - } - } - - return (th); + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object, + &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); } -void -traverse_fini(traverse_handle_t *th) +/* + * NB: pool must not be changing on-disk (eg, from zdb or sync context). + */ +int +traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg) { - int d, l; - zseg_t *zseg; - - for (d = 0; d < ZB_DEPTH; d++) - for (l = 0; l < ZB_MAXLEVEL; l++) - if (th->th_cache[d][l].bc_data != NULL) - zio_buf_free(th->th_cache[d][l].bc_data, - SPA_MAXBLOCKSIZE); - - while ((zseg = list_head(&th->th_seglist)) != NULL) { - list_remove(&th->th_seglist, zseg); - kmem_free(zseg, sizeof (*zseg)); + int err; + uint64_t obj; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + + /* visit the MOS */ + err = traverse_impl(spa, 0, spa_get_rootblkptr(spa), + 0, TRAVERSE_PRE, func, arg); + if (err) + return (err); + + /* visit each dataset */ + for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) { + dmu_object_info_t doi; + + err = dmu_object_info(mos, obj, &doi); + if (err) + return (err); + + if (doi.doi_type == DMU_OT_DSL_DATASET) { + dsl_dataset_t *ds; + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + err = traverse_dataset(ds, + ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE, + func, arg); + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + } } - - list_destroy(&th->th_seglist); - - dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", - th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, - th->th_syncs, th->th_restarts); - - kmem_free(th, sizeof (*th)); + if (err == ESRCH) + err = 0; + return (err); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index a9f9c54..30e0836 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -1163,12 +1163,13 @@ struct killarg { /* ARGSUSED */ static int -kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; - blkptr_t *bp = &bc->bc_blkptr; - ASSERT3U(bc->bc_errno, ==, 0); + if (bp == NULL) + return (0); ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); @@ -1196,7 +1197,7 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) return (EINVAL); /* - * If we made changes this txg, traverse_dsl_dataset won't find + * If we made changes this txg, traverse_dataset won't find * them. Try again. */ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) @@ -1263,8 +1264,8 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ka.ds = ds; ka.zio = zio; ka.tx = tx; - (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - ADVANCE_POST, kill_blkptr, &ka); + (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + TRAVERSE_POST, kill_blkptr, &ka); (void) zio_wait(zio); } @@ -1657,8 +1658,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) ka.ds = ds; ka.zio = zio; ka.tx = tx; - err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - ADVANCE_POST, kill_blkptr, &ka); + err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + TRAVERSE_POST, kill_blkptr, &ka); ASSERT3U(err, ==, 0); ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); @@ -2850,6 +2851,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) csa->cds->ds_phys->ds_deadlist_obj)); VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, csa->ohds->ds_phys->ds_deadlist_obj)); + + dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index 58a79ca..e5823c5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -232,6 +232,8 @@ dsl_pool_close(dsl_pool_t *dp) mutex_destroy(&dp->dp_lock); mutex_destroy(&dp->dp_scrub_cancel_lock); taskq_destroy(dp->dp_vnrele_taskq); + if (dp->dp_blkstats) + kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c index 5f675b7..950a91f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c @@ -107,6 +107,12 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* back to the generic stuff */ + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + } + bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; @@ -575,6 +581,37 @@ dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) } } +void +dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { + dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; + } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { + dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; + } + + if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds1->ds_object, tx) == 0) { + int err = zap_add_int(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, ds2->ds_object, tx); + VERIFY(err == 0 || err == EEXIST); + if (err == EEXIST) { + /* Both were there to begin with */ + VERIFY(0 == zap_add_int(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, ds1->ds_object, tx)); + } + } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds2->ds_object, tx) == 0) { + VERIFY(0 == zap_add_int(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, ds1->ds_object, tx)); + } +} + struct enqueue_clones_arg { dmu_tx_t *tx; uint64_t originobj; @@ -817,6 +854,52 @@ dsl_pool_scrub_restart(dsl_pool_t *dp) */ static void +count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ + int i; + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } +} + +static void dsl_pool_scrub_clean_done(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -844,6 +927,8 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; int zio_priority; + count_block(dp->dp_blkstats, bp); + if (dp->dp_scrub_isresilver == 0) { /* It's a scrub */ zio_flags |= ZIO_FLAG_SCRUB; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 888b882..54c7c46 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -4075,11 +4075,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_config_syncing = NULL; } - spa->spa_traverse_wanted = B_TRUE; - rw_enter(&spa->spa_traverse_lock, RW_WRITER); - spa->spa_traverse_wanted = B_FALSE; spa->spa_ubsync = spa->spa_uberblock; - rw_exit(&spa->spa_traverse_lock); /* * Clean up the ZIL records for the synced txg. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index 7a41d4f..5735d31 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -428,8 +428,6 @@ spa_add(const char *name, const char *altroot) spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); - rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); - mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); @@ -513,8 +511,6 @@ spa_remove(spa_t *spa) spa_config_lock_destroy(spa); - rw_destroy(&spa->spa_traverse_lock); - cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_async_root_cv); cv_destroy(&spa->spa_scrub_io_cv); @@ -1127,16 +1123,10 @@ zfs_panic_recover(const char *fmt, ...) * ========================================================================== */ -krwlock_t * -spa_traverse_rwlock(spa_t *spa) -{ - return (&spa->spa_traverse_lock); -} - boolean_t -spa_traverse_wanted(spa_t *spa) +spa_shutting_down(spa_t *spa) { - return (spa->spa_traverse_wanted); + return (spa->spa_async_suspended); } dsl_pool_t * @@ -1205,7 +1195,7 @@ spa_first_txg(spa_t *spa) return (spa->spa_first_txg); } -int +pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h index 05e5ffd..3e02689 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h @@ -26,93 +26,29 @@ #ifndef _SYS_DMU_TRAVERSE_H #define _SYS_DMU_TRAVERSE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/zio.h> -#include <sys/dmu.h> -#include <sys/dnode.h> -#include <sys/arc.h> #ifdef __cplusplus extern "C" { #endif -#define ADVANCE_POST 0 /* post-order traversal */ -#define ADVANCE_PRE 0x01 /* pre-order traversal */ -#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */ -#define ADVANCE_DATA 0x04 /* read user data blocks */ -#define ADVANCE_HOLES 0x08 /* visit holes */ -#define ADVANCE_ZIL 0x10 /* visit intent log blocks */ -#define ADVANCE_NOLOCK 0x20 /* Don't grab SPA sync lock */ - -#define ZB_NO_LEVEL -2 -#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */ -#define ZB_MAXBLKID (1ULL << 62) -#define ZB_MAXOBJSET (1ULL << 62) -#define ZB_MAXOBJECT (1ULL << 62) - -#define ZB_MOS_CACHE 0 -#define ZB_MDN_CACHE 1 -#define ZB_DN_CACHE 2 -#define ZB_DEPTH 3 - -typedef struct zseg { - uint64_t seg_mintxg; - uint64_t seg_maxtxg; - zbookmark_t seg_start; - zbookmark_t seg_end; - list_node_t seg_node; -} zseg_t; - -typedef struct traverse_blk_cache { - zbookmark_t bc_bookmark; - blkptr_t bc_blkptr; - void *bc_data; - dnode_phys_t *bc_dnode; - int bc_errno; - int bc_pad1; - uint64_t bc_pad2; -} traverse_blk_cache_t; - -typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg); - -struct traverse_handle { - spa_t *th_spa; - blkptr_cb_t *th_func; - void *th_arg; - uint16_t th_advance; - uint16_t th_locked; - int th_zio_flags; - list_t th_seglist; - traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL]; - traverse_blk_cache_t th_zil_cache; - uint64_t th_hits; - uint64_t th_arc_hits; - uint64_t th_reads; - uint64_t th_callbacks; - uint64_t th_syncs; - uint64_t th_restarts; - zbookmark_t th_noread; - zbookmark_t th_lastcb; -}; - -int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start, - int advance, blkptr_cb_t func, void *arg); -int traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg); +struct dnode_phys; +struct dsl_dataset; -traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg, - int advance, int zio_flags); -void traverse_fini(traverse_handle_t *th); +typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp, + const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); -void traverse_add_dnode(traverse_handle_t *th, - uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object); -void traverse_add_objset(traverse_handle_t *th, - uint64_t mintxg, uint64_t maxtxg, uint64_t objset); -void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg); +#define TRAVERSE_PRE (1<<0) +#define TRAVERSE_POST (1<<1) +#define TRAVERSE_PREFETCH_METADATA (1<<2) +#define TRAVERSE_PREFETCH_DATA (1<<3) +#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) -int traverse_more(traverse_handle_t *th); +int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h index dcf5a44..ef1b904 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -31,6 +31,7 @@ #include <sys/txg_impl.h> #include <sys/zfs_context.h> #include <sys/zio.h> +#include <sys/dnode.h> #ifdef __cplusplus extern "C" { @@ -48,6 +49,25 @@ enum scrub_func { SCRUB_FUNC_NUMFUNCS }; +/* These macros are for indexing into the zfs_all_blkstats_t. */ +#define DMU_OT_DEFERRED DMU_OT_NONE +#define DMU_OT_TOTAL DMU_OT_NUMTYPES + +typedef struct zfs_blkstat { + uint64_t zb_count; + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_gangs; + uint64_t zb_ditto_2_of_2_samevdev; + uint64_t zb_ditto_2_of_3_samevdev; + uint64_t zb_ditto_3_of_3_samevdev; +} zfs_blkstat_t; + +typedef struct zfs_all_blkstats { + zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; +} zfs_all_blkstats_t; + typedef struct dsl_pool { /* Immutable */ @@ -95,6 +115,8 @@ typedef struct dsl_pool { * nobody else could possibly have it for write. */ krwlock_t dp_config_rwlock; + + zfs_all_blkstats_t *dp_blkstats; } dsl_pool_t; int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); @@ -112,6 +134,8 @@ int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, zio_done_func_t *done, void *private, uint32_t arc_flags); void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index b0b758b..1cfa7ec 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -44,7 +44,6 @@ typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct zilog zilog_t; -typedef struct traverse_handle traverse_handle_t; typedef struct spa_aux_vdev spa_aux_vdev_t; struct dsl_pool; @@ -438,8 +437,7 @@ extern void spa_vdev_state_enter(spa_t *spa); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Accessor functions */ -extern krwlock_t *spa_traverse_rwlock(spa_t *spa); -extern boolean_t spa_traverse_wanted(spa_t *spa); +extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); @@ -450,7 +448,7 @@ extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); -extern int spa_state(spa_t *spa); +extern pool_state_t spa_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_alloc(spa_t *spa); extern uint64_t spa_get_space(spa_t *spa); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index ab41ba6..8aeb414 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -101,9 +101,8 @@ struct spa { nvlist_t *spa_config_syncing; /* currently syncing config */ uint64_t spa_config_txg; /* txg of last config change */ int spa_sync_pass; /* iterate-to-convergence */ - int spa_state; /* pool state */ + pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ - uint8_t spa_traverse_wanted; /* traverse lock wanted */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; @@ -125,7 +124,6 @@ struct spa { uint64_t spa_syncing_txg; /* txg currently syncing */ uint64_t spa_sync_bplist_obj; /* object for deferred frees */ bplist_t spa_sync_bplist; /* deferred-free bplist */ - krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */ uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h index a58be84..7413c66 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h @@ -26,8 +26,6 @@ #ifndef _SYS_TXG_IMPL_H #define _SYS_TXG_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/txg.h> @@ -66,7 +64,6 @@ typedef struct tx_state { kthread_t *tx_sync_thread; kthread_t *tx_quiesce_thread; - kthread_t *tx_timelimit_thread; } tx_state_t; #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c index 8650fa1..da1e83e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c @@ -309,12 +309,14 @@ txg_sync_thread(void *arg) uint64_t txg; /* - * We sync when there's someone waiting on us, or the - * quiesce thread has handed off a txg to us, or we have - * reached our timeout. + * We sync when we're scrubbing, there's someone waiting + * on us, or the quiesce thread has handed off a txg to + * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); - while (!tx->tx_exiting && timer > 0 && + while ((dp->dp_scrub_func == SCRUB_FUNC_NONE || + spa_shutting_down(dp->dp_spa)) && + !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c index aa8f6f0..88c15b7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c @@ -279,7 +279,7 @@ vdev_cache_read(zio_t *zio) /* * If the I/O straddles two or more cache blocks, don't cache it. */ - if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS)) + if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) return (EXDEV); ASSERT(cache_phase + zio->io_size <= VCBS); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index b07a8c1..79a9966 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -94,23 +94,13 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); static kmutex_t zvol_state_lock; static uint32_t zvol_minors; -#define NUM_EXTENTS ((SPA_MAXBLOCKSIZE) / sizeof (zvol_extent_t)) - typedef struct zvol_extent { + list_node_t ze_node; dva_t ze_dva; /* dva associated with this extent */ - uint64_t ze_stride; /* extent stride */ - uint64_t ze_size; /* number of blocks in extent */ + uint64_t ze_nblks; /* number of blocks in extent */ } zvol_extent_t; /* - * The list of extents associated with the dump device - */ -typedef struct zvol_ext_list { - zvol_extent_t zl_extents[NUM_EXTENTS]; - struct zvol_ext_list *zl_next; -} zvol_ext_list_t; - -/* * The in-core state of each volume. */ typedef struct zvol_state { @@ -124,7 +114,7 @@ typedef struct zvol_state { uint32_t zv_mode; /* DS_MODE_* flags at open time */ uint32_t zv_total_opens; /* total open count */ zilog_t *zv_zilog; /* ZIL handle */ - zvol_ext_list_t *zv_list; /* List of extents for dump */ + list_t zv_extents; /* List of extents for dump */ uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ znode_t zv_znode; /* for range locking */ int zv_state; @@ -350,12 +340,12 @@ static void zvol_serve_one(zvol_state_t *zv, struct bio *bp) { uint64_t off, volsize; - size_t size, resid; + size_t resid; char *addr; objset_t *os; rl_t *rl; int error = 0; - boolean_t reading; + boolean_t doread = (bp->bio_cmd == BIO_READ); off = bp->bio_offset; volsize = zv->zv_volsize; @@ -373,18 +363,16 @@ zvol_serve_one(zvol_state_t *zv, struct bio *bp) * we can't change the data whilst calculating the checksum. * A better approach than a per zvol rwlock would be to lock ranges. */ - reading = (bp->bio_cmd == BIO_READ); rl = zfs_range_lock(&zv->zv_znode, off, resid, - reading ? RL_READER : RL_WRITER); + doread ? RL_READER : RL_WRITER); while (resid != 0 && off < volsize) { - - size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */ + size_t size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */ if (size > volsize - off) /* don't write past the end */ size = volsize - off; - if (reading) { + if (doread) { error = dmu_read(os, ZVOL_OBJ, off, size, addr); } else { dmu_tx_t *tx = dmu_tx_create(os); @@ -457,128 +445,81 @@ zvol_worker(void *arg) } } -void -zvol_init_extent(zvol_extent_t *ze, blkptr_t *bp) -{ - ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ - ze->ze_stride = 0; - ze->ze_size = 1; -} - /* extent mapping arg */ struct maparg { - zvol_ext_list_t *ma_list; - zvol_extent_t *ma_extent; - int ma_gang; + zvol_state_t *ma_zv; + uint64_t ma_blks; }; /*ARGSUSED*/ static int -zvol_map_block(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) { - zbookmark_t *zb = &bc->bc_bookmark; - blkptr_t *bp = &bc->bc_blkptr; - void *data = bc->bc_data; - dnode_phys_t *dnp = bc->bc_dnode; - struct maparg *ma = (struct maparg *)arg; - uint64_t stride; - - /* If there is an error, then keep trying to make progress */ - if (bc->bc_errno) - return (ERESTART); - -#ifdef ZFS_DEBUG - if (zb->zb_level == -1) { - ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); - ASSERT3U(BP_GET_LEVEL(bp), ==, 0); - } else { - ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); - ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - } + struct maparg *ma = arg; + zvol_extent_t *ze; + int bs = ma->ma_zv->zv_volblocksize; - if (zb->zb_level > 0) { - uint64_t fill = 0; - blkptr_t *bpx, *bpend; + if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) + return (0); - for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx); - bpx < bpend; bpx++) { - if (bpx->blk_birth != 0) { - fill += bpx->blk_fill; - } else { - ASSERT(bpx->blk_fill == 0); - } - } - ASSERT3U(fill, ==, bp->blk_fill); - } + VERIFY3U(ma->ma_blks, ==, zb->zb_blkid); + ma->ma_blks++; - if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) { - uint64_t fill = 0; - dnode_phys_t *dnx, *dnend; + /* Abort immediately if we have encountered gang blocks */ + if (BP_IS_GANG(bp)) + return (EFRAGS); - for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT); - dnx < dnend; dnx++) { - if (dnx->dn_type != DMU_OT_NONE) - fill++; - } - ASSERT3U(fill, ==, bp->blk_fill); + /* + * See if the block is at the end of the previous extent. + */ + ze = list_tail(&ma->ma_zv->zv_extents); + if (ze && + DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) && + DVA_GET_OFFSET(BP_IDENTITY(bp)) == + DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) { + ze->ze_nblks++; + return (0); } -#endif - if (zb->zb_level || dnp->dn_type == DMU_OT_DNODE) - return (0); + dprintf_bp(bp, "%s", "next blkptr:"); - /* Abort immediately if we have encountered gang blocks */ - if (BP_IS_GANG(bp)) { - ma->ma_gang++; - return (EINTR); - } + /* start a new extent */ + ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP); + ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ + ze->ze_nblks = 1; + list_insert_tail(&ma->ma_zv->zv_extents, ze); + return (0); +} - /* first time? */ - if (ma->ma_extent->ze_size == 0) { - zvol_init_extent(ma->ma_extent, bp); - return (0); - } +static void +zvol_free_extents(zvol_state_t *zv) +{ + zvol_extent_t *ze; - stride = (DVA_GET_OFFSET(&bp->blk_dva[0])) - - ((DVA_GET_OFFSET(&ma->ma_extent->ze_dva)) + - (ma->ma_extent->ze_size - 1) * (ma->ma_extent->ze_stride)); - if (DVA_GET_VDEV(BP_IDENTITY(bp)) == - DVA_GET_VDEV(&ma->ma_extent->ze_dva)) { - if (ma->ma_extent->ze_stride == 0) { - /* second block in this extent */ - ma->ma_extent->ze_stride = stride; - ma->ma_extent->ze_size++; - return (0); - } else if (ma->ma_extent->ze_stride == stride) { - /* - * the block we allocated has the same - * stride - */ - ma->ma_extent->ze_size++; - return (0); - } + while (ze = list_head(&zv->zv_extents)) { + list_remove(&zv->zv_extents, ze); + kmem_free(ze, sizeof (zvol_extent_t)); } +} - /* - * dtrace -n 'zfs-dprintf - * /stringof(arg0) == "zvol.c"/ - * { - * printf("%s: %s", stringof(arg1), stringof(arg3)) - * } ' - */ - dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n", - ma->ma_extent->ze_size, ma->ma_extent->ze_stride, stride); - dprintf_bp(bp, "%s", "next blkptr:"); - /* start a new extent */ - if (ma->ma_extent == &ma->ma_list->zl_extents[NUM_EXTENTS - 1]) { - ma->ma_list->zl_next = kmem_zalloc(sizeof (zvol_ext_list_t), - KM_SLEEP); - ma->ma_list = ma->ma_list->zl_next; - ma->ma_extent = &ma->ma_list->zl_extents[0]; - } else { - ma->ma_extent++; +static int +zvol_get_lbas(zvol_state_t *zv) +{ + struct maparg ma; + int err; + + ma.ma_zv = zv; + ma.ma_blks = 0; + zvol_free_extents(zv); + + err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma); + if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) { + zvol_free_extents(zv); + return (err ? err : EIO); } - zvol_init_extent(ma->ma_extent, bp); + return (0); } @@ -676,106 +617,6 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { }; /* - * reconstruct dva that gets us to the desired offset (offset - * is in bytes) - */ -int -zvol_get_dva(zvol_state_t *zv, uint64_t offset, dva_t *dva) -{ - zvol_ext_list_t *zl; - zvol_extent_t *ze; - int idx; - uint64_t tmp; - - if ((zl = zv->zv_list) == NULL) - return (EIO); - idx = 0; - ze = &zl->zl_extents[0]; - while (offset >= ze->ze_size * zv->zv_volblocksize) { - offset -= ze->ze_size * zv->zv_volblocksize; - - if (idx == NUM_EXTENTS - 1) { - /* we've reached the end of this array */ - ASSERT(zl->zl_next != NULL); - if (zl->zl_next == NULL) - return (-1); - zl = zl->zl_next; - ze = &zl->zl_extents[0]; - idx = 0; - } else { - ze++; - idx++; - } - } - DVA_SET_VDEV(dva, DVA_GET_VDEV(&ze->ze_dva)); - tmp = DVA_GET_OFFSET((&ze->ze_dva)); - tmp += (ze->ze_stride * (offset / zv->zv_volblocksize)); - DVA_SET_OFFSET(dva, tmp); - return (0); -} - -static void -zvol_free_extents(zvol_state_t *zv) -{ - zvol_ext_list_t *zl; - zvol_ext_list_t *tmp; - - if (zv->zv_list != NULL) { - zl = zv->zv_list; - while (zl != NULL) { - tmp = zl->zl_next; - kmem_free(zl, sizeof (zvol_ext_list_t)); - zl = tmp; - } - zv->zv_list = NULL; - } -} - -int -zvol_get_lbas(zvol_state_t *zv) -{ - struct maparg ma; - zvol_ext_list_t *zl; - zvol_extent_t *ze; - uint64_t blocks = 0; - int err; - - ma.ma_list = zl = kmem_zalloc(sizeof (zvol_ext_list_t), KM_SLEEP); - ma.ma_extent = &ma.ma_list->zl_extents[0]; - ma.ma_gang = 0; - zv->zv_list = ma.ma_list; - - err = traverse_zvol(zv->zv_objset, ADVANCE_PRE, zvol_map_block, &ma); - if (err == EINTR && ma.ma_gang) { - /* - * We currently don't support dump devices when the pool - * is so fragmented that our allocation has resulted in - * gang blocks. - */ - zvol_free_extents(zv); - return (EFRAGS); - } - ASSERT3U(err, ==, 0); - - ze = &zl->zl_extents[0]; - while (ze) { - blocks += ze->ze_size; - if (ze == &zl->zl_extents[NUM_EXTENTS - 1]) { - zl = zl->zl_next; - ze = &zl->zl_extents[0]; - } else { - ze++; - } - } - if (blocks != (zv->zv_volsize / zv->zv_volblocksize)) { - zvol_free_extents(zv); - return (EIO); - } - - return (0); -} - -/* * Create a minor node (plus a whole lot more) for the specified volume. */ int @@ -830,6 +671,8 @@ zvol_create_minor(const char *name, major_t maj) mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); + list_create(&zv->zv_extents, sizeof (zvol_extent_t), + offsetof(zvol_extent_t, ze_node)); /* get and cache the blocksize */ error = dmu_object_info(os, ZVOL_OBJ, &doi); ASSERT(error == 0); @@ -1091,6 +934,8 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) if (error == ENOTSUP) error = EBUSY; dmu_tx_commit(tx); + if (error == 0) + zv->zv_volblocksize = volblocksize; } end: mutex_exit(&zvol_state_lock); @@ -1225,7 +1070,6 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) int error = 0; objset_t *os = zv->zv_objset; nvlist_t *nv = NULL; - uint64_t checksum, compress, refresrv; ASSERT(MUTEX_HELD(&zvol_state_lock)); @@ -1248,12 +1092,16 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &zv->zv_volsize, tx); } else { + uint64_t checksum, compress, refresrv, vbs; + error = dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); error = error ? error : dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); error = error ? error : dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); + error = error ? error : dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL); error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, @@ -1263,6 +1111,9 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv, tx); + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, + &vbs, tx); } dmu_tx_commit(tx); @@ -1288,6 +1139,9 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_CHECKSUM), ZIO_CHECKSUM_OFF) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + SPA_MAXBLOCKSIZE) == 0); error = zfs_set_prop_nvlist(zv->zv_name, nv); nvlist_free(nv); @@ -1367,7 +1221,7 @@ zvol_dump_fini(zvol_state_t *zv) objset_t *os = zv->zv_objset; nvlist_t *nv; int error = 0; - uint64_t checksum, compress, refresrv; + uint64_t checksum, compress, refresrv, vbs; /* * Attempt to restore the zvol back to its pre-dumpified state. @@ -1392,6 +1246,8 @@ zvol_dump_fini(zvol_state_t *zv) zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); (void) nvlist_add_uint64(nv, @@ -1400,6 +1256,8 @@ zvol_dump_fini(zvol_state_t *zv) zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); (void) nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs); (void) zfs_set_prop_nvlist(zv->zv_name, nv); nvlist_free(nv); |