summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormm <mm@FreeBSD.org>2010-05-13 20:32:56 +0000
committermm <mm@FreeBSD.org>2010-05-13 20:32:56 +0000
commit6f4ba1587b481253ce8265f50114ebe438759779 (patch)
tree1cb19e0cf3e413fa2f84c4f8c268f6161be50805
parentcdb02238eea11b2e4495d0b09780e3669bce9e4c (diff)
downloadFreeBSD-src-6f4ba1587b481253ce8265f50114ebe438759779.zip
FreeBSD-src-6f4ba1587b481253ce8265f50114ebe438759779.tar.gz
Import OpenSolaris revision 7837:001de5627df3
It includes the following changes: - parallel reads in traversal code (Bug ID 6333409) - faster traversal for zfs send (Bug ID 6418042) - traversal code cleanup (Bug ID 6725675) - fix for two scrub related bugs (Bug ID 6729696, 6730101) - fix assertion in dbuf_verify (Bug ID 6752226) - fix panic during zfs send with i/o errors (Bug ID 6577985) - replace P2CROSS with P2BOUNDARY (Bug ID 6725680) List of OpenSolaris Bug IDs: 6333409, 6418042, 6757112, 6725668, 6725675, 6725680, 6725698, 6729696, 6730101, 6752226, 6577985, 6755042 Approved by: pjd, delphij (mentor) Obtained from: OpenSolaris (multiple Bug IDs) MFC after: 1 week
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb.c358
-rw-r--r--cddl/contrib/opensolaris/cmd/ztest/ztest.c155
-rw-r--r--cddl/contrib/opensolaris/lib/libzpool/common/kernel.c4
-rw-r--r--cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h3
-rw-r--r--cddl/contrib/opensolaris/lib/libzpool/common/taskq.c12
-rw-r--r--sys/cddl/boot/zfs/zfsimpl.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/sysmacros.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c26
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c80
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c1075
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c19
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c85
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h88
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c308
23 files changed, 712 insertions, 1580 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
index 9e1e106..f0b4ba4 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@@ -50,6 +50,7 @@
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/zfs_fuid.h>
+#include <sys/arc.h>
#undef ZFS_MAXNAMELEN
#undef verify
#include <libzfs.h>
@@ -62,8 +63,6 @@ typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
extern void dump_intent_log(zilog_t *);
uint64_t *zopt_object = NULL;
int zopt_objects = 0;
-int zdb_advance = ADVANCE_PRE;
-zbookmark_t zdb_noread = { 0, 0, ZB_NO_LEVEL, 0 };
libzfs_handle_t *g_zfs;
boolean_t zdb_sig_user_data = B_TRUE;
int zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
@@ -88,8 +87,8 @@ static void
usage(void)
{
(void) fprintf(stderr,
- "Usage: %s [-udibcsvL] [-U cachefile_path] [-O order] "
- "[-B os:obj:level:blkid] [-S user:cksumalg] "
+ "Usage: %s [-udibcsv] [-U cachefile_path] "
+ "[-S user:cksumalg] "
"dataset [object...]\n"
" %s -C [pool]\n"
" %s -l dev\n"
@@ -109,13 +108,8 @@ usage(void)
"dump blkptr signatures\n");
(void) fprintf(stderr, " -v verbose (applies to all others)\n");
(void) fprintf(stderr, " -l dump label contents\n");
- (void) fprintf(stderr, " -L live pool (allows some errors)\n");
- (void) fprintf(stderr, " -O [!]<pre|post|prune|data|holes> "
- "visitation order\n");
(void) fprintf(stderr, " -U cachefile_path -- use alternate "
"cachefile\n");
- (void) fprintf(stderr, " -B objset:object:level:blkid -- "
- "simulate bad block\n");
(void) fprintf(stderr, " -R read and display block from a "
"device\n");
(void) fprintf(stderr, " -e Pool is exported/destroyed/"
@@ -138,7 +132,7 @@ fatal(const char *fmt, ...)
va_end(ap);
(void) fprintf(stderr, "\n");
- exit(1);
+ abort();
}
static void
@@ -571,7 +565,7 @@ dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
}
static uint64_t
-blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid)
+blkid2offset(const dnode_phys_t *dnp, int level, uint64_t blkid)
{
if (level < 0)
return (blkid);
@@ -602,115 +596,104 @@ sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
(u_longlong_t)bp->blk_birth);
}
-/* ARGSUSED */
-static int
-zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
+static void
+print_indirect(blkptr_t *bp, const zbookmark_t *zb,
+ const dnode_phys_t *dnp)
{
- zbookmark_t *zb = &bc->bc_bookmark;
- blkptr_t *bp = &bc->bc_blkptr;
- void *data = bc->bc_data;
- dnode_phys_t *dnp = bc->bc_dnode;
- char blkbuf[BP_SPRINTF_LEN + 80];
+ char blkbuf[BP_SPRINTF_LEN];
int l;
- if (bc->bc_errno) {
- (void) sprintf(blkbuf,
- "Error %d reading <%llu, %llu, %lld, %llu>: ",
- bc->bc_errno,
- (u_longlong_t)zb->zb_objset,
- (u_longlong_t)zb->zb_object,
- (u_longlong_t)zb->zb_level,
- (u_longlong_t)zb->zb_blkid);
- goto out;
- }
-
- if (zb->zb_level == -1) {
- ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
- ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
- } else {
- ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
- ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
- }
-
- if (zb->zb_level > 0) {
- uint64_t fill = 0;
- blkptr_t *bpx, *bpend;
-
- for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
- bpx < bpend; bpx++) {
- if (bpx->blk_birth != 0) {
- fill += bpx->blk_fill;
- } else {
- ASSERT(bpx->blk_fill == 0);
- }
- }
- ASSERT3U(fill, ==, bp->blk_fill);
- }
-
- if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
- uint64_t fill = 0;
- dnode_phys_t *dnx, *dnend;
+ ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+ ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
- for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
- dnx < dnend; dnx++) {
- if (dnx->dn_type != DMU_OT_NONE)
- fill++;
- }
- ASSERT3U(fill, ==, bp->blk_fill);
- }
-
- (void) sprintf(blkbuf, "%16llx ",
+ (void) printf("%16llx ",
(u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
ASSERT(zb->zb_level >= 0);
for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
if (l == zb->zb_level) {
- (void) sprintf(blkbuf + strlen(blkbuf), "L%llx",
- (u_longlong_t)zb->zb_level);
+ (void) printf("L%llx", (u_longlong_t)zb->zb_level);
} else {
- (void) sprintf(blkbuf + strlen(blkbuf), " ");
+ (void) printf(" ");
}
}
-out:
- if (bp->blk_birth == 0) {
- (void) sprintf(blkbuf + strlen(blkbuf), "<hole>");
- (void) printf("%s\n", blkbuf);
- } else {
- sprintf_blkptr_compact(blkbuf + strlen(blkbuf), bp,
- dump_opt['d'] > 5 ? 1 : 0);
- (void) printf("%s\n", blkbuf);
+ sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+ (void) printf("%s\n", blkbuf);
+}
+
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+static int
+visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
+ blkptr_t *bp, const zbookmark_t *zb)
+{
+ int err;
+
+ if (bp->blk_birth == 0)
+ return (0);
+
+ print_indirect(bp, zb, dnp);
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ uint32_t flags = ARC_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ arc_buf_t *buf;
+ uint64_t fill = 0;
+
+ err = arc_read_nolock(NULL, spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err)
+ return (err);
+
+ /* recursively visit blocks below this */
+ cbp = buf->b_data;
+ for (i = 0; i < epb; i++, cbp++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ err = visit_indirect(spa, dnp, cbp, &czb);
+ if (err)
+ break;
+ fill += cbp->blk_fill;
+ }
+ ASSERT3U(fill, ==, bp->blk_fill);
+ (void) arc_buf_remove_ref(buf, &buf);
}
- return (bc->bc_errno ? ERESTART : 0);
+ return (err);
}
/*ARGSUSED*/
static void
-dump_indirect(objset_t *os, uint64_t object, void *data, size_t size)
+dump_indirect(dnode_t *dn)
{
- traverse_handle_t *th;
- uint64_t objset = dmu_objset_id(os);
- int advance = zdb_advance;
+ dnode_phys_t *dnp = dn->dn_phys;
+ int j;
+ zbookmark_t czb;
(void) printf("Indirect blocks:\n");
- if (object == 0)
- advance |= ADVANCE_DATA;
-
- th = traverse_init(dmu_objset_spa(os), zdb_indirect_cb, NULL, advance,
- ZIO_FLAG_CANFAIL);
- th->th_noread = zdb_noread;
-
- traverse_add_dnode(th, 0, -1ULL, objset, object);
-
- while (traverse_more(th) == EAGAIN)
- continue;
+ SET_BOOKMARK(&czb, dmu_objset_id(&dn->dn_objset->os),
+ dn->dn_object, dnp->dn_nlevels - 1, 0);
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ czb.zb_blkid = j;
+ (void) visit_indirect(dmu_objset_spa(&dn->dn_objset->os), dnp,
+ &dnp->dn_blkptr[j], &czb);
+ }
(void) printf("\n");
-
- traverse_fini(th);
}
/*ARGSUSED*/
@@ -1093,7 +1076,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
}
if (verbosity >= 5)
- dump_indirect(os, object, NULL, 0);
+ dump_indirect(dn);
if (verbosity >= 5) {
/*
@@ -1458,18 +1441,17 @@ typedef struct zdb_blkstats {
#define DMU_OT_DEFERRED DMU_OT_NONE
#define DMU_OT_TOTAL DMU_OT_NUMTYPES
-#define ZB_TOTAL ZB_MAXLEVEL
+#define ZB_TOTAL DN_MAX_LEVELS
typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
uint64_t zcb_errors[256];
- traverse_blk_cache_t *zcb_cache;
int zcb_readfails;
int zcb_haderrors;
} zdb_cb_t;
static void
-zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
+zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
{
for (int i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
@@ -1485,7 +1467,7 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
if (dump_opt['S']) {
boolean_t print_sig;
- print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 &&
+ print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 &&
BP_GET_TYPE(bp) == DMU_OT_PLAIN_FILE_CONTENTS);
if (BP_GET_CHECKSUM(bp) < zdb_sig_cksumalg)
@@ -1507,56 +1489,55 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
}
}
- if (!dump_opt['L'])
- VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
- NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
+ VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
+ NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
}
static int
-zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
+ const dnode_phys_t *dnp, void *arg)
{
- zbookmark_t *zb = &bc->bc_bookmark;
zdb_cb_t *zcb = arg;
- blkptr_t *bp = &bc->bc_blkptr;
- dmu_object_type_t type = BP_GET_TYPE(bp);
char blkbuf[BP_SPRINTF_LEN];
- int error = 0;
- ASSERT(!BP_IS_HOLE(bp));
+ if (bp == NULL)
+ return (0);
- zdb_count_block(spa, zcb, bp, type);
+ zdb_count_block(spa, zcb, bp, BP_GET_TYPE(bp));
- if (bc->bc_errno) {
- if (zcb->zcb_readfails++ < 10 && dump_opt['L']) {
- uberblock_t ub;
- vdev_uberblock_load(NULL, spa->spa_root_vdev, &ub);
- if (ub.ub_txg != 0)
- spa->spa_ubsync = ub;
- error = EAGAIN;
- } else {
+ if (dump_opt['c'] || dump_opt['S']) {
+ int ioerr, size;
+ void *data;
+
+ size = BP_GET_LSIZE(bp);
+ data = malloc(size);
+ ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB, zb));
+ free(data);
+
+ /* We expect io errors on intent log */
+ if (ioerr && BP_GET_TYPE(bp) != DMU_OT_INTENT_LOG) {
zcb->zcb_haderrors = 1;
- zcb->zcb_errors[bc->bc_errno]++;
- error = ERESTART;
- }
+ zcb->zcb_errors[ioerr]++;
- if (dump_opt['b'] >= 3 || (dump_opt['b'] >= 2 && bc->bc_errno))
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
- else
- blkbuf[0] = '\0';
-
- if (!dump_opt['S']) {
- (void) printf("zdb_blkptr_cb: Got error %d reading "
- "<%llu, %llu, %lld, %llx> %s -- %s\n",
- bc->bc_errno,
- (u_longlong_t)zb->zb_objset,
- (u_longlong_t)zb->zb_object,
- (u_longlong_t)zb->zb_level,
- (u_longlong_t)zb->zb_blkid,
- blkbuf,
- error == EAGAIN ? "retrying" : "skipping");
+ if (dump_opt['b'] >= 2)
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+ else
+ blkbuf[0] = '\0';
+
+ if (!dump_opt['S']) {
+ (void) printf("zdb_blkptr_cb: "
+ "Got error %d reading "
+ "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+ ioerr,
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level,
+ (u_longlong_t)zb->zb_blkid,
+ blkbuf);
+ }
}
-
- return (error);
}
zcb->zcb_readfails = 0;
@@ -1566,8 +1547,8 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
(void) printf("objset %llu object %llu offset 0x%llx %s\n",
(u_longlong_t)zb->zb_objset,
(u_longlong_t)zb->zb_object,
- (u_longlong_t)blkid2offset(bc->bc_dnode,
- zb->zb_level, zb->zb_blkid), blkbuf);
+ (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid),
+ blkbuf);
}
return (0);
@@ -1576,22 +1557,12 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
static int
dump_block_stats(spa_t *spa)
{
- traverse_handle_t *th;
zdb_cb_t zcb = { 0 };
- traverse_blk_cache_t dummy_cache = { 0 };
zdb_blkstats_t *zb, *tzb;
uint64_t alloc, space, logalloc;
vdev_t *rvd = spa->spa_root_vdev;
int leaks = 0;
- int advance = zdb_advance;
- int c, e, flags;
-
- zcb.zcb_cache = &dummy_cache;
-
- if (dump_opt['c'] || dump_opt['S'])
- advance |= ADVANCE_DATA;
-
- advance |= ADVANCE_PRUNE | ADVANCE_ZIL;
+ int c, e;
if (!dump_opt['S']) {
(void) printf("\nTraversing all blocks to %sverify"
@@ -1607,8 +1578,7 @@ dump_block_stats(spa_t *spa)
* it's not part of any space map) is a double allocation,
* reference to a freed block, or an unclaimed log block.
*/
- if (!dump_opt['L'])
- zdb_leak_init(spa);
+ zdb_leak_init(spa);
/*
* If there's a deferred-free bplist, process that first.
@@ -1634,22 +1604,7 @@ dump_block_stats(spa_t *spa)
bplist_close(bpl);
}
- /*
- * Now traverse the pool. If we're reading all data to verify
- * checksums, do a scrubbing read so that we validate all copies.
- */
- flags = ZIO_FLAG_CANFAIL;
- if (advance & ADVANCE_DATA)
- flags |= ZIO_FLAG_SCRUB;
- th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags);
- th->th_noread = zdb_noread;
-
- traverse_add_pool(th, 0, spa_first_txg(spa) + TXG_CONCURRENT_STATES);
-
- while (traverse_more(th) == EAGAIN)
- continue;
-
- traverse_fini(th);
+ zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb);
if (zcb.zcb_haderrors && !dump_opt['S']) {
(void) printf("\nError counts:\n\n");
@@ -1665,8 +1620,7 @@ dump_block_stats(spa_t *spa)
/*
* Report any leaked segments.
*/
- if (!dump_opt['L'])
- zdb_leak_fini(spa);
+ zdb_leak_fini(spa);
/*
* If we're interested in printing out the blkptr signatures,
@@ -1676,10 +1630,6 @@ dump_block_stats(spa_t *spa)
if (dump_opt['S'])
return (zcb.zcb_haderrors ? 3 : 0);
- if (dump_opt['L'])
- (void) printf("\n\n *** Live pool traversal; "
- "block counts are only approximate ***\n\n");
-
alloc = spa_get_alloc(spa);
space = spa_get_space(spa);
@@ -2285,7 +2235,6 @@ main(int argc, char **argv)
int dump_all = 1;
int verbose = 0;
int error;
- int flag, set;
int exported = 0;
char *vdev_dir = NULL;
@@ -2294,7 +2243,7 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv);
- while ((c = getopt(argc, argv, "udibcsvCLO:B:S:U:lRep:")) != -1) {
+ while ((c = getopt(argc, argv, "udibcsvCS:U:lRep:")) != -1) {
switch (c) {
case 'u':
case 'd':
@@ -2308,49 +2257,6 @@ main(int argc, char **argv)
dump_opt[c]++;
dump_all = 0;
break;
- case 'L':
- dump_opt[c]++;
- break;
- case 'O':
- endstr = optarg;
- if (endstr[0] == '!') {
- endstr++;
- set = 0;
- } else {
- set = 1;
- }
- if (strcmp(endstr, "post") == 0) {
- flag = ADVANCE_PRE;
- set = !set;
- } else if (strcmp(endstr, "pre") == 0) {
- flag = ADVANCE_PRE;
- } else if (strcmp(endstr, "prune") == 0) {
- flag = ADVANCE_PRUNE;
- } else if (strcmp(endstr, "data") == 0) {
- flag = ADVANCE_DATA;
- } else if (strcmp(endstr, "holes") == 0) {
- flag = ADVANCE_HOLES;
- } else {
- usage();
- }
- if (set)
- zdb_advance |= flag;
- else
- zdb_advance &= ~flag;
- break;
- case 'B':
- endstr = optarg - 1;
- zdb_noread.zb_objset = strtoull(endstr + 1, &endstr, 0);
- zdb_noread.zb_object = strtoull(endstr + 1, &endstr, 0);
- zdb_noread.zb_level = strtol(endstr + 1, &endstr, 0);
- zdb_noread.zb_blkid = strtoull(endstr + 1, &endstr, 16);
- (void) printf("simulating bad block "
- "<%llu, %llu, %lld, %llx>\n",
- (u_longlong_t)zdb_noread.zb_objset,
- (u_longlong_t)zdb_noread.zb_object,
- (u_longlong_t)zdb_noread.zb_level,
- (u_longlong_t)zdb_noread.zb_blkid);
- break;
case 'v':
verbose++;
break;
@@ -2387,21 +2293,17 @@ main(int argc, char **argv)
}
}
- if (vdev_dir != NULL && exported == 0)
- (void) fatal("-p option requires use of -e\n");
+ if (vdev_dir != NULL && exported == 0) {
+ (void) fprintf(stderr, "-p option requires use of -e\n");
+ usage();
+ }
kernel_init(FREAD);
g_zfs = libzfs_init();
ASSERT(g_zfs != NULL);
- /*
- * Disable vdev caching. If we don't do this, live pool traversal
- * won't make progress because it will never see disk updates.
- */
- zfs_vdev_cache_size = 0;
-
for (c = 0; c < 256; c++) {
- if (dump_all && c != 'L' && c != 'l' && c != 'R')
+ if (dump_all && c != 'l' && c != 'R')
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
index f3e7d64..ff55c29 100644
--- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c
+++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
@@ -77,7 +77,6 @@
#include <sys/dmu.h>
#include <sys/txg.h>
#include <sys/zap.h>
-#include <sys/dmu_traverse.h>
#include <sys/dmu_objset.h>
#include <sys/poll.h>
#include <sys/stat.h>
@@ -151,7 +150,6 @@ typedef struct ztest_args {
hrtime_t za_start;
hrtime_t za_stop;
hrtime_t za_kill;
- traverse_handle_t *za_th;
/*
* Thread-local variables can go here to aid debugging.
*/
@@ -206,7 +204,6 @@ ztest_info_t ztest_info[] = {
{ ztest_dmu_object_alloc_free, 1, &zopt_always },
{ ztest_zap, 30, &zopt_always },
{ ztest_zap_parallel, 100, &zopt_always },
- { ztest_traverse, 1, &zopt_often },
{ ztest_dsl_prop_get_set, 1, &zopt_sometimes },
{ ztest_dmu_objset_create_destroy, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
@@ -1447,152 +1444,6 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
(void) rw_unlock(&ztest_shared->zs_name_lock);
}
-#define ZTEST_TRAVERSE_BLOCKS 1000
-
-static int
-ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
-{
- ztest_args_t *za = arg;
- zbookmark_t *zb = &bc->bc_bookmark;
- blkptr_t *bp = &bc->bc_blkptr;
- dnode_phys_t *dnp = bc->bc_dnode;
- traverse_handle_t *th = za->za_th;
- uint64_t size = BP_GET_LSIZE(bp);
-
- /*
- * Level -1 indicates the objset_phys_t or something in its intent log.
- */
- if (zb->zb_level == -1) {
- if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
- ASSERT3U(zb->zb_object, ==, 0);
- ASSERT3U(zb->zb_blkid, ==, 0);
- ASSERT3U(size, ==, sizeof (objset_phys_t));
- za->za_zil_seq = 0;
- } else if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
- ASSERT3U(zb->zb_object, ==, 0);
- ASSERT3U(zb->zb_blkid, >, za->za_zil_seq);
- za->za_zil_seq = zb->zb_blkid;
- } else {
- ASSERT3U(zb->zb_object, !=, 0); /* lr_write_t */
- }
-
- return (0);
- }
-
- ASSERT(dnp != NULL);
-
- if (bc->bc_errno)
- return (ERESTART);
-
- /*
- * Once in a while, abort the traverse. We only do this to odd
- * instance numbers to ensure that even ones can run to completion.
- */
- if ((za->za_instance & 1) && ztest_random(10000) == 0)
- return (EINTR);
-
- if (bp->blk_birth == 0) {
- ASSERT(th->th_advance & ADVANCE_HOLES);
- return (0);
- }
-
- if (zb->zb_level == 0 && !(th->th_advance & ADVANCE_DATA) &&
- bc == &th->th_cache[ZB_DN_CACHE][0]) {
- ASSERT(bc->bc_data == NULL);
- return (0);
- }
-
- ASSERT(bc->bc_data != NULL);
-
- /*
- * This is an expensive question, so don't ask it too often.
- */
- if (((za->za_random ^ th->th_callbacks) & 0xff) == 0) {
- void *xbuf = umem_alloc(size, UMEM_NOFAIL);
- if (arc_tryread(spa, bp, xbuf) == 0) {
- ASSERT(bcmp(bc->bc_data, xbuf, size) == 0);
- }
- umem_free(xbuf, size);
- }
-
- if (zb->zb_level > 0) {
- ASSERT3U(size, ==, 1ULL << dnp->dn_indblkshift);
- return (0);
- }
-
- ASSERT(zb->zb_level == 0);
- ASSERT3U(size, ==, dnp->dn_datablkszsec << DEV_BSHIFT);
-
- return (0);
-}
-
-/*
- * Verify that live pool traversal works.
- */
-void
-ztest_traverse(ztest_args_t *za)
-{
- spa_t *spa = za->za_spa;
- traverse_handle_t *th = za->za_th;
- int rc, advance;
- uint64_t cbstart, cblimit;
-
- if (th == NULL) {
- advance = 0;
-
- if (ztest_random(2) == 0)
- advance |= ADVANCE_PRE;
-
- if (ztest_random(2) == 0)
- advance |= ADVANCE_PRUNE;
-
- if (ztest_random(2) == 0)
- advance |= ADVANCE_DATA;
-
- if (ztest_random(2) == 0)
- advance |= ADVANCE_HOLES;
-
- if (ztest_random(2) == 0)
- advance |= ADVANCE_ZIL;
-
- th = za->za_th = traverse_init(spa, ztest_blk_cb, za, advance,
- ZIO_FLAG_CANFAIL);
-
- traverse_add_pool(th, 0, -1ULL);
- }
-
- advance = th->th_advance;
- cbstart = th->th_callbacks;
- cblimit = cbstart + ((advance & ADVANCE_DATA) ? 100 : 1000);
-
- while ((rc = traverse_more(th)) == EAGAIN && th->th_callbacks < cblimit)
- continue;
-
- if (zopt_verbose >= 5)
- (void) printf("traverse %s%s%s%s %llu blocks to "
- "<%llu, %llu, %lld, %llx>%s\n",
- (advance & ADVANCE_PRE) ? "pre" : "post",
- (advance & ADVANCE_PRUNE) ? "|prune" : "",
- (advance & ADVANCE_DATA) ? "|data" : "",
- (advance & ADVANCE_HOLES) ? "|holes" : "",
- (u_longlong_t)(th->th_callbacks - cbstart),
- (u_longlong_t)th->th_lastcb.zb_objset,
- (u_longlong_t)th->th_lastcb.zb_object,
- (u_longlong_t)th->th_lastcb.zb_level,
- (u_longlong_t)th->th_lastcb.zb_blkid,
- rc == 0 ? " [done]" :
- rc == EINTR ? " [aborted]" :
- rc == EAGAIN ? "" :
- strerror(rc));
-
- if (rc != EAGAIN) {
- if (rc != 0 && rc != EINTR)
- fatal(0, "traverse_more(%p) = %d", th, rc);
- traverse_fini(th);
- za->za_th = NULL;
- }
-}
-
/*
* Verify dsl_dataset_promote handles EBUSY
*/
@@ -3067,12 +2918,12 @@ ztest_verify_blocks(char *pool)
isa = strdup(isa);
/* LINTED */
(void) sprintf(bin,
- "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache -O %s %s",
+ "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache %s",
isalen,
isa,
zopt_verbose >= 3 ? "s" : "",
zopt_verbose >= 4 ? "v" : "",
- ztest_random(2) == 0 ? "pre" : "post", pool);
+ pool);
free(isa);
if (zopt_verbose >= 5)
@@ -3438,8 +3289,6 @@ ztest_run(char *pool)
while (--t >= 0) {
VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
- if (za[t].za_th)
- traverse_fini(za[t].za_th);
if (t < zopt_datasets) {
zil_close(za[t].za_zilog);
dmu_objset_close(za[t].za_os);
diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
index a13cd76..6365c6c 100644
--- a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
+++ b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <assert.h>
#include <fcntl.h>
#include <poll.h>
@@ -842,6 +840,8 @@ kernel_init(int mode)
VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
+ system_taskq_init();
+
spa_init(mode);
}
diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
index 4ef6472..ee202c8 100644
--- a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
+++ b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
@@ -334,11 +334,14 @@ typedef void (task_func_t)(void *);
#define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */
#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
+extern taskq_t *system_taskq;
+
extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *);
extern int taskq_member(taskq_t *, void *);
+extern void system_taskq_init(void);
#define XVA_MAPSIZE 3
#define XVA_MAGIC 0x78766174
diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c b/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
index ccf5b4d..93acdcf 100644
--- a/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
+++ b/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
@@ -19,15 +19,14 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
int taskq_now;
+taskq_t *system_taskq;
typedef struct task {
struct task *task_next;
@@ -253,3 +252,10 @@ taskq_member(taskq_t *tq, void *t)
return (0);
}
+
+void
+system_taskq_init(void)
+{
+ system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512,
+ TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
+}
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
index ef13487..04c74a3 100644
--- a/sys/cddl/boot/zfs/zfsimpl.h
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -66,7 +66,7 @@
#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
#define P2END(x, align) (-(~(x) & -(align)))
#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
-#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
+#define P2BOUNDARY(off, len, align) (((off) ^ ((off) + (len) - 1)) > (align) - 1)
/*
* General-purpose 32-bit and 64-bit bitfield encodings.
diff --git a/sys/cddl/compat/opensolaris/sys/sysmacros.h b/sys/cddl/compat/opensolaris/sys/sysmacros.h
index 3c1e9b1..0afc9ca 100644
--- a/sys/cddl/compat/opensolaris/sys/sysmacros.h
+++ b/sys/cddl/compat/opensolaris/sys/sysmacros.h
@@ -43,6 +43,10 @@ extern "C" {
#define ABS(a) ((a) < 0 ? -(a) : (a))
#endif
+#ifndef SIGNOF
+#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0)
+#endif
+
/*
* Macro for checking power of 2 address alignment.
*/
@@ -63,7 +67,7 @@ extern "C" {
#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
#define P2END(x, align) (-(~(x) & -(align)))
#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
-#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
+#define P2BOUNDARY(off, len, align) (((off) ^ ((off) + (len) - 1)) > (align) - 1)
/*
* Determine whether two numbers have the same high-order bit.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 053c1e1..69ad489 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -308,20 +308,18 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
}
- if (db->db_level == 0) {
- /* we can be momentarily larger in dnode_set_blksz() */
- if (db->db_blkid != DB_BONUS_BLKID && dn) {
- ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
- }
- if (db->db.db_object == DMU_META_DNODE_OBJECT) {
- dbuf_dirty_record_t *dr = db->db_data_pending;
- /*
- * it should only be modified in syncing
- * context, so make sure we only have
- * one copy of the data.
- */
- ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
- }
+ /*
+ * We can't assert that db_size matches dn_datablksz because it
+ * can be momentarily different when another thread is doing
+ * dnode_set_blksz().
+ */
+ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ /*
+ * It should only be modified in syncing context, so
+ * make sure we only have one copy of the data.
+ */
+ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
}
/* verify db->db_blkptr */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index 5c97cd7..6effae8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_tx.h>
@@ -172,66 +170,59 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
static int
-backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
+ const dnode_phys_t *dnp, void *arg)
{
struct backuparg *ba = arg;
- uint64_t object = bc->bc_bookmark.zb_object;
- int level = bc->bc_bookmark.zb_level;
- uint64_t blkid = bc->bc_bookmark.zb_blkid;
- blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
- void *data = bc->bc_data;
int err = 0;
if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR);
- ASSERT(data || bp == NULL);
-
- if (bp == NULL && object == 0) {
- uint64_t span = BP_SPAN(bc->bc_dnode, level);
- uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
+ if (bp == NULL && zb->zb_object == 0) {
+ uint64_t span = BP_SPAN(dnp, zb->zb_level);
+ uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
} else if (bp == NULL) {
- uint64_t span = BP_SPAN(bc->bc_dnode, level);
- err = dump_free(ba, object, blkid * span, span);
- } else if (data && level == 0 && type == DMU_OT_DNODE) {
- dnode_phys_t *blk = data;
+ uint64_t span = BP_SPAN(dnp, zb->zb_level);
+ err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span);
+ } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
+ return (0);
+ } else if (type == DMU_OT_DNODE) {
+ dnode_phys_t *blk;
int i;
int blksz = BP_GET_LSIZE(bp);
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf;
+ if (arc_read_nolock(NULL, spa, bp,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+ return (EIO);
+
+ blk = abuf->b_data;
for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
- uint64_t dnobj =
- (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+ uint64_t dnobj = (zb->zb_blkid <<
+ (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
err = dump_dnode(ba, dnobj, blk+i);
if (err)
break;
}
- } else if (level == 0 &&
- type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ } else { /* it's a level-0 block of a regular object */
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
- if (data == NULL) {
- uint32_t aflags = ARC_WAIT;
- arc_buf_t *abuf;
- zbookmark_t zb;
-
- zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
- zb.zb_object = object;
- zb.zb_level = level;
- zb.zb_blkid = blkid;
- (void) arc_read_nolock(NULL, spa, bp,
- arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
- ZIO_FLAG_MUSTSUCCEED, &aflags, &zb);
-
- if (abuf) {
- err = dump_data(ba, type, object, blkid * blksz,
- blksz, abuf->b_data);
- (void) arc_buf_remove_ref(abuf, &abuf);
- }
- } else {
- err = dump_data(ba, type, object, blkid * blksz,
- blksz, data);
- }
+
+ if (arc_read_nolock(NULL, spa, bp,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+ return (EIO);
+
+ err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
+ blksz, abuf->b_data);
+ (void) arc_buf_remove_ref(abuf, &abuf);
}
ASSERT(err == 0 || err == EINTR);
@@ -311,8 +302,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
return (ba.err);
}
- err = traverse_dsl_dataset(ds, fromtxg,
- ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
+ err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
backup_cb, &ba);
if (err) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index 43bf82e..5e177c5d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_traverse.h>
@@ -35,510 +33,88 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_impl.h>
-#include <sys/zvol.h>
-
-#define BP_SPAN_SHIFT(level, width) ((level) * (width))
-
-#define BP_EQUAL(b1, b2) \
- (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
- (b1)->blk_birth == (b2)->blk_birth)
-
-/*
- * Compare two bookmarks.
- *
- * For ADVANCE_PRE, the visitation order is:
- *
- * objset 0, 1, 2, ..., ZB_MAXOBJSET.
- * object 0, 1, 2, ..., ZB_MAXOBJECT.
- * blkoff 0, 1, 2, ...
- * level ZB_MAXLEVEL, ..., 2, 1, 0.
- *
- * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
- * ordering vector is:
- *
- * < objset, object, blkoff, -level >
- *
- * For ADVANCE_POST, the starting offsets aren't sequential but ending
- * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
- * The visitation order is:
- *
- * objset 1, 2, ..., ZB_MAXOBJSET, 0.
- * object 1, 2, ..., ZB_MAXOBJECT, 0.
- * blkoff 1, 2, ...
- * level 0, 1, 2, ..., ZB_MAXLEVEL.
- *
- * and thus a valid ordering vector is:
- *
- * < objset - 1, object - 1, blkoff, level >
- *
- * Both orderings can be expressed as:
- *
- * < objset + bias, object + bias, blkoff, level ^ bias >
- *
- * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
- * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
- *
- * Special case: an objset's osphys is represented as level -1 of object 0.
- * It is always either the very first or very last block we visit in an objset.
- * Therefore, if either bookmark's level is -1, level alone determines order.
- */
-static int
-compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
- int advance)
-{
- int bias = (advance & ADVANCE_PRE) ? 0 : -1;
- uint64_t sblkoff, eblkoff;
- int slevel, elevel, wshift;
-
- if (szb->zb_objset + bias < ezb->zb_objset + bias)
- return (-1);
-
- if (szb->zb_objset + bias > ezb->zb_objset + bias)
- return (1);
-
- slevel = szb->zb_level;
- elevel = ezb->zb_level;
-
- if ((slevel | elevel) < 0)
- return ((slevel ^ bias) - (elevel ^ bias));
-
- if (szb->zb_object + bias < ezb->zb_object + bias)
- return (-1);
-
- if (szb->zb_object + bias > ezb->zb_object + bias)
- return (1);
-
- if (dnp == NULL)
- return (0);
-
- wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
- eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
-
- if (sblkoff < eblkoff)
- return (-1);
-
- if (sblkoff > eblkoff)
- return (1);
-
- return ((elevel ^ bias) - (slevel ^ bias));
-}
-
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
-#define SET_BOOKMARK_LB(zb, level, blkid) \
-{ \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
-static int
-advance_objset(zseg_t *zseg, uint64_t objset, int advance)
-{
- zbookmark_t *zb = &zseg->seg_start;
-
- if (advance & ADVANCE_PRE) {
- if (objset >= ZB_MAXOBJSET)
- return (ERANGE);
- SET_BOOKMARK(zb, objset, 0, -1, 0);
- } else {
- if (objset >= ZB_MAXOBJSET)
- objset = 0;
- SET_BOOKMARK(zb, objset, 1, 0, 0);
- }
-
- if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
- return (ERANGE);
-
- return (EAGAIN);
-}
-
-static int
-advance_object(zseg_t *zseg, uint64_t object, int advance)
-{
- zbookmark_t *zb = &zseg->seg_start;
-
- if (advance & ADVANCE_PRE) {
- if (object >= ZB_MAXOBJECT) {
- SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
- } else {
- SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
- }
- } else {
- if (zb->zb_object == 0) {
- SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
- } else {
- if (object >= ZB_MAXOBJECT)
- object = 0;
- SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
- }
- }
-
- if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
- return (ERANGE);
-
- return (EAGAIN);
-}
-
-static int
-advance_from_osphys(zseg_t *zseg, int advance)
-{
- zbookmark_t *zb = &zseg->seg_start;
-
- ASSERT(zb->zb_object == 0);
- ASSERT(zb->zb_level == -1);
- ASSERT(zb->zb_blkid == 0);
-
- if (advance & ADVANCE_PRE) {
- SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
- } else {
- if (zb->zb_objset == 0)
- return (ERANGE);
- SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
- }
-
- if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
- return (ERANGE);
-
- return (EAGAIN);
-}
-
-static int
-advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
-{
- zbookmark_t *zb = &zseg->seg_start;
- int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
- int maxlevel = dnp->dn_nlevels - 1;
- int level = zb->zb_level;
- uint64_t blkid = zb->zb_blkid;
-
- if (advance & ADVANCE_PRE) {
- if (level > 0 && rc == 0) {
- level--;
- blkid <<= wshift;
- } else {
- blkid++;
-
- if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
- dnp->dn_maxblkid)
- return (ERANGE);
-
- while (level < maxlevel) {
- if (P2PHASE(blkid, 1ULL << wshift))
- break;
- blkid >>= wshift;
- level++;
- }
- }
- } else {
- if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
- blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
- level = 0;
- } else {
- blkid >>= wshift;
- level++;
- }
-
- while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
- dnp->dn_maxblkid) {
- if (level == maxlevel)
- return (ERANGE);
- blkid >>= wshift;
- level++;
- }
- }
- SET_BOOKMARK_LB(zb, level, blkid);
-
- if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
- return (ERANGE);
-
- return (EAGAIN);
-}
-
-/*
- * The traverse_callback function will call the function specified in th_func.
- * In the event of an error the callee, specified by th_func, must return
- * one of the following errors:
- *
- * EINTR - Indicates that the callee wants the traversal to
- * abort immediately.
- * ERESTART - The callee has acknowledged the error and would
- * like to continue.
- */
-static int
-traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
-{
- /*
- * Before we issue the callback, prune against maxtxg.
- *
- * We prune against mintxg before we get here because it's a big win.
- * If a given block was born in txg 37, then we know that the entire
- * subtree below that block must have been born in txg 37 or earlier.
- * We can therefore lop off huge branches of the tree as we go.
- *
- * There's no corresponding optimization for maxtxg because knowing
- * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
- * children. In fact, the copy-on-write design of ZFS ensures that
- * top-level blocks will pretty much always be new.
- *
- * Therefore, in the name of simplicity we don't prune against
- * maxtxg until the last possible moment -- that being right now.
- */
- if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
- return (0);
-
- /*
- * Debugging: verify that the order we visit things agrees with the
- * order defined by compare_bookmark(). We don't check this for
- * log blocks because there's no defined ordering for them; they're
- * always visited (or not) as part of visiting the objset_phys_t.
- */
- if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
- zbookmark_t *zb = &bc->bc_bookmark;
- zbookmark_t *szb = &zseg->seg_start;
- zbookmark_t *ezb = &zseg->seg_end;
- zbookmark_t *lzb = &th->th_lastcb;
- dnode_phys_t *dnp = bc->bc_dnode;
-
- ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
- ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
- ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
- lzb->zb_level == ZB_NO_LEVEL);
- *lzb = *zb;
- }
-
- th->th_callbacks++;
- return (th->th_func(bc, th->th_spa, th->th_arg));
-}
-
-static int
-traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
- dnode_phys_t *dnp)
-{
- zbookmark_t *zb = &bc->bc_bookmark;
- int error;
-
- th->th_hits++;
-
- bc->bc_dnode = dnp;
- bc->bc_errno = 0;
-
- if (BP_EQUAL(&bc->bc_blkptr, bp))
- return (0);
-
- bc->bc_blkptr = *bp;
-
- if (bc->bc_data == NULL)
- return (0);
-
- if (BP_IS_HOLE(bp)) {
- ASSERT(th->th_advance & ADVANCE_HOLES);
- return (0);
- }
-
- if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
- error = EIO;
- } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
- error = 0;
- th->th_arc_hits++;
- } else {
- error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
- BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
- th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
-
- if (BP_SHOULD_BYTESWAP(bp) && error == 0)
- (zb->zb_level > 0 ? byteswap_uint64_array :
- dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
- BP_GET_LSIZE(bp));
- th->th_reads++;
- }
-
- if (error) {
- bc->bc_errno = error;
- error = traverse_callback(th, NULL, bc);
- ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
- bc->bc_blkptr.blk_birth = -1ULL;
- }
-
- dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
- bc - &th->th_cache[0][0], error,
- zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
-
- return (error);
-}
-
-static int
-find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
-{
- zbookmark_t *zb = &zseg->seg_start;
- traverse_blk_cache_t *bc;
- blkptr_t *bp = dnp->dn_blkptr;
- int i, first, level;
- int nbp = dnp->dn_nblkptr;
- int minlevel = zb->zb_level;
- int maxlevel = dnp->dn_nlevels - 1;
- int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
- int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
- uint64_t blkid = zb->zb_blkid >> bp_shift;
- int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
- int rc;
-
- if (minlevel > maxlevel || blkid >= nbp)
- return (ERANGE);
-
- for (level = maxlevel; level >= minlevel; level--) {
- first = P2PHASE(blkid, 1ULL << wshift);
-
- for (i = first; i < nbp; i++)
- if (bp[i].blk_birth > zseg->seg_mintxg ||
- BP_IS_HOLE(&bp[i]) && do_holes)
- break;
-
- if (i != first) {
- i--;
- SET_BOOKMARK_LB(zb, level, blkid + (i - first));
- return (ENOTBLK);
- }
-
- bc = &th->th_cache[depth][level];
-
- SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
- level, blkid);
-
- if (rc = traverse_read(th, bc, bp + i, dnp)) {
- if (rc != EAGAIN) {
- SET_BOOKMARK_LB(zb, level, blkid);
- }
- return (rc);
- }
-
- if (BP_IS_HOLE(&bp[i])) {
- SET_BOOKMARK_LB(zb, level, blkid);
- th->th_lastcb.zb_level = ZB_NO_LEVEL;
- return (0);
- }
-
- nbp = 1 << wshift;
- bp = bc->bc_data;
- bp_shift -= wshift;
- blkid = zb->zb_blkid >> bp_shift;
- }
-
- return (0);
+#include <sys/callb.h>
+
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
}
-static int
-get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
- uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
-{
- zseg_t zseg;
- zbookmark_t *zb = &zseg.seg_start;
- uint64_t object = *objectp;
- int i, rc;
-
- SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
- SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
-
- zseg.seg_mintxg = txg;
- zseg.seg_maxtxg = -1ULL;
-
- for (;;) {
- rc = find_block(th, &zseg, mdn, depth);
-
- if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
- break;
-
- if (rc == 0 && zb->zb_level == 0) {
- dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
- for (i = 0; i < DNODES_PER_BLOCK; i++) {
- object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
- if (object >= *objectp &&
- dnp[i].dn_type != DMU_OT_NONE &&
- (type == -1 || dnp[i].dn_type == type)) {
- *objectp = object;
- *dnpp = &dnp[i];
- return (0);
- }
- }
- }
-
- rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
-
- if (rc == ERANGE)
- break;
- }
-
- if (rc == ERANGE)
- *objectp = ZB_MAXOBJECT;
-
- return (rc);
-}
+struct prefetch_data {
+ kmutex_t pd_mtx;
+ kcondvar_t pd_cv;
+ int pd_blks_max;
+ int pd_blks_fetched;
+ int pd_flags;
+ boolean_t pd_cancel;
+ boolean_t pd_exited;
+};
+
+struct traverse_data {
+ spa_t *td_spa;
+ uint64_t td_objset;
+ blkptr_t *td_rootbp;
+ uint64_t td_min_txg;
+ int td_flags;
+ struct prefetch_data *td_pfd;
+ blkptr_cb_t *td_func;
+ void *td_arg;
+};
/* ARGSUSED */
static void
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
- traverse_handle_t *th = arg;
- traverse_blk_cache_t *bc = &th->th_zil_cache;
- zbookmark_t *zb = &bc->bc_bookmark;
- zseg_t *zseg = list_head(&th->th_seglist);
+ struct traverse_data *td = arg;
+ zbookmark_t zb;
- if (bp->blk_birth <= zseg->seg_mintxg)
+ if (bp->blk_birth == 0)
return;
- if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
- zb->zb_object = 0;
- zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
- bc->bc_blkptr = *bp;
- (void) traverse_callback(th, zseg, bc);
- }
+ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
+ return;
+
+ zb.zb_objset = td->td_objset;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
}
/* ARGSUSED */
static void
traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
{
- traverse_handle_t *th = arg;
- traverse_blk_cache_t *bc = &th->th_zil_cache;
- zbookmark_t *zb = &bc->bc_bookmark;
- zseg_t *zseg = list_head(&th->th_seglist);
+ struct traverse_data *td = arg;
if (lrc->lrc_txtype == TX_WRITE) {
lr_write_t *lr = (lr_write_t *)lrc;
blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_t zb;
- if (bp->blk_birth <= zseg->seg_mintxg)
+ if (bp->blk_birth == 0)
return;
- if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
- zb->zb_object = lr->lr_foid;
- zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
- bc->bc_blkptr = *bp;
- (void) traverse_callback(th, zseg, bc);
- }
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return;
+
+ zb.zb_objset = td->td_objset;
+ zb.zb_object = lr->lr_foid;
+ zb.zb_level = BP_GET_LEVEL(bp);
+ zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+ VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
}
}
static void
-traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
+traverse_zil(struct traverse_data *td, zil_header_t *zh)
{
- spa_t *spa = th->th_spa;
- dsl_pool_t *dp = spa_get_dsl(spa);
- objset_phys_t *osphys = bc->bc_data;
- zil_header_t *zh = &osphys->os_zil_header;
uint64_t claim_txg = zh->zh_claim_txg;
zilog_t *zilog;
- ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
- ASSERT(bc->bc_bookmark.zb_level == -1);
-
/*
* We only want to visit blocks that have been claimed but not yet
* replayed (or, in read-only mode, blocks that *would* be claimed).
@@ -546,375 +122,290 @@ traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
if (claim_txg == 0 && (spa_mode & FWRITE))
return;
- th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
-
- zilog = zil_alloc(dp->dp_meta_objset, zh);
+ zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
- (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
+ (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
claim_txg);
zil_free(zilog);
}
static int
-traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
+traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
+ arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
{
- zbookmark_t *zb = &zseg->seg_start;
- traverse_blk_cache_t *bc;
- dnode_phys_t *dn, *dn_tmp;
- int worklimit = 100;
- int rc;
-
- dprintf("<%llu, %llu, %d, %llx>\n",
- zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
-
- bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
- dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
-
- SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
-
- rc = traverse_read(th, bc, mosbp, dn);
-
- if (rc) /* If we get ERESTART, we've got nowhere left to go */
- return (rc == ERESTART ? EINTR : rc);
-
- ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
-
- if (zb->zb_objset != 0) {
- uint64_t objset = zb->zb_objset;
- dsl_dataset_phys_t *dsp;
-
- rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
- DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
-
- if (objset != zb->zb_objset)
- rc = advance_objset(zseg, objset, th->th_advance);
+ int err = 0;
+ arc_buf_t *buf = NULL;
+ struct prefetch_data *pd = td->td_pfd;
- if (rc != 0)
- return (rc);
-
- dsp = DN_BONUS(dn_tmp);
-
- bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
- dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
-
- SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
-
- /*
- * If we're traversing an open snapshot, we know that it
- * can't be deleted (because it's open) and it can't change
- * (because it's a snapshot). Therefore, once we've gotten
- * from the uberblock down to the snapshot's objset_phys_t,
- * we no longer need to synchronize with spa_sync(); we're
- * traversing a completely static block tree from here on.
- */
- if (th->th_advance & ADVANCE_NOLOCK) {
- ASSERT(th->th_locked);
- rw_exit(spa_traverse_rwlock(th->th_spa));
- th->th_locked = 0;
- }
-
- if (BP_IS_HOLE(&dsp->ds_bp))
- rc = ERESTART;
- else
- rc = traverse_read(th, bc, &dsp->ds_bp, dn);
-
- if (rc != 0) {
- if (rc == ERESTART)
- rc = advance_objset(zseg, zb->zb_objset + 1,
- th->th_advance);
- return (rc);
- }
-
- if (th->th_advance & ADVANCE_PRUNE)
- zseg->seg_mintxg =
- MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
+ if (bp->blk_birth == 0) {
+ err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
+ return (err);
}
- if (zb->zb_level == -1) {
- ASSERT(zb->zb_object == 0);
- ASSERT(zb->zb_blkid == 0);
- ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
-
- if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
- rc = traverse_callback(th, zseg, bc);
- if (rc) {
- ASSERT(rc == EINTR);
- return (rc);
- }
- if ((th->th_advance & ADVANCE_ZIL) &&
- zb->zb_objset != 0)
- traverse_zil(th, bc);
- }
+ if (bp->blk_birth <= td->td_min_txg)
+ return (0);
- return (advance_from_osphys(zseg, th->th_advance));
+ if (pd && !pd->pd_exited &&
+ ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
+ BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
+ mutex_enter(&pd->pd_mtx);
+ ASSERT(pd->pd_blks_fetched >= 0);
+ while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
+ cv_wait(&pd->pd_cv, &pd->pd_mtx);
+ pd->pd_blks_fetched--;
+ cv_broadcast(&pd->pd_cv);
+ mutex_exit(&pd->pd_mtx);
}
- if (zb->zb_object != 0) {
- uint64_t object = zb->zb_object;
-
- rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
- zseg->seg_mintxg, -1, ZB_MDN_CACHE);
-
- if (object != zb->zb_object)
- rc = advance_object(zseg, object, th->th_advance);
-
- if (rc != 0)
- return (rc);
-
- dn = dn_tmp;
+ if (td->td_flags & TRAVERSE_PRE) {
+ err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+ if (err)
+ return (err);
}
- if (zb->zb_level == ZB_MAXLEVEL)
- zb->zb_level = dn->dn_nlevels - 1;
-
- for (;;) {
- rc = find_block(th, zseg, dn, ZB_DN_CACHE);
-
- if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
- break;
-
- if (rc == 0) {
- bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
- ASSERT(bc->bc_dnode == dn);
- ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
- rc = traverse_callback(th, zseg, bc);
- if (rc) {
- ASSERT(rc == EINTR);
- return (rc);
- }
- if (BP_IS_HOLE(&bc->bc_blkptr)) {
- ASSERT(th->th_advance & ADVANCE_HOLES);
- rc = ENOTBLK;
+ if (BP_GET_LEVEL(bp) > 0) {
+ uint32_t flags = ARC_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+ err = arc_read(NULL, td->td_spa, bp, pbuf,
+ arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err)
+ return (err);
+
+ /* recursively visitbp() blocks below this */
+ cbp = buf->b_data;
+ for (i = 0; i < epb; i++, cbp++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ err = traverse_visitbp(td, dnp, buf, cbp, &czb);
+ if (err)
+ break;
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ uint32_t flags = ARC_WAIT;
+ int i, j;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ err = arc_read(NULL, td->td_spa, bp, pbuf,
+ arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err)
+ return (err);
+
+ /* recursively visitbp() blocks below this */
+ dnp = buf->b_data;
+ for (i = 0; i < epb && err == 0; i++, dnp++) {
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset,
+ zb->zb_blkid * epb + i,
+ dnp->dn_nlevels - 1, j);
+ err = traverse_visitbp(td, dnp, buf,
+ (blkptr_t *)&dnp->dn_blkptr[j], &czb);
+ if (err)
+ break;
}
}
-
- rc = advance_block(zseg, dn, rc, th->th_advance);
-
- if (rc == ERANGE)
- break;
-
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ uint32_t flags = ARC_WAIT;
+ objset_phys_t *osp;
+ int j;
+
+ err = arc_read_nolock(NULL, td->td_spa, bp,
+ arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err)
+ return (err);
+
+ osp = buf->b_data;
/*
- * Give spa_sync() a chance to run.
+ * traverse_zil is just here for zdb's leak checking.
+ * For other consumers, there will be no ZIL blocks.
*/
- if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
- th->th_syncs++;
- return (EAGAIN);
- }
-
- if (--worklimit == 0)
- return (EAGAIN);
- }
-
- if (rc == ERANGE)
- rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
-
- return (rc);
-}
+ traverse_zil(td, &osp->os_zil_header);
-/*
- * It is the caller's responsibility to ensure that the dsl_dataset_t
- * doesn't go away during traversal.
- */
-int
-traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
- blkptr_cb_t func, void *arg)
-{
- spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
- traverse_handle_t *th;
- int err;
+ for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
+ zbookmark_t czb;
- th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
+ SET_BOOKMARK(&czb, zb->zb_objset, 0,
+ osp->os_meta_dnode.dn_nlevels - 1, j);
+ err = traverse_visitbp(td, &osp->os_meta_dnode, buf,
+ (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j],
+ &czb);
+ if (err)
+ break;
+ }
+ }
- traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
+ if (buf)
+ (void) arc_buf_remove_ref(buf, &buf);
- while ((err = traverse_more(th)) == EAGAIN)
- continue;
+ if (err == 0 && (td->td_flags & TRAVERSE_POST))
+ err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
- traverse_fini(th);
return (err);
}
-int
-traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg)
+/* ARGSUSED */
+static int
+traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
+ const dnode_phys_t *dnp, void *arg)
{
- spa_t *spa = dmu_objset_spa(os);
- traverse_handle_t *th;
- int err;
-
- th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
-
- traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
-
- while ((err = traverse_more(th)) == EAGAIN)
- continue;
+ struct prefetch_data *pfd = arg;
+ uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
- traverse_fini(th);
- return (err);
-}
+ ASSERT(pfd->pd_blks_fetched >= 0);
+ if (pfd->pd_cancel)
+ return (EINTR);
-int
-traverse_more(traverse_handle_t *th)
-{
- zseg_t *zseg = list_head(&th->th_seglist);
- uint64_t save_txg; /* XXX won't be necessary with real itinerary */
- krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
- blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
- int rc;
-
- if (zseg == NULL)
+ if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
+ BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
return (0);
- th->th_restarts++;
-
- save_txg = zseg->seg_mintxg;
-
- rw_enter(rw, RW_READER);
- th->th_locked = 1;
-
- rc = traverse_segment(th, zseg, mosbp);
- ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
+ mutex_enter(&pfd->pd_mtx);
+ while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
+ cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
+ pfd->pd_blks_fetched++;
+ cv_broadcast(&pfd->pd_cv);
+ mutex_exit(&pfd->pd_mtx);
- if (th->th_locked)
- rw_exit(rw);
- th->th_locked = 0;
-
- zseg->seg_mintxg = save_txg;
-
- if (rc == ERANGE) {
- list_remove(&th->th_seglist, zseg);
- kmem_free(zseg, sizeof (*zseg));
- return (EAGAIN);
- }
+ (void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, zb);
- return (rc);
+ return (0);
}
-/*
- * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
- * are not included. The blocks covered by this segment will all have
- * mintxg < birth < maxtxg.
- */
static void
-traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
- uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
- uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
+traverse_prefetch_thread(void *arg)
{
- zseg_t *zseg;
-
- zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
+ struct traverse_data *td_main = arg;
+ struct traverse_data td = *td_main;
+ zbookmark_t czb;
- zseg->seg_mintxg = mintxg;
- zseg->seg_maxtxg = maxtxg;
+ td.td_func = traverse_prefetcher;
+ td.td_arg = td_main->td_pfd;
+ td.td_pfd = NULL;
- zseg->seg_start.zb_objset = sobjset;
- zseg->seg_start.zb_object = sobject;
- zseg->seg_start.zb_level = slevel;
- zseg->seg_start.zb_blkid = sblkid;
+ SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
+ (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
- zseg->seg_end.zb_objset = eobjset;
- zseg->seg_end.zb_object = eobject;
- zseg->seg_end.zb_level = elevel;
- zseg->seg_end.zb_blkid = eblkid;
-
- list_insert_tail(&th->th_seglist, zseg);
+ mutex_enter(&td_main->td_pfd->pd_mtx);
+ td_main->td_pfd->pd_exited = B_TRUE;
+ cv_broadcast(&td_main->td_pfd->pd_cv);
+ mutex_exit(&td_main->td_pfd->pd_mtx);
}
-void
-traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
- uint64_t objset, uint64_t object)
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+static int
+traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
{
- if (th->th_advance & ADVANCE_PRE)
- traverse_add_segment(th, mintxg, maxtxg,
- objset, object, ZB_MAXLEVEL, 0,
- objset, object, 0, ZB_MAXBLKID);
- else
- traverse_add_segment(th, mintxg, maxtxg,
- objset, object, 0, 0,
- objset, object, 0, ZB_MAXBLKID);
-}
+ struct traverse_data td;
+ struct prefetch_data pd = { 0 };
+ zbookmark_t czb;
+ int err;
-void
-traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
- uint64_t objset)
-{
- if (th->th_advance & ADVANCE_PRE)
- traverse_add_segment(th, mintxg, maxtxg,
- objset, 0, -1, 0,
- objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
- else
- traverse_add_segment(th, mintxg, maxtxg,
- objset, 1, 0, 0,
- objset, 0, -1, 0);
-}
+ td.td_spa = spa;
+ td.td_objset = objset;
+ td.td_rootbp = rootbp;
+ td.td_min_txg = txg_start;
+ td.td_func = func;
+ td.td_arg = arg;
+ td.td_pfd = &pd;
+ td.td_flags = flags;
+
+ pd.pd_blks_max = 100;
+ pd.pd_flags = flags;
+ mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
+
+ if (!(flags & TRAVERSE_PREFETCH) ||
+ 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
+ &td, TQ_NOQUEUE))
+ pd.pd_exited = B_TRUE;
+
+ SET_BOOKMARK(&czb, objset, 0, -1, 0);
+ err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
+
+ mutex_enter(&pd.pd_mtx);
+ pd.pd_cancel = B_TRUE;
+ cv_broadcast(&pd.pd_cv);
+ while (!pd.pd_exited)
+ cv_wait(&pd.pd_cv, &pd.pd_mtx);
+ mutex_exit(&pd.pd_mtx);
+
+ mutex_destroy(&pd.pd_mtx);
+ cv_destroy(&pd.pd_cv);
-void
-traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
-{
- if (th->th_advance & ADVANCE_PRE)
- traverse_add_segment(th, mintxg, maxtxg,
- 0, 0, -1, 0,
- ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
- else
- traverse_add_segment(th, mintxg, maxtxg,
- 1, 1, 0, 0,
- 0, 0, -1, 0);
+ return (err);
}
-traverse_handle_t *
-traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
- int zio_flags)
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
+ blkptr_cb_t func, void *arg)
{
- traverse_handle_t *th;
- int d, l;
-
- th = kmem_zalloc(sizeof (*th), KM_SLEEP);
-
- th->th_spa = spa;
- th->th_func = func;
- th->th_arg = arg;
- th->th_advance = advance;
- th->th_lastcb.zb_level = ZB_NO_LEVEL;
- th->th_noread.zb_level = ZB_NO_LEVEL;
- th->th_zio_flags = zio_flags;
-
- list_create(&th->th_seglist, sizeof (zseg_t),
- offsetof(zseg_t, seg_node));
-
- for (d = 0; d < ZB_DEPTH; d++) {
- for (l = 0; l < ZB_MAXLEVEL; l++) {
- if ((advance & ADVANCE_DATA) ||
- l != 0 || d != ZB_DN_CACHE)
- th->th_cache[d][l].bc_data =
- zio_buf_alloc(SPA_MAXBLOCKSIZE);
- }
- }
-
- return (th);
+ return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
+ &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
}
-void
-traverse_fini(traverse_handle_t *th)
+/*
+ * NB: pool must not be changing on-disk (eg, from zdb or sync context).
+ */
+int
+traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
{
- int d, l;
- zseg_t *zseg;
-
- for (d = 0; d < ZB_DEPTH; d++)
- for (l = 0; l < ZB_MAXLEVEL; l++)
- if (th->th_cache[d][l].bc_data != NULL)
- zio_buf_free(th->th_cache[d][l].bc_data,
- SPA_MAXBLOCKSIZE);
-
- while ((zseg = list_head(&th->th_seglist)) != NULL) {
- list_remove(&th->th_seglist, zseg);
- kmem_free(zseg, sizeof (*zseg));
+ int err;
+ uint64_t obj;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ objset_t *mos = dp->dp_meta_objset;
+
+ /* visit the MOS */
+ err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
+ 0, TRAVERSE_PRE, func, arg);
+ if (err)
+ return (err);
+
+ /* visit each dataset */
+ for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
+ dmu_object_info_t doi;
+
+ err = dmu_object_info(mos, obj, &doi);
+ if (err)
+ return (err);
+
+ if (doi.doi_type == DMU_OT_DSL_DATASET) {
+ dsl_dataset_t *ds;
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (err)
+ return (err);
+ err = traverse_dataset(ds,
+ ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
+ func, arg);
+ dsl_dataset_rele(ds, FTAG);
+ if (err)
+ return (err);
+ }
}
-
- list_destroy(&th->th_seglist);
-
- dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
- th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
- th->th_syncs, th->th_restarts);
-
- kmem_free(th, sizeof (*th));
+ if (err == ESRCH)
+ err = 0;
+ return (err);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index a9f9c54..30e0836 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -1163,12 +1163,13 @@ struct killarg {
/* ARGSUSED */
static int
-kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
+ const dnode_phys_t *dnp, void *arg)
{
struct killarg *ka = arg;
- blkptr_t *bp = &bc->bc_blkptr;
- ASSERT3U(bc->bc_errno, ==, 0);
+ if (bp == NULL)
+ return (0);
ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
(void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
@@ -1196,7 +1197,7 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (EINVAL);
/*
- * If we made changes this txg, traverse_dsl_dataset won't find
+ * If we made changes this txg, traverse_dataset won't find
* them. Try again.
*/
if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
@@ -1263,8 +1264,8 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ka.ds = ds;
ka.zio = zio;
ka.tx = tx;
- (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
- ADVANCE_POST, kill_blkptr, &ka);
+ (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ TRAVERSE_POST, kill_blkptr, &ka);
(void) zio_wait(zio);
}
@@ -1657,8 +1658,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
ka.ds = ds;
ka.zio = zio;
ka.tx = tx;
- err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
- ADVANCE_POST, kill_blkptr, &ka);
+ err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ TRAVERSE_POST, kill_blkptr, &ka);
ASSERT3U(err, ==, 0);
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
ds->ds_phys->ds_unique_bytes == 0);
@@ -2850,6 +2851,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
csa->cds->ds_phys->ds_deadlist_obj));
VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
csa->ohds->ds_phys->ds_deadlist_obj));
+
+ dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx);
}
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index 58a79ca..e5823c5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -232,6 +232,8 @@ dsl_pool_close(dsl_pool_t *dp)
mutex_destroy(&dp->dp_lock);
mutex_destroy(&dp->dp_scrub_cancel_lock);
taskq_destroy(dp->dp_vnrele_taskq);
+ if (dp->dp_blkstats)
+ kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
kmem_free(dp, sizeof (dsl_pool_t));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
index 5f675b7..950a91f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
@@ -107,6 +107,12 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
/* back to the generic stuff */
+ if (dp->dp_blkstats == NULL) {
+ dp->dp_blkstats =
+ kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ }
+ bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+
if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
ot = DMU_OT_ZAP_OTHER;
@@ -575,6 +581,37 @@ dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
}
}
+void
+dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+
+ if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+ return;
+
+ if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
+ dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
+ } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
+ dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
+ }
+
+ if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+ ds1->ds_object, tx) == 0) {
+ int err = zap_add_int(dp->dp_meta_objset,
+ dp->dp_scrub_queue_obj, ds2->ds_object, tx);
+ VERIFY(err == 0 || err == EEXIST);
+ if (err == EEXIST) {
+ /* Both were there to begin with */
+ VERIFY(0 == zap_add_int(dp->dp_meta_objset,
+ dp->dp_scrub_queue_obj, ds1->ds_object, tx));
+ }
+ } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+ ds2->ds_object, tx) == 0) {
+ VERIFY(0 == zap_add_int(dp->dp_meta_objset,
+ dp->dp_scrub_queue_obj, ds1->ds_object, tx));
+ }
+}
+
struct enqueue_clones_arg {
dmu_tx_t *tx;
uint64_t originobj;
@@ -817,6 +854,52 @@ dsl_pool_scrub_restart(dsl_pool_t *dp)
*/
static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+ int i;
+
+ /*
+ * If we resume after a reboot, zab will be NULL; don't record
+ * incomplete stats in that case.
+ */
+ if (zab == NULL)
+ return;
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+ int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+ zfs_blkstat_t *zb = &zab->zab_type[l][t];
+ int equal;
+
+ zb->zb_count++;
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]))
+ zb->zb_ditto_2_of_2_samevdev++;
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal == 1)
+ zb->zb_ditto_2_of_3_samevdev++;
+ else if (equal == 3)
+ zb->zb_ditto_3_of_3_samevdev++;
+ break;
+ }
+ }
+}
+
+static void
dsl_pool_scrub_clean_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -844,6 +927,8 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
int zio_priority;
+ count_block(dp->dp_blkstats, bp);
+
if (dp->dp_scrub_isresilver == 0) {
/* It's a scrub */
zio_flags |= ZIO_FLAG_SCRUB;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 888b882..54c7c46 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -4075,11 +4075,7 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_config_syncing = NULL;
}
- spa->spa_traverse_wanted = B_TRUE;
- rw_enter(&spa->spa_traverse_lock, RW_WRITER);
- spa->spa_traverse_wanted = B_FALSE;
spa->spa_ubsync = spa->spa_uberblock;
- rw_exit(&spa->spa_traverse_lock);
/*
* Clean up the ZIL records for the synced txg.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index 7a41d4f..5735d31 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -428,8 +428,6 @@ spa_add(const char *name, const char *altroot)
spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
- rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
-
mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -513,8 +511,6 @@ spa_remove(spa_t *spa)
spa_config_lock_destroy(spa);
- rw_destroy(&spa->spa_traverse_lock);
-
cv_destroy(&spa->spa_async_cv);
cv_destroy(&spa->spa_async_root_cv);
cv_destroy(&spa->spa_scrub_io_cv);
@@ -1127,16 +1123,10 @@ zfs_panic_recover(const char *fmt, ...)
* ==========================================================================
*/
-krwlock_t *
-spa_traverse_rwlock(spa_t *spa)
-{
- return (&spa->spa_traverse_lock);
-}
-
boolean_t
-spa_traverse_wanted(spa_t *spa)
+spa_shutting_down(spa_t *spa)
{
- return (spa->spa_traverse_wanted);
+ return (spa->spa_async_suspended);
}
dsl_pool_t *
@@ -1205,7 +1195,7 @@ spa_first_txg(spa_t *spa)
return (spa->spa_first_txg);
}
-int
+pool_state_t
spa_state(spa_t *spa)
{
return (spa->spa_state);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
index 05e5ffd..3e02689 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -26,93 +26,29 @@
#ifndef _SYS_DMU_TRAVERSE_H
#define _SYS_DMU_TRAVERSE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/zio.h>
-#include <sys/dmu.h>
-#include <sys/dnode.h>
-#include <sys/arc.h>
#ifdef __cplusplus
extern "C" {
#endif
-#define ADVANCE_POST 0 /* post-order traversal */
-#define ADVANCE_PRE 0x01 /* pre-order traversal */
-#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */
-#define ADVANCE_DATA 0x04 /* read user data blocks */
-#define ADVANCE_HOLES 0x08 /* visit holes */
-#define ADVANCE_ZIL 0x10 /* visit intent log blocks */
-#define ADVANCE_NOLOCK 0x20 /* Don't grab SPA sync lock */
-
-#define ZB_NO_LEVEL -2
-#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */
-#define ZB_MAXBLKID (1ULL << 62)
-#define ZB_MAXOBJSET (1ULL << 62)
-#define ZB_MAXOBJECT (1ULL << 62)
-
-#define ZB_MOS_CACHE 0
-#define ZB_MDN_CACHE 1
-#define ZB_DN_CACHE 2
-#define ZB_DEPTH 3
-
-typedef struct zseg {
- uint64_t seg_mintxg;
- uint64_t seg_maxtxg;
- zbookmark_t seg_start;
- zbookmark_t seg_end;
- list_node_t seg_node;
-} zseg_t;
-
-typedef struct traverse_blk_cache {
- zbookmark_t bc_bookmark;
- blkptr_t bc_blkptr;
- void *bc_data;
- dnode_phys_t *bc_dnode;
- int bc_errno;
- int bc_pad1;
- uint64_t bc_pad2;
-} traverse_blk_cache_t;
-
-typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg);
-
-struct traverse_handle {
- spa_t *th_spa;
- blkptr_cb_t *th_func;
- void *th_arg;
- uint16_t th_advance;
- uint16_t th_locked;
- int th_zio_flags;
- list_t th_seglist;
- traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
- traverse_blk_cache_t th_zil_cache;
- uint64_t th_hits;
- uint64_t th_arc_hits;
- uint64_t th_reads;
- uint64_t th_callbacks;
- uint64_t th_syncs;
- uint64_t th_restarts;
- zbookmark_t th_noread;
- zbookmark_t th_lastcb;
-};
-
-int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start,
- int advance, blkptr_cb_t func, void *arg);
-int traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg);
+struct dnode_phys;
+struct dsl_dataset;
-traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg,
- int advance, int zio_flags);
-void traverse_fini(traverse_handle_t *th);
+typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp,
+ const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
-void traverse_add_dnode(traverse_handle_t *th,
- uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object);
-void traverse_add_objset(traverse_handle_t *th,
- uint64_t mintxg, uint64_t maxtxg, uint64_t objset);
-void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg);
+#define TRAVERSE_PRE (1<<0)
+#define TRAVERSE_POST (1<<1)
+#define TRAVERSE_PREFETCH_METADATA (1<<2)
+#define TRAVERSE_PREFETCH_DATA (1<<3)
+#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
-int traverse_more(traverse_handle_t *th);
+int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start,
+ int flags, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
index dcf5a44..ef1b904 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
@@ -31,6 +31,7 @@
#include <sys/txg_impl.h>
#include <sys/zfs_context.h>
#include <sys/zio.h>
+#include <sys/dnode.h>
#ifdef __cplusplus
extern "C" {
@@ -48,6 +49,25 @@ enum scrub_func {
SCRUB_FUNC_NUMFUNCS
};
+/* These macros are for indexing into the zfs_all_blkstats_t. */
+#define DMU_OT_DEFERRED DMU_OT_NONE
+#define DMU_OT_TOTAL DMU_OT_NUMTYPES
+
+typedef struct zfs_blkstat {
+ uint64_t zb_count;
+ uint64_t zb_asize;
+ uint64_t zb_lsize;
+ uint64_t zb_psize;
+ uint64_t zb_gangs;
+ uint64_t zb_ditto_2_of_2_samevdev;
+ uint64_t zb_ditto_2_of_3_samevdev;
+ uint64_t zb_ditto_3_of_3_samevdev;
+} zfs_blkstat_t;
+
+typedef struct zfs_all_blkstats {
+ zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
+} zfs_all_blkstats_t;
+
typedef struct dsl_pool {
/* Immutable */
@@ -95,6 +115,8 @@ typedef struct dsl_pool {
* nobody else could possibly have it for write.
*/
krwlock_t dp_config_rwlock;
+
+ zfs_all_blkstats_t *dp_blkstats;
} dsl_pool_t;
int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
@@ -112,6 +134,8 @@ int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
zio_done_func_t *done, void *private, uint32_t arc_flags);
void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+ struct dmu_tx *tx);
void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index b0b758b..1cfa7ec 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -44,7 +44,6 @@ typedef struct spa spa_t;
typedef struct vdev vdev_t;
typedef struct metaslab metaslab_t;
typedef struct zilog zilog_t;
-typedef struct traverse_handle traverse_handle_t;
typedef struct spa_aux_vdev spa_aux_vdev_t;
struct dsl_pool;
@@ -438,8 +437,7 @@ extern void spa_vdev_state_enter(spa_t *spa);
extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
/* Accessor functions */
-extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
-extern boolean_t spa_traverse_wanted(spa_t *spa);
+extern boolean_t spa_shutting_down(spa_t *spa);
extern struct dsl_pool *spa_get_dsl(spa_t *spa);
extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
@@ -450,7 +448,7 @@ extern uint64_t spa_guid(spa_t *spa);
extern uint64_t spa_last_synced_txg(spa_t *spa);
extern uint64_t spa_first_txg(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
-extern int spa_state(spa_t *spa);
+extern pool_state_t spa_state(spa_t *spa);
extern uint64_t spa_freeze_txg(spa_t *spa);
extern uint64_t spa_get_alloc(spa_t *spa);
extern uint64_t spa_get_space(spa_t *spa);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index ab41ba6..8aeb414 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -101,9 +101,8 @@ struct spa {
nvlist_t *spa_config_syncing; /* currently syncing config */
uint64_t spa_config_txg; /* txg of last config change */
int spa_sync_pass; /* iterate-to-convergence */
- int spa_state; /* pool state */
+ pool_state_t spa_state; /* pool state */
int spa_inject_ref; /* injection references */
- uint8_t spa_traverse_wanted; /* traverse lock wanted */
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
@@ -125,7 +124,6 @@ struct spa {
uint64_t spa_syncing_txg; /* txg currently syncing */
uint64_t spa_sync_bplist_obj; /* object for deferred frees */
bplist_t spa_sync_bplist; /* deferred-free bplist */
- krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
index a58be84..7413c66 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
@@ -26,8 +26,6 @@
#ifndef _SYS_TXG_IMPL_H
#define _SYS_TXG_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/spa.h>
#include <sys/txg.h>
@@ -66,7 +64,6 @@ typedef struct tx_state {
kthread_t *tx_sync_thread;
kthread_t *tx_quiesce_thread;
- kthread_t *tx_timelimit_thread;
} tx_state_t;
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
index 8650fa1..da1e83e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -309,12 +309,14 @@ txg_sync_thread(void *arg)
uint64_t txg;
/*
- * We sync when there's someone waiting on us, or the
- * quiesce thread has handed off a txg to us, or we have
- * reached our timeout.
+ * We sync when we're scrubbing, there's someone waiting
+ * on us, or the quiesce thread has handed off a txg to
+ * us, or we have reached our timeout.
*/
timer = (delta >= timeout ? 0 : timeout - delta);
- while (!tx->tx_exiting && timer > 0 &&
+ while ((dp->dp_scrub_func == SCRUB_FUNC_NONE ||
+ spa_shutting_down(dp->dp_spa)) &&
+ !tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
tx->tx_quiesced_txg == 0) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
index aa8f6f0..88c15b7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -279,7 +279,7 @@ vdev_cache_read(zio_t *zio)
/*
* If the I/O straddles two or more cache blocks, don't cache it.
*/
- if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
+ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
return (EXDEV);
ASSERT(cache_phase + zio->io_size <= VCBS);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index b07a8c1..79a9966 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -94,23 +94,13 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
static kmutex_t zvol_state_lock;
static uint32_t zvol_minors;
-#define NUM_EXTENTS ((SPA_MAXBLOCKSIZE) / sizeof (zvol_extent_t))
-
typedef struct zvol_extent {
+ list_node_t ze_node;
dva_t ze_dva; /* dva associated with this extent */
- uint64_t ze_stride; /* extent stride */
- uint64_t ze_size; /* number of blocks in extent */
+ uint64_t ze_nblks; /* number of blocks in extent */
} zvol_extent_t;
/*
- * The list of extents associated with the dump device
- */
-typedef struct zvol_ext_list {
- zvol_extent_t zl_extents[NUM_EXTENTS];
- struct zvol_ext_list *zl_next;
-} zvol_ext_list_t;
-
-/*
* The in-core state of each volume.
*/
typedef struct zvol_state {
@@ -124,7 +114,7 @@ typedef struct zvol_state {
uint32_t zv_mode; /* DS_MODE_* flags at open time */
uint32_t zv_total_opens; /* total open count */
zilog_t *zv_zilog; /* ZIL handle */
- zvol_ext_list_t *zv_list; /* List of extents for dump */
+ list_t zv_extents; /* List of extents for dump */
uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
znode_t zv_znode; /* for range locking */
int zv_state;
@@ -350,12 +340,12 @@ static void
zvol_serve_one(zvol_state_t *zv, struct bio *bp)
{
uint64_t off, volsize;
- size_t size, resid;
+ size_t resid;
char *addr;
objset_t *os;
rl_t *rl;
int error = 0;
- boolean_t reading;
+ boolean_t doread = (bp->bio_cmd == BIO_READ);
off = bp->bio_offset;
volsize = zv->zv_volsize;
@@ -373,18 +363,16 @@ zvol_serve_one(zvol_state_t *zv, struct bio *bp)
* we can't change the data whilst calculating the checksum.
* A better approach than a per zvol rwlock would be to lock ranges.
*/
- reading = (bp->bio_cmd == BIO_READ);
rl = zfs_range_lock(&zv->zv_znode, off, resid,
- reading ? RL_READER : RL_WRITER);
+ doread ? RL_READER : RL_WRITER);
while (resid != 0 && off < volsize) {
-
- size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
+ size_t size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
if (size > volsize - off) /* don't write past the end */
size = volsize - off;
- if (reading) {
+ if (doread) {
error = dmu_read(os, ZVOL_OBJ, off, size, addr);
} else {
dmu_tx_t *tx = dmu_tx_create(os);
@@ -457,128 +445,81 @@ zvol_worker(void *arg)
}
}
-void
-zvol_init_extent(zvol_extent_t *ze, blkptr_t *bp)
-{
- ze->ze_dva = bp->blk_dva[0]; /* structure assignment */
- ze->ze_stride = 0;
- ze->ze_size = 1;
-}
-
/* extent mapping arg */
struct maparg {
- zvol_ext_list_t *ma_list;
- zvol_extent_t *ma_extent;
- int ma_gang;
+ zvol_state_t *ma_zv;
+ uint64_t ma_blks;
};
/*ARGSUSED*/
static int
-zvol_map_block(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
+ const dnode_phys_t *dnp, void *arg)
{
- zbookmark_t *zb = &bc->bc_bookmark;
- blkptr_t *bp = &bc->bc_blkptr;
- void *data = bc->bc_data;
- dnode_phys_t *dnp = bc->bc_dnode;
- struct maparg *ma = (struct maparg *)arg;
- uint64_t stride;
-
- /* If there is an error, then keep trying to make progress */
- if (bc->bc_errno)
- return (ERESTART);
-
-#ifdef ZFS_DEBUG
- if (zb->zb_level == -1) {
- ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
- ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
- } else {
- ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
- ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
- }
+ struct maparg *ma = arg;
+ zvol_extent_t *ze;
+ int bs = ma->ma_zv->zv_volblocksize;
- if (zb->zb_level > 0) {
- uint64_t fill = 0;
- blkptr_t *bpx, *bpend;
+ if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
+ return (0);
- for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
- bpx < bpend; bpx++) {
- if (bpx->blk_birth != 0) {
- fill += bpx->blk_fill;
- } else {
- ASSERT(bpx->blk_fill == 0);
- }
- }
- ASSERT3U(fill, ==, bp->blk_fill);
- }
+ VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
+ ma->ma_blks++;
- if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
- uint64_t fill = 0;
- dnode_phys_t *dnx, *dnend;
+ /* Abort immediately if we have encountered gang blocks */
+ if (BP_IS_GANG(bp))
+ return (EFRAGS);
- for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
- dnx < dnend; dnx++) {
- if (dnx->dn_type != DMU_OT_NONE)
- fill++;
- }
- ASSERT3U(fill, ==, bp->blk_fill);
+ /*
+ * See if the block is at the end of the previous extent.
+ */
+ ze = list_tail(&ma->ma_zv->zv_extents);
+ if (ze &&
+ DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
+ DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
+ DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
+ ze->ze_nblks++;
+ return (0);
}
-#endif
- if (zb->zb_level || dnp->dn_type == DMU_OT_DNODE)
- return (0);
+ dprintf_bp(bp, "%s", "next blkptr:");
- /* Abort immediately if we have encountered gang blocks */
- if (BP_IS_GANG(bp)) {
- ma->ma_gang++;
- return (EINTR);
- }
+ /* start a new extent */
+ ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
+ ze->ze_dva = bp->blk_dva[0]; /* structure assignment */
+ ze->ze_nblks = 1;
+ list_insert_tail(&ma->ma_zv->zv_extents, ze);
+ return (0);
+}
- /* first time? */
- if (ma->ma_extent->ze_size == 0) {
- zvol_init_extent(ma->ma_extent, bp);
- return (0);
- }
+static void
+zvol_free_extents(zvol_state_t *zv)
+{
+ zvol_extent_t *ze;
- stride = (DVA_GET_OFFSET(&bp->blk_dva[0])) -
- ((DVA_GET_OFFSET(&ma->ma_extent->ze_dva)) +
- (ma->ma_extent->ze_size - 1) * (ma->ma_extent->ze_stride));
- if (DVA_GET_VDEV(BP_IDENTITY(bp)) ==
- DVA_GET_VDEV(&ma->ma_extent->ze_dva)) {
- if (ma->ma_extent->ze_stride == 0) {
- /* second block in this extent */
- ma->ma_extent->ze_stride = stride;
- ma->ma_extent->ze_size++;
- return (0);
- } else if (ma->ma_extent->ze_stride == stride) {
- /*
- * the block we allocated has the same
- * stride
- */
- ma->ma_extent->ze_size++;
- return (0);
- }
+ while (ze = list_head(&zv->zv_extents)) {
+ list_remove(&zv->zv_extents, ze);
+ kmem_free(ze, sizeof (zvol_extent_t));
}
+}
- /*
- * dtrace -n 'zfs-dprintf
- * /stringof(arg0) == "zvol.c"/
- * {
- * printf("%s: %s", stringof(arg1), stringof(arg3))
- * } '
- */
- dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n",
- ma->ma_extent->ze_size, ma->ma_extent->ze_stride, stride);
- dprintf_bp(bp, "%s", "next blkptr:");
- /* start a new extent */
- if (ma->ma_extent == &ma->ma_list->zl_extents[NUM_EXTENTS - 1]) {
- ma->ma_list->zl_next = kmem_zalloc(sizeof (zvol_ext_list_t),
- KM_SLEEP);
- ma->ma_list = ma->ma_list->zl_next;
- ma->ma_extent = &ma->ma_list->zl_extents[0];
- } else {
- ma->ma_extent++;
+static int
+zvol_get_lbas(zvol_state_t *zv)
+{
+ struct maparg ma;
+ int err;
+
+ ma.ma_zv = zv;
+ ma.ma_blks = 0;
+ zvol_free_extents(zv);
+
+ err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
+ if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
+ zvol_free_extents(zv);
+ return (err ? err : EIO);
}
- zvol_init_extent(ma->ma_extent, bp);
+
return (0);
}
@@ -676,106 +617,6 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
};
/*
- * reconstruct dva that gets us to the desired offset (offset
- * is in bytes)
- */
-int
-zvol_get_dva(zvol_state_t *zv, uint64_t offset, dva_t *dva)
-{
- zvol_ext_list_t *zl;
- zvol_extent_t *ze;
- int idx;
- uint64_t tmp;
-
- if ((zl = zv->zv_list) == NULL)
- return (EIO);
- idx = 0;
- ze = &zl->zl_extents[0];
- while (offset >= ze->ze_size * zv->zv_volblocksize) {
- offset -= ze->ze_size * zv->zv_volblocksize;
-
- if (idx == NUM_EXTENTS - 1) {
- /* we've reached the end of this array */
- ASSERT(zl->zl_next != NULL);
- if (zl->zl_next == NULL)
- return (-1);
- zl = zl->zl_next;
- ze = &zl->zl_extents[0];
- idx = 0;
- } else {
- ze++;
- idx++;
- }
- }
- DVA_SET_VDEV(dva, DVA_GET_VDEV(&ze->ze_dva));
- tmp = DVA_GET_OFFSET((&ze->ze_dva));
- tmp += (ze->ze_stride * (offset / zv->zv_volblocksize));
- DVA_SET_OFFSET(dva, tmp);
- return (0);
-}
-
-static void
-zvol_free_extents(zvol_state_t *zv)
-{
- zvol_ext_list_t *zl;
- zvol_ext_list_t *tmp;
-
- if (zv->zv_list != NULL) {
- zl = zv->zv_list;
- while (zl != NULL) {
- tmp = zl->zl_next;
- kmem_free(zl, sizeof (zvol_ext_list_t));
- zl = tmp;
- }
- zv->zv_list = NULL;
- }
-}
-
-int
-zvol_get_lbas(zvol_state_t *zv)
-{
- struct maparg ma;
- zvol_ext_list_t *zl;
- zvol_extent_t *ze;
- uint64_t blocks = 0;
- int err;
-
- ma.ma_list = zl = kmem_zalloc(sizeof (zvol_ext_list_t), KM_SLEEP);
- ma.ma_extent = &ma.ma_list->zl_extents[0];
- ma.ma_gang = 0;
- zv->zv_list = ma.ma_list;
-
- err = traverse_zvol(zv->zv_objset, ADVANCE_PRE, zvol_map_block, &ma);
- if (err == EINTR && ma.ma_gang) {
- /*
- * We currently don't support dump devices when the pool
- * is so fragmented that our allocation has resulted in
- * gang blocks.
- */
- zvol_free_extents(zv);
- return (EFRAGS);
- }
- ASSERT3U(err, ==, 0);
-
- ze = &zl->zl_extents[0];
- while (ze) {
- blocks += ze->ze_size;
- if (ze == &zl->zl_extents[NUM_EXTENTS - 1]) {
- zl = zl->zl_next;
- ze = &zl->zl_extents[0];
- } else {
- ze++;
- }
- }
- if (blocks != (zv->zv_volsize / zv->zv_volblocksize)) {
- zvol_free_extents(zv);
- return (EIO);
- }
-
- return (0);
-}
-
-/*
* Create a minor node (plus a whole lot more) for the specified volume.
*/
int
@@ -830,6 +671,8 @@ zvol_create_minor(const char *name, major_t maj)
mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
+ list_create(&zv->zv_extents, sizeof (zvol_extent_t),
+ offsetof(zvol_extent_t, ze_node));
/* get and cache the blocksize */
error = dmu_object_info(os, ZVOL_OBJ, &doi);
ASSERT(error == 0);
@@ -1091,6 +934,8 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
if (error == ENOTSUP)
error = EBUSY;
dmu_tx_commit(tx);
+ if (error == 0)
+ zv->zv_volblocksize = volblocksize;
}
end:
mutex_exit(&zvol_state_lock);
@@ -1225,7 +1070,6 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
int error = 0;
objset_t *os = zv->zv_objset;
nvlist_t *nv = NULL;
- uint64_t checksum, compress, refresrv;
ASSERT(MUTEX_HELD(&zvol_state_lock));
@@ -1248,12 +1092,16 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
&zv->zv_volsize, tx);
} else {
+ uint64_t checksum, compress, refresrv, vbs;
+
error = dsl_prop_get_integer(zv->zv_name,
zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
error = error ? error : dsl_prop_get_integer(zv->zv_name,
zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
error = error ? error : dsl_prop_get_integer(zv->zv_name,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
+ error = error ? error : dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
@@ -1263,6 +1111,9 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
&refresrv, tx);
+ error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
+ &vbs, tx);
}
dmu_tx_commit(tx);
@@ -1288,6 +1139,9 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
ZIO_CHECKSUM_OFF) == 0);
+ VERIFY(nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+ SPA_MAXBLOCKSIZE) == 0);
error = zfs_set_prop_nvlist(zv->zv_name, nv);
nvlist_free(nv);
@@ -1367,7 +1221,7 @@ zvol_dump_fini(zvol_state_t *zv)
objset_t *os = zv->zv_objset;
nvlist_t *nv;
int error = 0;
- uint64_t checksum, compress, refresrv;
+ uint64_t checksum, compress, refresrv, vbs;
/*
* Attempt to restore the zvol back to its pre-dumpified state.
@@ -1392,6 +1246,8 @@ zvol_dump_fini(zvol_state_t *zv)
zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
+ (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
(void) nvlist_add_uint64(nv,
@@ -1400,6 +1256,8 @@ zvol_dump_fini(zvol_state_t *zv)
zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
(void) nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
+ (void) nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs);
(void) zfs_set_prop_nvlist(zv->zv_name, nv);
nvlist_free(nv);
OpenPOWER on IntegriCloud