diff options
Diffstat (limited to 'sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c')
-rw-r--r-- | sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c | 888 |
1 files changed, 888 insertions, 0 deletions
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c new file mode 100644 index 0000000..3d2bc3e --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -0,0 +1,888 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_impl.h> + +#define BP_SPAN_SHIFT(level, width) ((level) * (width)) + +#define BP_EQUAL(b1, b2) \ + (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ + (b1)->blk_birth == (b2)->blk_birth) + +/* + * Compare two bookmarks. + * + * For ADVANCE_PRE, the visitation order is: + * + * objset 0, 1, 2, ..., ZB_MAXOBJSET. + * object 0, 1, 2, ..., ZB_MAXOBJECT. + * blkoff 0, 1, 2, ... + * level ZB_MAXLEVEL, ..., 2, 1, 0. + * + * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid + * ordering vector is: + * + * < objset, object, blkoff, -level > + * + * For ADVANCE_POST, the starting offsets aren't sequential but ending + * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. + * The visitation order is: + * + * objset 1, 2, ..., ZB_MAXOBJSET, 0. + * object 1, 2, ..., ZB_MAXOBJECT, 0. + * blkoff 1, 2, ... + * level 0, 1, 2, ..., ZB_MAXLEVEL. + * + * and thus a valid ordering vector is: + * + * < objset - 1, object - 1, blkoff, level > + * + * Both orderings can be expressed as: + * + * < objset + bias, object + bias, blkoff, level ^ bias > + * + * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) + * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). + * + * Special case: an objset's osphys is represented as level -1 of object 0. + * It is always either the very first or very last block we visit in an objset. + * Therefore, if either bookmark's level is -1, level alone determines order. + */ +static int +compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, + int advance) +{ + int bias = (advance & ADVANCE_PRE) ? 0 : -1; + uint64_t sblkoff, eblkoff; + int slevel, elevel, wshift; + + if (szb->zb_objset + bias < ezb->zb_objset + bias) + return (-1); + + if (szb->zb_objset + bias > ezb->zb_objset + bias) + return (1); + + slevel = szb->zb_level; + elevel = ezb->zb_level; + + if ((slevel | elevel) < 0) + return ((slevel ^ bias) - (elevel ^ bias)); + + if (szb->zb_object + bias < ezb->zb_object + bias) + return (-1); + + if (szb->zb_object + bias > ezb->zb_object + bias) + return (1); + + if (dnp == NULL) + return (0); + + wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; + + sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); + eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); + + if (sblkoff < eblkoff) + return (-1); + + if (sblkoff > eblkoff) + return (1); + + return ((elevel ^ bias) - (slevel ^ bias)); +} + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +#define SET_BOOKMARK_LB(zb, level, blkid) \ +{ \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +static int +advance_objset(zseg_t *zseg, uint64_t objset, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + + if (advance & ADVANCE_PRE) { + if (objset >= ZB_MAXOBJSET) + return (ERANGE); + SET_BOOKMARK(zb, objset, 0, -1, 0); + } else { + if (objset >= ZB_MAXOBJSET) + objset = 0; + SET_BOOKMARK(zb, objset, 1, 0, 0); + } + + if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +advance_object(zseg_t *zseg, uint64_t object, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + + if (advance & ADVANCE_PRE) { + if (object >= ZB_MAXOBJECT) { + SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); + } else { + SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); + } + } else { + if (zb->zb_object == 0) { + SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); + } else { + if (object >= ZB_MAXOBJECT) + object = 0; + SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); + } + } + + if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +advance_from_osphys(zseg_t *zseg, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + + ASSERT(zb->zb_object == 0); + ASSERT(zb->zb_level == -1); + ASSERT(zb->zb_blkid == 0); + + if (advance & ADVANCE_PRE) { + SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); + } else { + if (zb->zb_objset == 0) + return (ERANGE); + SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); + } + + if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; + int maxlevel = dnp->dn_nlevels - 1; + int level = zb->zb_level; + uint64_t blkid = zb->zb_blkid; + + if (advance & ADVANCE_PRE) { + if (level > 0 && rc == 0) { + level--; + blkid <<= wshift; + } else { + blkid++; + + if ((blkid << BP_SPAN_SHIFT(level, wshift)) > + dnp->dn_maxblkid) + return (ERANGE); + + while (level < maxlevel) { + if (P2PHASE(blkid, 1ULL << wshift)) + break; + blkid >>= wshift; + level++; + } + } + } else { + if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { + blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); + level = 0; + } else { + blkid >>= wshift; + level++; + } + + while ((blkid << BP_SPAN_SHIFT(level, wshift)) > + dnp->dn_maxblkid) { + if (level == maxlevel) + return (ERANGE); + blkid >>= wshift; + level++; + } + } + SET_BOOKMARK_LB(zb, level, blkid); + + if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) +{ + /* + * Before we issue the callback, prune against maxtxg. + * + * We prune against mintxg before we get here because it's a big win. + * If a given block was born in txg 37, then we know that the entire + * subtree below that block must have been born in txg 37 or earlier. + * We can therefore lop off huge branches of the tree as we go. + * + * There's no corresponding optimization for maxtxg because knowing + * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's + * children. In fact, the copy-on-write design of ZFS ensures that + * top-level blocks will pretty much always be new. + * + * Therefore, in the name of simplicity we don't prune against + * maxtxg until the last possible moment -- that being right now. + */ + if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) + return (0); + + /* + * Debugging: verify that the order we visit things agrees with the + * order defined by compare_bookmark(). We don't check this for + * log blocks because there's no defined ordering for them; they're + * always visited (or not) as part of visiting the objset_phys_t. + */ + if (bc->bc_errno == 0 && bc != &th->th_zil_cache) { + zbookmark_t *zb = &bc->bc_bookmark; + zbookmark_t *szb = &zseg->seg_start; + zbookmark_t *ezb = &zseg->seg_end; + zbookmark_t *lzb = &th->th_lastcb; + dnode_phys_t *dnp = bc->bc_dnode; + + ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); + ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); + ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || + lzb->zb_level == ZB_NO_LEVEL); + *lzb = *zb; + } + + th->th_callbacks++; + return (th->th_func(bc, th->th_spa, th->th_arg)); +} + +static int +traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, + dnode_phys_t *dnp) +{ + zbookmark_t *zb = &bc->bc_bookmark; + int error; + + th->th_hits++; + + bc->bc_dnode = dnp; + bc->bc_errno = 0; + + if (BP_EQUAL(&bc->bc_blkptr, bp)) + return (0); + + bc->bc_blkptr = *bp; + + if (bc->bc_data == NULL) + return (0); + + if (BP_IS_HOLE(bp)) { + ASSERT(th->th_advance & ADVANCE_HOLES); + return (0); + } + + if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { + error = EIO; + } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { + error = 0; + th->th_arc_hits++; + } else { + error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, + BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, + th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb)); + + if (BP_SHOULD_BYTESWAP(bp) && error == 0) + (zb->zb_level > 0 ? byteswap_uint64_array : + dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, + BP_GET_LSIZE(bp)); + th->th_reads++; + } + + if (error) { + bc->bc_errno = error; + error = traverse_callback(th, NULL, bc); + ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); + bc->bc_blkptr.blk_birth = -1ULL; + } + + dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", + bc - &th->th_cache[0][0], error, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); + + return (error); +} + +static int +find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) +{ + zbookmark_t *zb = &zseg->seg_start; + traverse_blk_cache_t *bc; + blkptr_t *bp = dnp->dn_blkptr; + int i, first, level; + int nbp = dnp->dn_nblkptr; + int minlevel = zb->zb_level; + int maxlevel = dnp->dn_nlevels - 1; + int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; + int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); + uint64_t blkid = zb->zb_blkid >> bp_shift; + int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; + int rc; + + if (minlevel > maxlevel || blkid >= nbp) + return (ERANGE); + + for (level = maxlevel; level >= minlevel; level--) { + first = P2PHASE(blkid, 1ULL << wshift); + + for (i = first; i < nbp; i++) + if (bp[i].blk_birth > zseg->seg_mintxg || + BP_IS_HOLE(&bp[i]) && do_holes) + break; + + if (i != first) { + i--; + SET_BOOKMARK_LB(zb, level, blkid + (i - first)); + return (ENOTBLK); + } + + bc = &th->th_cache[depth][level]; + + SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, + level, blkid); + + if (rc = traverse_read(th, bc, bp + i, dnp)) { + if (rc != EAGAIN) { + SET_BOOKMARK_LB(zb, level, blkid); + } + return (rc); + } + + if (BP_IS_HOLE(&bp[i])) { + SET_BOOKMARK_LB(zb, level, blkid); + th->th_lastcb.zb_level = ZB_NO_LEVEL; + return (0); + } + + nbp = 1 << wshift; + bp = bc->bc_data; + bp_shift -= wshift; + blkid = zb->zb_blkid >> bp_shift; + } + + return (0); +} + +static int +get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, + uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) +{ + zseg_t zseg; + zbookmark_t *zb = &zseg.seg_start; + uint64_t object = *objectp; + int i, rc; + + SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); + SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); + + zseg.seg_mintxg = txg; + zseg.seg_maxtxg = -1ULL; + + for (;;) { + rc = find_block(th, &zseg, mdn, depth); + + if (rc == EAGAIN || rc == EINTR || rc == ERANGE) + break; + + if (rc == 0 && zb->zb_level == 0) { + dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; + for (i = 0; i < DNODES_PER_BLOCK; i++) { + object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; + if (object >= *objectp && + dnp[i].dn_type != DMU_OT_NONE && + (type == -1 || dnp[i].dn_type == type)) { + *objectp = object; + *dnpp = &dnp[i]; + return (0); + } + } + } + + rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); + + if (rc == ERANGE) + break; + } + + if (rc == ERANGE) + *objectp = ZB_MAXOBJECT; + + return (rc); +} + +/* ARGSUSED */ +static void +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + traverse_handle_t *th = arg; + traverse_blk_cache_t *bc = &th->th_zil_cache; + zbookmark_t *zb = &bc->bc_bookmark; + zseg_t *zseg = list_head(&th->th_seglist); + + if (bp->blk_birth <= zseg->seg_mintxg) + return; + + if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { + zb->zb_object = 0; + zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + bc->bc_blkptr = *bp; + (void) traverse_callback(th, zseg, bc); + } +} + +/* ARGSUSED */ +static void +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + traverse_handle_t *th = arg; + traverse_blk_cache_t *bc = &th->th_zil_cache; + zbookmark_t *zb = &bc->bc_bookmark; + zseg_t *zseg = list_head(&th->th_seglist); + + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + + if (bp->blk_birth <= zseg->seg_mintxg) + return; + + if (claim_txg != 0 && bp->blk_birth >= claim_txg) { + zb->zb_object = lr->lr_foid; + zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); + bc->bc_blkptr = *bp; + (void) traverse_callback(th, zseg, bc); + } + } +} + +static void +traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc) +{ + spa_t *spa = th->th_spa; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_phys_t *osphys = bc->bc_data; + zil_header_t *zh = &osphys->os_zil_header; + uint64_t claim_txg = zh->zh_claim_txg; + zilog_t *zilog; + + ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]); + ASSERT(bc->bc_bookmark.zb_level == -1); + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && (spa_mode & FWRITE)) + return; + + th->th_zil_cache.bc_bookmark = bc->bc_bookmark; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, + claim_txg); + + zil_free(zilog); +} + +static int +traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) +{ + zbookmark_t *zb = &zseg->seg_start; + traverse_blk_cache_t *bc; + dnode_phys_t *dn, *dn_tmp; + int worklimit = 100; + int rc; + + dprintf("<%llu, %llu, %d, %llx>\n", + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); + + bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; + dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; + + SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); + + rc = traverse_read(th, bc, mosbp, dn); + + if (rc) /* If we get ERESTART, we've got nowhere left to go */ + return (rc == ERESTART ? EINTR : rc); + + ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); + + if (zb->zb_objset != 0) { + uint64_t objset = zb->zb_objset; + dsl_dataset_phys_t *dsp; + + rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, + DMU_OT_DSL_DATASET, ZB_MOS_CACHE); + + if (objset != zb->zb_objset) + rc = advance_objset(zseg, objset, th->th_advance); + + if (rc != 0) + return (rc); + + dsp = DN_BONUS(dn_tmp); + + bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; + dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; + + SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); + + /* + * If we're traversing an open snapshot, we know that it + * can't be deleted (because it's open) and it can't change + * (because it's a snapshot). Therefore, once we've gotten + * from the uberblock down to the snapshot's objset_phys_t, + * we no longer need to synchronize with spa_sync(); we're + * traversing a completely static block tree from here on. + */ + if (th->th_advance & ADVANCE_NOLOCK) { + ASSERT(th->th_locked); + rw_exit(spa_traverse_rwlock(th->th_spa)); + th->th_locked = 0; + } + + rc = traverse_read(th, bc, &dsp->ds_bp, dn); + + if (rc != 0) { + if (rc == ERESTART) + rc = advance_objset(zseg, zb->zb_objset + 1, + th->th_advance); + return (rc); + } + + if (th->th_advance & ADVANCE_PRUNE) + zseg->seg_mintxg = + MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); + } + + if (zb->zb_level == -1) { + ASSERT(zb->zb_object == 0); + ASSERT(zb->zb_blkid == 0); + ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET); + + if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { + rc = traverse_callback(th, zseg, bc); + if (rc) { + ASSERT(rc == EINTR); + return (rc); + } + if ((th->th_advance & ADVANCE_ZIL) && + zb->zb_objset != 0) + traverse_zil(th, bc); + } + + return (advance_from_osphys(zseg, th->th_advance)); + } + + if (zb->zb_object != 0) { + uint64_t object = zb->zb_object; + + rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, + zseg->seg_mintxg, -1, ZB_MDN_CACHE); + + if (object != zb->zb_object) + rc = advance_object(zseg, object, th->th_advance); + + if (rc != 0) + return (rc); + + dn = dn_tmp; + } + + if (zb->zb_level == ZB_MAXLEVEL) + zb->zb_level = dn->dn_nlevels - 1; + + for (;;) { + rc = find_block(th, zseg, dn, ZB_DN_CACHE); + + if (rc == EAGAIN || rc == EINTR || rc == ERANGE) + break; + + if (rc == 0) { + bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; + ASSERT(bc->bc_dnode == dn); + ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); + rc = traverse_callback(th, zseg, bc); + if (rc) { + ASSERT(rc == EINTR); + return (rc); + } + if (BP_IS_HOLE(&bc->bc_blkptr)) { + ASSERT(th->th_advance & ADVANCE_HOLES); + rc = ENOTBLK; + } + } + + rc = advance_block(zseg, dn, rc, th->th_advance); + + if (rc == ERANGE) + break; + + /* + * Give spa_sync() a chance to run. + */ + if (th->th_locked && spa_traverse_wanted(th->th_spa)) { + th->th_syncs++; + return (EAGAIN); + } + + if (--worklimit == 0) + return (EAGAIN); + } + + if (rc == ERANGE) + rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); + + return (rc); +} + +/* + * It is the caller's responsibility to ensure that the dsl_dataset_t + * doesn't go away during traversal. + */ +int +traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, + blkptr_cb_t func, void *arg) +{ + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + traverse_handle_t *th; + int err; + + th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); + + traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); + + while ((err = traverse_more(th)) == EAGAIN) + continue; + + traverse_fini(th); + return (err); +} + +int +traverse_more(traverse_handle_t *th) +{ + zseg_t *zseg = list_head(&th->th_seglist); + uint64_t save_txg; /* XXX won't be necessary with real itinerary */ + krwlock_t *rw = spa_traverse_rwlock(th->th_spa); + blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); + int rc; + + if (zseg == NULL) + return (0); + + th->th_restarts++; + + save_txg = zseg->seg_mintxg; + + rw_enter(rw, RW_READER); + th->th_locked = 1; + + rc = traverse_segment(th, zseg, mosbp); + ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); + + if (th->th_locked) + rw_exit(rw); + th->th_locked = 0; + + zseg->seg_mintxg = save_txg; + + if (rc == ERANGE) { + list_remove(&th->th_seglist, zseg); + kmem_free(zseg, sizeof (*zseg)); + return (EAGAIN); + } + + return (rc); +} + +/* + * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves + * are not included. The blocks covered by this segment will all have + * mintxg < birth < maxtxg. + */ +static void +traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, + uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, + uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) +{ + zseg_t *zseg; + + zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); + + zseg->seg_mintxg = mintxg; + zseg->seg_maxtxg = maxtxg; + + zseg->seg_start.zb_objset = sobjset; + zseg->seg_start.zb_object = sobject; + zseg->seg_start.zb_level = slevel; + zseg->seg_start.zb_blkid = sblkid; + + zseg->seg_end.zb_objset = eobjset; + zseg->seg_end.zb_object = eobject; + zseg->seg_end.zb_level = elevel; + zseg->seg_end.zb_blkid = eblkid; + + list_insert_tail(&th->th_seglist, zseg); +} + +void +traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, + uint64_t objset, uint64_t object) +{ + if (th->th_advance & ADVANCE_PRE) + traverse_add_segment(th, mintxg, maxtxg, + objset, object, ZB_MAXLEVEL, 0, + objset, object, 0, ZB_MAXBLKID); + else + traverse_add_segment(th, mintxg, maxtxg, + objset, object, 0, 0, + objset, object, 0, ZB_MAXBLKID); +} + +void +traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, + uint64_t objset) +{ + if (th->th_advance & ADVANCE_PRE) + traverse_add_segment(th, mintxg, maxtxg, + objset, 0, -1, 0, + objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); + else + traverse_add_segment(th, mintxg, maxtxg, + objset, 1, 0, 0, + objset, 0, -1, 0); +} + +void +traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) +{ + if (th->th_advance & ADVANCE_PRE) + traverse_add_segment(th, mintxg, maxtxg, + 0, 0, -1, 0, + ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); + else + traverse_add_segment(th, mintxg, maxtxg, + 1, 1, 0, 0, + 0, 0, -1, 0); +} + +traverse_handle_t * +traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, + int zio_flags) +{ + traverse_handle_t *th; + int d, l; + + th = kmem_zalloc(sizeof (*th), KM_SLEEP); + + th->th_spa = spa; + th->th_func = func; + th->th_arg = arg; + th->th_advance = advance; + th->th_lastcb.zb_level = ZB_NO_LEVEL; + th->th_noread.zb_level = ZB_NO_LEVEL; + th->th_zio_flags = zio_flags; + + list_create(&th->th_seglist, sizeof (zseg_t), + offsetof(zseg_t, seg_node)); + + for (d = 0; d < ZB_DEPTH; d++) { + for (l = 0; l < ZB_MAXLEVEL; l++) { + if ((advance & ADVANCE_DATA) || + l != 0 || d != ZB_DN_CACHE) + th->th_cache[d][l].bc_data = + zio_buf_alloc(SPA_MAXBLOCKSIZE); + } + } + + return (th); +} + +void +traverse_fini(traverse_handle_t *th) +{ + int d, l; + zseg_t *zseg; + + for (d = 0; d < ZB_DEPTH; d++) + for (l = 0; l < ZB_MAXLEVEL; l++) + if (th->th_cache[d][l].bc_data != NULL) + zio_buf_free(th->th_cache[d][l].bc_data, + SPA_MAXBLOCKSIZE); + + while ((zseg = list_head(&th->th_seglist)) != NULL) { + list_remove(&th->th_seglist, zseg); + kmem_free(zseg, sizeof (*zseg)); + } + + list_destroy(&th->th_seglist); + + dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", + th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, + th->th_syncs, th->th_restarts); + + kmem_free(th, sizeof (*th)); +} |