diff options
Diffstat (limited to 'cddl/contrib/opensolaris/cmd/zdb/zdb.c')
-rw-r--r-- | cddl/contrib/opensolaris/cmd/zdb/zdb.c | 478 |
1 files changed, 321 insertions, 157 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index c265c99..61e8071 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include <stdio.h> @@ -89,6 +89,7 @@ extern void dump_intent_log(zilog_t *); uint64_t *zopt_object = NULL; int zopt_objects = 0; libzfs_handle_t *g_zfs; +uint64_t max_inflight = 200; /* * These libumem hooks provide a reasonable set of defaults for the allocator's @@ -110,16 +111,17 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]]" - "poolname [object...]\n" - " %s [-divPA] [-e -p path...] dataset [object...]\n" - " %s -m [-LXFPA] [-t txg] [-e [-p path...]]" - "poolname [vdev [metaslab...]]\n" - " %s -R [-A] [-e [-p path...]] poolname " - "vdev:offset:size[:flags]\n" - " %s -S [-PA] [-e [-p path...]] poolname\n" - " %s -l [-uA] device\n" - " %s -C [-A] [-U config]\n\n", + "Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] " + "[-U config] [-M inflight I/Os] poolname [object...]\n" + " %s [-divPA] [-e -p path...] [-U config] dataset " + "[object...]\n" + " %s -m [-LXFPA] [-t txg] [-e [-p path...]] [-U config] " + "poolname [vdev [metaslab...]]\n" + " %s -R [-A] [-e [-p path...]] poolname " + "vdev:offset:size[:flags]\n" + " %s -S [-PA] [-e [-p path...]] [-U config] poolname\n" + " %s -l [-uA] device\n" + " %s -C [-A] [-U config]\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " @@ -164,6 +166,8 @@ usage(void) (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -t <txg> -- highest txg to use when " "searching for uberblocks\n"); + (void) fprintf(stderr, " -M <number of inflight I/Os> -- " + "specify the maximum number of checksumming I/Os [default is 200]"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); @@ -242,7 +246,7 @@ const char histo_stars[] = "****************************************"; const int histo_width = sizeof (histo_stars) - 1; static void -dump_histogram(const uint64_t *histo, int size) +dump_histogram(const uint64_t *histo, int size, int offset) { int i; int minidx = size - 1; @@ -263,7 +267,7 @@ dump_histogram(const uint64_t *histo, int size) for (i = minidx; i <= maxidx; i++) { (void) printf("\t\t\t%3u: %6llu %s\n", - i, (u_longlong_t)histo[i], + i + offset, (u_longlong_t)histo[i], &histo_stars[(max - histo[i]) * histo_width / max]); } } @@ -316,19 +320,19 @@ dump_zap_stats(objset_t *os, uint64_t object) (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); - dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE); + dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); - dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE); + dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); - dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE); + dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); - dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE); + dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); - dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE); + dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } /*ARGSUSED*/ @@ -517,26 +521,89 @@ dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) zap_cursor_fini(&zc); } +int +get_dtl_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_ops->vdev_op_leaf) { + space_map_t *sm = vd->vdev_dtl_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + return (1); + return (0); + } + + for (int c = 0; c < vd->vdev_children; c++) + refcount += get_dtl_refcount(vd->vdev_child[c]); + return (refcount); +} + +int +get_metaslab_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_top == vd) { + for (int m = 0; m < vd->vdev_ms_count; m++) { + space_map_t *sm = vd->vdev_ms[m]->ms_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + refcount++; + } + } + for (int c = 0; c < vd->vdev_children; c++) + refcount += get_metaslab_refcount(vd->vdev_child[c]); + + return (refcount); +} + +static int +verify_spacemap_refcounts(spa_t *spa) +{ + uint64_t expected_refcount = 0; + uint64_t actual_refcount; + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], + &expected_refcount); + actual_refcount = get_dtl_refcount(spa->spa_root_vdev); + actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); + + if (expected_refcount != actual_refcount) { + (void) printf("space map refcount mismatch: expected %lld != " + "actual %lld\n", + (longlong_t)expected_refcount, + (longlong_t)actual_refcount); + return (2); + } + return (0); +} + static void -dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) +dump_spacemap(objset_t *os, space_map_t *sm) { uint64_t alloc, offset, entry; - uint8_t mapshift = sm->sm_shift; - uint64_t mapstart = sm->sm_start; char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; - if (smo->smo_object == 0) + if (sm == NULL) return; /* * Print out the freelist entries in both encoded and decoded form. */ alloc = 0; - for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) { - VERIFY3U(0, ==, dmu_read(os, smo->smo_object, offset, + for (offset = 0; offset < space_map_length(sm); + offset += sizeof (entry)) { + uint8_t mapshift = sm->sm_shift; + + VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (entry), &entry, DMU_READ_PREFETCH)); if (SM_DEBUG_DECODE(entry)) { + (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", (u_longlong_t)(offset / sizeof (entry)), ddata[SM_DEBUG_ACTION_DECODE(entry)], @@ -548,10 +615,10 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) (u_longlong_t)(offset / sizeof (entry)), SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F', (u_longlong_t)((SM_OFFSET_DECODE(entry) << - mapshift) + mapstart), + mapshift) + sm->sm_start), (u_longlong_t)((SM_OFFSET_DECODE(entry) << - mapshift) + mapstart + (SM_RUN_DECODE(entry) << - mapshift)), + mapshift) + sm->sm_start + + (SM_RUN_DECODE(entry) << mapshift)), (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift)); if (SM_TYPE_DECODE(entry) == SM_ALLOC) alloc += SM_RUN_DECODE(entry) << mapshift; @@ -559,10 +626,10 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) alloc -= SM_RUN_DECODE(entry) << mapshift; } } - if (alloc != smo->smo_alloc) { + if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%llu) INCONSISTENT " "with space map summary (%llu)\n", - (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc); + (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc); } } @@ -570,15 +637,17 @@ static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; - space_map_t *sm = msp->ms_map; - avl_tree_t *t = sm->sm_pp_root; - int free_pct = sm->sm_space * 100 / sm->sm_size; + range_tree_t *rt = msp->ms_tree; + avl_tree_t *t = &msp->ms_size_tree; + int free_pct = range_tree_space(rt) * 100 / msp->ms_size; - zdb_nicenum(space_map_maxsize(sm), maxbuf); + zdb_nicenum(metaslab_block_maxsize(msp), maxbuf); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); + (void) printf("\tIn-memory histogram:\n"); + dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void @@ -586,33 +655,44 @@ dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; - space_map_t *sm = msp->ms_map; - space_map_obj_t *smo = &msp->ms_smo; + space_map_t *sm = msp->ms_sm; char freebuf[32]; - zdb_nicenum(sm->sm_size - smo->smo_alloc, freebuf); + zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", - (u_longlong_t)(sm->sm_start / sm->sm_size), - (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf); + (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, + (u_longlong_t)space_map_object(sm), freebuf); - if (dump_opt['m'] > 1 && !dump_opt['L']) { + if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); - space_map_load_wait(sm); - if (!sm->sm_loaded) - VERIFY(space_map_load(sm, zfs_metaslab_ops, - SM_FREE, smo, spa->spa_meta_objset) == 0); + metaslab_load_wait(msp); + if (!msp->ms_loaded) { + VERIFY0(metaslab_load(msp)); + range_tree_stat_verify(msp->ms_tree); + } dump_metaslab_stats(msp); - space_map_unload(sm); + metaslab_unload(msp); mutex_exit(&msp->ms_lock); } - if (dump_opt['d'] > 5 || dump_opt['m'] > 2) { - ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift)); + if (dump_opt['m'] > 1 && sm != NULL && + spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + /* + * The space map histogram represents free space in chunks + * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). + */ + (void) printf("\tOn-disk histogram:\n"); + dump_histogram(sm->sm_phys->smp_histogram, + SPACE_MAP_HISTOGRAM_SIZE(sm), sm->sm_shift); + } + + if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { + ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); mutex_enter(&msp->ms_lock); - dump_spacemap(spa->spa_meta_objset, smo, sm); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); mutex_exit(&msp->ms_lock); } } @@ -684,7 +764,7 @@ dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); - sprintf_blkptr(blkbuf, &blk); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, types[p], blkbuf); @@ -801,9 +881,9 @@ dump_all_ddts(spa_t *spa) } static void -dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size) +dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { - char *prefix = (void *)sm; + char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, @@ -833,28 +913,32 @@ dump_dtl(vdev_t *vd, int indent) required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { - space_map_t *sm = &vd->vdev_dtl[t]; - if (sm->sm_space == 0) + range_tree_t *rt = vd->vdev_dtl[t]; + if (range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); - mutex_enter(sm->sm_lock); - space_map_walk(sm, dump_dtl_seg, (void *)prefix); - mutex_exit(sm->sm_lock); + mutex_enter(rt->rt_lock); + range_tree_walk(rt, dump_dtl_seg, prefix); + mutex_exit(rt->rt_lock); if (dump_opt['d'] > 5 && vd->vdev_children == 0) - dump_spacemap(spa->spa_meta_objset, - &vd->vdev_dtl_smo, sm); + dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (int c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } +/* from spa_history.c: spa_history_create_obj() */ +#define HIS_BUF_LEN_DEF (128 << 10) +#define HIS_BUF_LEN_MAX (1 << 30) + static void dump_history(spa_t *spa) { nvlist_t **events = NULL; - char buf[SPA_MAXBLOCKSIZE]; + char *buf = NULL; + uint64_t bufsize = HIS_BUF_LEN_DEF; uint64_t resid, len, off = 0; uint_t num = 0; int error; @@ -863,8 +947,11 @@ dump_history(spa_t *spa) char tbuf[30]; char internalstr[MAXPATHLEN]; + if ((buf = malloc(bufsize)) == NULL) + (void) fprintf(stderr, "Unable to read history: " + "out of memory\n"); do { - len = sizeof (buf); + len = bufsize; if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " @@ -874,9 +961,26 @@ dump_history(spa_t *spa) if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; - off -= resid; + + /* + * If the history block is too big, double the buffer + * size and try again. + */ + if (resid == len) { + free(buf); + buf = NULL; + + bufsize <<= 1; + if ((bufsize >= HIS_BUF_LEN_MAX) || + ((buf = malloc(bufsize)) == NULL)) { + (void) fprintf(stderr, "Unable to read history: " + "out of memory\n"); + return; + } + } } while (len != 0); + free(buf); (void) printf("\nHistory:\n"); for (int i = 0; i < num; i++) { @@ -945,31 +1049,39 @@ blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb) } static void -sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp) +snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; if (dump_opt['b'] >= 6) { - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, buflen, bp); return; } blkbuf[0] = '\0'; for (int i = 0; i < ndvas; i++) - (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ", + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); - (void) sprintf(blkbuf + strlen(blkbuf), - "%llxL/%llxP F=%llu B=%llu/%llu", - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp), - (u_longlong_t)bp->blk_fill, - (u_longlong_t)bp->blk_birth, - (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + if (BP_IS_HOLE(bp)) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), "B=%llu", + (u_longlong_t)bp->blk_birth); + } else { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL/%llxP F=%llu B=%llu/%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp), + (u_longlong_t)bp->blk_fill, + (u_longlong_t)bp->blk_birth, + (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + } } static void @@ -994,7 +1106,7 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb, } } - sprintf_blkptr_compact(blkbuf, bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } @@ -1009,7 +1121,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, print_indirect(bp, zb, dnp); - if (BP_GET_LEVEL(bp) > 0) { + if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { uint32_t flags = ARC_WAIT; int i; blkptr_t *cbp; @@ -1134,7 +1246,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) zdb_nicenum(ds->ds_compressed_bytes, compressed); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed); zdb_nicenum(ds->ds_unique_bytes, unique); - sprintf_blkptr(blkbuf, &ds->ds_bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); @@ -1179,7 +1291,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) char blkbuf[BP_SPRINTF_LEN]; if (bp->blk_birth != 0) { - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); @@ -1217,7 +1329,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); - sprintf_blkptr_compact(blkbuf, bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); return (0); } @@ -1716,8 +1828,9 @@ dump_dir(objset_t *os) zdb_nicenum(refdbytes, numbuf); if (verbosity >= 4) { - (void) sprintf(blkbuf, ", rootbp "); - (void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp); + (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); + (void) snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } @@ -1747,7 +1860,7 @@ dump_dir(objset_t *os) if (verbosity < 2) return; - if (os->os_rootbp->blk_birth == 0) + if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header); @@ -1788,7 +1901,7 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); if (dump_opt['u'] >= 3) { char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, &ub->ub_rootbp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf(footer ? footer : ""); @@ -2079,16 +2192,68 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } +/* ARGSUSED */ +static void +zdb_blkptr_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + int ioerr = zio->io_error; + zdb_cb_t *zcb = zio->io_private; + zbookmark_t *zb = &zio->io_bookmark; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + char blkbuf[BP_SPRINTF_LEN]; + + zcb->zcb_haderrors = 1; + zcb->zcb_errors[ioerr]++; + + if (dump_opt['b'] >= 2) + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + else + blkbuf[0] = '\0'; + + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } + mutex_exit(&spa->spa_scrub_lock); +} + +/* ARGSUSED */ static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; - char blkbuf[BP_SPRINTF_LEN]; dmu_object_type_t type; boolean_t is_metadata; - if (bp == NULL) + if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("objset %llu object %llu " + "level %lld offset 0x%llx %s\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (u_longlong_t)blkid2offset(dnp, bp, zb), + blkbuf); + } + + if (BP_IS_HOLE(bp)) return (0); type = BP_GET_TYPE(bp); @@ -2099,53 +2264,26 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { - int ioerr; size_t size = BP_GET_PSIZE(bp); - void *data = malloc(size); + void *data = zio_data_buf_alloc(size); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; - ioerr = zio_wait(zio_read(NULL, spa, bp, data, size, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb)); + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > max_inflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); - free(data); - if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) { - zcb->zcb_haderrors = 1; - zcb->zcb_errors[ioerr]++; - - if (dump_opt['b'] >= 2) - sprintf_blkptr(blkbuf, bp); - else - blkbuf[0] = '\0'; - - (void) printf("zdb_blkptr_cb: " - "Got error %d reading " - "<%llu, %llu, %lld, %llx> %s -- skipping\n", - ioerr, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid, - blkbuf); - } + zio_nowait(zio_read(NULL, spa, bp, data, size, + zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; - if (dump_opt['b'] >= 5) { - sprintf_blkptr(blkbuf, bp); - (void) printf("objset %llu object %llu " - "level %lld offset 0x%llx %s\n", - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (u_longlong_t)blkid2offset(dnp, bp, zb), - blkbuf); - } - if (dump_opt['b'] < 5 && isatty(STDERR_FILENO) && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); @@ -2172,39 +2310,17 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } static void -zdb_leak(space_map_t *sm, uint64_t start, uint64_t size) +zdb_leak(void *arg, uint64_t start, uint64_t size) { - vdev_t *vd = sm->sm_ppd; + vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } -/* ARGSUSED */ -static void -zdb_space_map_load(space_map_t *sm) -{ -} - -static void -zdb_space_map_unload(space_map_t *sm) -{ - space_map_vacate(sm, zdb_leak, sm); -} - -/* ARGSUSED */ -static void -zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) -{ -} - -static space_map_ops_t zdb_space_map_ops = { - zdb_space_map_load, - zdb_space_map_unload, +static metaslab_ops_t zdb_metaslab_ops = { NULL, /* alloc */ - zdb_space_map_claim, - NULL, /* free */ - NULL /* maxsize */ + NULL /* fragmented */ }; static void @@ -2259,11 +2375,21 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; mutex_enter(&msp->ms_lock); - space_map_unload(msp->ms_map); - VERIFY(space_map_load(msp->ms_map, - &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo, - spa->spa_meta_objset) == 0); - msp->ms_map->sm_ppd = vd; + metaslab_unload(msp); + + /* + * For leak detection, we overload the metaslab + * ms_tree to contain allocated segments + * instead of free segments. As a result, + * we can't use the normal metaslab_load/unload + * interfaces. + */ + if (msp->ms_sm != NULL) { + msp->ms_ops = &zdb_metaslab_ops; + VERIFY0(space_map_load(msp->ms_sm, + msp->ms_tree, SM_ALLOC)); + msp->ms_loaded = B_TRUE; + } mutex_exit(&msp->ms_lock); } } @@ -2286,7 +2412,20 @@ zdb_leak_fini(spa_t *spa) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; mutex_enter(&msp->ms_lock); - space_map_unload(msp->ms_map); + + /* + * The ms_tree has been overloaded to + * contain allocated segments. Now that we + * finished traversing all blocks, any + * block that remains in the ms_tree + * represents an allocated block that we + * did not claim during the traversal. + * Claimed blocks would have been removed + * from the ms_tree. + */ + range_tree_vacate(msp->ms_tree, zdb_leak, vd); + msp->ms_loaded = B_FALSE; + mutex_exit(&msp->ms_lock); } } @@ -2301,7 +2440,7 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } @@ -2344,8 +2483,7 @@ dump_block_stats(spa_t *spa) (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, count_block_cb, &zcb, NULL); } - if (spa_feature_is_active(spa, - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, &zcb, NULL)); @@ -2358,6 +2496,18 @@ dump_block_stats(spa_t *spa) zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + /* + * If we've traversed the data blocks then we need to wait for those + * I/Os to complete. We leverage "The Godfather" zio to wait on + * all async I/Os to complete. + */ + if (dump_opt['c']) { + (void) zio_wait(spa->spa_async_zio_root); + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); @@ -2489,7 +2639,7 @@ dump_block_stats(spa_t *spa) "(in 512-byte sectors): " "number of blocks\n"); dump_histogram(zb->zb_psize_histogram, - PSIZE_HISTO_SIZE); + PSIZE_HISTO_SIZE, 0); } } } @@ -2524,7 +2674,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; - if (bp == NULL) + if (BP_IS_HOLE(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { @@ -2591,7 +2741,8 @@ dump_simulated_ddt(spa_t *spa) dds.dds_ref_psize = zdde->zdde_ref_psize; dds.dds_ref_dsize = zdde->zdde_ref_dsize; - ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0); + ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], + &dds, 0); umem_free(zdde, sizeof (*zdde)); } @@ -2646,7 +2797,7 @@ dump_zpool(spa_t *spa) } if (spa_feature_is_active(spa, - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, "Pool dataset frees"); @@ -2659,6 +2810,9 @@ dump_zpool(spa_t *spa) if (dump_opt['b'] || dump_opt['c']) rc = dump_block_stats(spa); + if (rc == 0) + rc = verify_spacemap_refcounts(spa); + if (dump_opt['s']) show_pool_stats(spa); @@ -2688,7 +2842,7 @@ zdb_print_blkptr(blkptr_t *bp, int flags) if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } @@ -2884,6 +3038,7 @@ zdb_read_block(char *thing, spa_t *spa) free(dup); return; } + i += p - &flagstr[i + 1]; /* skip over the number */ } } @@ -3124,7 +3279,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) { + while ((c = getopt(argc, argv, "bcdhilmM:suCDRSAFLXevp:t:U:P")) != -1) { switch (c) { case 'b': case 'c': @@ -3153,6 +3308,15 @@ main(int argc, char **argv) case 'v': verbose++; break; + case 'M': + max_inflight = strtoull(optarg, NULL, 0); + if (max_inflight == 0) { + (void) fprintf(stderr, "maximum number " + "of inflight I/Os must be greater " + "than 0\n"); + usage(); + } + break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), |