diff options
author | mm <mm@FreeBSD.org> | 2010-09-27 09:05:51 +0000 |
---|---|---|
committer | mm <mm@FreeBSD.org> | 2010-09-27 09:05:51 +0000 |
commit | 06d7ad088338844d37a5a4328b1c4bb5de1a68a2 (patch) | |
tree | 51bc74c1fcdcb6583ce0325e215ee1938ced28f3 /sys | |
parent | 776968f31350da9df36c96599691f3e375ccfba4 (diff) | |
download | FreeBSD-src-06d7ad088338844d37a5a4328b1c4bb5de1a68a2.zip FreeBSD-src-06d7ad088338844d37a5a4328b1c4bb5de1a68a2.tar.gz |
Enable offlining of log devices.
OpenSolaris revision and Bug IDs:
9701:cc5b64682e64
6803605 should be able to offline log devices
6726045 vdev_deflate_ratio is not set when offlining a log device
6599442 zpool import has faults in the display
Approved by: delphij (mentor)
Obtained from: OpenSolaris (Bug ID 6803605, 6726045, 6599442)
MFC after: 3 weeks
Diffstat (limited to 'sys')
6 files changed, 147 insertions, 75 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c index d11f106..50cc069 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c @@ -351,7 +351,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) return; /* - * One block ("stumpy") can be allocated a long time ago; we + * One block ("stubby") can be allocated a long time ago; we * want to visit that one because it has been allocated * (on-disk) even if it hasn't been claimed (even though for * plain scrub there's nothing to do to it). diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index da030f1..32caec6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -1110,6 +1110,33 @@ spa_check_removed(vdev_t *vd) } /* + * Load the slog device state from the config object since it's possible + * that the label does not contain the most up-to-date information. + */ +void +spa_load_log_state(spa_t *spa) +{ + nvlist_t *nv, *nvroot, **child; + uint64_t is_log; + uint_t children, c; + vdev_t *rvd = spa->spa_root_vdev; + + VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0); + VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + + for (c = 0; c < children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log) == 0 && is_log) + vdev_load_log_state(tvd, child[c]); + } + nvlist_free(nv); +} + +/* * Check for missing log devices */ int @@ -1125,13 +1152,7 @@ spa_check_logs(spa_t *spa) return (1); } break; - - case SPA_LOG_CLEAR: - (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, - DS_FIND_CHILDREN); - break; } - spa->spa_log_state = SPA_LOG_GOOD; return (0); } @@ -1455,6 +1476,8 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa_config_exit(spa, SCL_ALL, FTAG); } + spa_load_log_state(spa); + if (spa_check_logs(spa)) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LOG); @@ -1542,6 +1565,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); + spa->spa_log_state = SPA_LOG_GOOD; spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 1406d15..93e4102 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -259,6 +259,7 @@ extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ +extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv); extern void vdev_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h index e992f6a..efbf65e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h @@ -366,9 +366,9 @@ extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); +extern int zil_vdev_offline(char *osname, void *txarg); extern int zil_claim(char *osname, void *txarg); extern int zil_check_log_chain(char *osname, void *txarg); -extern int zil_clear_log_chain(char *osname, void *txarg); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog); extern int zil_is_committed(zilog_t *zilog); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index 140deed..82006fc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -39,6 +39,7 @@ #include <sys/zap.h> #include <sys/fs/zfs.h> #include <sys/arc.h> +#include <sys/zil.h> SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); @@ -765,6 +766,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ return (0); + /* + * Compute the raidz-deflation ratio. Note, we hard-code + * in 128k (1 << 17) because it is the current "typical" blocksize. + * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, + * or we will inconsistently account for existing bp's. + */ + vd->vdev_deflate_ratio = (1 << 17) / + (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + ASSERT(oldc <= newc); if (vd->vdev_islog) @@ -998,6 +1008,8 @@ vdev_open(vdev_t *vd) vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); @@ -1113,18 +1125,6 @@ vdev_open(vdev_t *vd) } /* - * If this is a top-level vdev, compute the raidz-deflation - * ratio. Note, we hard-code in 128k (1<<17) because it is the - * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE - * changes, this algorithm must never change, or we will - * inconsistently account for existing bp's. - */ - if (vd->vdev_top == vd) { - vd->vdev_deflate_ratio = (1<<17) / - (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); - } - - /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. @@ -1937,7 +1937,8 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { - vdev_t *vd; + vdev_t *vd, *tvd; + int error; spa_vdev_state_enter(spa); @@ -1947,34 +1948,58 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; + /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, - * don't allow it to be offlined. + * don't allow it to be offlined. Log devices are always + * expendable. */ - if (vd->vdev_aux == NULL && vdev_dtl_required(vd)) + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * Offline this device and reopen its top-level vdev. - * If this action results in the top-level vdev becoming - * unusable, undo it and fail the request. + * If the top-level vdev is a log device then just offline + * it. Otherwise, if this action results in the top-level + * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; - vdev_reopen(vd->vdev_top); - if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) { + vdev_reopen(tvd); + + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; - vdev_reopen(vd->vdev_top); + vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); - return (spa_vdev_state_exit(spa, vd, 0)); + if (!tvd->vdev_islog || !vdev_is_dead(tvd)) + return (spa_vdev_state_exit(spa, vd, 0)); + + (void) spa_vdev_state_exit(spa, vd, 0); + + error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN); + if (error) { + (void) vdev_online(spa, guid, 0, NULL); + return (error); + } + /* + * If we successfully offlined the log device then we need to + * sync out the current txg so that the "stubby" block can be + * removed by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + return (0); } /* @@ -2279,6 +2304,7 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, * childrens', thus not accurate enough for us. */ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; @@ -2631,11 +2657,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) boolean_t vdev_is_bootable(vdev_t *vd) { -#ifdef __FreeBSD_version - return (B_TRUE); -#else - int c; - +#ifdef sun if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; @@ -2654,6 +2676,35 @@ vdev_is_bootable(vdev_t *vd) if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } +#endif /* sun */ return (B_TRUE); -#endif +} + +void +vdev_load_log_state(vdev_t *vd, nvlist_t *nv) +{ + uint_t c, children; + nvlist_t **child; + uint64_t val; + spa_t *spa = vd->vdev_spa; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + vdev_load_log_state(vd->vdev_child[c], child[c]); + } + + if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) { + + /* + * It would be nice to call vdev_offline() + * directly but the pool isn't fully loaded and + * the txg threads have not been started yet. + */ + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER); + vd->vdev_offline = val; + vdev_reopen(vd->vdev_top); + spa_config_exit(spa, SCL_STATE_ALL, FTAG); + } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 7839713..1dd8ea0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -25,6 +25,7 @@ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/dmu.h> #include <sys/zap.h> #include <sys/arc.h> @@ -515,6 +516,13 @@ zil_claim(char *osname, void *txarg) zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); + if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) { + if (!BP_IS_HOLE(&zh->zh_log)) + zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg); + BP_ZERO(&zh->zh_log); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + /* * Record here whether the zil has any records to replay. * If the header block pointer is null or the block points @@ -527,8 +535,10 @@ zil_claim(char *osname, void *txarg) * Note, the intent log can be empty but still need the * stubby to be claimed. */ - if (!zil_empty(zilog)) + if (!zil_empty(zilog)) { zh->zh_flags |= ZIL_REPLAY_NEEDED; + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } /* * Claim all log blocks if we haven't already done so, and remember @@ -597,36 +607,6 @@ zil_check_log_chain(char *osname, void *txarg) return (error); } -/* - * Clear a log chain - */ -/* ARGSUSED */ -int -zil_clear_log_chain(char *osname, void *txarg) -{ - zilog_t *zilog; - zil_header_t *zh; - objset_t *os; - dmu_tx_t *tx; - int error; - - error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); - if (error) { - cmn_err(CE_WARN, "can't open objset for %s", osname); - return (0); - } - - zilog = dmu_objset_zil(os); - tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); - zh = zil_header_in_syncing_context(zilog); - BP_ZERO(&zh->zh_log); - dsl_dataset_dirty(dmu_objset_ds(os), tx); - dmu_tx_commit(tx); - dmu_objset_close(os); - return (0); -} - static int zil_vdev_compare(const void *x1, const void *x2) { @@ -771,9 +751,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) } if (lwb->lwb_zio == NULL) { lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, - lwb->lwb_sz, zil_lwb_write_done, lwb, - ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb); + 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, + zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb); } } @@ -1270,12 +1250,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) } } - for (;;) { - lwb = list_head(&zilog->zl_lwb_list); - if (lwb == NULL) { - mutex_exit(&zilog->zl_lock); - return; - } + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; @@ -1692,3 +1667,24 @@ out: mutex_exit(&zilog->zl_lock); return (ret); } + +/* ARGSUSED */ +int +zil_vdev_offline(char *osname, void *arg) +{ + objset_t *os; + zilog_t *zilog; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) + return (error); + + zilog = dmu_objset_zil(os); + if (zil_suspend(zilog) != 0) + error = EEXIST; + else + zil_resume(zilog); + dmu_objset_close(os); + return (error); +} |