summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordelphij <delphij@FreeBSD.org>2014-07-29 05:49:16 +0000
committerdelphij <delphij@FreeBSD.org>2014-07-29 05:49:16 +0000
commit2c88e211d5f978575601a6aac6c1a63dd8e88d7e (patch)
tree3f08ef3186fb7ff8d249b75348f97e9844851563
parent12df8077a6be5397f6d806d09abb9004c3723694 (diff)
downloadFreeBSD-src-2c88e211d5f978575601a6aac6c1a63dd8e88d7e.zip
FreeBSD-src-2c88e211d5f978575601a6aac6c1a63dd8e88d7e.tar.gz
MFC r268720: MFV r268714:
Improve extreme rewind import. When doing an "extreme rewind" import ("zpool import -XF"), we attempt to verify all data in the pool, essentially scrubbing the entire pool. The problem is that spa_load_verify_cb() issues an unbounded number of concurrent scrub i/os. This can lead to all of memory being used for these zio's, wedging the system. Like normal scrub, we need to put a cap on the number of outstanding i/os, and have the traverse thread block when we reach this cap. For this purpose the cap can be very large (10,000) to optimize the elevator algorithm. Three kernel tunables have been added: vfs.zfs.spa_load_verify_maxinflight vfs.zfs.spa_load_verify_metadata vfs.zfs.spa_load_verify_data The latter two tunables controls whether metadata and/or user data when doing extreme rewind. Make 'zpool import -T' imply scrub. Make zpool import -T <txg> accept hexadecimal values for the txg when prefixed with 0x. Skip txg's for which there is no uberblock when doing extreme rewind. Skip reading all user data twice by skipping prefetches when doing extreme rewinds as we do not access via the ARC. Illumos issues: 4970 need controls on i/o issued by zpool import -XF 4971 zpool import -T should accept hex values 4972 zpool import -T implies extreme rewind, and thus a scrub 4973 spa_load_retry retries the same txg 4974 spa_load_verify() reads all data twice
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool_main.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c74
2 files changed, 63 insertions, 15 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
index 7f27bb0..f4855b5 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012 by Frederik Wessels. All rights reserved.
* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
@@ -2033,7 +2033,7 @@ zpool_do_import(int argc, char **argv)
break;
case 'T':
errno = 0;
- txg = strtoull(optarg, &endptr, 10);
+ txg = strtoull(optarg, &endptr, 0);
if (errno != 0 || *endptr != '\0') {
(void) fprintf(stderr,
gettext("invalid txg value\n"));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index c65108e..3a20f9d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -1874,6 +1874,7 @@ spa_load_verify_done(zio_t *zio)
spa_load_error_t *sle = zio->io_private;
dmu_object_type_t type = BP_GET_TYPE(bp);
int error = zio->io_error;
+ spa_t *spa = zio->io_spa;
if (error) {
if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
@@ -1883,23 +1884,65 @@ spa_load_verify_done(zio_t *zio)
atomic_add_64(&sle->sle_data_count, 1);
}
zio_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_inflight--;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
}
+/*
+ * Maximum number of concurrent scrub i/os to create while verifying
+ * a pool while importing it.
+ */
+int spa_load_verify_maxinflight = 10000;
+boolean_t spa_load_verify_metadata = B_TRUE;
+boolean_t spa_load_verify_data = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
+ &spa_load_verify_maxinflight, 0,
+ "Maximum number of concurrent scrub I/Os to create while verifying a "
+ "pool while importing it");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
+ &spa_load_verify_metadata, 0,
+ "Check metadata on import?");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
+ &spa_load_verify_data, 0,
+ "Check user data on import?");
+
/*ARGSUSED*/
static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
- if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
- zio_t *rio = arg;
- size_t size = BP_GET_PSIZE(bp);
- void *data = zio_data_buf_alloc(size);
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ return (0);
+ /*
+ * Note: normally this routine will not be called if
+ * spa_load_verify_metadata is not set. However, it may be useful
+ * to manually set the flag after the traversal has begun.
+ */
+ if (!spa_load_verify_metadata)
+ return (0);
+ if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
+ return (0);
- zio_nowait(zio_read(rio, spa, bp, data, size,
- spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
- ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
- ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
- }
+ zio_t *rio = arg;
+ size_t size = BP_GET_PSIZE(bp);
+ void *data = zio_data_buf_alloc(size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(rio, spa, bp, data, size,
+ spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
return (0);
}
@@ -1910,7 +1953,7 @@ spa_load_verify(spa_t *spa)
spa_load_error_t sle = { 0 };
zpool_rewind_policy_t policy;
boolean_t verify_ok = B_FALSE;
- int error;
+ int error = 0;
zpool_get_rewind_policy(spa->spa_config, &policy);
@@ -1920,8 +1963,11 @@ spa_load_verify(spa_t *spa)
rio = zio_root(spa, NULL, &sle,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
- error = traverse_pool(spa, spa->spa_verify_min_txg,
- TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+ if (spa_load_verify_metadata) {
+ error = traverse_pool(spa, spa->spa_verify_min_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+ spa_load_verify_cb, rio);
+ }
(void) zio_wait(rio);
@@ -2796,7 +2842,7 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
spa_unload(spa);
spa_deactivate(spa);
- spa->spa_load_max_txg--;
+ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
spa_activate(spa, mode);
spa_async_suspend(spa);
@@ -2826,6 +2872,8 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
spa_set_log_state(spa, SPA_LOG_CLEAR);
} else {
spa->spa_load_max_txg = max_request;
+ if (max_request != UINT64_MAX)
+ spa->spa_extreme_rewind = B_TRUE;
}
load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
OpenPOWER on IntegriCloud