summaryrefslogtreecommitdiffstats
path: root/cddl
diff options
context:
space:
mode:
authorgibbs <gibbs@FreeBSD.org>2013-08-21 04:10:24 +0000
committergibbs <gibbs@FreeBSD.org>2013-08-21 04:10:24 +0000
commitbd47afb2893407b47a1dc3c9165507d2aa77b5b7 (patch)
treefd4721b7f443de53ce7e74f87c485e2d9f3c7dc1 /cddl
parentd5eb41b48a84cd4efc8be15b34a5cedb8e60de26 (diff)
downloadFreeBSD-src-bd47afb2893407b47a1dc3c9165507d2aa77b5b7.zip
FreeBSD-src-bd47afb2893407b47a1dc3c9165507d2aa77b5b7.tar.gz
Enhance the ZFS vdev layer to maintain both a logical and a physical
minimum allocation size for devices. Use this information to automatically increase ZFS's minimum allocation size for new top-level vdevs to a value that more closely matches the optimum device allocation size. Use GEOM's stripesize attribute, if set, as the physical sector size of the GEOM. Calculate the minimum blocksize of each metaslab class. Use the calculated value instead of SPA_MINBLOCKSIZE (512b) when determining the likelyhood of compression yeilding a reduction in physical space usage. Report devices with sub-optimal block size configuration in "zpool status". Also properly fail attempts to attach devices with a logical block size greater than 8kB, since this will cause corruption to ZFS's label area. Sponsored by: Spectra Logic Corporaion MFC after: 2 weeks Background ========== Many modern devices use physical allocation units that are much larger than the minimum logical allocation size accessible by external commands. Two prevalent examples of this are 512e disk drives (512b logical sector, 4K physical sector) and flash devices (512b logical sector, 4K or larger allocation block size, and 128k or larger erase block size). Operations that modify less than the physical sector size result in a costly read-modify-write or garbage collection sequence on these devices. Simply exporting the true physical sector of the device to ZFS would yield optimal performance, but has two serious drawbacks: 1) Existing pools created with devices that have different logical and physical block sizes, but were configured to use the logical block size (e.g. because the OS version used for pool construction reported the logical block size instead of the physical block size) will suddenly find that the vdev allocation size has increased. This can be easily tolerated for active members of the array, but ZFS would prevent replacement of a vdev with another identical device because it now appears that the smaller allocation size required by the pool is not supported by the new device. 2) The device's physical block size may be too large to be supported by ZFS. The optimal allocation size for the vdev may be quite large. For example, a RAID controller may export a vdev that requires read-modify-write cycles unless accessed using 64k aligned/sized requests. ZFS currently has an 8k minimum block size limit. Reporting both the logical and physical allocation sizes for vdevs solves these problems. A device may be used so long as the logical block size is compatible with the configuration. By comparing the logical and physical block sizes, new configurations can be optimized and administrators can be notified of any existing pools that are sub-optimal. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h: Add the SPA_ASHIFT constant. ZFS currently has a hard upper limit of 13 (8k) for ashift and this constant is used to both document and enforce this limit. sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h: Add the VDEV_AUX_ASHIFT_TOO_BIG error code. Add fields for exporting the configured, logical, and physical ashift to the vdev_stat_t structure. Add VDEV_STAT_VALID() macro which can be used to verify the presence of required vdev_stat_t fields in nvlist data. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c: Provide a SYSCTL_PROC handler for "max_auto_ashift". Since the limit is only referenced long after boot when a create operation occurs, there's no compelling need for it to be a boot time configurable tunable. This also allows the validation code for the max_auto_ashift value to be contained within the sysctl handler. Populate the new fields in the vdev_stat_t structure. Fail vdev opens if the vdev reports an ashift larger than SPA_MAXASHIFT. Propogate vdev_logical_ashift and vdev_physical_ashift between child and parent vdevs as is done for vdev_ashift. In vdev_open(), restore code that fails opens for devices where vdev_ashift grows. This can only happen now if the device's logical ashift grows, which means it really isn't safe to use the device. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c: Update the vdev_open() API so that both logical (what was just ashift before) and physical ashift are reported. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h: Add two new fields, vdev_physical_ashift and vdev_logical_ashift, to vdev_t. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c: Add vdev_ashift_optimize(). Call it anytime a new top-level vdev is allocated. cddl/contrib/opensolaris/cmd/zpool/zpool_main.c: Add text for the VDEV_AUX_ASHIFT_TOO_BIG error. For each sub-optimally configured leaf vdev, report configured and native block sizes. cddl/contrib/opensolaris/cmd/zpool/zpool_main.c: cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h: cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c: Introduce a new zpool status: ZPOOL_STATUS_NON_NATIVE_ASHIFT. This status is reported on healthy pools containing vdevs configured to use a block size smaller than their reported physical block size. cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c: Update find_vdev_problem() and supporting functions to provide the full vdev_stat_t structure to problem checking routines, and to allow decent into replacing vdevs. Add a vdev_non_native_ashift() validator which is used on the full vdev tree to check for ZPOOL_STATUS_NON_NATIVE_ASHIFT. cddl/contrib/opensolaris/lib/libzpool/common/kernel.c: cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h: Enhance sysctl userland stubs now that a SYSCTL_PROC handler is used in vdev.c. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h: When the group membership of a metaslab class changes (i.e. when a vdev is added or removed from a pool), walk the group list to determine the smallest block size currently available and record this in the metaslab class. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c: Add the metaslab_class_get_minblocksize() accessor. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c: In zio_compress_data(), take the minimum blocksize as an input parameter instead of assuming SPA_MINBLOCKSIZE. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c: In l2arc_compress_buf(), pass SPA_MINBLOCKSIZE as the minimum blocksize of the device. The l2arc code performs has it's own code for deciding if compression is worth while, so this effectively disables zio_compress_data() from second guessing the original decision. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c: In zio_write_bp_init(), use the minimum blocksize of the normal metaslab class when compressing data.
Diffstat (limited to 'cddl')
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool_main.c24
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h1
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c87
-rw-r--r--cddl/contrib/opensolaris/lib/libzpool/common/kernel.c6
-rw-r--r--cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h46
5 files changed, 127 insertions, 37 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
index bfb7114..1686467 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
@@ -1295,12 +1295,13 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
int namewidth, int depth, boolean_t isspare)
{
nvlist_t **child;
- uint_t c, children;
+ uint_t c, vsc, children;
pool_scan_stat_t *ps = NULL;
vdev_stat_t *vs;
char rbuf[6], wbuf[6], cbuf[6];
char *vname;
uint64_t notpresent;
+ uint64_t ashift;
spare_cbdata_t cb;
const char *state;
@@ -1309,7 +1310,7 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
children = 0;
verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
- (uint64_t **)&vs, &c) == 0);
+ (uint64_t **)&vs, &vsc) == 0);
state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
if (isspare) {
@@ -1363,6 +1364,10 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
(void) printf(gettext("unsupported feature(s)"));
break;
+ case VDEV_AUX_ASHIFT_TOO_BIG:
+ (void) printf(gettext("unsupported minimum blocksize"));
+ break;
+
case VDEV_AUX_SPARED:
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&cb.cb_guid) == 0);
@@ -1405,6 +1410,12 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
(void) printf(gettext("corrupted data"));
break;
}
+ } else if (children == 0 && !isspare &&
+ VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
+ vs->vs_configured_ashift < vs->vs_physical_ashift) {
+ (void) printf(
+ gettext(" block size: %dB configured, %dB native"),
+ 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift);
}
(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
@@ -4268,6 +4279,15 @@ status_callback(zpool_handle_t *zhp, void *data)
"'zpool clear'.\n"));
break;
+ case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
+ (void) printf(gettext("status: One or more devices are "
+ "configured to use a non-native block size.\n"
+ "\tExpect reduced performance.\n"));
+ (void) printf(gettext("action: Replace affected devices with "
+ "devices that support the\n\tconfigured block size, or "
+ "migrate data to a properly configured\n\tpool.\n"));
+ break;
+
default:
/*
* The remaining errors can't actually be generated, yet.
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
index 981e9bd..0a5ca82 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
@@ -326,6 +326,7 @@ typedef enum {
ZPOOL_STATUS_RESILVERING, /* device being resilvered */
ZPOOL_STATUS_OFFLINE_DEV, /* device online */
ZPOOL_STATUS_REMOVED_DEV, /* removed device */
+ ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */
/*
* Finally, the following indicates a healthy pool.
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
index 71b27a1..906883c 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
@@ -73,57 +73,66 @@ static char *zfs_msgid_table[] = {
/* ARGSUSED */
static int
-vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_missing(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_CANT_OPEN &&
- aux == VDEV_AUX_OPEN_FAILED);
+ return (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+ vs->vs_aux == VDEV_AUX_OPEN_FAILED);
}
/* ARGSUSED */
static int
-vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_faulted(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_FAULTED);
+ return (vs->vs_state == VDEV_STATE_FAULTED);
}
/* ARGSUSED */
static int
-vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_errors(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_DEGRADED || errs != 0);
+ return (vs->vs_state == VDEV_STATE_DEGRADED ||
+ vs->vs_read_errors != 0 || vs->vs_write_errors != 0 ||
+ vs->vs_checksum_errors != 0);
}
/* ARGSUSED */
static int
-vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_broken(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_CANT_OPEN);
+ return (vs->vs_state == VDEV_STATE_CANT_OPEN);
}
/* ARGSUSED */
static int
-vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_offlined(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_OFFLINE);
+ return (vs->vs_state == VDEV_STATE_OFFLINE);
}
/* ARGSUSED */
static int
-vdev_removed(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_removed(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_REMOVED);
+ return (vs->vs_state == VDEV_STATE_REMOVED);
+}
+
+static int
+vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc)
+{
+ return (VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
+ vs->vs_configured_ashift < vs->vs_physical_ashift);
}
/*
* Detect if any leaf devices that have seen errors or could not be opened.
*/
static boolean_t
-find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
+find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t),
+ boolean_t ignore_replacing)
{
nvlist_t **child;
vdev_stat_t *vs;
- uint_t c, children;
- char *type;
+ uint_t c, vsc, children;
/*
* Ignore problems within a 'replacing' vdev, since we're presumably in
@@ -131,23 +140,25 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
* out again. We'll pick up the fact that a resilver is happening
* later.
*/
- verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
- if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
- return (B_FALSE);
+ if (ignore_replacing == B_TRUE) {
+ char *type;
+
+ verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE,
+ &type) == 0);
+ if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
+ return (B_FALSE);
+ }
if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
&children) == 0) {
for (c = 0; c < children; c++)
- if (find_vdev_problem(child[c], func))
+ if (find_vdev_problem(child[c], func, ignore_replacing))
return (B_TRUE);
} else {
verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
- (uint64_t **)&vs, &c) == 0);
+ (uint64_t **)&vs, &vsc) == 0);
- if (func(vs->vs_state, vs->vs_aux,
- vs->vs_read_errors +
- vs->vs_write_errors +
- vs->vs_checksum_errors))
+ if (func(vs, vsc) != 0)
return (B_TRUE);
}
@@ -157,7 +168,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child,
&children) == 0) {
for (c = 0; c < children; c++)
- if (find_vdev_problem(child[c], func))
+ if (find_vdev_problem(child[c], func, ignore_replacing))
return (B_TRUE);
}
@@ -270,15 +281,15 @@ check_status(nvlist_t *config, boolean_t isimport)
* Bad devices in non-replicated config.
*/
if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
- find_vdev_problem(nvroot, vdev_faulted))
+ find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
return (ZPOOL_STATUS_FAULTED_DEV_NR);
if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
- find_vdev_problem(nvroot, vdev_missing))
+ find_vdev_problem(nvroot, vdev_missing, B_TRUE))
return (ZPOOL_STATUS_MISSING_DEV_NR);
if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
- find_vdev_problem(nvroot, vdev_broken))
+ find_vdev_problem(nvroot, vdev_broken, B_TRUE))
return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
/*
@@ -300,32 +311,38 @@ check_status(nvlist_t *config, boolean_t isimport)
/*
* Missing devices in a replicated config.
*/
- if (find_vdev_problem(nvroot, vdev_faulted))
+ if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
return (ZPOOL_STATUS_FAULTED_DEV_R);
- if (find_vdev_problem(nvroot, vdev_missing))
+ if (find_vdev_problem(nvroot, vdev_missing, B_TRUE))
return (ZPOOL_STATUS_MISSING_DEV_R);
- if (find_vdev_problem(nvroot, vdev_broken))
+ if (find_vdev_problem(nvroot, vdev_broken, B_TRUE))
return (ZPOOL_STATUS_CORRUPT_LABEL_R);
/*
* Devices with errors
*/
- if (!isimport && find_vdev_problem(nvroot, vdev_errors))
+ if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE))
return (ZPOOL_STATUS_FAILING_DEV);
/*
* Offlined devices
*/
- if (find_vdev_problem(nvroot, vdev_offlined))
+ if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE))
return (ZPOOL_STATUS_OFFLINE_DEV);
/*
* Removed device
*/
- if (find_vdev_problem(nvroot, vdev_removed))
+ if (find_vdev_problem(nvroot, vdev_removed, B_TRUE))
return (ZPOOL_STATUS_REMOVED_DEV);
/*
+ * Suboptimal, but usable, ashift configuration.
+ */
+ if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE))
+ return (ZPOOL_STATUS_NON_NATIVE_ASHIFT);
+
+ /*
* Outdated, but usable, version
*/
if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION)
diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
index c5c7b66..71c3dce 100644
--- a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
+++ b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
@@ -591,6 +591,12 @@ dprintf_setup(int *argc, char **argv)
dprintf_print_all = 1;
}
+int
+sysctl_handle_64(SYSCTL_HANDLER_ARGS)
+{
+ return (0);
+}
+
/*
* =========================================================================
* debug printfs
diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
index 2107505..da153c7 100644
--- a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
+++ b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
@@ -659,11 +659,55 @@ typedef uint32_t idmap_rid_t;
#define SX_SYSINIT(name, lock, desc)
+#define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, \
+ intptr_t arg2, struct sysctl_req *req
+
+/*
+ * This describes the access space for a sysctl request. This is needed
+ * so that we can use the interface from the kernel or from user-space.
+ */
+struct sysctl_req {
+ struct thread *td; /* used for access checking */
+ int lock; /* wiring state */
+ void *oldptr;
+ size_t oldlen;
+ size_t oldidx;
+ int (*oldfunc)(struct sysctl_req *, const void *, size_t);
+ void *newptr;
+ size_t newlen;
+ size_t newidx;
+ int (*newfunc)(struct sysctl_req *, void *, size_t);
+ size_t validlen;
+ int flags;
+};
+
+SLIST_HEAD(sysctl_oid_list, sysctl_oid);
+
+/*
+ * This describes one "oid" in the MIB tree. Potentially more nodes can
+ * be hidden behind it, expanded by the handler.
+ */
+struct sysctl_oid {
+ struct sysctl_oid_list *oid_parent;
+ SLIST_ENTRY(sysctl_oid) oid_link;
+ int oid_number;
+ u_int oid_kind;
+ void *oid_arg1;
+ intptr_t oid_arg2;
+ const char *oid_name;
+ int (*oid_handler)(SYSCTL_HANDLER_ARGS);
+ const char *oid_fmt;
+ int oid_refcnt;
+ u_int oid_running;
+ const char *oid_descr;
+};
+
#define SYSCTL_DECL(...)
#define SYSCTL_NODE(...)
#define SYSCTL_INT(...)
#define SYSCTL_UINT(...)
#define SYSCTL_ULONG(...)
+#define SYSCTL_PROC(...)
#define SYSCTL_QUAD(...)
#define SYSCTL_UQUAD(...)
#ifdef TUNABLE_INT
@@ -675,6 +719,8 @@ typedef uint32_t idmap_rid_t;
#define TUNABLE_ULONG(...)
#define TUNABLE_QUAD(...)
+int sysctl_handle_64(SYSCTL_HANDLER_ARGS);
+
/* Errors */
#ifndef ERESTART
OpenPOWER on IntegriCloud