diff options
author | mm <mm@FreeBSD.org> | 2012-06-11 11:35:22 +0000 |
---|---|---|
committer | mm <mm@FreeBSD.org> | 2012-06-11 11:35:22 +0000 |
commit | cc61ab2f133566dca51970d44cc49a4355039b5d (patch) | |
tree | a0efb9ad759cc8c8f8ec6f2cbabe6b3305a334e9 /sys/cddl/contrib | |
parent | 2cecdd81d885af2868172221b5068c523eade78c (diff) | |
download | FreeBSD-src-cc61ab2f133566dca51970d44cc49a4355039b5d.zip FreeBSD-src-cc61ab2f133566dca51970d44cc49a4355039b5d.tar.gz |
Introduce "feature flags" for ZFS pools (bump SPA version to 5000).
Add first feature "com.delphix:async_destroy" (asynchronous destroy
of ZFS datasets).
Implement features support in ZFS boot code.
Illumos revisions merged:
13700:2889e2596bd6
13701:1949b688d5fb
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
References:
https://www.illumos.org/issues/2619
https://www.illumos.org/issues/2747
Obtained from: illumos (issue #2619, #2747)
MFC after: 1 month
Diffstat (limited to 'sys/cddl/contrib')
46 files changed, 2794 insertions, 402 deletions
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/fnvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/fnvpair.c new file mode 100644 index 0000000..acef4cb --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/nvpair/fnvpair.c @@ -0,0 +1,498 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/nvpair.h> +#ifndef _KERNEL +#include <sys/zfs_context.h> +#else +#include <sys/debug.h> +#include <sys/kmem.h> +#endif + +/* + * "Force" nvlist wrapper. + * + * These functions wrap the nvlist_* functions with assertions that assume + * the operation is successful. This allows the caller's code to be much + * more readable, especially for the fnvlist_lookup_* and fnvpair_value_* + * functions, which can return the requested value (rather than filling in + * a pointer). + * + * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate + * with KM_SLEEP. + * + * More wrappers should be added as needed -- for example + * nvlist_lookup_*_array and nvpair_value_*_array. + */ + +nvlist_t * +fnvlist_alloc(void) +{ + nvlist_t *nvl; + VERIFY3U(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP), ==, 0); + return (nvl); +} + +void +fnvlist_free(nvlist_t *nvl) +{ + nvlist_free(nvl); +} + +size_t +fnvlist_size(nvlist_t *nvl) +{ + size_t size; + VERIFY3U(nvlist_size(nvl, &size, NV_ENCODE_NATIVE), ==, 0); + return (size); +} + +/* + * Returns allocated buffer of size *sizep. Caller must free the buffer with + * fnvlist_pack_free(). + */ +char * +fnvlist_pack(nvlist_t *nvl, size_t *sizep) +{ + char *packed = 0; + VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, + KM_SLEEP), ==, 0); + return (packed); +} + +/*ARGSUSED*/ +void +fnvlist_pack_free(char *pack, size_t size) +{ +#ifdef _KERNEL + kmem_free(pack, size); +#else + free(pack); +#endif +} + +nvlist_t * +fnvlist_unpack(char *buf, size_t buflen) +{ + nvlist_t *rv; + VERIFY3U(nvlist_unpack(buf, buflen, &rv, KM_SLEEP), ==, 0); + return (rv); +} + +nvlist_t * +fnvlist_dup(nvlist_t *nvl) +{ + nvlist_t *rv; + VERIFY3U(nvlist_dup(nvl, &rv, KM_SLEEP), ==, 0); + return (rv); +} + +void +fnvlist_merge(nvlist_t *dst, nvlist_t *src) +{ + VERIFY3U(nvlist_merge(dst, src, KM_SLEEP), ==, 0); +} + +void +fnvlist_add_boolean(nvlist_t *nvl, const char *name) +{ + VERIFY3U(nvlist_add_boolean(nvl, name), ==, 0); +} + +void +fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) +{ + VERIFY3U(nvlist_add_boolean_value(nvl, name, val), ==, 0); +} + +void +fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) +{ + VERIFY3U(nvlist_add_byte(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) +{ + VERIFY3U(nvlist_add_int8(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) +{ + VERIFY3U(nvlist_add_uint8(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) +{ + VERIFY3U(nvlist_add_int16(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) +{ + VERIFY3U(nvlist_add_uint16(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) +{ + VERIFY3U(nvlist_add_int32(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) +{ + VERIFY3U(nvlist_add_uint32(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) +{ + VERIFY3U(nvlist_add_int64(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) +{ + VERIFY3U(nvlist_add_uint64(nvl, name, val), ==, 0); +} + +void +fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val) +{ + VERIFY3U(nvlist_add_string(nvl, name, val), ==, 0); +} + +void +fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) +{ + VERIFY3U(nvlist_add_nvlist(nvl, name, val), ==, 0); +} + +void +fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY3U(nvlist_add_nvpair(nvl, pair), ==, 0); +} + +void +fnvlist_add_boolean_array(nvlist_t *nvl, const char *name, + boolean_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_boolean_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_byte_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int8_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint8_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int16_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint16_array(nvlist_t *nvl, const char *name, + uint16_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint16_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int32_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint32_array(nvlist_t *nvl, const char *name, + uint32_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint32_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int64_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint64_array(nvlist_t *nvl, const char *name, + uint64_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint64_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_string_array(nvlist_t *nvl, const char *name, + char * const *val, uint_t n) +{ + VERIFY3U(nvlist_add_string_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name, + nvlist_t **val, uint_t n) +{ + VERIFY3U(nvlist_add_nvlist_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_remove(nvlist_t *nvl, const char *name) +{ + VERIFY3U(nvlist_remove_all(nvl, name), ==, 0); +} + +void +fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY3U(nvlist_remove_nvpair(nvl, pair), ==, 0); +} + +nvpair_t * +fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name) +{ + nvpair_t *rv; + VERIFY3U(nvlist_lookup_nvpair(nvl, name, &rv), ==, 0); + return (rv); +} + +/* returns B_TRUE if the entry exists */ +boolean_t +fnvlist_lookup_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_lookup_boolean(nvl, name) == 0); +} + +boolean_t +fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name) +{ + boolean_t rv; + VERIFY3U(nvlist_lookup_boolean_value(nvl, name, &rv), ==, 0); + return (rv); +} + +uchar_t +fnvlist_lookup_byte(nvlist_t *nvl, const char *name) +{ + uchar_t rv; + VERIFY3U(nvlist_lookup_byte(nvl, name, &rv), ==, 0); + return (rv); +} + +int8_t +fnvlist_lookup_int8(nvlist_t *nvl, const char *name) +{ + int8_t rv; + VERIFY3U(nvlist_lookup_int8(nvl, name, &rv), ==, 0); + return (rv); +} + +int16_t +fnvlist_lookup_int16(nvlist_t *nvl, const char *name) +{ + int16_t rv; + VERIFY3U(nvlist_lookup_int16(nvl, name, &rv), ==, 0); + return (rv); +} + +int32_t +fnvlist_lookup_int32(nvlist_t *nvl, const char *name) +{ + int32_t rv; + VERIFY3U(nvlist_lookup_int32(nvl, name, &rv), ==, 0); + return (rv); +} + +int64_t +fnvlist_lookup_int64(nvlist_t *nvl, const char *name) +{ + int64_t rv; + VERIFY3U(nvlist_lookup_int64(nvl, name, &rv), ==, 0); + return (rv); +} + +uint8_t +fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name) +{ + uint8_t rv; + VERIFY3U(nvlist_lookup_uint8(nvl, name, &rv), ==, 0); + return (rv); +} + +uint16_t +fnvlist_lookup_uint16(nvlist_t *nvl, const char *name) +{ + uint16_t rv; + VERIFY3U(nvlist_lookup_uint16(nvl, name, &rv), ==, 0); + return (rv); +} + +uint32_t +fnvlist_lookup_uint32(nvlist_t *nvl, const char *name) +{ + uint32_t rv; + VERIFY3U(nvlist_lookup_uint32(nvl, name, &rv), ==, 0); + return (rv); +} + +uint64_t +fnvlist_lookup_uint64(nvlist_t *nvl, const char *name) +{ + uint64_t rv; + VERIFY3U(nvlist_lookup_uint64(nvl, name, &rv), ==, 0); + return (rv); +} + +char * +fnvlist_lookup_string(nvlist_t *nvl, const char *name) +{ + char *rv; + VERIFY3U(nvlist_lookup_string(nvl, name, &rv), ==, 0); + return (rv); +} + +nvlist_t * +fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name) +{ + nvlist_t *rv; + VERIFY3U(nvlist_lookup_nvlist(nvl, name, &rv), ==, 0); + return (rv); +} + +boolean_t +fnvpair_value_boolean_value(nvpair_t *nvp) +{ + boolean_t rv; + VERIFY3U(nvpair_value_boolean_value(nvp, &rv), ==, 0); + return (rv); +} + +uchar_t +fnvpair_value_byte(nvpair_t *nvp) +{ + uchar_t rv; + VERIFY3U(nvpair_value_byte(nvp, &rv), ==, 0); + return (rv); +} + +int8_t +fnvpair_value_int8(nvpair_t *nvp) +{ + int8_t rv; + VERIFY3U(nvpair_value_int8(nvp, &rv), ==, 0); + return (rv); +} + +int16_t +fnvpair_value_int16(nvpair_t *nvp) +{ + int16_t rv; + VERIFY3U(nvpair_value_int16(nvp, &rv), ==, 0); + return (rv); +} + +int32_t +fnvpair_value_int32(nvpair_t *nvp) +{ + int32_t rv; + VERIFY3U(nvpair_value_int32(nvp, &rv), ==, 0); + return (rv); +} + +int64_t +fnvpair_value_int64(nvpair_t *nvp) +{ + int64_t rv; + VERIFY3U(nvpair_value_int64(nvp, &rv), ==, 0); + return (rv); +} + +uint8_t +fnvpair_value_uint8_t(nvpair_t *nvp) +{ + uint8_t rv; + VERIFY3U(nvpair_value_uint8(nvp, &rv), ==, 0); + return (rv); +} + +uint16_t +fnvpair_value_uint16(nvpair_t *nvp) +{ + uint16_t rv; + VERIFY3U(nvpair_value_uint16(nvp, &rv), ==, 0); + return (rv); +} + +uint32_t +fnvpair_value_uint32(nvpair_t *nvp) +{ + uint32_t rv; + VERIFY3U(nvpair_value_uint32(nvp, &rv), ==, 0); + return (rv); +} + +uint64_t +fnvpair_value_uint64(nvpair_t *nvp) +{ + uint64_t rv; + VERIFY3U(nvpair_value_uint64(nvp, &rv), ==, 0); + return (rv); +} + +char * +fnvpair_value_string(nvpair_t *nvp) +{ + char *rv; + VERIFY3U(nvpair_value_string(nvp, &rv), ==, 0); + return (rv); +} + +nvlist_t * +fnvpair_value_nvlist(nvpair_t *nvp) +{ + nvlist_t *rv; + VERIFY3U(nvpair_value_nvlist(nvp, &rv), ==, 0); + return (rv); +} diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c new file mode 100644 index 0000000..6dadd99 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <errno.h> +#include <string.h> +#endif +#include <sys/debug.h> +#include <sys/fs/zfs.h> +#include <sys/types.h> +#include "zfeature_common.h" + +/* + * Set to disable all feature checks while opening pools, allowing pools with + * unsupported features to be opened. Set for testing only. + */ +boolean_t zfeature_checks_disable = B_FALSE; + +zfeature_info_t spa_feature_table[SPA_FEATURES]; + +/* + * Valid characters for feature guids. This list is mainly for aesthetic + * purposes and could be expanded in the future. There are different allowed + * characters in the guids reverse dns portion (before the colon) and its + * short name (after the colon). + */ +static int +valid_char(char c, boolean_t after_colon) +{ + return ((c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == (after_colon ? '_' : '.')); +} + +/* + * Every feature guid must contain exactly one colon which separates a reverse + * dns organization name from the feature's "short" name (e.g. + * "com.company:feature_name"). + */ +boolean_t +zfeature_is_valid_guid(const char *name) +{ + int i; + boolean_t has_colon = B_FALSE; + + i = 0; + while (name[i] != '\0') { + char c = name[i++]; + if (c == ':') { + if (has_colon) + return (B_FALSE); + has_colon = B_TRUE; + continue; + } + if (!valid_char(c, has_colon)) + return (B_FALSE); + } + + return (has_colon); +} + +boolean_t +zfeature_is_supported(const char *guid) +{ + if (zfeature_checks_disable) + return (B_TRUE); + + return (0 == zfeature_lookup_guid(guid, NULL)); +} + +int +zfeature_lookup_guid(const char *guid, zfeature_info_t **res) +{ + for (int i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(guid, feature->fi_guid) == 0) { + if (res != NULL) + *res = feature; + return (0); + } + } + + return (ENOENT); +} + +int +zfeature_lookup_name(const char *name, zfeature_info_t **res) +{ + for (int i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(name, feature->fi_uname) == 0) { + if (res != NULL) + *res = feature; + return (0); + } + } + + return (ENOENT); +} + +static void +zfeature_register(int fid, const char *guid, const char *name, const char *desc, + boolean_t readonly, boolean_t mos, zfeature_info_t **deps) +{ + zfeature_info_t *feature = &spa_feature_table[fid]; + static zfeature_info_t *nodeps[] = { NULL }; + + ASSERT(name != NULL); + ASSERT(desc != NULL); + ASSERT(!readonly || !mos); + ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(zfeature_is_valid_guid(guid)); + + if (deps == NULL) + deps = nodeps; + + feature->fi_guid = guid; + feature->fi_uname = name; + feature->fi_desc = desc; + feature->fi_can_readonly = readonly; + feature->fi_mos = mos; + feature->fi_depends = deps; +} + +void +zpool_feature_init(void) +{ + zfeature_register(SPA_FEATURE_ASYNC_DESTROY, + "com.delphix:async_destroy", "async_destroy", + "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL); +} diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h new file mode 100644 index 0000000..b57eb43 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _ZFEATURE_COMMON_H +#define _ZFEATURE_COMMON_H + +#include <sys/fs/zfs.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct zfeature_info; + +typedef struct zfeature_info { + const char *fi_uname; /* User-facing feature name */ + const char *fi_guid; /* On-disk feature identifier */ + const char *fi_desc; /* Feature description */ + boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */ + boolean_t fi_mos; /* Is the feature necessary to read the MOS? */ + struct zfeature_info **fi_depends; /* array; null terminated */ +} zfeature_info_t; + +typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg); + +#define ZFS_FEATURE_DEBUG + +static enum spa_feature { + SPA_FEATURE_ASYNC_DESTROY, + SPA_FEATURES +} spa_feature_t; + +extern zfeature_info_t spa_feature_table[SPA_FEATURES]; + +extern boolean_t zfeature_is_valid_guid(const char *); + +extern boolean_t zfeature_is_supported(const char *); +extern int zfeature_lookup_guid(const char *, zfeature_info_t **res); +extern int zfeature_lookup_name(const char *, zfeature_info_t **res); + +extern void zpool_feature_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFEATURE_COMMON_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c index 512e067..72db879 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c @@ -79,6 +79,8 @@ zpool_prop_init(void) ZFS_TYPE_POOL, "<size>", "SIZE"); zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "FREE"); + zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<size>", "FREEING"); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC"); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, @@ -166,6 +168,26 @@ zpool_prop_default_numeric(zpool_prop_t prop) return (zpool_prop_table[prop].pd_numdefault); } +/* + * Returns true if this is a valid feature@ property. + */ +boolean_t +zpool_prop_feature(const char *name) +{ + static const char *prefix = "feature@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +/* + * Returns true if this is a valid unsupported@ property. + */ +boolean_t +zpool_prop_unsupported(const char *name) +{ + static const char *prefix = "unsupported@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + int zpool_prop_string_to_index(zpool_prop_t prop, const char *string, uint64_t *index) diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files index 2ab1d7b..30d4f79 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -21,6 +21,7 @@ # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2012 by Delphix. All rights reserved. # # # This Makefile defines all file modules for the directory uts/common @@ -31,6 +32,7 @@ ZFS_COMMON_OBJS += \ arc.o \ bplist.o \ bpobj.o \ + bptree.o \ dbuf.o \ ddt.o \ ddt_zap.o \ @@ -52,6 +54,7 @@ ZFS_COMMON_OBJS += \ dsl_deleg.o \ dsl_prop.o \ dsl_scan.o \ + zfeature.o \ gzip.o \ lzjb.o \ metaslab.o \ @@ -94,11 +97,12 @@ ZFS_COMMON_OBJS += \ zrlock.o ZFS_SHARED_OBJS += \ - zfs_namecheck.o \ - zfs_deleg.o \ - zfs_prop.o \ + zfeature_common.o \ zfs_comutil.o \ + zfs_deleg.o \ zfs_fletcher.o \ + zfs_namecheck.o \ + zfs_prop.o \ zpool_prop.o \ zprop_common.o diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index cb81392..9dd4e07 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -2794,9 +2794,11 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + dmu_ot_byteswap[bswap].ob_func; func(buf->b_data, hdr->b_size); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c new file mode 100644 index 0000000..8c5a7d4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c @@ -0,0 +1,224 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/arc.h> +#include <sys/bptree.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/refcount.h> +#include <sys/spa.h> + +/* + * A bptree is a queue of root block pointers from destroyed datasets. When a + * dataset is destroyed its root block pointer is put on the end of the pool's + * bptree queue so the dataset's blocks can be freed asynchronously by + * dsl_scan_sync. This allows the delete operation to finish without traversing + * all the dataset's blocks. + * + * Note that while bt_begin and bt_end are only ever incremented in this code + * they are effectively reset to 0 every time the entire bptree is freed because + * the bptree's object is destroyed and re-created. + */ + +struct bptree_args { + bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ + boolean_t ba_free; /* true if freeing during traversal */ + + bptree_itor_t *ba_func; /* function to call for each blockpointer */ + void *ba_arg; /* caller supplied argument to ba_func */ + dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ +} bptree_args_t; + +uint64_t +bptree_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t obj; + dmu_buf_t *db; + bptree_phys_t *bt; + + obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, + SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (bptree_phys_t), tx); + + /* + * Bonus buffer contents are already initialized to 0, but for + * readability we make it explicit. + */ + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + bt = db->db_data; + bt->bt_begin = 0; + bt->bt_end = 0; + bt->bt_bytes = 0; + bt->bt_comp = 0; + bt->bt_uncomp = 0; + dmu_buf_rele(db, FTAG); + + return (obj); +} + +int +bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + ASSERT3U(bt->bt_begin, ==, bt->bt_end); + ASSERT3U(bt->bt_bytes, ==, 0); + ASSERT3U(bt->bt_comp, ==, 0); + ASSERT3U(bt->bt_uncomp, ==, 0); + dmu_buf_rele(db, FTAG); + + return (dmu_object_free(os, obj, tx)); +} + +void +bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + bptree_entry_phys_t bte; + + /* + * bptree objects are in the pool mos, therefore they can only be + * modified in syncing context. Furthermore, this is only modified + * by the sync thread, so no locking is necessary. + */ + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + + bte.be_birth_txg = birth_txg; + bte.be_bp = *bp; + bzero(&bte.be_zb, sizeof (bte.be_zb)); + dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); + + dmu_buf_will_dirty(db, tx); + bt->bt_end++; + bt->bt_bytes += bytes; + bt->bt_comp += comp; + bt->bt_uncomp += uncomp; + dmu_buf_rele(db, FTAG); +} + +/* ARGSUSED */ +static int +bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + int err; + struct bptree_args *ba = arg; + + if (bp == NULL) + return (0); + + err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); + if (err == 0 && ba->ba_free) { + ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); + ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); + ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); + } + return (err); +} + +int +bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, + void *arg, dmu_tx_t *tx) +{ + int err; + uint64_t i; + dmu_buf_t *db; + struct bptree_args ba; + + ASSERT(!free || dmu_tx_is_syncing(tx)); + + err = dmu_bonus_hold(os, obj, FTAG, &db); + if (err != 0) + return (err); + + if (free) + dmu_buf_will_dirty(db, tx); + + ba.ba_phys = db->db_data; + ba.ba_free = free; + ba.ba_func = func; + ba.ba_arg = arg; + ba.ba_tx = tx; + + err = 0; + for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { + bptree_entry_phys_t bte; + + ASSERT(!free || i == ba.ba_phys->bt_begin); + + err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), + &bte, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, + bte.be_birth_txg, &bte.be_zb, TRAVERSE_POST, + bptree_visit_cb, &ba); + if (free) { + ASSERT(err == 0 || err == ERESTART); + if (err != 0) { + /* save bookmark for future resume */ + ASSERT3U(bte.be_zb.zb_objset, ==, + ZB_DESTROYED_OBJSET); + ASSERT3U(bte.be_zb.zb_level, ==, 0); + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + break; + } else { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } + } + + ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + + /* if all blocks are free there should be no used space */ + if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + ASSERT3U(ba.ba_phys->bt_bytes, ==, 0); + ASSERT3U(ba.ba_phys->bt_comp, ==, 0); + ASSERT3U(ba.ba_phys->bt_uncomp, ==, 0); + } + + dmu_buf_rele(db, FTAG); + + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index dc9f482..b75ee44 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -227,7 +228,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db) boolean_t is_metadata; DB_DNODE_ENTER(db); - is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; + is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c index 0edf62e..ef3d0f4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -1067,11 +1068,9 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { - spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, - DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, - &spa->spa_ddt_stat_object, tx) == 0); + spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, tx); } while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index f7d471f..2204e9d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -45,60 +46,73 @@ #endif const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { byteswap_uint8_array, TRUE, "unallocated" }, - { zap_byteswap, TRUE, "object directory" }, - { byteswap_uint64_array, TRUE, "object array" }, - { byteswap_uint8_array, TRUE, "packed nvlist" }, - { byteswap_uint64_array, TRUE, "packed nvlist size" }, - { byteswap_uint64_array, TRUE, "bpobj" }, - { byteswap_uint64_array, TRUE, "bpobj header" }, - { byteswap_uint64_array, TRUE, "SPA space map header" }, - { byteswap_uint64_array, TRUE, "SPA space map" }, - { byteswap_uint64_array, TRUE, "ZIL intent log" }, - { dnode_buf_byteswap, TRUE, "DMU dnode" }, - { dmu_objset_byteswap, TRUE, "DMU objset" }, - { byteswap_uint64_array, TRUE, "DSL directory" }, - { zap_byteswap, TRUE, "DSL directory child map"}, - { zap_byteswap, TRUE, "DSL dataset snap map" }, - { zap_byteswap, TRUE, "DSL props" }, - { byteswap_uint64_array, TRUE, "DSL dataset" }, - { zfs_znode_byteswap, TRUE, "ZFS znode" }, - { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, - { byteswap_uint8_array, FALSE, "ZFS plain file" }, - { zap_byteswap, TRUE, "ZFS directory" }, - { zap_byteswap, TRUE, "ZFS master node" }, - { zap_byteswap, TRUE, "ZFS delete queue" }, - { byteswap_uint8_array, FALSE, "zvol object" }, - { zap_byteswap, TRUE, "zvol prop" }, - { byteswap_uint8_array, FALSE, "other uint8[]" }, - { byteswap_uint64_array, FALSE, "other uint64[]" }, - { zap_byteswap, TRUE, "other ZAP" }, - { zap_byteswap, TRUE, "persistent error log" }, - { byteswap_uint8_array, TRUE, "SPA history" }, - { byteswap_uint64_array, TRUE, "SPA history offsets" }, - { zap_byteswap, TRUE, "Pool properties" }, - { zap_byteswap, TRUE, "DSL permissions" }, - { zfs_acl_byteswap, TRUE, "ZFS ACL" }, - { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, - { byteswap_uint8_array, TRUE, "FUID table" }, - { byteswap_uint64_array, TRUE, "FUID table size" }, - { zap_byteswap, TRUE, "DSL dataset next clones"}, - { zap_byteswap, TRUE, "scan work queue" }, - { zap_byteswap, TRUE, "ZFS user/group used" }, - { zap_byteswap, TRUE, "ZFS user/group quota" }, - { zap_byteswap, TRUE, "snapshot refcount tags"}, - { zap_byteswap, TRUE, "DDT ZAP algorithm" }, - { zap_byteswap, TRUE, "DDT statistics" }, - { byteswap_uint8_array, TRUE, "System attributes" }, - { zap_byteswap, TRUE, "SA master node" }, - { zap_byteswap, TRUE, "SA attr registration" }, - { zap_byteswap, TRUE, "SA attr layouts" }, - { zap_byteswap, TRUE, "scan translations" }, - { byteswap_uint8_array, FALSE, "deduplicated block" }, - { zap_byteswap, TRUE, "DSL deadlist map" }, - { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" }, - { zap_byteswap, TRUE, "DSL dir clones" }, - { byteswap_uint64_array, TRUE, "bpobj subobj" }, + { DMU_BSWAP_UINT8, TRUE, "unallocated" }, + { DMU_BSWAP_ZAP, TRUE, "object directory" }, + { DMU_BSWAP_UINT64, TRUE, "object array" }, + { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, + { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, + { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, + { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, + { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, + { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, + { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, + { DMU_BSWAP_ZAP, TRUE, "DSL props" }, + { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, + { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, + { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, + { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, + { DMU_BSWAP_UINT8, FALSE, "zvol object" }, + { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, + { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, + { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, + { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, + { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, + { DMU_BSWAP_UINT8, TRUE, "SPA history" }, + { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, + { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, + { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, + { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, + { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, + { DMU_BSWAP_UINT8, TRUE, "FUID table" }, + { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, + { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, + { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, + { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, + { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, "System attributes" }, + { DMU_BSWAP_ZAP, TRUE, "SA master node" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, + { DMU_BSWAP_ZAP, TRUE, "scan translations" }, + { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, + { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, + { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } +}; + +const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { + { byteswap_uint8_array, "uint8" }, + { byteswap_uint16_array, "uint16" }, + { byteswap_uint32_array, "uint32" }, + { byteswap_uint64_array, "uint64" }, + { zap_byteswap, "zap" }, + { dnode_buf_byteswap, "dnode" }, + { dmu_objset_byteswap, "objset" }, + { zfs_znode_byteswap, "znode" }, + { zfs_oldacl_byteswap, "oldacl" }, + { zfs_acl_byteswap, "acl" } }; int @@ -175,7 +189,7 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if (type > DMU_OT_NUMTYPES) { + if (!DMU_OT_IS_VALID(type)) { error = EINVAL; } else if (dn->dn_bonus != db) { error = EINVAL; @@ -1513,7 +1527,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; - boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata || + boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index baed037..81fd9f2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -20,11 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. */ @@ -1117,8 +1114,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) void *data = NULL; if (drro->drr_type == DMU_OT_NONE || - drro->drr_type >= DMU_OT_NUMTYPES || - drro->drr_bonustype >= DMU_OT_NUMTYPES || + !DMU_OT_IS_VALID(drro->drr_type) || + !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || @@ -1183,7 +1180,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { - dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drro->drr_bonustype); + dmu_ot_byteswap[byteswap].ob_func(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); @@ -1226,7 +1225,7 @@ restore_write(struct restorearg *ra, objset_t *os, int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || - drrw->drr_type >= DMU_OT_NUMTYPES) + !DMU_OT_IS_VALID(drrw->drr_type)) return (EINVAL); data = restore_read(ra, drrw->drr_length); @@ -1245,8 +1244,11 @@ restore_write(struct restorearg *ra, objset_t *os, dmu_tx_abort(tx); return (err); } - if (ra->byteswap) - dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); + if (ra->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); + } dmu_write(os, drrw->drr_object, drrw->drr_offset, drrw->drr_length, data, tx); dmu_tx_commit(tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index 023f90e..bfe9e65 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -53,6 +54,7 @@ typedef struct traverse_data { uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; + zbookmark_t *td_resume; int td_flags; prefetch_data_t *td_pfd; blkptr_cb_t *td_func; @@ -128,6 +130,54 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh) zil_free(zilog); } +typedef enum resume_skip { + RESUME_SKIP_ALL, + RESUME_SKIP_NONE, + RESUME_SKIP_CHILDREN +} resume_skip_t; + +/* + * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and + * the block indicated by zb does not need to be visited at all. Returns + * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the + * resume point. This indicates that this block should be visited but not its + * children (since they must have been visited in a previous traversal). + * Otherwise returns RESUME_SKIP_NONE. + */ +static resume_skip_t +resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, + const zbookmark_t *zb) +{ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { + /* + * If we already visited this bp & everything below, + * don't bother doing it again. + */ + if (zbookmark_is_before(dnp, zb, td->td_resume)) + return (RESUME_SKIP_ALL); + + /* + * If we found the block we're trying to resume from, zero + * the bookmark out to indicate that we have resumed. + */ + ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object); + if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { + bzero(td->td_resume, sizeof (*zb)); + if (td->td_flags & TRAVERSE_POST) + return (RESUME_SKIP_CHILDREN); + } + } + return (RESUME_SKIP_NONE); +} + +static void +traverse_pause(traverse_data_t *td, const zbookmark_t *zb) +{ + ASSERT(td->td_resume != NULL); + ASSERT3U(zb->zb_level, ==, 0); + bcopy(zb, td->td_resume, sizeof (*td->td_resume)); +} + static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) @@ -137,8 +187,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; + boolean_t pause = B_FALSE; + + switch (resume_skip_check(td, dnp, zb)) { + case RESUME_SKIP_ALL: + return (0); + case RESUME_SKIP_CHILDREN: + goto post; + case RESUME_SKIP_NONE: + break; + default: + ASSERT(0); + } - if (bp->blk_birth == 0) { + if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, td->td_arg); return (err); @@ -164,8 +226,10 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); - if (err) - return (err); + if (err == ERESTART) + pause = B_TRUE; /* handle pausing at a common point */ + if (err != 0) + goto post; } if (BP_GET_LEVEL(bp) > 0) { @@ -253,9 +317,18 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (buf) (void) arc_buf_remove_ref(buf, &buf); +post: if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, td->td_arg); + if (err == ERESTART) + pause = B_TRUE; + } + + if (pause && td->td_resume != NULL) { + ASSERT3U(err, ==, ERESTART); + ASSERT(!hard); + traverse_pause(td, zb); } return (err != 0 ? err : lasterr); @@ -353,18 +426,23 @@ traverse_prefetch_thread(void *arg) * in syncing context). */ static int -traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp, - uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) +traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg) { traverse_data_t td; prefetch_data_t pd = { 0 }; zbookmark_t czb; int err; + ASSERT(ds == NULL || objset == ds->ds_object); + ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); + td.td_spa = spa; - td.td_objset = ds ? ds->ds_object : 0; + td.td_objset = objset; td.td_rootbp = rootbp; td.td_min_txg = txg_start; + td.td_resume = resume; td.td_func = func; td.td_arg = arg; td.td_pfd = &pd; @@ -416,8 +494,17 @@ int traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, - &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, + &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg)); +} + +int +traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg) +{ + return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, + blkptr, txg_start, resume, flags, func, arg)); } /* @@ -434,8 +521,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ - err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa), - txg_start, flags, func, arg); + err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), + txg_start, NULL, flags, func, arg); if (err) return (err); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index b4579e2..3dd5f2c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -20,9 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -676,7 +675,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) return; } - ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 && !add) { blkptr_t *bp; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index ca2b69a..f098e11 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -196,7 +197,7 @@ dnode_verify(dnode_t *dn) ASSERT(dn->dn_objset); ASSERT(dn->dn_handle->dnh_dnode == dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; @@ -215,7 +216,7 @@ dnode_verify(dnode_t *dn) ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); - ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); @@ -281,8 +282,10 @@ dnode_byteswap(dnode_phys_t *dnp) */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); size_t len = DN_MAX_BONUSLEN - off; - ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); - dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); + ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(dnp->dn_bonustype); + dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } /* Swap SPILL block if we have one */ @@ -410,7 +413,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dmu_zfetch_init(&dn->dn_zfetch, dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); list_insert_head(&os->os_dnodes, dn); @@ -499,11 +502,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); - ASSERT3U(ot, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT3U(dn->dn_maxblkid, ==, 0); @@ -571,7 +574,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); /* clean up any unreferenced dbufs */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 32afe7d..fcabc74 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -597,7 +599,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) } if (dn->dn_next_bonustype[txgoff]) { - ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; dn->dn_next_bonustype[txgoff] = 0; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 648f2f3..c445278 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. * All rights reserved. @@ -38,6 +38,7 @@ #include <sys/arc.h> #include <sys/zio.h> #include <sys/zap.h> +#include <sys/zfeature.h> #include <sys/unique.h> #include <sys/zfs_context.h> #include <sys/zfs_ioctl.h> @@ -103,7 +104,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) if (BP_IS_HOLE(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); - ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { /* * Account for the meta-objset space in its placeholder @@ -120,7 +121,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); - ds->ds_phys->ds_used_bytes += used; + ds->ds_phys->ds_referenced_bytes += used; ds->ds_phys->ds_compressed_bytes += compressed; ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; @@ -214,8 +215,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, } } mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); - ds->ds_phys->ds_used_bytes -= used; + ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); + ds->ds_phys->ds_referenced_bytes -= used; ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); ds->ds_phys->ds_compressed_bytes -= compressed; ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); @@ -827,8 +828,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = origin->ds_phys->ds_creation_txg; - dsphys->ds_used_bytes = - origin->ds_phys->ds_used_bytes; + dsphys->ds_referenced_bytes = + origin->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = origin->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = @@ -981,7 +982,6 @@ dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { dsl_dataset_t *ds; - int err; err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); if (err == 0) { @@ -1130,19 +1130,23 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) goto out; /* - * remove the objects in open context, so that we won't - * have too much to do in syncing context. + * If async destruction is not enabled try to remove all objects + * while in the open context so that there is less work to do in + * the syncing context. */ - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, - ds->ds_phys->ds_prev_snap_txg)) { - /* - * Ignore errors, if there is not enough disk space - * we will deal with it in dsl_dataset_destroy_sync(). - */ - (void) dmu_free_object(os, obj); + if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, + ds->ds_phys->ds_prev_snap_txg)) { + /* + * Ignore errors, if there is not enough disk space + * we will deal with it in dsl_dataset_destroy_sync(). + */ + (void) dmu_free_object(os, obj); + } + if (err != ESRCH) + goto out; } - if (err != ESRCH) - goto out; /* * Only the ZIL knows how to free log blocks. @@ -1288,7 +1292,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ASSERT(!dsl_dataset_is_snapshot(ds)); if (ds->ds_phys->ds_prev_snap_obj != 0) - mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; + mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; else mrs_used = 0; @@ -1296,7 +1300,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ASSERT3U(dlused, <=, mrs_used); ds->ds_phys->ds_unique_bytes = - ds->ds_phys->ds_used_bytes - (mrs_used - dlused); + ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) @@ -1655,6 +1659,30 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, ds_next->ds_phys->ds_deadlist_obj); } +static int +old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + int err; + struct killarg ka; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.tx = tx; + err = traverse_dataset(ds, + ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, + kill_blkptr, &ka); + ASSERT3U(err, ==, 0); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); + + return (err); +} + void dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { @@ -1801,7 +1829,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) tx); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); - dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, @@ -1877,32 +1904,54 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) } dsl_dataset_rele(ds_next, FTAG); } else { + zfeature_info_t *async_destroy = + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; + /* * There's no next snapshot, so this is a head dataset. * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty. (If it's a clone, it's * safe to ignore the deadlist contents.) */ - struct killarg ka; - dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; - /* - * Free everything that we point to (that's born after - * the previous snapshot, if we are a clone) - * - * NB: this should be very quick, because we already - * freed all the objects in open context. - */ - ka.ds = ds; - ka.tx = tx; - err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - TRAVERSE_POST, kill_blkptr, &ka); - ASSERT3U(err, ==, 0); - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - ds->ds_phys->ds_unique_bytes == 0); + if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { + err = old_synchronous_dataset_destroy(ds, tx); + } else { + /* + * Move the bptree into the pool's list of trees to + * clean up and update space accounting information. + */ + uint64_t used, comp, uncomp; + + ASSERT(err == 0 || err == EBUSY); + if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { + spa_feature_incr(dp->dp_spa, async_destroy, tx); + dp->dp_bptree_obj = bptree_alloc( + dp->dp_meta_objset, tx); + VERIFY(zap_add(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx) == 0); + } + + used = ds->ds_dir->dd_phys->dd_used_bytes; + comp = ds->ds_dir->dd_phys->dd_compressed_bytes; + uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + ds->ds_phys->ds_unique_bytes == used); + + bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj, + &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, + used, comp, uncomp, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + } if (ds->ds_prev != NULL) { if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { @@ -2095,7 +2144,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; - dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; + dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; dsphys->ds_flags = ds->ds_phys->ds_flags; @@ -2219,10 +2268,22 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAXNAMELEN]; + /* + * Even though we hold the dp_config_rwlock, the dataset + * may fail to open, returning ENOENT. If there is a + * thread concurrently attempting to destroy this + * dataset, it will have the ds_rwlock held for + * RW_WRITER. Our call to dsl_dataset_hold_obj() -> + * dsl_dataset_hold_ref() will fail its + * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the + * dp_config_rwlock, and wait for the destroy progress + * and signal ds_exclusive_cv. If the destroy was + * successful, we will see that + * DSL_DATASET_IS_DESTROYED(), and return ENOENT. + */ if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone) != 0) { - goto fail; - } + za.za_first_integer, FTAG, &clone) != 0) + continue; dsl_dir_name(clone->ds_dir, buf); VERIFY(nvlist_add_boolean(val, buf) == 0); dsl_dataset_rele(clone, FTAG); @@ -2345,7 +2406,7 @@ dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { - *refdbytesp = ds->ds_phys->ds_used_bytes; + *refdbytesp = ds->ds_phys->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; @@ -2688,7 +2749,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ - pa->used = origin_ds->ds_phys->ds_used_bytes; + pa->used = origin_ds->ds_phys->ds_referenced_bytes; pa->comp = origin_ds->ds_phys->ds_compressed_bytes; pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; for (snap = list_head(&pa->shared_snaps); snap; @@ -2722,7 +2783,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * so we need to subtract out the clone origin's used space. */ if (pa->origin_origin) { - pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; + pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; } @@ -3238,8 +3299,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsl_deadlist_space(&csa->ohds->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); - dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - - (csa->ohds->ds_phys->ds_used_bytes + odl_used); + dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - + (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + @@ -3268,8 +3329,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) } /* swap ds_*_bytes */ - SWITCH64(csa->ohds->ds_phys->ds_used_bytes, - csa->cds->ds_phys->ds_used_bytes); + SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, + csa->cds->ds_phys->ds_referenced_bytes); SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, csa->cds->ds_phys->ds_compressed_bytes); SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, @@ -3398,8 +3459,9 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { - if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) + if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { + if (inflight > 0 || + ds->ds_phys->ds_referenced_bytes < ds->ds_quota) error = ERESTART; else error = EDQUOT; @@ -3426,7 +3488,7 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) if (psa->psa_effective_value == 0) return (0); - if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || + if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || psa->psa_effective_value < ds->ds_reserved) return (ENOSPC); @@ -4180,8 +4242,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, dsl_pool_t *dp = new->ds_dir->dd_pool; *usedp = 0; - *usedp += new->ds_phys->ds_used_bytes; - *usedp -= oldsnap->ds_phys->ds_used_bytes; + *usedp += new->ds_phys->ds_referenced_bytes; + *usedp -= oldsnap->ds_phys->ds_referenced_bytes; *compp = 0; *compp += new->ds_phys->ds_compressed_bytes; @@ -4197,9 +4259,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, dsl_dataset_t *snap; uint64_t used, comp, uncomp; - err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); - if (err != 0) - break; + if (snapobj == new->ds_object) { + snap = new; + } else { + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + } if (snap->ds_phys->ds_prev_snap_txg == oldsnap->ds_phys->ds_creation_txg) { @@ -4228,7 +4294,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, * was not a snapshot of/before new. */ snapobj = snap->ds_phys->ds_prev_snap_obj; - dsl_dataset_rele(snap, FTAG); + if (snap != new) + dsl_dataset_rele(snap, FTAG); if (snapobj == 0) { err = EINVAL; break; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c index 0b5fa0b..8f9d2c5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -171,10 +171,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { - jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, - DMU_OT_NONE, 0, tx); - VERIFY(zap_update(mos, zapobj, - whokey, 8, 1, &jumpobj, tx) == 0); + jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, + zapobj, whokey, tx); } while (permpair = nvlist_next_nvpair(perms, permpair)) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index 7f7a3b9..d698e59 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dsl_pool.h> @@ -40,6 +40,8 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/bptree.h> +#include <sys/zfeature.h> int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -125,20 +127,32 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) } int -dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, + &dp->dp_meta_objset); + if (err != 0) + dsl_pool_close(dp); + else + *dpp = dp; + + return (err); +} + +int +dsl_pool_open(dsl_pool_t *dp) +{ + int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; - rw_enter(&dp->dp_config_rwlock, RW_WRITER); - err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, - &dp->dp_meta_objset); - if (err) - goto out; + ASSERT(!dmu_objset_is_dirty_anywhere(dp->dp_meta_objset)); + rw_enter(&dp->dp_config_rwlock, RW_WRITER); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); @@ -154,7 +168,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - if (spa_version(spa) >= SPA_VERSION_ORIGIN) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; @@ -171,7 +185,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) goto out; } - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) @@ -185,6 +199,15 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) dp->dp_meta_objset, obj)); } + if (spa_feature_is_active(dp->dp_spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj); + if (err != 0) + goto out; + } + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); @@ -193,15 +216,10 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - err = dsl_scan_init(dp, txg); + err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rw_exit(&dp->dp_config_rwlock); - if (err) - dsl_pool_close(dp); - else - *dpp = dp; - return (err); } @@ -495,7 +513,7 @@ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || - spa_get_dsl(dp->dp_spa) == NULL); + spa_is_initializing(dp->dp_spa)); } uint64_t @@ -813,11 +831,8 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); - dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, - DMU_OT_NONE, 0, tx); - - VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, - sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); + dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index 475b494..164a73a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dsl_scan.h> @@ -44,6 +45,7 @@ #include <sys/ddt.h> #include <sys/sa.h> #include <sys/sa_impl.h> +#include <sys/zfeature.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif @@ -381,55 +383,6 @@ dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, priority, zio_flags, arc_flags, zb)); } -static boolean_t -bookmark_is_zero(const zbookmark_t *zb) -{ - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); -} - -/* dnp is the dnode for zb1->zb_object */ -static boolean_t -bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); -} - static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { @@ -461,7 +414,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ @@ -616,13 +569,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, /* * We never skip over user/group accounting objects (obj<0) */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ - if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* @@ -815,22 +768,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return; - if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { - /* - * For non-user-accounting blocks, we need to read the - * new bp (from a deleted snapshot, found in - * check_existing_xlation). If we used the old bp, - * pointers inside this block from before we resumed - * would be untranslated. - * - * For user-accounting blocks, we need to read the old - * bp, because we will apply the entire space delta to - * it (original untranslated -> translations from - * deleted snap -> now). - */ - bp_toread = *bp; - } - if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, &buf) != 0) return; @@ -1395,19 +1332,28 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) zap_cursor_fini(&zc); } -static int -dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +static boolean_t +dsl_scan_free_should_pause(dsl_scan_t *scn) { - dsl_scan_t *scn = arg; uint64_t elapsed_nanosecs; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; - - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || - spa_shutting_down(scn->scn_dp->dp_spa)) - return (ERESTART); + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +static int +dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + + if (!scn->scn_is_bptree || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { + if (dsl_scan_free_should_pause(scn)) + return (ERESTART); + } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); @@ -1432,6 +1378,10 @@ dsl_scan_active(dsl_scan_t *scn) if (scn->scn_phys.scn_state == DSS_SCANNING) return (B_TRUE); + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + return (B_TRUE); + } if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); @@ -1478,14 +1428,40 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * traversing it. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, - dsl_scan_free_cb, scn, tx); + dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + + if (err == 0 && spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, + scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + if (err != 0) + return; + + /* disable async destroy feature */ + spa_feature_decr(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); + ASSERT(!spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj txg %llu", + "free_bpobj/bptree txg %llu", (longlong_t)scn->scn_visited_this_txg, (longlong_t) (gethrtime() - scn->scn_sync_start_time) / MICROSEC, @@ -1600,6 +1576,8 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + if (t & DMU_OT_NEWTYPE) + t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c index 69374fb..173198d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c @@ -18,9 +18,11 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 iXsystems, Inc + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -427,10 +429,9 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, char attr_name[8]; if (sa->sa_layout_attr_obj == 0) { - sa->sa_layout_attr_obj = zap_create(os, - DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1, - &sa->sa_layout_attr_obj, tx) == 0); + sa->sa_layout_attr_obj = zap_create_link(os, + DMU_OT_SA_ATTR_LAYOUTS, + sa->sa_master_obj, SA_LAYOUTS, tx); } (void) snprintf(attr_name, sizeof (attr_name), @@ -1552,10 +1553,9 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) } if (sa->sa_reg_attr_obj == 0) { - sa->sa_reg_attr_obj = zap_create(hdl->sa_os, - DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj, - SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0); + sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, + DMU_OT_SA_ATTR_REGISTRATION, + sa->sa_master_obj, SA_REGISTRY, tx); } for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_registered) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 2060aa8..c796533 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -60,6 +60,7 @@ #include <sys/spa_boot.h> #include <sys/zfs_ioctl.h> #include <sys/dsl_scan.h> +#include <sys/zfeature.h> #include <sys/zvol.h> #ifdef _KERNEL @@ -117,6 +118,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; +static dsl_syncfunc_t spa_sync_version; static dsl_syncfunc_t spa_sync_props; static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, @@ -176,6 +178,7 @@ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; + dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size; uint64_t alloc; uint64_t space; @@ -222,6 +225,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } + if (pool != NULL) { + dsl_dir_t *freedir = pool->dp_free_dir; + + /* + * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, + * when opening pools before this version freedir will be NULL. + */ + if (freedir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, + freedir->dd_phys->dd_used_bytes, src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, + NULL, 0, src); + } + } + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { @@ -361,25 +380,55 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum; + boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - zpool_prop_t prop; - char *propname, *strval; uint64_t intval; - objset_t *os; - char *slash, *check; + char *strval, *slash, *check, *fname; + const char *propname = nvpair_name(elem); + zpool_prop_t prop = zpool_name_to_prop(propname); + + switch (prop) { + case ZPROP_INVAL: + if (!zpool_prop_feature(propname)) { + error = EINVAL; + break; + } - propname = nvpair_name(elem); + /* + * Sanitize the input. + */ + if (nvpair_type(elem) != DATA_TYPE_UINT64) { + error = EINVAL; + break; + } - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) - return (EINVAL); + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + + if (intval != 0) { + error = EINVAL; + break; + } + + fname = strchr(propname, '@') + 1; + if (zfeature_lookup_name(fname, NULL) != 0) { + error = EINVAL; + break; + } + + has_feature = B_TRUE; + break; - switch (prop) { case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && - (intval < spa_version(spa) || intval > SPA_VERSION)) + (intval < spa_version(spa) || + intval > SPA_VERSION_BEFORE_FEATURES || + has_feature)) error = EINVAL; break; @@ -416,6 +465,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = nvpair_value_string(elem, &strval); if (!error) { + objset_t *os; uint64_t compress; if (strval == NULL || strval[0] == '\0') { @@ -565,33 +615,58 @@ int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; - nvpair_t *elem; + nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; - zpool_prop_t prop; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { - if ((prop = zpool_name_to_prop( - nvpair_name(elem))) == ZPROP_INVAL) - return (EINVAL); + zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; + if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { + uint64_t ver; + + if (prop == ZPOOL_PROP_VERSION) { + VERIFY(nvpair_value_uint64(elem, &ver) == 0); + } else { + ASSERT(zpool_prop_feature(nvpair_name(elem))); + ver = SPA_VERSION_FEATURES; + need_sync = B_TRUE; + } + + /* Save time if the version is already set. */ + if (ver == spa_version(spa)) + continue; + + /* + * In addition to the pool directory object, we might + * create the pool properties object, the features for + * read object, the features for write object, or the + * feature descriptions object. + */ + error = dsl_sync_task_do(spa_get_dsl(spa), NULL, + spa_sync_version, spa, &ver, 6); + if (error) + return (error); + continue; + } + need_sync = B_TRUE; break; } - if (need_sync) + if (need_sync) { return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); - else - return (0); + spa, nvp, 6)); + } + + return (0); } /* @@ -1630,7 +1705,7 @@ spa_load_verify_done(zio_t *zio) int error = zio->io_error; if (error) { - if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && + if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_add_64(&sle->sle_meta_count, 1); else @@ -1860,6 +1935,9 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, KM_SLEEP) == 0); } + nvlist_free(spa->spa_load_info); + spa->spa_load_info = fnvlist_alloc(); + gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); @@ -1892,12 +1970,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, { int error = 0; nvlist_t *nvroot = NULL; + nvlist_t *label; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; + boolean_t missing_feat_write = B_FALSE; /* * If this is an untrusted config, access the pool in read-only mode. @@ -1977,19 +2057,78 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Find the best uberblock. */ - vdev_uberblock_load(NULL, rvd, ub); + vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ - if (ub->ub_txg == 0) + if (ub->ub_txg == 0) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); + } /* - * If the pool is newer than the code, we can't open it. + * If the pool has an unsupported version we can't open it. */ - if (ub->ub_version > SPA_VERSION) + if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); + } + + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *features; + + /* + * If we weren't able to find what's necessary for reading the + * MOS in the label, return failure. + */ + if (label == NULL || nvlist_lookup_nvlist(label, + ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { + nvlist_free(label); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + /* + * Update our in-core representation with the definitive values + * from the label. + */ + nvlist_free(spa->spa_label_features); + VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); + } + + nvlist_free(label); + + /* + * Look through entries in the label nvlist's features_for_read. If + * there is a feature listed there which we don't understand then we + * cannot open a pool. + */ + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *unsup_feat; + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, + NULL); nvp != NULL; + nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { + if (!zfeature_is_supported(nvpair_name(nvp))) { + VERIFY(nvlist_add_string(unsup_feat, + nvpair_name(nvp), "") == 0); + } + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + nvlist_free(unsup_feat); + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + nvlist_free(unsup_feat); + } /* * If the vdev guid sum doesn't match the uberblock, we have an @@ -2023,7 +2162,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; - error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; @@ -2031,6 +2170,84 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (spa_version(spa) >= SPA_VERSION_FEATURES) { + boolean_t missing_feat_read = B_FALSE; + nvlist_t *unsup_feat; + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, + &spa->spa_feat_for_read_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, + &spa->spa_feat_for_write_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, + &spa->spa_feat_desc_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + if (!feature_is_supported(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, + unsup_feat)) + missing_feat_read = B_TRUE; + + if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { + if (!feature_is_supported(spa->spa_meta_objset, + spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, + unsup_feat)) + missing_feat_write = B_TRUE; + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + } + + nvlist_free(unsup_feat); + + if (!missing_feat_read) { + fnvlist_add_boolean(spa->spa_load_info, + ZPOOL_CONFIG_CAN_RDONLY); + } + + /* + * If the state is SPA_LOAD_TRYIMPORT, our objective is + * twofold: to determine whether the pool is available for + * import in read-write mode and (if it is not) whether the + * pool is available for import in read-only mode. If the pool + * is available for import in read-write mode, it is displayed + * as available in userland; if it is not available for import + * in read-only mode, it is displayed as unavailable in + * userland. If the pool is available for import in read-only + * mode but not read-write mode, it is displayed as unavailable + * in userland with a special note that the pool is actually + * available for open in read-only mode. + * + * As a result, if the state is SPA_LOAD_TRYIMPORT and we are + * missing a feature for write, we must first determine whether + * the pool can be opened read-only before returning to + * userland in order to know whether to display the + * abovementioned note. + */ + if (missing_feat_read || (missing_feat_write && + spa_writeable(spa))) { + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + } + + spa->spa_is_initializing = B_TRUE; + error = dsl_pool_open(spa->spa_dsl_pool); + spa->spa_is_initializing = B_FALSE; + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (!mosconfig) { uint64_t hostid; nvlist_t *policy = NULL, *nvconfig; @@ -2248,7 +2465,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, nvlist_free(nvconfig); /* - * Now that we've validate the config, check the state of the + * Now that we've validated the config, check the state of the * root vdev. If it can't be opened, it indicates one or * more toplevel vdevs are faulted. */ @@ -2261,6 +2478,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, } } + if (missing_feat_write) { + ASSERT(state == SPA_LOAD_TRYIMPORT); + + /* + * At this point, we know that we can open the pool in + * read-only mode but not read-write mode. We now have enough + * information and can return to userland. + */ + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); + } + /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. @@ -2370,10 +2598,18 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } +/* + * If spa_load() fails this function will try loading prior txg's. If + * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool + * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this + * function will not rewind the pool and will return the same error as + * spa_load(). + */ static int spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, uint64_t max_request, int rewind_flags) { + nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; @@ -2402,9 +2638,18 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, return (load_error); } - /* Price of rolling back is discarding txgs, including log */ - if (state == SPA_LOAD_RECOVER) + if (state == SPA_LOAD_RECOVER) { + /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * If we aren't rolling back save the load info from our first + * import attempt so that we can restore it after attempting + * to rewind. + */ + loadinfo = spa->spa_load_info; + spa->spa_load_info = fnvlist_alloc(); + } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; @@ -2428,7 +2673,20 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); - return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); + if (state == SPA_LOAD_RECOVER) { + ASSERT3P(loadinfo, ==, NULL); + return (rewind_error); + } else { + /* Store the rewind info as part of the initial load info */ + fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, + spa->spa_load_info); + + /* Restore the initial load info */ + fnvlist_free(spa->spa_load_info); + spa->spa_load_info = loadinfo; + + return (load_error); + } } /* @@ -2707,8 +2965,50 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) } } +static void +spa_add_feature_stats(spa_t *spa, nvlist_t *config) +{ + nvlist_t *features; + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (spa->spa_feat_for_read_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_read_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + if (spa->spa_feat_for_write_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_write_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + features) == 0); + nvlist_free(features); +} + int -spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) +spa_get_stats(const char *name, nvlist_t **config, + char *altroot, size_t buflen) { int error; spa_t *spa; @@ -2743,6 +3043,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); + spa_add_feature_stats(spa, *config); } } @@ -2963,6 +3264,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; + boolean_t has_features; /* * If this pool already exists, return failure. @@ -2988,10 +3290,18 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, return (error); } - if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), - &version) != 0) + has_features = B_FALSE; + for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); + elem != NULL; elem = nvlist_next_nvpair(props, elem)) { + if (zpool_prop_feature(nvpair_name(elem))) + has_features = B_TRUE; + } + + if (has_features || nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; - ASSERT(version <= SPA_VERSION); + } + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; @@ -3067,8 +3377,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_l2cache.sav_sync = B_TRUE; } + spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; + spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). @@ -3092,6 +3404,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, cmn_err(CE_PANIC, "failed to add pool config"); } + if (spa_version(spa) >= SPA_VERSION_FEATURES) + spa_feature_create_zap_objects(spa, tx); + if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { @@ -3283,7 +3598,7 @@ spa_import_rootpool(char *devpath, char *devid) } #endif if (config == NULL) { - cmn_err(CE_NOTE, "Can not read the pool label from '%s'", + cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); return (EIO); } @@ -3603,6 +3918,8 @@ spa_tryimport(nvlist_t *tryconfig) state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we @@ -5328,7 +5645,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) * information. This avoids the dbuf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ - bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); + bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = kmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, @@ -5413,6 +5730,24 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } +static void +spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + uint64_t version = *(uint64_t *)arg2; + + /* + * Setting the version is special cased when first creating the pool. + */ + ASSERT(tx->tx_txg != TXG_INITIAL); + + ASSERT(version <= SPA_VERSION); + ASSERT(version >= spa_version(spa)); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); +} + /* * Set zpool properties. */ @@ -5422,32 +5757,38 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) spa_t *spa = arg1; objset_t *mos = spa->spa_meta_objset; nvlist_t *nvp = arg2; - nvpair_t *elem; - uint64_t intval; - char *strval; - zpool_prop_t prop; - const char *propname; - zprop_type_t proptype; + nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem))) { + uint64_t intval; + char *strval, *fname; + zpool_prop_t prop; + const char *propname; + zprop_type_t proptype; + zfeature_info_t *feature; + switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + case ZPROP_INVAL: + /* + * We checked this earlier in spa_prop_validate(). + */ + ASSERT(zpool_prop_feature(nvpair_name(elem))); + + fname = strchr(nvpair_name(elem), '@') + 1; + VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); + + spa_feature_enable(spa, feature, tx); + break; + case ZPOOL_PROP_VERSION: + VERIFY(nvpair_value_uint64(elem, &intval) == 0); /* - * Only set version for non-zpool-creation cases - * (set/import). spa_create() needs special care - * for version setting. + * The version is synced seperatly before other + * properties and should be correct by now. */ - if (tx->tx_txg != TXG_INITIAL) { - VERIFY(nvpair_value_uint64(elem, - &intval) == 0); - ASSERT(intval <= SPA_VERSION); - ASSERT(intval >= spa_version(spa)); - spa->spa_uberblock.ub_version = intval; - vdev_config_dirty(spa->spa_root_vdev); - } + ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: @@ -5484,14 +5825,10 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { - VERIFY((spa->spa_pool_props_object = - zap_create(mos, DMU_OT_POOL_PROPS, - DMU_OT_NONE, 0, tx)) > 0); - - VERIFY(zap_update(mos, + spa->spa_pool_props_object = + zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, - 8, 1, &spa->spa_pool_props_object, tx) - == 0); + tx); } /* normalize the property name */ @@ -5590,6 +5927,11 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && + spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + spa_feature_create_zap_objects(spa, tx); + } } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c index a072233..3dae0ba 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -35,6 +35,7 @@ #include <sys/zfs_ioctl.h> #include <sys/utsname.h> #include <sys/sunddi.h> +#include <sys/zfeature.h> #ifdef _KERNEL #include <sys/kobj.h> #include <sys/zone.h> @@ -407,6 +408,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); + /* + * Store what's necessary for reading the MOS in the label. + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + spa->spa_label_features) == 0); + if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index 6342452..83c3d31 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -48,6 +48,7 @@ #include <sys/arc.h> #include <sys/ddt.h> #include "zfs_prop.h" +#include "zfeature_common.h" /* * SPA locking @@ -216,7 +217,7 @@ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. * - * spa_rename() is also implemented within this file since is requires + * spa_rename() is also implemented within this file since it requires * manipulation of the namespace. */ @@ -487,8 +488,22 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (config != NULL) + if (config != NULL) { + nvlist_t *features; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) == 0) { + VERIFY(nvlist_dup(features, &spa->spa_label_features, + 0) == 0); + } + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + } + + if (spa->spa_label_features == NULL) { + VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + } return (spa); } @@ -525,6 +540,7 @@ spa_remove(spa_t *spa) list_destroy(&spa->spa_config_list); + nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); @@ -1033,6 +1049,20 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) * ========================================================================== */ +void +spa_activate_mos_feature(spa_t *spa, const char *feature) +{ + (void) nvlist_add_boolean(spa->spa_label_features, feature); + vdev_config_dirty(spa->spa_root_vdev); +} + +void +spa_deactivate_mos_feature(spa_t *spa, const char *feature) +{ + (void) nvlist_remove_all(spa->spa_label_features, feature); + vdev_config_dirty(spa->spa_root_vdev); +} + /* * Rename a spa_t. */ @@ -1183,12 +1213,22 @@ spa_generate_guid(spa_t *spa) void sprintf_blkptr(char *buf, const blkptr_t *bp) { - char *type = NULL; + char type[256]; char *checksum = NULL; char *compress = NULL; if (bp != NULL) { - type = dmu_ot[BP_GET_TYPE(bp)].ot_name; + if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); + (void) snprintf(type, sizeof (type), "bswap %s %s", + DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? + "metadata" : "data", + dmu_ot_byteswap[bswap].ob_name); + } else { + (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, + sizeof (type)); + } checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } @@ -1270,6 +1310,12 @@ spa_get_dsl(spa_t *spa) return (spa->spa_dsl_pool); } +boolean_t +spa_is_initializing(spa_t *spa) +{ + return (spa->spa_is_initializing); +} + blkptr_t * spa_get_rootblkptr(spa_t *spa) { @@ -1553,6 +1599,7 @@ spa_init(int mode) vdev_cache_stat_init(); zfs_prop_init(); zpool_prop_init(); + zpool_feature_init(); spa_config_load(); l2arc_start(); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h new file mode 100644 index 0000000..9715072 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BPTREE_H +#define _SYS_BPTREE_H + +#include <sys/spa.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bptree_phys { + uint64_t bt_begin; + uint64_t bt_end; + uint64_t bt_bytes; + uint64_t bt_comp; + uint64_t bt_uncomp; +} bptree_phys_t; + +typedef struct bptree_entry_phys { + blkptr_t be_bp; + uint64_t be_birth_txg; /* only delete blocks born after this txg */ + zbookmark_t be_zb; /* holds traversal resume point if needed */ +} bptree_entry_phys_t; + +typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx); +int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); + +void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx); + +int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, + bptree_itor_t func, void *arg, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPTREE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 4d24f57..c6943e1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -18,11 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. - */ -/* + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -75,6 +74,53 @@ typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; +typedef enum dmu_object_byteswap { + DMU_BSWAP_UINT8, + DMU_BSWAP_UINT16, + DMU_BSWAP_UINT32, + DMU_BSWAP_UINT64, + DMU_BSWAP_ZAP, + DMU_BSWAP_DNODE, + DMU_BSWAP_OBJSET, + DMU_BSWAP_ZNODE, + DMU_BSWAP_OLDACL, + DMU_BSWAP_ACL, + /* + * Allocating a new byteswap type number makes the on-disk format + * incompatible with any other format that uses the same number. + * + * Data can usually be structured to work with one of the + * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. + */ + DMU_BSWAP_NUMFUNCS +} dmu_object_byteswap_t; + +#define DMU_OT_NEWTYPE 0x80 +#define DMU_OT_METADATA 0x40 +#define DMU_OT_BYTESWAP_MASK 0x3f + +/* + * Defines a uint8_t object type. Object types specify if the data + * in the object is metadata (boolean) and how to byteswap the data + * (dmu_object_byteswap_t). + */ +#define DMU_OT(byteswap, metadata) \ + (DMU_OT_NEWTYPE | \ + ((metadata) ? DMU_OT_METADATA : 0) | \ + ((byteswap) & DMU_OT_BYTESWAP_MASK)) + +#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ + (ot) < DMU_OT_NUMTYPES) + +#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_METADATA) : \ + dmu_ot[(ot)].ot_metadata) + +#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) : \ + dmu_ot[(ot)].ot_byteswap) + typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ @@ -139,7 +185,35 @@ typedef enum dmu_object_type { DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ - DMU_OT_NUMTYPES + /* + * Do not allocate new object types here. Doing so makes the on-disk + * format incompatible with any other format that uses the same object + * type number. + * + * When creating an object which does not have one of the above types + * use the DMU_OTN_* type with the correct byteswap and metadata + * values. + * + * The DMU_OTN_* types do not have entries in the dmu_ot table, + * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead + * of indexing into dmu_ot directly (this works for both DMU_OT_* types + * and DMU_OTN_* types). + */ + DMU_OT_NUMTYPES, + + /* + * Names for valid types declared with DMU_OT(). + */ + DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), + DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), + DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), + DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), + DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), + DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), + DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), + DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), + DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), + DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; typedef enum dmu_objset_type { @@ -221,6 +295,9 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" +#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" +#define DMU_POOL_FEATURES_FOR_READ "features_for_read" +#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" @@ -236,6 +313,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" +#define DMU_POOL_BPTREE_OBJ "bptree_obj" /* * Allocate an object from this objset. The range of object numbers @@ -496,7 +574,7 @@ void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, /* * Free up the data blocks for a defined range of a file. If size is - * zero, the range from offset to end-of-file is freed. + * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); @@ -566,12 +644,18 @@ typedef struct dmu_object_info { typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { - arc_byteswap_func_t *ot_byteswap; + dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; char *ot_name; } dmu_object_type_info_t; +typedef struct dmu_object_byteswap_info { + arc_byteswap_func_t *ob_func; + char *ob_name; +} dmu_object_byteswap_info_t; + extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; +extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h index 5b326cd..3cbf42f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -54,6 +55,9 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg); int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index 56d2f6b..e3affeb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. * All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -88,7 +88,12 @@ typedef struct dsl_dataset_phys { uint64_t ds_creation_time; /* seconds since 1970 */ uint64_t ds_creation_txg; uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ - uint64_t ds_used_bytes; + /* + * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes + * include all blocks referenced by this dataset, including those + * shared with any other datasets. + */ + uint64_t ds_referenced_bytes; uint64_t ds_compressed_bytes; uint64_t ds_uncompressed_bytes; uint64_t ds_unique_bytes; /* only relevant to snapshots */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h index 57725b5..e2c12ab 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_POOL_H @@ -34,6 +35,7 @@ #include <sys/ddt.h> #include <sys/arc.h> #include <sys/bpobj.h> +#include <sys/bptree.h> #ifdef __cplusplus extern "C" { @@ -48,7 +50,8 @@ struct dsl_scan; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE -#define DMU_OT_TOTAL DMU_OT_NUMTYPES +#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */ +#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1) typedef struct zfs_blkstat { uint64_t zb_count; @@ -85,6 +88,7 @@ typedef struct dsl_pool { uint64_t dp_write_limit; uint64_t dp_tmp_userrefs_obj; bpobj_t dp_free_bpobj; + uint64_t dp_bptree_obj; struct dsl_scan *dp_scan; @@ -110,7 +114,8 @@ typedef struct dsl_pool { zfs_all_blkstats_t *dp_blkstats; } dsl_pool_t; -int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +int dsl_pool_open(dsl_pool_t *dp); void dsl_pool_close(dsl_pool_t *dp); dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h index c79666e..5691f4d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_SCAN_H @@ -79,6 +80,9 @@ typedef struct dsl_scan { uint64_t scn_sync_start_time; zio_t *scn_zio_root; + /* for freeing blocks */ + boolean_t scn_is_bptree; + /* for debugging / information */ uint64_t scn_visited_this_txg; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index a2a76e3..ec6914e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -94,7 +94,7 @@ struct dsl_pool; /* * Size of block to hold the configuration data (a packed nvlist) */ -#define SPA_CONFIG_BLOCKSIZE (1 << 14) +#define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. @@ -262,7 +262,7 @@ typedef struct blkptr { DVA_GET_ASIZE(&(bp)->blk_dva[2])) #define BP_GET_UCSIZE(bp) \ - ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ + ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ @@ -403,8 +403,8 @@ typedef struct blkptr { #include <sys/dmu.h> #define BP_GET_BUFC_TYPE(bp) \ - (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA); + (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ + ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, @@ -415,8 +415,8 @@ typedef enum spa_import_type { extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, nvlist_t *policy, nvlist_t **config); -extern int spa_get_stats(const char *pool, nvlist_t **config, - char *altroot, size_t buflen); +extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, + size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, const char *history_str, nvlist_t *zplprops); extern int spa_import_rootpool(char *devpath, char *devid); @@ -573,6 +573,7 @@ extern void spa_claim_notify(zio_t *zio); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern boolean_t spa_is_initializing(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); @@ -604,6 +605,8 @@ extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); /* Miscellaneous support routines */ +extern void spa_activate_mos_feature(spa_t *spa, const char *feature); +extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern int spa_rename(const char *oldname, const char *newname); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index 88d1477..b427674 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -127,6 +127,7 @@ struct spa { uint64_t spa_import_flags; /* import specific flags */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; + boolean_t spa_is_initializing; /* true while opening pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ uint64_t spa_first_txg; /* first txg after spa_open() */ @@ -144,6 +145,7 @@ struct spa { list_t spa_state_dirty_list; /* vdevs with dirty state */ spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ @@ -220,7 +222,10 @@ struct spa { boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ - uint64_t spa_prev_software_version; + uint64_t spa_prev_software_version; /* See ub_software_version */ + uint64_t spa_feat_for_write_obj; /* required to write to pool */ + uint64_t spa_feat_for_read_obj; /* required to read from pool */ + uint64_t spa_feat_desc_obj; /* Feature descriptions */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h index aa6559c..2329d5b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. @@ -141,8 +142,8 @@ extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, struct uberblock; extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern int vdev_label_number(uint64_t psise, uint64_t offset); -extern nvlist_t *vdev_label_read_config(vdev_t *vd); -extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); +extern nvlist_t *vdev_label_read_config(vdev_t *vd, int label); +extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 019d2be..7eed4a1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -202,7 +202,7 @@ struct vdev { * For DTrace to work in userland (libzpool) context, these fields must * remain at the end of the structure. DTrace will use the kernel's * CTF definition for 'struct vdev', and since the size of a kmutex_t is - * larger in userland, the offsets for the rest fields would be + * larger in userland, the offsets for the rest of the fields would be * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ @@ -257,6 +257,7 @@ typedef struct vdev_label { #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 +#define VDEV_BEST_LABEL VDEV_LABELS #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h index a1130bb..4d7b315 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ZAP_H @@ -132,6 +133,8 @@ uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, + uint64_t parent_obj, const char *name, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) @@ -300,12 +303,6 @@ int zap_add_int_key(objset_t *os, uint64_t obj, int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep); -/* - * They name is a stringified version of key; increment its value by - * delta. Zero values will be zap_remove()-ed. - */ -int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, - dmu_tx_t *tx); int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h new file mode 100644 index 0000000..9ff1c93 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFEATURE_H +#define _SYS_ZFEATURE_H + +#include <sys/dmu.h> +#include <sys/nvpair.h> +#include "zfeature_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t feature_is_supported(objset_t *os, uint64_t obj, + uint64_t desc_obj, nvlist_t *unsup_feat); + +struct spa; +extern void spa_feature_create_zap_objects(struct spa *, dmu_tx_t *); +extern void spa_feature_enable(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern void spa_feature_incr(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern void spa_feature_decr(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *); +extern boolean_t spa_feature_is_active(struct spa *, zfeature_info_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFEATURE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 4a4e843..80d9336 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _ZIO_H @@ -270,6 +271,14 @@ typedef struct zbookmark { #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) +#define ZB_IS_ZERO(zb) \ + ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ + (zb)->zb_level == 0 && (zb)->zb_blkid == 0) +#define ZB_IS_ROOT(zb) \ + ((zb)->zb_object == ZB_ROOT_OBJECT && \ + (zb)->zb_level == ZB_ROOT_LEVEL && \ + (zb)->zb_blkid == ZB_ROOT_BLKID) + typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; @@ -287,6 +296,7 @@ typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ +struct dnode_phys; struct zio_cksum_report { struct zio_cksum_report *zcr_next; @@ -559,6 +569,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); +/* zbookmark functions */ +boolean_t zbookmark_is_before(const struct dnode_phys *dnp, + const zbookmark_t *zb1, const zbookmark_t *zb2); + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index ffcefb9..950f2f1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -1329,7 +1329,8 @@ vdev_validate(vdev_t *vd, boolean_t strict) uint64_t aux_guid = 0; nvlist_t *nvl; - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == + NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); @@ -1970,14 +1971,14 @@ vdev_validate_aux(vdev_t *vd) if (!vdev_readable(vd)) return (0); - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || - version > SPA_VERSION || + !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index c08ed8b..b943647 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -121,6 +123,8 @@ * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. + * features_for_read + * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * @@ -428,8 +432,13 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } +/* + * Returns the configuration from the label of the given vdev. If 'label' is + * VDEV_BEST_LABEL, each label of the vdev will be read until a valid + * configuration is found; otherwise, only the specified label will be read. + */ nvlist_t * -vdev_label_read_config(vdev_t *vd) +vdev_label_read_config(vdev_t *vd, int label) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; @@ -447,6 +456,8 @@ vdev_label_read_config(vdev_t *vd) retry: for (int l = 0; l < VDEV_LABELS; l++) { + if (label >= 0 && label < VDEV_LABELS && label != l) + continue; zio = zio_root(spa, NULL, NULL, flags); @@ -496,7 +507,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, /* * Read the label, if any, and perform some basic sanity checks. */ - if ((label = vdev_label_read_config(vd)) == NULL) + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, @@ -833,7 +844,7 @@ retry: * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then - * the missing replica comes back, then for a new seconds we'll have two + * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ @@ -853,46 +864,50 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) return (0); } +struct ubl_cbdata { + uberblock_t *ubl_ubbest; /* Best uberblock */ + vdev_t *ubl_vd; /* vdev associated with the above */ + int ubl_label; /* Label associated with the above */ +}; + static void vdev_uberblock_load_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = zio->io_data; - uberblock_t *ubbest = rio->io_private; + struct ubl_cbdata *cbp = rio->io_private; - ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (ub->ub_txg <= spa->spa_load_max_txg && - vdev_uberblock_compare(ub, ubbest) > 0) - *ubbest = *ub; + vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { + /* + * Keep track of the vdev and label in which this + * uberblock was found. We will use this information + * later to obtain the config nvlist associated with + * this uberblock. + */ + *cbp->ubl_ubbest = *ub; + cbp->ubl_vd = vd; + cbp->ubl_label = vdev_label_number(vd->vdev_psize, + zio->io_offset); + } mutex_exit(&rio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); } -void -vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) +static void +vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, + struct ubl_cbdata *cbp) { - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; - - if (vd == rvd) { - ASSERT(zio == NULL); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - zio = zio_root(spa, NULL, ubbest, flags); - bzero(ubbest, sizeof (uberblock_t)); - } - - ASSERT(zio != NULL); - for (int c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { for (int l = 0; l < VDEV_LABELS; l++) { @@ -905,11 +920,45 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) } } } +} - if (vd == rvd) { - (void) zio_wait(zio); - spa_config_exit(spa, SCL_ALL, FTAG); +/* + * Reads the 'best' uberblock from disk along with its associated + * configuration. First, we read the uberblock array of each label of each + * vdev, keeping track of the uberblock with the highest txg in each array. + * Then, we read the configuration from the same label as the best uberblock. + */ +void +vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) +{ + int i; + zio_t *zio; + spa_t *spa = rvd->vdev_spa; + struct ubl_cbdata cb; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(ub); + ASSERT(config); + + bzero(ub, sizeof (uberblock_t)); + *config = NULL; + + cb.ubl_ubbest = ub; + cb.ubl_vd = NULL; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, &cb, flags); + vdev_uberblock_load_impl(zio, rvd, flags, &cb); + (void) zio_wait(zio); + if (cb.ubl_vd != NULL) { + for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) { + *config = vdev_label_read_config(cb.ubl_vd, i); + if (*config != NULL) + break; + } } + spa_config_exit(spa, SCL_ALL, FTAG); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c index 288a4d9..fa1d99f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -946,6 +947,19 @@ fzap_prefetch(zap_name_t *zn) * Helper functions for consumers. */ +uint64_t +zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, dmu_tx_t *tx) +{ + uint64_t new_obj; + + VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0); + VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, + tx) == 0); + + return (new_obj); +} + int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c index 6e506a4..dfdce04 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zio.h> @@ -472,7 +472,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif @@ -596,7 +596,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c new file mode 100644 index 0000000..ba72208 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c @@ -0,0 +1,414 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/zfeature.h> +#include <sys/dmu.h> +#include <sys/nvpair.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include "zfeature_common.h" +#include <sys/spa_impl.h> + +/* + * ZFS Feature Flags + * ----------------- + * + * ZFS feature flags are used to provide fine-grained versioning to the ZFS + * on-disk format. Once enabled on a pool feature flags replace the old + * spa_version() number. + * + * Each new on-disk format change will be given a uniquely identifying string + * guid rather than a version number. This avoids the problem of different + * organizations creating new on-disk formats with the same version number. To + * keep feature guids unique they should consist of the reverse dns name of the + * organization which implemented the feature and a short name for the feature, + * separated by a colon (e.g. com.delphix:async_destroy). + * + * Reference Counts + * ---------------- + * + * Within each pool features can be in one of three states: disabled, enabled, + * or active. These states are differentiated by a reference count stored on + * disk for each feature: + * + * 1) If there is no reference count stored on disk the feature is disabled. + * 2) If the reference count is 0 a system administrator has enabled the + * feature, but the feature has not been used yet, so no on-disk + * format changes have been made. + * 3) If the reference count is greater than 0 the feature is active. + * The format changes required by the feature are currently on disk. + * Note that if the feature's format changes are reversed the feature + * may choose to set its reference count back to 0. + * + * Feature flags makes no differentiation between non-zero reference counts + * for an active feature (e.g. a reference count of 1 means the same thing as a + * reference count of 27834721), but feature implementations may choose to use + * the reference count to store meaningful information. For example, a new RAID + * implementation might set the reference count to the number of vdevs using + * it. If all those disks are removed from the pool the feature goes back to + * having a reference count of 0. + * + * It is the responsibility of the individual features to maintain a non-zero + * reference count as long as the feature's format changes are present on disk. + * + * Dependencies + * ------------ + * + * Each feature may depend on other features. The only effect of this + * relationship is that when a feature is enabled all of its dependencies are + * automatically enabled as well. Any future work to support disabling of + * features would need to ensure that features cannot be disabled if other + * enabled features depend on them. + * + * On-disk Format + * -------------- + * + * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES + * (5000). In order for this to work the pool is automatically upgraded to + * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk + * format changes will be in use. + * + * Information about features is stored in 3 ZAP objects in the pool's MOS. + * These objects are linked to by the following names in the pool directory + * object: + * + * 1) features_for_read: feature guid -> reference count + * Features needed to open the pool for reading. + * 2) features_for_write: feature guid -> reference count + * Features needed to open the pool for writing. + * 3) feature_descriptions: feature guid -> descriptive string + * A human readable string. + * + * All enabled features appear in either features_for_read or + * features_for_write, but not both. + * + * To open a pool in read-only mode only the features listed in + * features_for_read need to be supported. + * + * To open the pool in read-write mode features in both features_for_read and + * features_for_write need to be supported. + * + * Some features may be required to read the ZAP objects containing feature + * information. To allow software to check for compatibility with these features + * before the pool is opened their names must be stored in the label in a + * new "features_for_read" entry (note that features that are only required + * to write to a pool never need to be stored in the label since the + * features_for_write ZAP object can be read before the pool is written to). + * To save space in the label features must be explicitly marked as needing to + * be written to the label. Also, reference counts are not stored in the label, + * instead any feature whose reference count drops to 0 is removed from the + * label. + * + * Adding New Features + * ------------------- + * + * Features must be registered in zpool_feature_init() function in + * zfeature_common.c using the zfeature_register() function. This function + * has arguments to specify if the feature should be stored in the + * features_for_read or features_for_write ZAP object and if it needs to be + * written to the label when active. + * + * Once a feature is registered it will appear as a "feature@<feature name>" + * property which can be set by an administrator. Feature implementors should + * use the spa_feature_is_enabled() and spa_feature_is_active() functions to + * query the state of a feature and the spa_feature_incr() and + * spa_feature_decr() functions to change an enabled feature's reference count. + * Reference counts may only be updated in the syncing context. + * + * Features may not perform enable-time initialization. Instead, any such + * initialization should occur when the feature is first used. This design + * enforces that on-disk changes be made only when features are used. Code + * should only check if a feature is enabled using spa_feature_is_enabled(), + * not by relying on any feature specific metadata existing. If a feature is + * enabled, but the feature's metadata is not on disk yet then it should be + * created as needed. + * + * As an example, consider the com.delphix:async_destroy feature. This feature + * relies on the existence of a bptree in the MOS that store blocks for + * asynchronous freeing. This bptree is not created when async_destroy is + * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is + * called to check if async_destroy is enabled. If it is and the bptree object + * does not exist yet, the bptree object is created as part of the dataset + * destroy and async_destroy's reference count is incremented to indicate it + * has made an on-disk format change. Later, after the destroyed dataset's + * blocks have all been asynchronously freed there is no longer any use for the + * bptree object, so it is destroyed and async_destroy's reference count is + * decremented back to 0 to indicate that it has undone its on-disk format + * changes. + */ + +typedef enum { + FEATURE_ACTION_ENABLE, + FEATURE_ACTION_INCR, + FEATURE_ACTION_DECR, +} feature_action_t; + +/* + * Checks that the features active in the specified object are supported by + * this software. Adds each unsupported feature (name -> description) to + * the supplied nvlist. + */ +boolean_t +feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, + nvlist_t *unsup_feat) +{ + boolean_t supported; + zap_cursor_t zc; + zap_attribute_t za; + + supported = B_TRUE; + for (zap_cursor_init(&zc, os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + + if (za.za_first_integer != 0 && + !zfeature_is_supported(za.za_name)) { + supported = B_FALSE; + + if (unsup_feat != NULL) { + char *desc = ""; + char buf[MAXPATHLEN]; + + if (zap_lookup(os, desc_obj, za.za_name, + 1, sizeof (buf), buf) == 0) + desc = buf; + + VERIFY(nvlist_add_string(unsup_feat, za.za_name, + desc) == 0); + } + } + } + zap_cursor_fini(&zc); + + return (supported); +} + +static int +feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj, + zfeature_info_t *feature, uint64_t *res) +{ + int err; + uint64_t refcount; + uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; + + ASSERT(0 != zapobj); + + err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1, + &refcount); + if (err != 0) { + if (err == ENOENT) + return (ENOTSUP); + else + return (err); + } + *res = refcount; + return (0); +} + +static int +feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj, + uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action, + dmu_tx_t *tx) +{ + int error; + uint64_t refcount; + uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; + + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + + error = zap_lookup(os, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount); + + /* + * If we can't ascertain the status of the specified feature, an I/O + * error occurred. + */ + if (error != 0 && error != ENOENT) + return (error); + + switch (action) { + case FEATURE_ACTION_ENABLE: + /* + * If the feature is already enabled, ignore the request. + */ + if (error == 0) + return (0); + refcount = 0; + break; + case FEATURE_ACTION_INCR: + if (error == ENOENT) + return (ENOTSUP); + if (refcount == UINT64_MAX) + return (EOVERFLOW); + refcount++; + break; + case FEATURE_ACTION_DECR: + if (error == ENOENT) + return (ENOTSUP); + if (refcount == 0) + return (EOVERFLOW); + refcount--; + break; + default: + ASSERT(0); + break; + } + + if (action == FEATURE_ACTION_ENABLE) { + int i; + + for (i = 0; feature->fi_depends[i] != NULL; i++) { + zfeature_info_t *dep = feature->fi_depends[i]; + + error = feature_do_action(os, read_obj, write_obj, + desc_obj, dep, FEATURE_ACTION_ENABLE, tx); + if (error != 0) + return (error); + } + } + + error = zap_update(os, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount, tx); + if (error != 0) + return (error); + + if (action == FEATURE_ACTION_ENABLE) { + error = zap_update(os, desc_obj, + feature->fi_guid, 1, strlen(feature->fi_desc) + 1, + feature->fi_desc, tx); + if (error != 0) + return (error); + } + + if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) { + spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid); + } + + if (action == FEATURE_ACTION_DECR && refcount == 0) { + spa_deactivate_mos_feature(dmu_objset_spa(os), + feature->fi_guid); + } + + return (0); +} + +void +spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We create feature flags ZAP objects in two instances: during pool + * creation and during pool upgrade. + */ + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on && + tx->tx_txg == TXG_INITIAL)); + + spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_READ, tx); + spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_WRITE, tx); + spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_DESCRIPTIONS, tx); +} + +/* + * Enable any required dependencies, then enable the requested feature. + */ +void +spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx)); +} + +/* + * If the specified feature has not yet been enabled, this function returns + * ENOTSUP; otherwise, this function increments the feature's refcount (or + * returns EOVERFLOW if the refcount cannot be incremented). This function must + * be called from syncing context. + */ +void +spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx)); +} + +/* + * If the specified feature has not yet been enabled, this function returns + * ENOTSUP; otherwise, this function decrements the feature's refcount (or + * returns EOVERFLOW if the refcount is already 0). This function must + * be called from syncing context. + */ +void +spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx)); +} + +boolean_t +spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature) +{ + int err; + uint64_t refcount; + + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + feature, &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0); +} + +boolean_t +spa_feature_is_active(spa_t *spa, zfeature_info_t *feature) +{ + int err; + uint64_t refcount; + + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + feature, &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0 && refcount > 0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 7c75aca..73a4f75 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011-2012 Pawel Jakub Dawidek <pawel@dawidek.net>. @@ -1157,6 +1158,8 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp) /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_vfs will be NULL, and it will be opened as the owner. + * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, + * which prevents all vnode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) @@ -1220,7 +1223,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); - if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) { + if (!SPA_VERSION_IS_SUPPORTED(version)) { error = EINVAL; goto pool_props_bad; } @@ -1344,6 +1347,15 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * + * outputs: + * zc_cookie real errno + * zc_nvlist_dst config nvlist + * zc_nvlist_dst_size size of config nvlist + */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { @@ -1445,7 +1457,8 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { + if (zc->zc_cookie < spa_version(spa) || + !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (EINVAL); } @@ -5452,7 +5465,7 @@ zfs_modevent(module_t mod, int type, void *unused __unused) tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); - printf("ZFS storage pool version " SPA_VERSION_STRING "\n"); + printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n"); root_mount_rel(zfs_root_token); zfsdev_init(); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 467b6a6..01eddbd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -2278,7 +2278,7 @@ void zfs_init(void) { - printf("ZFS filesystem version " ZPL_VERSION_STRING "\n"); + printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 694302e..ca402f6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -640,7 +640,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && - zp->zp_type < DMU_OT_NUMTYPES && + DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa) && @@ -924,7 +924,7 @@ zio_read_bp_init(zio_t *zio) zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } - if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) @@ -3015,3 +3015,45 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_checksum_verify, zio_done }; + +/* dnp is the dnode for zb1->zb_object */ +boolean_t +zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == DMU_DEADLIST_OBJECT) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == DMU_META_DNODE_OBJECT) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h index 1b23dc2..f2d996a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -172,6 +172,7 @@ typedef enum { ZPOOL_PROP_READONLY, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, + ZPOOL_PROP_FREEING, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -245,6 +246,8 @@ const char *zpool_prop_to_name(zpool_prop_t); const char *zpool_prop_default_string(zpool_prop_t); uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); +boolean_t zpool_prop_feature(const char *); +boolean_t zpool_prop_unsupported(const char *name); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); @@ -352,6 +355,7 @@ typedef enum { #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL +#define SPA_VERSION_5000 5000ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk @@ -359,8 +363,8 @@ typedef enum { * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_28 -#define SPA_VERSION_STRING "28" +#define SPA_VERSION SPA_VERSION_5000 +#define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -411,6 +415,12 @@ typedef enum { #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 +#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 +#define SPA_VERSION_FEATURES SPA_VERSION_5000 + +#define SPA_VERSION_IS_SUPPORTED(v) \ + (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ + ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -508,6 +518,11 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ +#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ +#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ +#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ +#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" +#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -586,6 +601,7 @@ typedef enum vdev_aux { VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ + VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h index abf84cf..3062dd9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_NVPAIR_H @@ -274,6 +275,73 @@ int nvpair_value_hrtime(nvpair_t *, hrtime_t *); int nvpair_value_double(nvpair_t *, double *); #endif +nvlist_t *fnvlist_alloc(void); +void fnvlist_free(nvlist_t *); +size_t fnvlist_size(nvlist_t *); +char *fnvlist_pack(nvlist_t *, size_t *); +void fnvlist_pack_free(char *, size_t); +nvlist_t *fnvlist_unpack(char *, size_t); +nvlist_t *fnvlist_dup(nvlist_t *); +void fnvlist_merge(nvlist_t *, nvlist_t *); + +void fnvlist_add_boolean(nvlist_t *, const char *); +void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); +void fnvlist_add_byte(nvlist_t *, const char *, uchar_t); +void fnvlist_add_int8(nvlist_t *, const char *, int8_t); +void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t); +void fnvlist_add_int16(nvlist_t *, const char *, int16_t); +void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t); +void fnvlist_add_int32(nvlist_t *, const char *, int32_t); +void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t); +void fnvlist_add_int64(nvlist_t *, const char *, int64_t); +void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t); +void fnvlist_add_string(nvlist_t *, const char *, const char *); +void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +void fnvlist_add_nvpair(nvlist_t *, nvpair_t *); +void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); +void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); +void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); +void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); +void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); +void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); +void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); +void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); +void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); +void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); +void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t); +void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); + +void fnvlist_remove(nvlist_t *, const char *); +void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *); + +nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name); +boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name); +boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name); +uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name); +int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name); +int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name); +int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name); +int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name); +uint8_t fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name); +uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name); +uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name); +uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name); +char *fnvlist_lookup_string(nvlist_t *nvl, const char *name); +nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name); + +boolean_t fnvpair_value_boolean_value(nvpair_t *nvp); +uchar_t fnvpair_value_byte(nvpair_t *nvp); +int8_t fnvpair_value_int8(nvpair_t *nvp); +int16_t fnvpair_value_int16(nvpair_t *nvp); +int32_t fnvpair_value_int32(nvpair_t *nvp); +int64_t fnvpair_value_int64(nvpair_t *nvp); +uint8_t fnvpair_value_uint8_t(nvpair_t *nvp); +uint16_t fnvpair_value_uint16(nvpair_t *nvp); +uint32_t fnvpair_value_uint32(nvpair_t *nvp); +uint64_t fnvpair_value_uint64(nvpair_t *nvp); +char *fnvpair_value_string(nvpair_t *nvp); +nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp); + #ifdef __cplusplus } #endif |