diff options
author | mm <mm@FreeBSD.org> | 2012-07-18 09:10:54 +0000 |
---|---|---|
committer | mm <mm@FreeBSD.org> | 2012-07-18 09:10:54 +0000 |
commit | 863e8d4ffc44293bee25f7ee9dc16cc3a070f4b3 (patch) | |
tree | 84a9ce8d96bc1919c16898d09e3fbea0fe11bc35 | |
parent | 39f5422299c27ce429b9d5f3aeecb746b7b600d7 (diff) | |
download | FreeBSD-src-863e8d4ffc44293bee25f7ee9dc16cc3a070f4b3.zip FreeBSD-src-863e8d4ffc44293bee25f7ee9dc16cc3a070f4b3.tar.gz |
Update vendor-sys/illumos/dist to pre libzfs_core state
illumos-gate revision 13742:b6bbdd77139c
Obtained from: ssh://anonhg@hg.illumos.org/illumos-gate
85 files changed, 5238 insertions, 1184 deletions
diff --git a/common/acl/acl_common.c b/common/acl/acl_common.c index eafc47d..494c5f7 100644 --- a/common/acl/acl_common.c +++ b/common/acl/acl_common.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ #include <sys/types.h> @@ -372,7 +373,7 @@ access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow) * by nfsace, assuming aclent_t -> nfsace semantics. */ static uint32_t -mode_to_ace_access(mode_t mode, int isdir, int isowner, int isallow) +mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow) { uint32_t access = 0; int haswriteperm = 0; @@ -415,7 +416,7 @@ mode_to_ace_access(mode_t mode, int isdir, int isowner, int isallow) access |= ACE_DELETE_CHILD; } /* exec */ - if (mode & 01) { + if (mode & S_IXOTH) { access |= ACE_EXECUTE; } @@ -666,7 +667,7 @@ out: } static int -convert_aent_to_ace(aclent_t *aclentp, int aclcnt, int isdir, +convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir, ace_t **retacep, int *retacecnt) { ace_t *acep; @@ -692,7 +693,7 @@ convert_aent_to_ace(aclent_t *aclentp, int aclcnt, int isdir, dfaclcnt = aclcnt - i; } - if (dfaclcnt && isdir == 0) { + if (dfaclcnt && !isdir) { return (EINVAL); } @@ -730,7 +731,7 @@ convert_aent_to_ace(aclent_t *aclentp, int aclcnt, int isdir, } static int -ace_mask_to_mode(uint32_t mask, o_mode_t *modep, int isdir) +ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) { int error = 0; o_mode_t mode = 0; @@ -1027,7 +1028,7 @@ out: } static int -ace_allow_to_mode(uint32_t mask, o_mode_t *modep, int isdir) +ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) { /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */ if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) != @@ -1040,7 +1041,7 @@ ace_allow_to_mode(uint32_t mask, o_mode_t *modep, int isdir) static int acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list, - uid_t owner, gid_t group, int isdir) + uid_t owner, gid_t group, boolean_t isdir) { int error; uint32_t flips = ACE_POSIX_SUPPORTED_BITS; @@ -1080,7 +1081,7 @@ out: static int ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt, - uid_t owner, gid_t group, int isdir) + uid_t owner, gid_t group, boolean_t isdir) { int error = 0; aclent_t *aent, *result = NULL; @@ -1260,7 +1261,7 @@ acevals_compare(const void *va, const void *vb) static int ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group, aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt, - int isdir) + boolean_t isdir) { int error = 0; ace_t *acep; @@ -1455,7 +1456,7 @@ out: } static int -convert_ace_to_aent(ace_t *acebufp, int acecnt, int isdir, +convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir, uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt) { int error = 0; @@ -1497,7 +1498,7 @@ convert_ace_to_aent(ace_t *acebufp, int acecnt, int isdir, int -acl_translate(acl_t *aclp, int target_flavor, int isdir, uid_t owner, +acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner, gid_t group) { int aclcnt; @@ -1568,101 +1569,105 @@ out: } void -acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1, - uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone) +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) { - *deny1 = *deny2 = *allow0 = *group = 0; + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + (void) isdir; /* will need this later */ + + masks->deny1 = 0; if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) - *deny1 |= ACE_READ_DATA; + masks->deny1 |= read_mask; if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) - *deny1 |= ACE_WRITE_DATA; + masks->deny1 |= write_mask; if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) - *deny1 |= ACE_EXECUTE; + masks->deny1 |= execute_mask; + masks->deny2 = 0; if (!(mode & S_IRGRP) && (mode & S_IROTH)) - *deny2 = ACE_READ_DATA; + masks->deny2 |= read_mask; if (!(mode & S_IWGRP) && (mode & S_IWOTH)) - *deny2 |= ACE_WRITE_DATA; + masks->deny2 |= write_mask; if (!(mode & S_IXGRP) && (mode & S_IXOTH)) - *deny2 |= ACE_EXECUTE; + masks->deny2 |= execute_mask; + masks->allow0 = 0; if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) - *allow0 |= ACE_READ_DATA; + masks->allow0 |= read_mask; if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) - *allow0 |= ACE_WRITE_DATA; + masks->allow0 |= write_mask; if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) - *allow0 |= ACE_EXECUTE; + masks->allow0 |= execute_mask; - *owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; if (mode & S_IRUSR) - *owner |= ACE_READ_DATA; + masks->owner |= read_mask; if (mode & S_IWUSR) - *owner |= ACE_WRITE_DATA|ACE_APPEND_DATA; + masks->owner |= write_mask; if (mode & S_IXUSR) - *owner |= ACE_EXECUTE; + masks->owner |= execute_mask; - *group = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| ACE_SYNCHRONIZE; if (mode & S_IRGRP) - *group |= ACE_READ_DATA; + masks->group |= read_mask; if (mode & S_IWGRP) - *group |= ACE_WRITE_DATA|ACE_APPEND_DATA; + masks->group |= write_mask; if (mode & S_IXGRP) - *group |= ACE_EXECUTE; + masks->group |= execute_mask; - *everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| ACE_SYNCHRONIZE; if (mode & S_IROTH) - *everyone |= ACE_READ_DATA; + masks->everyone |= read_mask; if (mode & S_IWOTH) - *everyone |= ACE_WRITE_DATA|ACE_APPEND_DATA; + masks->everyone |= write_mask; if (mode & S_IXOTH) - *everyone |= ACE_EXECUTE; + masks->everyone |= execute_mask; } int -acl_trivial_create(mode_t mode, ace_t **acl, int *count) +acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count) { - uint32_t deny1, deny2; - uint32_t allow0; - uint32_t owner, group, everyone; int index = 0; int error; + trivial_acl_t masks; *count = 3; - acl_trivial_access_masks(mode, &allow0, &deny1, &deny2, &owner, &group, - &everyone); + acl_trivial_access_masks(mode, isdir, &masks); - if (allow0) + if (masks.allow0) (*count)++; - if (deny1) + if (masks.deny1) (*count)++; - if (deny2) + if (masks.deny2) (*count)++; if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0) return (error); - if (allow0) { - SET_ACE(acl, index, -1, allow0, ACE_ACCESS_ALLOWED_ACE_TYPE, - ACE_OWNER); + if (masks.allow0) { + SET_ACE(acl, index, -1, masks.allow0, + ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER); } - if (deny1) { - SET_ACE(acl, index, -1, deny1, ACE_ACCESS_DENIED_ACE_TYPE, - ACE_OWNER); + if (masks.deny1) { + SET_ACE(acl, index, -1, masks.deny1, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER); } - if (deny2) { - SET_ACE(acl, index, -1, deny2, ACE_ACCESS_DENIED_ACE_TYPE, - ACE_GROUP|ACE_IDENTIFIER_GROUP); + if (masks.deny2) { + SET_ACE(acl, index, -1, masks.deny2, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP); } - SET_ACE(acl, index, -1, owner, ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER); - SET_ACE(acl, index, -1, group, ACE_ACCESS_ALLOWED_ACE_TYPE, + SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_OWNER); + SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_IDENTIFIER_GROUP|ACE_GROUP); - SET_ACE(acl, index, -1, everyone, ACE_ACCESS_ALLOWED_ACE_TYPE, + SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_EVERYONE); return (0); diff --git a/common/acl/acl_common.h b/common/acl/acl_common.h index f76cbd3..be4fd0c 100644 --- a/common/acl/acl_common.h +++ b/common/acl/acl_common.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ #ifndef _ACL_COMMON_H @@ -33,7 +34,14 @@ extern "C" { #endif -extern ace_t trivial_acl[6]; +typedef struct trivial_acl { + uint32_t allow0; /* allow mask for bits only in owner */ + uint32_t deny1; /* deny mask for bits not in owner */ + uint32_t deny2; /* deny mask for bits not in group */ + uint32_t owner; /* allow mask matching mode */ + uint32_t group; /* allow mask matching mode */ + uint32_t everyone; /* allow mask matching mode */ +} trivial_acl_t; extern int acltrivial(const char *); extern void adjust_ace_pair(ace_t *pair, mode_t mode); @@ -44,13 +52,13 @@ extern int ace_trivial_common(void *, int, uint32_t *mask)); extern acl_t *acl_alloc(acl_type_t); extern void acl_free(acl_t *aclp); -extern int acl_translate(acl_t *aclp, int target_flavor, - int isdir, uid_t owner, gid_t group); +extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, + uid_t owner, gid_t group); void ksort(caddr_t v, int n, int s, int (*f)()); int cmp2acls(void *a, void *b); -int acl_trivial_create(mode_t mode, ace_t **acl, int *count); -void acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1, - uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone); +int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count); +void acl_trivial_access_masks(mode_t mode, boolean_t isdir, + trivial_acl_t *masks); #ifdef __cplusplus } diff --git a/common/nvpair/fnvpair.c b/common/nvpair/fnvpair.c new file mode 100644 index 0000000..8d1bb98 --- /dev/null +++ b/common/nvpair/fnvpair.c @@ -0,0 +1,496 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/nvpair.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#ifndef _KERNEL +#include <stdlib.h> +#endif + +/* + * "Force" nvlist wrapper. + * + * These functions wrap the nvlist_* functions with assertions that assume + * the operation is successful. This allows the caller's code to be much + * more readable, especially for the fnvlist_lookup_* and fnvpair_value_* + * functions, which can return the requested value (rather than filling in + * a pointer). + * + * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate + * with KM_SLEEP. + * + * More wrappers should be added as needed -- for example + * nvlist_lookup_*_array and nvpair_value_*_array. + */ + +nvlist_t * +fnvlist_alloc(void) +{ + nvlist_t *nvl; + VERIFY3U(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP), ==, 0); + return (nvl); +} + +void +fnvlist_free(nvlist_t *nvl) +{ + nvlist_free(nvl); +} + +size_t +fnvlist_size(nvlist_t *nvl) +{ + size_t size; + VERIFY3U(nvlist_size(nvl, &size, NV_ENCODE_NATIVE), ==, 0); + return (size); +} + +/* + * Returns allocated buffer of size *sizep. Caller must free the buffer with + * fnvlist_pack_free(). + */ +char * +fnvlist_pack(nvlist_t *nvl, size_t *sizep) +{ + char *packed = 0; + VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, + KM_SLEEP), ==, 0); + return (packed); +} + +/*ARGSUSED*/ +void +fnvlist_pack_free(char *pack, size_t size) +{ +#ifdef _KERNEL + kmem_free(pack, size); +#else + free(pack); +#endif +} + +nvlist_t * +fnvlist_unpack(char *buf, size_t buflen) +{ + nvlist_t *rv; + VERIFY3U(nvlist_unpack(buf, buflen, &rv, KM_SLEEP), ==, 0); + return (rv); +} + +nvlist_t * +fnvlist_dup(nvlist_t *nvl) +{ + nvlist_t *rv; + VERIFY3U(nvlist_dup(nvl, &rv, KM_SLEEP), ==, 0); + return (rv); +} + +void +fnvlist_merge(nvlist_t *dst, nvlist_t *src) +{ + VERIFY3U(nvlist_merge(dst, src, KM_SLEEP), ==, 0); +} + +void +fnvlist_add_boolean(nvlist_t *nvl, const char *name) +{ + VERIFY3U(nvlist_add_boolean(nvl, name), ==, 0); +} + +void +fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) +{ + VERIFY3U(nvlist_add_boolean_value(nvl, name, val), ==, 0); +} + +void +fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) +{ + VERIFY3U(nvlist_add_byte(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) +{ + VERIFY3U(nvlist_add_int8(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) +{ + VERIFY3U(nvlist_add_uint8(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) +{ + VERIFY3U(nvlist_add_int16(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) +{ + VERIFY3U(nvlist_add_uint16(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) +{ + VERIFY3U(nvlist_add_int32(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) +{ + VERIFY3U(nvlist_add_uint32(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) +{ + VERIFY3U(nvlist_add_int64(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) +{ + VERIFY3U(nvlist_add_uint64(nvl, name, val), ==, 0); +} + +void +fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val) +{ + VERIFY3U(nvlist_add_string(nvl, name, val), ==, 0); +} + +void +fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) +{ + VERIFY3U(nvlist_add_nvlist(nvl, name, val), ==, 0); +} + +void +fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY3U(nvlist_add_nvpair(nvl, pair), ==, 0); +} + +void +fnvlist_add_boolean_array(nvlist_t *nvl, const char *name, + boolean_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_boolean_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_byte_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int8_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint8_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int16_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint16_array(nvlist_t *nvl, const char *name, + uint16_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint16_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int32_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint32_array(nvlist_t *nvl, const char *name, + uint32_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint32_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int64_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint64_array(nvlist_t *nvl, const char *name, + uint64_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint64_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_string_array(nvlist_t *nvl, const char *name, + char * const *val, uint_t n) +{ + VERIFY3U(nvlist_add_string_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name, + nvlist_t **val, uint_t n) +{ + VERIFY3U(nvlist_add_nvlist_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_remove(nvlist_t *nvl, const char *name) +{ + VERIFY3U(nvlist_remove_all(nvl, name), ==, 0); +} + +void +fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY3U(nvlist_remove_nvpair(nvl, pair), ==, 0); +} + +nvpair_t * +fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name) +{ + nvpair_t *rv; + VERIFY3U(nvlist_lookup_nvpair(nvl, name, &rv), ==, 0); + return (rv); +} + +/* returns B_TRUE if the entry exists */ +boolean_t +fnvlist_lookup_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_lookup_boolean(nvl, name) == 0); +} + +boolean_t +fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name) +{ + boolean_t rv; + VERIFY3U(nvlist_lookup_boolean_value(nvl, name, &rv), ==, 0); + return (rv); +} + +uchar_t +fnvlist_lookup_byte(nvlist_t *nvl, const char *name) +{ + uchar_t rv; + VERIFY3U(nvlist_lookup_byte(nvl, name, &rv), ==, 0); + return (rv); +} + +int8_t +fnvlist_lookup_int8(nvlist_t *nvl, const char *name) +{ + int8_t rv; + VERIFY3U(nvlist_lookup_int8(nvl, name, &rv), ==, 0); + return (rv); +} + +int16_t +fnvlist_lookup_int16(nvlist_t *nvl, const char *name) +{ + int16_t rv; + VERIFY3U(nvlist_lookup_int16(nvl, name, &rv), ==, 0); + return (rv); +} + +int32_t +fnvlist_lookup_int32(nvlist_t *nvl, const char *name) +{ + int32_t rv; + VERIFY3U(nvlist_lookup_int32(nvl, name, &rv), ==, 0); + return (rv); +} + +int64_t +fnvlist_lookup_int64(nvlist_t *nvl, const char *name) +{ + int64_t rv; + VERIFY3U(nvlist_lookup_int64(nvl, name, &rv), ==, 0); + return (rv); +} + +uint8_t +fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name) +{ + uint8_t rv; + VERIFY3U(nvlist_lookup_uint8(nvl, name, &rv), ==, 0); + return (rv); +} + +uint16_t +fnvlist_lookup_uint16(nvlist_t *nvl, const char *name) +{ + uint16_t rv; + VERIFY3U(nvlist_lookup_uint16(nvl, name, &rv), ==, 0); + return (rv); +} + +uint32_t +fnvlist_lookup_uint32(nvlist_t *nvl, const char *name) +{ + uint32_t rv; + VERIFY3U(nvlist_lookup_uint32(nvl, name, &rv), ==, 0); + return (rv); +} + +uint64_t +fnvlist_lookup_uint64(nvlist_t *nvl, const char *name) +{ + uint64_t rv; + VERIFY3U(nvlist_lookup_uint64(nvl, name, &rv), ==, 0); + return (rv); +} + +char * +fnvlist_lookup_string(nvlist_t *nvl, const char *name) +{ + char *rv; + VERIFY3U(nvlist_lookup_string(nvl, name, &rv), ==, 0); + return (rv); +} + +nvlist_t * +fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name) +{ + nvlist_t *rv; + VERIFY3U(nvlist_lookup_nvlist(nvl, name, &rv), ==, 0); + return (rv); +} + +boolean_t +fnvpair_value_boolean_value(nvpair_t *nvp) +{ + boolean_t rv; + VERIFY3U(nvpair_value_boolean_value(nvp, &rv), ==, 0); + return (rv); +} + +uchar_t +fnvpair_value_byte(nvpair_t *nvp) +{ + uchar_t rv; + VERIFY3U(nvpair_value_byte(nvp, &rv), ==, 0); + return (rv); +} + +int8_t +fnvpair_value_int8(nvpair_t *nvp) +{ + int8_t rv; + VERIFY3U(nvpair_value_int8(nvp, &rv), ==, 0); + return (rv); +} + +int16_t +fnvpair_value_int16(nvpair_t *nvp) +{ + int16_t rv; + VERIFY3U(nvpair_value_int16(nvp, &rv), ==, 0); + return (rv); +} + +int32_t +fnvpair_value_int32(nvpair_t *nvp) +{ + int32_t rv; + VERIFY3U(nvpair_value_int32(nvp, &rv), ==, 0); + return (rv); +} + +int64_t +fnvpair_value_int64(nvpair_t *nvp) +{ + int64_t rv; + VERIFY3U(nvpair_value_int64(nvp, &rv), ==, 0); + return (rv); +} + +uint8_t +fnvpair_value_uint8_t(nvpair_t *nvp) +{ + uint8_t rv; + VERIFY3U(nvpair_value_uint8(nvp, &rv), ==, 0); + return (rv); +} + +uint16_t +fnvpair_value_uint16(nvpair_t *nvp) +{ + uint16_t rv; + VERIFY3U(nvpair_value_uint16(nvp, &rv), ==, 0); + return (rv); +} + +uint32_t +fnvpair_value_uint32(nvpair_t *nvp) +{ + uint32_t rv; + VERIFY3U(nvpair_value_uint32(nvp, &rv), ==, 0); + return (rv); +} + +uint64_t +fnvpair_value_uint64(nvpair_t *nvp) +{ + uint64_t rv; + VERIFY3U(nvpair_value_uint64(nvp, &rv), ==, 0); + return (rv); +} + +char * +fnvpair_value_string(nvpair_t *nvp) +{ + char *rv; + VERIFY3U(nvpair_value_string(nvp, &rv), ==, 0); + return (rv); +} + +nvlist_t * +fnvpair_value_nvlist(nvpair_t *nvp) +{ + nvlist_t *rv; + VERIFY3U(nvpair_value_nvlist(nvp, &rv), ==, 0); + return (rv); +} diff --git a/common/zfs/zfeature_common.c b/common/zfs/zfeature_common.c new file mode 100644 index 0000000..9c0b67b --- /dev/null +++ b/common/zfs/zfeature_common.c @@ -0,0 +1,156 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <errno.h> +#include <string.h> +#endif +#include <sys/debug.h> +#include <sys/fs/zfs.h> +#include <sys/inttypes.h> +#include <sys/types.h> +#include "zfeature_common.h" + +/* + * Set to disable all feature checks while opening pools, allowing pools with + * unsupported features to be opened. Set for testing only. + */ +boolean_t zfeature_checks_disable = B_FALSE; + +zfeature_info_t spa_feature_table[SPA_FEATURES]; + +/* + * Valid characters for feature guids. This list is mainly for aesthetic + * purposes and could be expanded in the future. There are different allowed + * characters in the guids reverse dns portion (before the colon) and its + * short name (after the colon). + */ +static int +valid_char(char c, boolean_t after_colon) +{ + return ((c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == (after_colon ? '_' : '.')); +} + +/* + * Every feature guid must contain exactly one colon which separates a reverse + * dns organization name from the feature's "short" name (e.g. + * "com.company:feature_name"). + */ +boolean_t +zfeature_is_valid_guid(const char *name) +{ + int i; + boolean_t has_colon = B_FALSE; + + i = 0; + while (name[i] != '\0') { + char c = name[i++]; + if (c == ':') { + if (has_colon) + return (B_FALSE); + has_colon = B_TRUE; + continue; + } + if (!valid_char(c, has_colon)) + return (B_FALSE); + } + + return (has_colon); +} + +boolean_t +zfeature_is_supported(const char *guid) +{ + if (zfeature_checks_disable) + return (B_TRUE); + + return (0 == zfeature_lookup_guid(guid, NULL)); +} + +int +zfeature_lookup_guid(const char *guid, zfeature_info_t **res) +{ + for (int i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(guid, feature->fi_guid) == 0) { + if (res != NULL) + *res = feature; + return (0); + } + } + + return (ENOENT); +} + +int +zfeature_lookup_name(const char *name, zfeature_info_t **res) +{ + for (int i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(name, feature->fi_uname) == 0) { + if (res != NULL) + *res = feature; + return (0); + } + } + + return (ENOENT); +} + +static void +zfeature_register(int fid, const char *guid, const char *name, const char *desc, + boolean_t readonly, boolean_t mos, zfeature_info_t **deps) +{ + zfeature_info_t *feature = &spa_feature_table[fid]; + static zfeature_info_t *nodeps[] = { NULL }; + + ASSERT(name != NULL); + ASSERT(desc != NULL); + ASSERT(!readonly || !mos); + ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(zfeature_is_valid_guid(guid)); + + if (deps == NULL) + deps = nodeps; + + feature->fi_guid = guid; + feature->fi_uname = name; + feature->fi_desc = desc; + feature->fi_can_readonly = readonly; + feature->fi_mos = mos; + feature->fi_depends = deps; +} + +void +zpool_feature_init(void) +{ + zfeature_register(SPA_FEATURE_ASYNC_DESTROY, + "com.delphix:async_destroy", "async_destroy", + "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL); +} diff --git a/common/zfs/zfeature_common.h b/common/zfs/zfeature_common.h new file mode 100644 index 0000000..93ba2b7 --- /dev/null +++ b/common/zfs/zfeature_common.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _ZFEATURE_COMMON_H +#define _ZFEATURE_COMMON_H + +#include <sys/fs/zfs.h> +#include <sys/inttypes.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct zfeature_info; + +typedef struct zfeature_info { + const char *fi_uname; /* User-facing feature name */ + const char *fi_guid; /* On-disk feature identifier */ + const char *fi_desc; /* Feature description */ + boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */ + boolean_t fi_mos; /* Is the feature necessary to read the MOS? */ + struct zfeature_info **fi_depends; /* array; null terminated */ +} zfeature_info_t; + +typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg); + +#define ZFS_FEATURE_DEBUG + +enum spa_feature { + SPA_FEATURE_ASYNC_DESTROY, + SPA_FEATURES +} spa_feature_t; + +extern zfeature_info_t spa_feature_table[SPA_FEATURES]; + +extern boolean_t zfeature_is_valid_guid(const char *); + +extern boolean_t zfeature_is_supported(const char *); +extern int zfeature_lookup_guid(const char *, zfeature_info_t **res); +extern int zfeature_lookup_name(const char *, zfeature_info_t **res); + +extern void zpool_feature_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFEATURE_COMMON_H */ diff --git a/common/zfs/zfs_deleg.c b/common/zfs/zfs_deleg.c index 83d9edb..1868103 100644 --- a/common/zfs/zfs_deleg.c +++ b/common/zfs/zfs_deleg.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. */ #if defined(_KERNEL) @@ -60,7 +61,7 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { {ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, {ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, {ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, - {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE }, + {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, {ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, {ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, {ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, diff --git a/common/zfs/zfs_deleg.h b/common/zfs/zfs_deleg.h index b4cb8e2..9997dff 100644 --- a/common/zfs/zfs_deleg.h +++ b/common/zfs/zfs_deleg.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. */ #ifndef _ZFS_DELEG_H @@ -51,6 +52,7 @@ typedef enum { ZFS_DELEG_NOTE_CLONE, ZFS_DELEG_NOTE_PROMOTE, ZFS_DELEG_NOTE_RENAME, + ZFS_DELEG_NOTE_SEND, ZFS_DELEG_NOTE_RECEIVE, ZFS_DELEG_NOTE_ALLOW, ZFS_DELEG_NOTE_USERPROP, diff --git a/common/zfs/zfs_prop.c b/common/zfs/zfs_prop.c index f29bcf6..5d45361 100644 --- a/common/zfs/zfs_prop.c +++ b/common/zfs/zfs_prop.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -104,6 +105,13 @@ zfs_prop_init(void) { NULL } }; + static zprop_index_t acl_mode_table[] = { + { "discard", ZFS_ACL_DISCARD }, + { "groupmask", ZFS_ACL_GROUPMASK }, + { "passthrough", ZFS_ACL_PASSTHROUGH }, + { NULL } + }; + static zprop_index_t acl_inherit_table[] = { { "discard", ZFS_ACL_DISCARD }, { "noallow", ZFS_ACL_NOALLOW }, @@ -207,6 +215,9 @@ zfs_prop_init(void) zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "hidden | visible", "SNAPDIR", snapdir_table); + zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "discard | groupmask | passthrough", "ACLMODE", acl_mode_table); zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | noallow | restricted | passthrough | passthrough-x", @@ -256,7 +267,7 @@ zfs_prop_init(void) /* default index properties */ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "1 | 2 | 3 | 4 | current", "VERSION", version_table); + "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", "CANMOUNT", canmount_table); @@ -286,6 +297,8 @@ zfs_prop_init(void) /* string properties */ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN"); + zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, + ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES"); zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT"); @@ -311,6 +324,9 @@ zfs_prop_init(void) zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "RATIO"); + zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, + PROP_READONLY, ZFS_TYPE_DATASET, + "<1.00x or higher if compressed>", "REFRATIO"); zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); @@ -328,6 +344,8 @@ zfs_prop_init(void) ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV"); zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS"); + zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, + ZFS_TYPE_DATASET, "<size>", "WRITTEN"); /* default number properties */ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, @@ -370,13 +388,6 @@ zfs_prop_init(void) zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); - /* - * Property to be removed once libbe is integrated - */ - zprop_register_hidden(ZFS_PROP_PRIVATE, "priv_prop", - PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_FILESYSTEM, - "PRIV_PROP"); - /* oddball properties */ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, PROP_READONLY, ZFS_TYPE_DATASET, @@ -461,6 +472,18 @@ zfs_prop_userquota(const char *name) } /* + * Returns true if this is a valid written@ property. + * Note that after the @, any character is valid (eg, another @, for + * written@pool/fs@origin). + */ +boolean_t +zfs_prop_written(const char *name) +{ + static const char *prefix = "written@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +/* * Tables of index types, plus functions to convert between the user view * (strings) and internal representation (uint64_t). */ diff --git a/common/zfs/zpool_prop.c b/common/zfs/zpool_prop.c index 988d05d..72db879 100644 --- a/common/zfs/zpool_prop.c +++ b/common/zfs/zpool_prop.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zio.h> @@ -69,14 +71,20 @@ zpool_prop_init(void) ZFS_TYPE_POOL, "<filesystem>", "BOOTFS"); zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE"); + zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL, + PROP_DEFAULT, ZFS_TYPE_POOL, "<comment-string>", "COMMENT"); /* readonly number properties */ zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "SIZE"); zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "FREE"); + zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<size>", "FREEING"); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC"); + zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, + PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ"); zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CAP"); zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, @@ -160,6 +168,26 @@ zpool_prop_default_numeric(zpool_prop_t prop) return (zpool_prop_table[prop].pd_numdefault); } +/* + * Returns true if this is a valid feature@ property. + */ +boolean_t +zpool_prop_feature(const char *name) +{ + static const char *prefix = "feature@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +/* + * Returns true if this is a valid unsupported@ property. + */ +boolean_t +zpool_prop_unsupported(const char *name) +{ + static const char *prefix = "unsupported@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + int zpool_prop_string_to_index(zpool_prop_t prop, const char *string, uint64_t *index) diff --git a/uts/common/Makefile.files b/uts/common/Makefile.files index ec08410..a2b4396 100644 --- a/uts/common/Makefile.files +++ b/uts/common/Makefile.files @@ -21,6 +21,8 @@ # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright 2011 Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2012 by Delphix. All rights reserved. # # @@ -190,7 +192,6 @@ GENUNIX_OBJS += \ gid.o \ groups.o \ grow.o \ - hat.o \ hat_refmod.o \ id32.o \ id_space.o \ @@ -242,6 +243,7 @@ GENUNIX_OBJS += \ nvpair.o \ nvpair_alloc_system.o \ nvpair_alloc_fixed.o \ + fnvpair.o \ octet.o \ open.o \ p_online.o \ @@ -453,6 +455,8 @@ AUDIO810_OBJS += audio810.o AUDIOCMI_OBJS += audiocmi.o +AUDIOCMIHD_OBJS += audiocmihd.o + AUDIOHD_OBJS += audiohd.o AUDIOIXP_OBJS += audioixp.o @@ -498,9 +502,9 @@ MD4_OBJS += md4.o md4_mod.o MD5_OBJS += md5.o md5_mod.o -SHA1_OBJS += sha1.o sha1_mod.o fips_sha1_util.o +SHA1_OBJS += sha1.o sha1_mod.o -SHA2_OBJS += sha2.o sha2_mod.o fips_sha2_util.o +SHA2_OBJS += sha2.o sha2_mod.o IPGPC_OBJS += classifierddi.o classifier.o filters.o trie.o table.o \ ba_table.o @@ -935,7 +939,7 @@ ST_OBJS += st.o st_conf.o EMLXS_OBJS += emlxs_clock.o emlxs_dfc.o emlxs_dhchap.o emlxs_diag.o \ emlxs_download.o emlxs_dump.o emlxs_els.o emlxs_event.o \ - emlxs_fcp.o emlxs_fct.o emlxs_hba.o emlxs_ip.o \ + emlxs_fcf.o emlxs_fcp.o emlxs_fct.o emlxs_hba.o emlxs_ip.o \ emlxs_mbox.o emlxs_mem.o emlxs_msg.o emlxs_node.o \ emlxs_pkt.o emlxs_sli3.o emlxs_sli4.o emlxs_solaris.o \ emlxs_thread.o @@ -1083,7 +1087,7 @@ DRM_OBJS += drm_sunmod.o drm_kstat.o drm_agpsupport.o \ drm_auth.o drm_bufs.o drm_context.o drm_dma.o \ drm_drawable.o drm_drv.o drm_fops.o drm_ioctl.o drm_irq.o \ drm_lock.o drm_memory.o drm_msg.o drm_pci.o drm_scatter.o \ - drm_cache.o drm_gem.o drm_mm.o ati_pcigart.o + drm_cache.o drm_gem.o drm_mm.o ati_pcigart.o FM_OBJS += devfm.o devfm_machdep.o @@ -1325,6 +1329,7 @@ ZFS_COMMON_OBJS += \ arc.o \ bplist.o \ bpobj.o \ + bptree.o \ dbuf.o \ ddt.o \ ddt_zap.o \ @@ -1346,6 +1351,7 @@ ZFS_COMMON_OBJS += \ dsl_deleg.o \ dsl_prop.o \ dsl_scan.o \ + zfeature.o \ gzip.o \ lzjb.o \ metaslab.o \ @@ -1388,11 +1394,12 @@ ZFS_COMMON_OBJS += \ zrlock.o ZFS_SHARED_OBJS += \ - zfs_namecheck.o \ - zfs_deleg.o \ - zfs_prop.o \ + zfeature_common.o \ zfs_comutil.o \ + zfs_deleg.o \ zfs_fletcher.o \ + zfs_namecheck.o \ + zfs_prop.o \ zpool_prop.o \ zprop_common.o @@ -1519,7 +1526,7 @@ KCF_OBJS += kcf.o kcf_callprov.o kcf_cbufcall.o kcf_cipher.o kcf_crypto.o \ kcf_object.o kcf_policy.o kcf_prov_lib.o kcf_prov_tabs.o \ kcf_sched.o kcf_session.o kcf_sign.o kcf_spi.o kcf_verify.o \ kcf_random.o modes.o ecb.o cbc.o ctr.o ccm.o gcm.o \ - fips_random.o fips_checksum.o fips_test_vectors.o + fips_random.o CRYPTOADM_OBJS += cryptoadm.o @@ -1530,7 +1537,7 @@ DPROV_OBJS += dprov.o DCA_OBJS += dca.o dca_3des.o dca_debug.o dca_dsa.o dca_kstat.o dca_rng.o \ dca_rsa.o -AESPROV_OBJS += aes.o aes_impl.o aes_modes.o fips_aes_util.o +AESPROV_OBJS += aes.o aes_impl.o aes_modes.o ARCFOURPROV_OBJS += arcfour.o arcfour_crypt.o @@ -1541,16 +1548,16 @@ ECCPROV_OBJS += ecc.o ec.o ec2_163.o ec2_mont.o ecdecode.o ecl_mult.o \ ecp_jm.o ec2_233.o ecl_curve.o ecp_224.o ecp_aff.o \ ecp_mont.o ec2_aff.o ec_naf.o ecl_gf.o ecp_256.o mp_gf2m.o \ mpi.o mplogic.o mpmontg.o mpprime.o oid.o \ - secitem.o ec2_test.o ecp_test.o fips_ecc_util.o + secitem.o ec2_test.o ecp_test.o -RSAPROV_OBJS += rsa.o rsa_impl.o pkcs1.o fips_rsa_util.o +RSAPROV_OBJS += rsa.o rsa_impl.o pkcs1.o -SWRANDPROV_OBJS += swrand.o fips_random_util.o +SWRANDPROV_OBJS += swrand.o # # kernel SSL # -KSSL_OBJS += kssl.o ksslioctl.o +KSSL_OBJS += kssl.o ksslioctl.o KSSL_SOCKFIL_MOD_OBJS += ksslfilter.o ksslapi.o ksslrec.o @@ -1664,7 +1671,7 @@ KGSS_KRB5_OBJS += krb5mech.o \ $(CRYPTO_OLD) \ $(CRYPTO_RAW) $(K5_KRB) $(K5_OS) -DES_OBJS += des_crypt.o des_impl.o des_ks.o des_soft.o fips_des_util.o +DES_OBJS += des_crypt.o des_impl.o des_ks.o des_soft.o DLBOOT_OBJS += bootparam_xdr.o nfs_dlinet.o scan.o @@ -1763,6 +1770,8 @@ BGE_OBJS += bge_main2.o bge_chip2.o bge_kstats.o bge_log.o bge_ndd.o \ DMFE_OBJS += dmfe_log.o dmfe_main.o dmfe_mii.o +EFE_OBJS += efe.o + ELXL_OBJS += elxl.o HME_OBJS += hme.o @@ -1773,6 +1782,8 @@ IXGB_OBJS += ixgb.o ixgb_atomic.o ixgb_chip.o ixgb_gld.o ixgb_kstats.o \ NGE_OBJS += nge_main.o nge_atomic.o nge_chip.o nge_ndd.o nge_kstats.o \ nge_log.o nge_rx.o nge_tx.o nge_xmii.o +PCN_OBJS += pcn.o + RGE_OBJS += rge_main.o rge_chip.o rge_ndd.o rge_kstats.o rge_log.o rge_rxtx.o URTW_OBJS += urtw.o @@ -1898,6 +1909,11 @@ IGB_OBJS = igb_82575.o igb_api.o igb_mac.o igb_manage.o \ igb_rx.o igb_stat.o igb_tx.o # +# Intel Pro/100 NIC driver module +# +IPRB_OBJS = iprb.o + +# # Intel 10GbE PCIE NIC driver module # IXGBE_OBJS = ixgbe_82598.o ixgbe_82599.o ixgbe_api.o \ @@ -1932,11 +1948,6 @@ NXGE_HCALL_OBJS = \ # KICONV_EMEA_OBJS += kiconv_emea.o -# -# blk2scsa -# -BLK2SCSA_OBJS = blk2scsa.o - KICONV_JA_OBJS += kiconv_ja.o KICONV_KO_OBJS += kiconv_cck_common.o kiconv_ko.o diff --git a/uts/common/dtrace/dtrace.c b/uts/common/dtrace/dtrace.c index 2a9df6d..0c5e4b3 100644 --- a/uts/common/dtrace/dtrace.c +++ b/uts/common/dtrace/dtrace.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ /* @@ -118,7 +119,7 @@ dtrace_optval_t dtrace_dof_maxsize = (256 * 1024); size_t dtrace_global_maxsize = (16 * 1024); size_t dtrace_actions_max = (16 * 1024); size_t dtrace_retain_max = 1024; -dtrace_optval_t dtrace_helper_actions_max = 32; +dtrace_optval_t dtrace_helper_actions_max = 1024; dtrace_optval_t dtrace_helper_providers_max = 32; dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024); size_t dtrace_strsize_default = 256; @@ -143,6 +144,7 @@ int dtrace_err_verbose; hrtime_t dtrace_deadman_interval = NANOSEC; hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC; hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC; +hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC; /* * DTrace External Variables @@ -459,11 +461,13 @@ static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id); static void dtrace_enabling_provide(dtrace_provider_t *); static int dtrace_enabling_match(dtrace_enabling_t *, int *); static void dtrace_enabling_matchall(void); +static void dtrace_enabling_reap(void); static dtrace_state_t *dtrace_anon_grab(void); static uint64_t dtrace_helper(int, dtrace_mstate_t *, dtrace_state_t *, uint64_t, uint64_t); static dtrace_helpers_t *dtrace_helpers_create(proc_t *); static void dtrace_buffer_drop(dtrace_buffer_t *); +static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when); static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t, dtrace_state_t *, dtrace_mstate_t *); static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t, @@ -1104,10 +1108,13 @@ dtrace_priv_proc_common_nocd() } static int -dtrace_priv_proc_destructive(dtrace_state_t *state) +dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate) { int action = state->dts_cred.dcr_action; + if (!(mstate->dtms_access & DTRACE_ACCESS_PROC)) + goto bad; + if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) && dtrace_priv_proc_common_zone(state) == 0) goto bad; @@ -1129,15 +1136,17 @@ bad: } static int -dtrace_priv_proc_control(dtrace_state_t *state) +dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate) { - if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL) - return (1); + if (mstate->dtms_access & DTRACE_ACCESS_PROC) { + if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL) + return (1); - if (dtrace_priv_proc_common_zone(state) && - dtrace_priv_proc_common_user(state) && - dtrace_priv_proc_common_nocd()) - return (1); + if (dtrace_priv_proc_common_zone(state) && + dtrace_priv_proc_common_user(state) && + dtrace_priv_proc_common_nocd()) + return (1); + } cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; @@ -1145,9 +1154,10 @@ dtrace_priv_proc_control(dtrace_state_t *state) } static int -dtrace_priv_proc(dtrace_state_t *state) +dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate) { - if (state->dts_cred.dcr_action & DTRACE_CRA_PROC) + if ((mstate->dtms_access & DTRACE_ACCESS_PROC) && + (state->dts_cred.dcr_action & DTRACE_CRA_PROC)) return (1); cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; @@ -1178,6 +1188,109 @@ dtrace_priv_kernel_destructive(dtrace_state_t *state) } /* + * Determine if the dte_cond of the specified ECB allows for processing of + * the current probe to continue. Note that this routine may allow continued + * processing, but with access(es) stripped from the mstate's dtms_access + * field. + */ +static int +dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate, + dtrace_ecb_t *ecb) +{ + dtrace_probe_t *probe = ecb->dte_probe; + dtrace_provider_t *prov = probe->dtpr_provider; + dtrace_pops_t *pops = &prov->dtpv_pops; + int mode = DTRACE_MODE_NOPRIV_DROP; + + ASSERT(ecb->dte_cond); + + if (pops->dtps_mode != NULL) { + mode = pops->dtps_mode(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg); + + ASSERT((mode & DTRACE_MODE_USER) || + (mode & DTRACE_MODE_KERNEL)); + ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) || + (mode & DTRACE_MODE_NOPRIV_DROP)); + } + + /* + * If the dte_cond bits indicate that this consumer is only allowed to + * see user-mode firings of this probe, call the provider's dtps_mode() + * entry point to check that the probe was fired while in a user + * context. If that's not the case, use the policy specified by the + * provider to determine if we drop the probe or merely restrict + * operation. + */ + if (ecb->dte_cond & DTRACE_COND_USERMODE) { + ASSERT(mode != DTRACE_MODE_NOPRIV_DROP); + + if (!(mode & DTRACE_MODE_USER)) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + + mstate->dtms_access &= ~DTRACE_ACCESS_ARGS; + } + } + + /* + * This is more subtle than it looks. We have to be absolutely certain + * that CRED() isn't going to change out from under us so it's only + * legit to examine that structure if we're in constrained situations. + * Currently, the only times we'll this check is if a non-super-user + * has enabled the profile or syscall providers -- providers that + * allow visibility of all processes. For the profile case, the check + * above will ensure that we're examining a user context. + */ + if (ecb->dte_cond & DTRACE_COND_OWNER) { + cred_t *cr; + cred_t *s_cr = state->dts_cred.dcr_cred; + proc_t *proc; + + ASSERT(s_cr != NULL); + + if ((cr = CRED()) == NULL || + s_cr->cr_uid != cr->cr_uid || + s_cr->cr_uid != cr->cr_ruid || + s_cr->cr_uid != cr->cr_suid || + s_cr->cr_gid != cr->cr_gid || + s_cr->cr_gid != cr->cr_rgid || + s_cr->cr_gid != cr->cr_sgid || + (proc = ttoproc(curthread)) == NULL || + (proc->p_flag & SNOCD)) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + + mstate->dtms_access &= ~DTRACE_ACCESS_PROC; + } + } + + /* + * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not + * in our zone, check to see if our mode policy is to restrict rather + * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC + * and DTRACE_ACCESS_ARGS + */ + if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) { + cred_t *cr; + cred_t *s_cr = state->dts_cred.dcr_cred; + + ASSERT(s_cr != NULL); + + if ((cr = CRED()) == NULL || + s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + + mstate->dtms_access &= + ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS); + } + } + + return (1); +} + +/* * Note: not called from probe context. This function is called * asynchronously (and at a regular interval) from outside of probe context to * clean the dirty dynamic variable lists on all CPUs. Dynamic variable @@ -1859,6 +1972,75 @@ dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr) lquanta[levels + 1] += incr; } +static int +dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low, + uint16_t high, uint16_t nsteps, int64_t value) +{ + int64_t this = 1, last, next; + int base = 1, order; + + ASSERT(factor <= nsteps); + ASSERT(nsteps % factor == 0); + + for (order = 0; order < low; order++) + this *= factor; + + /* + * If our value is less than our factor taken to the power of the + * low order of magnitude, it goes into the zeroth bucket. + */ + if (value < (last = this)) + return (0); + + for (this *= factor; order <= high; order++) { + int nbuckets = this > nsteps ? nsteps : this; + + if ((next = this * factor) < this) { + /* + * We should not generally get log/linear quantizations + * with a high magnitude that allows 64-bits to + * overflow, but we nonetheless protect against this + * by explicitly checking for overflow, and clamping + * our value accordingly. + */ + value = this - 1; + } + + if (value < this) { + /* + * If our value lies within this order of magnitude, + * determine its position by taking the offset within + * the order of magnitude, dividing by the bucket + * width, and adding to our (accumulated) base. + */ + return (base + (value - last) / (this / nbuckets)); + } + + base += nbuckets - (nbuckets / factor); + last = this; + this = next; + } + + /* + * Our value is greater than or equal to our factor taken to the + * power of one plus the high magnitude -- return the top bucket. + */ + return (base); +} + +static void +dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr) +{ + uint64_t arg = *llquanta++; + uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg); + uint16_t low = DTRACE_LLQUANTIZE_LOW(arg); + uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg); + uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg); + + llquanta[dtrace_aggregate_llquantize_bucket(factor, + low, high, nsteps, nval)] += incr; +} + /*ARGSUSED*/ static void dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg) @@ -2640,6 +2822,12 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, switch (v) { case DIF_VAR_ARGS: + if (!(mstate->dtms_access & DTRACE_ACCESS_ARGS)) { + cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= + CPU_DTRACE_KPRIV; + return (0); + } + ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS); if (ndx >= sizeof (mstate->dtms_arg) / sizeof (mstate->dtms_arg[0])) { @@ -2675,7 +2863,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, case DIF_VAR_UREGS: { klwp_t *lwp; - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); if ((lwp = curthread->t_lwp) == NULL) { @@ -2687,6 +2875,22 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, return (dtrace_getreg(lwp->lwp_regs, ndx)); } + case DIF_VAR_VMREGS: { + uint64_t rval; + + if (!dtrace_priv_kernel(state)) + return (0); + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + + rval = dtrace_getvmreg(ndx, + &cpu_core[CPU->cpu_id].cpuc_dtrace_flags); + + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + return (rval); + } + case DIF_VAR_CURTHREAD: if (!dtrace_priv_kernel(state)) return (0); @@ -2739,7 +2943,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, return (mstate->dtms_stackdepth); case DIF_VAR_USTACKDEPTH: - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) { /* @@ -2794,7 +2998,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, return (mstate->dtms_caller); case DIF_VAR_UCALLER: - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) { @@ -2842,7 +3046,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, state, mstate)); case DIF_VAR_PID: - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); /* @@ -2864,7 +3068,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, return ((uint64_t)curthread->t_procp->p_pidp->pid_id); case DIF_VAR_PPID: - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); /* @@ -2891,7 +3095,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, return ((uint64_t)curthread->t_tid); case DIF_VAR_EXECNAME: - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); /* @@ -2911,7 +3115,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, state, mstate)); case DIF_VAR_ZONENAME: - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); /* @@ -2931,7 +3135,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, state, mstate)); case DIF_VAR_UID: - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); /* @@ -2952,7 +3156,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, return ((uint64_t)curthread->t_procp->p_cred->cr_uid); case DIF_VAR_GID: - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); /* @@ -2974,7 +3178,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, case DIF_VAR_ERRNO: { klwp_t *lwp; - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); /* @@ -3314,7 +3518,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, uint64_t size = tupregs[2].dttk_value; if (!dtrace_destructive_disallow && - dtrace_priv_proc_control(state) && + dtrace_priv_proc_control(state, mstate) && !dtrace_istoxic(kaddr, size)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); dtrace_copyout(kaddr, uaddr, size, flags); @@ -3329,7 +3533,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, uint64_t size = tupregs[2].dttk_value; if (!dtrace_destructive_disallow && - dtrace_priv_proc_control(state) && + dtrace_priv_proc_control(state, mstate) && !dtrace_istoxic(kaddr, size)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); dtrace_copyoutstr(kaddr, uaddr, size, flags); @@ -3700,7 +3904,54 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, break; } - case DIF_SUBR_GETMAJOR: + case DIF_SUBR_TOUPPER: + case DIF_SUBR_TOLOWER: { + uintptr_t s = tupregs[0].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + char *dest = (char *)mstate->dtms_scratch_ptr, c; + size_t len = dtrace_strlen((char *)s, size); + char lower, upper, convert; + int64_t i; + + if (subr == DIF_SUBR_TOUPPER) { + lower = 'a'; + upper = 'z'; + convert = 'A'; + } else { + lower = 'A'; + upper = 'Z'; + convert = 'a'; + } + + if (!dtrace_canload(s, len + 1, mstate, vstate)) { + regs[rd] = NULL; + break; + } + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = NULL; + break; + } + + for (i = 0; i < size - 1; i++) { + if ((c = dtrace_load8(s + i)) == '\0') + break; + + if (c >= lower && c <= upper) + c = convert + (c - lower); + + dest[i] = c; + } + + ASSERT(i < size); + dest[i] = '\0'; + regs[rd] = (uintptr_t)dest; + mstate->dtms_scratch_ptr += size; + break; + } + +case DIF_SUBR_GETMAJOR: #ifdef _LP64 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64; #else @@ -3962,9 +4213,20 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, case DIF_SUBR_LLTOSTR: { int64_t i = (int64_t)tupregs[0].dttk_value; - int64_t val = i < 0 ? i * -1 : i; - uint64_t size = 22; /* enough room for 2^64 in decimal */ + uint64_t val, digit; + uint64_t size = 65; /* enough room for 2^64 in binary */ char *end = (char *)mstate->dtms_scratch_ptr + size - 1; + int base = 10; + + if (nargs > 1) { + if ((base = tupregs[1].dttk_value) <= 1 || + base > ('z' - 'a' + 1) + ('9' - '0' + 1)) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + } + + val = (base == 10 && i < 0) ? i * -1 : i; if (!DTRACE_INSCRATCH(mstate, size)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); @@ -3972,13 +4234,24 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, break; } - for (*end-- = '\0'; val; val /= 10) - *end-- = '0' + (val % 10); + for (*end-- = '\0'; val; val /= base) { + if ((digit = val % base) <= '9' - '0') { + *end-- = '0' + digit; + } else { + *end-- = 'a' + (digit - ('9' - '0') - 1); + } + } + + if (i == 0 && base == 16) + *end-- = '0'; + + if (base == 16) + *end-- = 'x'; - if (i == 0) + if (i == 0 || base == 8 || base == 16) *end-- = '0'; - if (i < 0) + if (i < 0 && base == 10) *end-- = '-'; regs[rd] = (uintptr_t)end + 1; @@ -5613,6 +5886,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid]; dtrace_vstate_t *vstate = &state->dts_vstate; dtrace_provider_t *prov = probe->dtpr_provider; + uint64_t tracememsize = 0; int committed = 0; caddr_t tomax; @@ -5633,6 +5907,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, #endif mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE; + mstate.dtms_access = DTRACE_ACCESS_ARGS | DTRACE_ACCESS_PROC; *flags &= ~CPU_DTRACE_ERROR; if (prov == dtrace_provider) { @@ -5670,65 +5945,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, } } - if (ecb->dte_cond) { - /* - * If the dte_cond bits indicate that this - * consumer is only allowed to see user-mode firings - * of this probe, call the provider's dtps_usermode() - * entry point to check that the probe was fired - * while in a user context. Skip this ECB if that's - * not the case. - */ - if ((ecb->dte_cond & DTRACE_COND_USERMODE) && - prov->dtpv_pops.dtps_usermode(prov->dtpv_arg, - probe->dtpr_id, probe->dtpr_arg) == 0) - continue; - - /* - * This is more subtle than it looks. We have to be - * absolutely certain that CRED() isn't going to - * change out from under us so it's only legit to - * examine that structure if we're in constrained - * situations. Currently, the only times we'll this - * check is if a non-super-user has enabled the - * profile or syscall providers -- providers that - * allow visibility of all processes. For the - * profile case, the check above will ensure that - * we're examining a user context. - */ - if (ecb->dte_cond & DTRACE_COND_OWNER) { - cred_t *cr; - cred_t *s_cr = - ecb->dte_state->dts_cred.dcr_cred; - proc_t *proc; - - ASSERT(s_cr != NULL); - - if ((cr = CRED()) == NULL || - s_cr->cr_uid != cr->cr_uid || - s_cr->cr_uid != cr->cr_ruid || - s_cr->cr_uid != cr->cr_suid || - s_cr->cr_gid != cr->cr_gid || - s_cr->cr_gid != cr->cr_rgid || - s_cr->cr_gid != cr->cr_sgid || - (proc = ttoproc(curthread)) == NULL || - (proc->p_flag & SNOCD)) - continue; - } - - if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) { - cred_t *cr; - cred_t *s_cr = - ecb->dte_state->dts_cred.dcr_cred; - - ASSERT(s_cr != NULL); - - if ((cr = CRED()) == NULL || - s_cr->cr_zone->zone_id != - cr->cr_zone->zone_id) - continue; - } - } + if (ecb->dte_cond && !dtrace_priv_probe(state, &mstate, ecb)) + continue; if (now - state->dts_alive > dtrace_deadman_timeout) { /* @@ -5768,9 +5986,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, mstate.dtms_present |= DTRACE_MSTATE_EPID; if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) - mstate.dtms_access = DTRACE_ACCESS_KERNEL; - else - mstate.dtms_access = 0; + mstate.dtms_access |= DTRACE_ACCESS_KERNEL; if (pred != NULL) { dtrace_difo_t *dp = pred->dtp_difo; @@ -5830,7 +6046,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, switch (act->dta_kind) { case DTRACEACT_STOP: - if (dtrace_priv_proc_destructive(state)) + if (dtrace_priv_proc_destructive(state, + &mstate)) dtrace_action_stop(); continue; @@ -5857,7 +6074,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, case DTRACEACT_JSTACK: case DTRACEACT_USTACK: - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, &mstate)) continue; /* @@ -5890,6 +6107,23 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, continue; } + /* + * Clear the string space, since there's no + * helper to do it for us. + */ + if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0) { + int depth = DTRACE_USTACK_NFRAMES( + rec->dtrd_arg); + size_t strsize = DTRACE_USTACK_STRSIZE( + rec->dtrd_arg); + uint64_t *buf = (uint64_t *)(tomax + + valoffs); + void *strspace = &buf[depth + 1]; + + dtrace_bzero(strspace, + MIN(depth, strsize)); + } + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); dtrace_getupcstack((uint64_t *) (tomax + valoffs), @@ -5943,7 +6177,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, continue; case DTRACEACT_RAISE: - if (dtrace_priv_proc_destructive(state)) + if (dtrace_priv_proc_destructive(state, + &mstate)) dtrace_action_raise(val); continue; @@ -5970,6 +6205,11 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, case DTRACEACT_PRINTA: case DTRACEACT_SYSTEM: case DTRACEACT_FREOPEN: + case DTRACEACT_TRACEMEM: + break; + + case DTRACEACT_TRACEMEM_DYNSIZE: + tracememsize = val; break; case DTRACEACT_SYM: @@ -5983,7 +6223,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, case DTRACEACT_UADDR: { struct pid *pid = curthread->t_procp->p_pidp; - if (!dtrace_priv_proc(state)) + if (!dtrace_priv_proc(state, &mstate)) continue; DTRACE_STORE(uint64_t, tomax, @@ -6035,6 +6275,12 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) { uintptr_t end = valoffs + size; + if (tracememsize != 0 && + valoffs + tracememsize < end) { + end = valoffs + tracememsize; + tracememsize = 0; + } + if (!dtrace_vcanload((void *)(uintptr_t)val, &dp->dtdo_rtype, &mstate, vstate)) continue; @@ -6915,9 +7161,9 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, if ((priv & DTRACE_PRIV_KERNEL) && (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) && - pops->dtps_usermode == NULL) { + pops->dtps_mode == NULL) { cmn_err(CE_WARN, "failed to register provider '%s': need " - "dtps_usermode() op for given privilege attributes", name); + "dtps_mode() op for given privilege attributes", name); return (EINVAL); } @@ -7014,7 +7260,7 @@ dtrace_unregister(dtrace_provider_id_t id) { dtrace_provider_t *old = (dtrace_provider_t *)id; dtrace_provider_t *prev = NULL; - int i, self = 0; + int i, self = 0, noreap = 0; dtrace_probe_t *probe, *first = NULL; if (old->dtpv_pops.dtps_enable == @@ -7071,14 +7317,31 @@ dtrace_unregister(dtrace_provider_id_t id) continue; /* - * We have at least one ECB; we can't remove this provider. + * If we are trying to unregister a defunct provider, and the + * provider was made defunct within the interval dictated by + * dtrace_unregister_defunct_reap, we'll (asynchronously) + * attempt to reap our enablings. To denote that the provider + * should reattempt to unregister itself at some point in the + * future, we will return a differentiable error code (EAGAIN + * instead of EBUSY) in this case. */ + if (dtrace_gethrtime() - old->dtpv_defunct > + dtrace_unregister_defunct_reap) + noreap = 1; + if (!self) { mutex_exit(&dtrace_lock); mutex_exit(&mod_lock); mutex_exit(&dtrace_provider_lock); } - return (EBUSY); + + if (noreap) + return (EBUSY); + + (void) taskq_dispatch(dtrace_taskq, + (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP); + + return (EAGAIN); } /* @@ -7169,7 +7432,7 @@ dtrace_invalidate(dtrace_provider_id_t id) mutex_enter(&dtrace_provider_lock); mutex_enter(&dtrace_lock); - pvp->dtpv_defunct = 1; + pvp->dtpv_defunct = dtrace_gethrtime(); mutex_exit(&dtrace_lock); mutex_exit(&dtrace_provider_lock); @@ -9376,6 +9639,35 @@ dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc) break; } + case DTRACEAGG_LLQUANTIZE: { + uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg); + uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg); + uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg); + uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg); + int64_t v; + + agg->dtag_initial = desc->dtad_arg; + agg->dtag_aggregate = dtrace_aggregate_llquantize; + + if (factor < 2 || low >= high || nsteps < factor) + goto err; + + /* + * Now check that the number of steps evenly divides a power + * of the factor. (This assures both integer bucket size and + * linearity within each magnitude.) + */ + for (v = factor; v < nsteps; v *= factor) + continue; + + if ((v % nsteps) || (nsteps % factor)) + goto err; + + size = (dtrace_aggregate_llquantize_bucket(factor, + low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t); + break; + } + case DTRACEAGG_AVG: agg->dtag_aggregate = dtrace_aggregate_avg; size = sizeof (uint64_t) * 2; @@ -9545,12 +9837,14 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc) case DTRACEACT_PRINTA: case DTRACEACT_SYSTEM: case DTRACEACT_FREOPEN: + case DTRACEACT_DIFEXPR: /* * We know that our arg is a string -- turn it into a * format. */ if (arg == NULL) { - ASSERT(desc->dtad_kind == DTRACEACT_PRINTA); + ASSERT(desc->dtad_kind == DTRACEACT_PRINTA || + desc->dtad_kind == DTRACEACT_DIFEXPR); format = 0; } else { ASSERT(arg != NULL); @@ -9561,7 +9855,8 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc) /*FALLTHROUGH*/ case DTRACEACT_LIBACT: - case DTRACEACT_DIFEXPR: + case DTRACEACT_TRACEMEM: + case DTRACEACT_TRACEMEM_DYNSIZE: if (dp == NULL) return (EINVAL); @@ -10044,6 +10339,7 @@ dtrace_buffer_switch(dtrace_buffer_t *buf) caddr_t tomax = buf->dtb_tomax; caddr_t xamot = buf->dtb_xamot; dtrace_icookie_t cookie; + hrtime_t now = dtrace_gethrtime(); ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH)); ASSERT(!(buf->dtb_flags & DTRACEBUF_RING)); @@ -10059,6 +10355,8 @@ dtrace_buffer_switch(dtrace_buffer_t *buf) buf->dtb_drops = 0; buf->dtb_errors = 0; buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED); + buf->dtb_interval = now - buf->dtb_switched; + buf->dtb_switched = now; dtrace_interrupt_enable(cookie); } @@ -10091,14 +10389,17 @@ dtrace_buffer_activate(dtrace_state_t *state) static int dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, - processorid_t cpu) + processorid_t cpu, int *factor) { cpu_t *cp; dtrace_buffer_t *buf; + int allocated = 0, desired = 0; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(MUTEX_HELD(&dtrace_lock)); + *factor = 1; + if (size > dtrace_nonroot_maxsize && !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE)) return (EFBIG); @@ -10123,7 +10424,8 @@ dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, ASSERT(buf->dtb_xamot == NULL); - if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL) + if ((buf->dtb_tomax = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) goto err; buf->dtb_size = size; @@ -10134,7 +10436,8 @@ dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, if (flags & DTRACEBUF_NOSWITCH) continue; - if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL) + if ((buf->dtb_xamot = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) goto err; } while ((cp = cp->cpu_next) != cpu_list); @@ -10148,16 +10451,19 @@ err: continue; buf = &bufs[cp->cpu_id]; + desired += 2; if (buf->dtb_xamot != NULL) { ASSERT(buf->dtb_tomax != NULL); ASSERT(buf->dtb_size == size); kmem_free(buf->dtb_xamot, size); + allocated++; } if (buf->dtb_tomax != NULL) { ASSERT(buf->dtb_size == size); kmem_free(buf->dtb_tomax, size); + allocated++; } buf->dtb_tomax = NULL; @@ -10165,6 +10471,8 @@ err: buf->dtb_size = 0; } while ((cp = cp->cpu_next) != cpu_list); + *factor = desired / (allocated > 0 ? allocated : 1); + return (ENOMEM); } @@ -10466,6 +10774,36 @@ dtrace_buffer_polish(dtrace_buffer_t *buf) } } +/* + * This routine determines if data generated at the specified time has likely + * been entirely consumed at user-level. This routine is called to determine + * if an ECB on a defunct probe (but for an active enabling) can be safely + * disabled and destroyed. + */ +static int +dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when) +{ + int i; + + for (i = 0; i < NCPU; i++) { + dtrace_buffer_t *buf = &bufs[i]; + + if (buf->dtb_size == 0) + continue; + + if (buf->dtb_flags & DTRACEBUF_RING) + return (0); + + if (!buf->dtb_switched && buf->dtb_offset != 0) + return (0); + + if (buf->dtb_switched - buf->dtb_interval < when) + return (0); + } + + return (1); +} + static void dtrace_buffer_free(dtrace_buffer_t *bufs) { @@ -10851,10 +11189,12 @@ dtrace_enabling_matchall(void) * block pending our completion. */ for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) { - cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred; + dtrace_cred_t *dcr = &enab->dten_vstate->dtvs_state->dts_cred; + cred_t *cr = dcr->dcr_cred; + zoneid_t zone = cr != NULL ? crgetzoneid(cr) : 0; - if (INGLOBALZONE(curproc) || - cr != NULL && getzoneid() == crgetzoneid(cr)) + if ((dcr->dcr_visible & DTRACE_CRV_ALLZONE) || (cr != NULL && + (zone == GLOBAL_ZONEID || getzoneid() == zone))) (void) dtrace_enabling_match(enab, NULL); } @@ -10954,6 +11294,85 @@ retry: } /* + * Called to reap ECBs that are attached to probes from defunct providers. + */ +static void +dtrace_enabling_reap(void) +{ + dtrace_provider_t *prov; + dtrace_probe_t *probe; + dtrace_ecb_t *ecb; + hrtime_t when; + int i; + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_lock); + + for (i = 0; i < dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i]) == NULL) + continue; + + if (probe->dtpr_ecb == NULL) + continue; + + prov = probe->dtpr_provider; + + if ((when = prov->dtpv_defunct) == 0) + continue; + + /* + * We have ECBs on a defunct provider: we want to reap these + * ECBs to allow the provider to unregister. The destruction + * of these ECBs must be done carefully: if we destroy the ECB + * and the consumer later wishes to consume an EPID that + * corresponds to the destroyed ECB (and if the EPID metadata + * has not been previously consumed), the consumer will abort + * processing on the unknown EPID. To reduce (but not, sadly, + * eliminate) the possibility of this, we will only destroy an + * ECB for a defunct provider if, for the state that + * corresponds to the ECB: + * + * (a) There is no speculative tracing (which can effectively + * cache an EPID for an arbitrary amount of time). + * + * (b) The principal buffers have been switched twice since the + * provider became defunct. + * + * (c) The aggregation buffers are of zero size or have been + * switched twice since the provider became defunct. + * + * We use dts_speculates to determine (a) and call a function + * (dtrace_buffer_consumed()) to determine (b) and (c). Note + * that as soon as we've been unable to destroy one of the ECBs + * associated with the probe, we quit trying -- reaping is only + * fruitful in as much as we can destroy all ECBs associated + * with the defunct provider's probes. + */ + while ((ecb = probe->dtpr_ecb) != NULL) { + dtrace_state_t *state = ecb->dte_state; + dtrace_buffer_t *buf = state->dts_buffer; + dtrace_buffer_t *aggbuf = state->dts_aggbuffer; + + if (state->dts_speculates) + break; + + if (!dtrace_buffer_consumed(buf, when)) + break; + + if (!dtrace_buffer_consumed(aggbuf, when)) + break; + + dtrace_ecb_disable(ecb); + ASSERT(probe->dtpr_ecb != ecb); + dtrace_ecb_destroy(ecb); + } + } + + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); +} + +/* * DTrace DOF Functions */ /*ARGSUSED*/ @@ -11458,15 +11877,20 @@ dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate, (uintptr_t)sec->dofs_offset + offs); kind = (dtrace_actkind_t)desc->dofa_kind; - if (DTRACEACT_ISPRINTFLIKE(kind) && + if ((DTRACEACT_ISPRINTFLIKE(kind) && (kind != DTRACEACT_PRINTA || + desc->dofa_strtab != DOF_SECIDX_NONE)) || + (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE)) { dof_sec_t *strtab; char *str, *fmt; uint64_t i; /* - * printf()-like actions must have a format string. + * The argument to these actions is an index into the + * DOF string table. For printf()-like actions, this + * is the format string. For print(), this is the + * CTF type of the expression result. */ if ((strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL) @@ -11952,7 +12376,7 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size) if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t))) size = min; - if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL) + if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL) return (ENOMEM); dstate->dtds_size = size; @@ -12314,7 +12738,7 @@ dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which) { dtrace_optval_t *opt = state->dts_options, size; processorid_t cpu; - int flags = 0, rval; + int flags = 0, rval, factor, divisor = 1; ASSERT(MUTEX_HELD(&dtrace_lock)); ASSERT(MUTEX_HELD(&cpu_lock)); @@ -12344,7 +12768,7 @@ dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which) flags |= DTRACEBUF_INACTIVE; } - for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) { + for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) { /* * The size must be 8-byte aligned. If the size is not 8-byte * aligned, drop it down by the difference. @@ -12362,7 +12786,7 @@ dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which) return (E2BIG); } - rval = dtrace_buffer_alloc(buf, size, flags, cpu); + rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor); if (rval != ENOMEM) { opt[which] = size; @@ -12371,6 +12795,9 @@ dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which) if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL) return (rval); + + for (divisor = 2; divisor < factor; divisor <<= 1) + continue; } return (ENOMEM); @@ -12470,7 +12897,8 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu) goto out; } - spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP); + spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), + KM_NOSLEEP | KM_NORMALPRI); if (spec == NULL) { rval = ENOMEM; @@ -12481,7 +12909,8 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu) state->dts_nspeculations = (int)nspec; for (i = 0; i < nspec; i++) { - if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) { + if ((buf = kmem_zalloc(bufsize, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { rval = ENOMEM; goto err; } diff --git a/uts/common/dtrace/fasttrap.c b/uts/common/dtrace/fasttrap.c index 42263e4..8cfe4cd 100644 --- a/uts/common/dtrace/fasttrap.c +++ b/uts/common/dtrace/fasttrap.c @@ -24,6 +24,9 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ #include <sys/atomic.h> #include <sys/errno.h> @@ -273,7 +276,7 @@ fasttrap_pid_cleanup_cb(void *data) fasttrap_provider_t **fpp, *fp; fasttrap_bucket_t *bucket; dtrace_provider_id_t provid; - int i, later; + int i, later, rval; static volatile int in = 0; ASSERT(in == 0); @@ -335,9 +338,13 @@ fasttrap_pid_cleanup_cb(void *data) * clean out the unenabled probes. */ provid = fp->ftp_provid; - if (dtrace_unregister(provid) != 0) { + if ((rval = dtrace_unregister(provid)) != 0) { if (fasttrap_total > fasttrap_max / 2) (void) dtrace_condense(provid); + + if (rval == EAGAIN) + fp->ftp_marked = 1; + later += fp->ftp_marked; fpp = &fp->ftp_next; } else { @@ -363,12 +370,16 @@ fasttrap_pid_cleanup_cb(void *data) * get a chance to do that work if and when the timeout is reenabled * (if detach fails). */ - if (later > 0 && fasttrap_timeout != (timeout_id_t)1) - fasttrap_timeout = timeout(&fasttrap_pid_cleanup_cb, NULL, hz); - else if (later > 0) + if (later > 0) { + if (fasttrap_timeout != (timeout_id_t)1) { + fasttrap_timeout = + timeout(&fasttrap_pid_cleanup_cb, NULL, hz); + } + fasttrap_cleanup_work = 1; - else + } else { fasttrap_timeout = 0; + } mutex_exit(&fasttrap_cleanup_mtx); in = 0; diff --git a/uts/common/dtrace/profile.c b/uts/common/dtrace/profile.c index c1a2d1f..fc809d3 100644 --- a/uts/common/dtrace/profile.c +++ b/uts/common/dtrace/profile.c @@ -23,6 +23,9 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ #include <sys/errno.h> #include <sys/stat.h> @@ -408,9 +411,25 @@ profile_disable(void *arg, dtrace_id_t id, void *parg) /*ARGSUSED*/ static int -profile_usermode(void *arg, dtrace_id_t id, void *parg) +profile_mode(void *arg, dtrace_id_t id, void *parg) { - return (CPU->cpu_profile_pc == 0); + profile_probe_t *prof = parg; + int mode; + + if (CPU->cpu_profile_pc != 0) { + mode = DTRACE_MODE_KERNEL; + } else { + mode = DTRACE_MODE_USER; + } + + if (prof->prof_kind == PROF_TICK) { + mode |= DTRACE_MODE_NOPRIV_RESTRICT; + } else { + ASSERT(prof->prof_kind == PROF_PROFILE); + mode |= DTRACE_MODE_NOPRIV_DROP; + } + + return (mode); } static dtrace_pattr_t profile_attr = { @@ -430,7 +449,7 @@ static dtrace_pops_t profile_pops = { NULL, NULL, NULL, - profile_usermode, + profile_mode, profile_destroy }; diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c index a82718e..bd6bda5 100644 --- a/uts/common/fs/zfs/arc.c +++ b/uts/common/fs/zfs/arc.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -78,9 +80,9 @@ * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * - * Buffers do not have their own mutexs, rather they rely on the - * hash table mutexs for the bulk of their protection (i.e. most - * fields in the arc_buf_hdr_t are protected by these mutexs). + * Buffers do not have their own mutexes, rather they rely on the + * hash table mutexes for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns @@ -1217,7 +1219,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; - hdr->b_spa = spa_guid(spa); + hdr->b_spa = spa_load_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); @@ -1919,7 +1921,7 @@ arc_flush(spa_t *spa) uint64_t guid = 0; if (spa) - guid = spa_guid(spa); + guid = spa_load_guid(spa); while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); @@ -1980,6 +1982,11 @@ arc_shrink(void) arc_adjust(); } +/* + * Determine if the system is under memory pressure and is asking + * to reclaim memory. A return value of 1 indicates that the system + * is under memory pressure and that the arc should adjust accordingly. + */ static int arc_reclaim_needed(void) { @@ -2027,11 +2034,24 @@ arc_reclaim_needed(void) * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ - if (btop(vmem_size(heap_arena, VMEM_FREE)) < - (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + if (vmem_size(heap_arena, VMEM_FREE) < + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) return (1); #endif + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this arena remains + * above about 1/16th free. + * + * Note: The 1/16th arena free requirement was put in place + * to aggressively evict memory from the arc in order to avoid + * memory fragmentation issues. + */ + if (zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) + return (1); #else if (spa_get_random(100) == 0) return (1); @@ -2083,6 +2103,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_cache); + + /* + * Ask the vmem areana to reclaim unused memory from its + * quantum caches. + */ + if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) + vmem_qcache_reap(zio_arena); } static void @@ -2216,18 +2243,6 @@ arc_evict_needed(arc_buf_contents_t type) if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); -#ifdef _KERNEL - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this area remains - * above about 1/32nd free. - */ - if (type == ARC_BUFC_DATA && zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) - return (1); -#endif - if (arc_reclaim_needed()) return (1); @@ -2532,9 +2547,11 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + dmu_ot_byteswap[bswap].ob_func; func(buf->b_data, hdr->b_size); } @@ -2619,7 +2636,7 @@ arc_read_done(zio_t *zio) } /* - * "Read" the block block at the specified DVA (in bp) via the + * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was @@ -2676,7 +2693,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *buf; kmutex_t *hash_lock; zio_t *rzio; - uint64_t guid = spa_guid(spa); + uint64_t guid = spa_load_guid(spa); top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), @@ -4234,7 +4251,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; - uint64_t guid = spa_guid(spa); + uint64_t guid = spa_load_guid(spa); ASSERT(dev->l2ad_vdev != NULL); diff --git a/uts/common/fs/zfs/bpobj.c b/uts/common/fs/zfs/bpobj.c index 72be312..022921c 100644 --- a/uts/common/fs/zfs/bpobj.c +++ b/uts/common/fs/zfs/bpobj.c @@ -20,11 +20,13 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include <sys/bpobj.h> #include <sys/zfs_context.h> #include <sys/refcount.h> +#include <sys/dsl_pool.h> uint64_t bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) @@ -440,7 +442,10 @@ space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) struct space_range_arg *sra = arg; if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { - sra->used += bp_get_dsize_sync(sra->spa, bp); + if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) + sra->used += bp_get_dsize_sync(sra->spa, bp); + else + sra->used += bp_get_dsize(sra->spa, bp); sra->comp += BP_GET_PSIZE(bp); sra->uncomp += BP_GET_UCSIZE(bp); } diff --git a/uts/common/fs/zfs/bptree.c b/uts/common/fs/zfs/bptree.c new file mode 100644 index 0000000..8c5a7d4 --- /dev/null +++ b/uts/common/fs/zfs/bptree.c @@ -0,0 +1,224 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/arc.h> +#include <sys/bptree.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/refcount.h> +#include <sys/spa.h> + +/* + * A bptree is a queue of root block pointers from destroyed datasets. When a + * dataset is destroyed its root block pointer is put on the end of the pool's + * bptree queue so the dataset's blocks can be freed asynchronously by + * dsl_scan_sync. This allows the delete operation to finish without traversing + * all the dataset's blocks. + * + * Note that while bt_begin and bt_end are only ever incremented in this code + * they are effectively reset to 0 every time the entire bptree is freed because + * the bptree's object is destroyed and re-created. + */ + +struct bptree_args { + bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ + boolean_t ba_free; /* true if freeing during traversal */ + + bptree_itor_t *ba_func; /* function to call for each blockpointer */ + void *ba_arg; /* caller supplied argument to ba_func */ + dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ +} bptree_args_t; + +uint64_t +bptree_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t obj; + dmu_buf_t *db; + bptree_phys_t *bt; + + obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, + SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (bptree_phys_t), tx); + + /* + * Bonus buffer contents are already initialized to 0, but for + * readability we make it explicit. + */ + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + bt = db->db_data; + bt->bt_begin = 0; + bt->bt_end = 0; + bt->bt_bytes = 0; + bt->bt_comp = 0; + bt->bt_uncomp = 0; + dmu_buf_rele(db, FTAG); + + return (obj); +} + +int +bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + ASSERT3U(bt->bt_begin, ==, bt->bt_end); + ASSERT3U(bt->bt_bytes, ==, 0); + ASSERT3U(bt->bt_comp, ==, 0); + ASSERT3U(bt->bt_uncomp, ==, 0); + dmu_buf_rele(db, FTAG); + + return (dmu_object_free(os, obj, tx)); +} + +void +bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + bptree_entry_phys_t bte; + + /* + * bptree objects are in the pool mos, therefore they can only be + * modified in syncing context. Furthermore, this is only modified + * by the sync thread, so no locking is necessary. + */ + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + + bte.be_birth_txg = birth_txg; + bte.be_bp = *bp; + bzero(&bte.be_zb, sizeof (bte.be_zb)); + dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); + + dmu_buf_will_dirty(db, tx); + bt->bt_end++; + bt->bt_bytes += bytes; + bt->bt_comp += comp; + bt->bt_uncomp += uncomp; + dmu_buf_rele(db, FTAG); +} + +/* ARGSUSED */ +static int +bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + int err; + struct bptree_args *ba = arg; + + if (bp == NULL) + return (0); + + err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); + if (err == 0 && ba->ba_free) { + ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); + ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); + ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); + } + return (err); +} + +int +bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, + void *arg, dmu_tx_t *tx) +{ + int err; + uint64_t i; + dmu_buf_t *db; + struct bptree_args ba; + + ASSERT(!free || dmu_tx_is_syncing(tx)); + + err = dmu_bonus_hold(os, obj, FTAG, &db); + if (err != 0) + return (err); + + if (free) + dmu_buf_will_dirty(db, tx); + + ba.ba_phys = db->db_data; + ba.ba_free = free; + ba.ba_func = func; + ba.ba_arg = arg; + ba.ba_tx = tx; + + err = 0; + for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { + bptree_entry_phys_t bte; + + ASSERT(!free || i == ba.ba_phys->bt_begin); + + err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), + &bte, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, + bte.be_birth_txg, &bte.be_zb, TRAVERSE_POST, + bptree_visit_cb, &ba); + if (free) { + ASSERT(err == 0 || err == ERESTART); + if (err != 0) { + /* save bookmark for future resume */ + ASSERT3U(bte.be_zb.zb_objset, ==, + ZB_DESTROYED_OBJSET); + ASSERT3U(bte.be_zb.zb_level, ==, 0); + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + break; + } else { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } + } + + ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + + /* if all blocks are free there should be no used space */ + if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + ASSERT3U(ba.ba_phys->bt_bytes, ==, 0); + ASSERT3U(ba.ba_phys->bt_comp, ==, 0); + ASSERT3U(ba.ba_phys->bt_uncomp, ==, 0); + } + + dmu_buf_rele(db, FTAG); + + return (err); +} diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c index 9c4e029..145cc01 100644 --- a/uts/common/fs/zfs/dbuf.c +++ b/uts/common/fs/zfs/dbuf.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -226,7 +228,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db) boolean_t is_metadata; DB_DNODE_ENTER(db); - is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; + is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); @@ -1300,13 +1302,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * it, since one of the current holders may be in the * middle of an update. Note that users of dbuf_undirty() * should not place a hold on the dbuf before the call. + * Also note: we can get here with a spill block, so + * test for that similar to how dbuf_dirty does. */ if (refcount_count(&db->db_holds) > db->db_dirtycnt) { mutex_exit(&db->db_mtx); /* Make sure we don't toss this buffer at sync phase */ - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); - mutex_exit(&dn->dn_mtx); + if (db->db_blkid != DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + } DB_DNODE_EXIT(db); return (0); } @@ -1319,11 +1325,18 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) *drp = dr->dr_next; + /* + * Note that there are three places in dbuf_dirty() + * where this dirty record may be put on a list. + * Make sure to do a list_remove corresponding to + * every one of those list_insert calls. + */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); - } else if (db->db_level+1 == dn->dn_nlevels) { + } else if (db->db_blkid == DMU_SPILL_BLKID || + db->db_level+1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); diff --git a/uts/common/fs/zfs/ddt.c b/uts/common/fs/zfs/ddt.c index 7183314..b3ec3cc 100644 --- a/uts/common/fs/zfs/ddt.c +++ b/uts/common/fs/zfs/ddt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -1061,11 +1062,9 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { - spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, - DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, - &spa->spa_ddt_stat_object, tx) == 0); + spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, tx); } while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c index 39234eb..94fa52f 100644 --- a/uts/common/fs/zfs/dmu.c +++ b/uts/common/fs/zfs/dmu.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -46,60 +47,73 @@ #endif const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { byteswap_uint8_array, TRUE, "unallocated" }, - { zap_byteswap, TRUE, "object directory" }, - { byteswap_uint64_array, TRUE, "object array" }, - { byteswap_uint8_array, TRUE, "packed nvlist" }, - { byteswap_uint64_array, TRUE, "packed nvlist size" }, - { byteswap_uint64_array, TRUE, "bpobj" }, - { byteswap_uint64_array, TRUE, "bpobj header" }, - { byteswap_uint64_array, TRUE, "SPA space map header" }, - { byteswap_uint64_array, TRUE, "SPA space map" }, - { byteswap_uint64_array, TRUE, "ZIL intent log" }, - { dnode_buf_byteswap, TRUE, "DMU dnode" }, - { dmu_objset_byteswap, TRUE, "DMU objset" }, - { byteswap_uint64_array, TRUE, "DSL directory" }, - { zap_byteswap, TRUE, "DSL directory child map"}, - { zap_byteswap, TRUE, "DSL dataset snap map" }, - { zap_byteswap, TRUE, "DSL props" }, - { byteswap_uint64_array, TRUE, "DSL dataset" }, - { zfs_znode_byteswap, TRUE, "ZFS znode" }, - { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, - { byteswap_uint8_array, FALSE, "ZFS plain file" }, - { zap_byteswap, TRUE, "ZFS directory" }, - { zap_byteswap, TRUE, "ZFS master node" }, - { zap_byteswap, TRUE, "ZFS delete queue" }, - { byteswap_uint8_array, FALSE, "zvol object" }, - { zap_byteswap, TRUE, "zvol prop" }, - { byteswap_uint8_array, FALSE, "other uint8[]" }, - { byteswap_uint64_array, FALSE, "other uint64[]" }, - { zap_byteswap, TRUE, "other ZAP" }, - { zap_byteswap, TRUE, "persistent error log" }, - { byteswap_uint8_array, TRUE, "SPA history" }, - { byteswap_uint64_array, TRUE, "SPA history offsets" }, - { zap_byteswap, TRUE, "Pool properties" }, - { zap_byteswap, TRUE, "DSL permissions" }, - { zfs_acl_byteswap, TRUE, "ZFS ACL" }, - { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, - { byteswap_uint8_array, TRUE, "FUID table" }, - { byteswap_uint64_array, TRUE, "FUID table size" }, - { zap_byteswap, TRUE, "DSL dataset next clones"}, - { zap_byteswap, TRUE, "scan work queue" }, - { zap_byteswap, TRUE, "ZFS user/group used" }, - { zap_byteswap, TRUE, "ZFS user/group quota" }, - { zap_byteswap, TRUE, "snapshot refcount tags"}, - { zap_byteswap, TRUE, "DDT ZAP algorithm" }, - { zap_byteswap, TRUE, "DDT statistics" }, - { byteswap_uint8_array, TRUE, "System attributes" }, - { zap_byteswap, TRUE, "SA master node" }, - { zap_byteswap, TRUE, "SA attr registration" }, - { zap_byteswap, TRUE, "SA attr layouts" }, - { zap_byteswap, TRUE, "scan translations" }, - { byteswap_uint8_array, FALSE, "deduplicated block" }, - { zap_byteswap, TRUE, "DSL deadlist map" }, - { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" }, - { zap_byteswap, TRUE, "DSL dir clones" }, - { byteswap_uint64_array, TRUE, "bpobj subobj" }, + { DMU_BSWAP_UINT8, TRUE, "unallocated" }, + { DMU_BSWAP_ZAP, TRUE, "object directory" }, + { DMU_BSWAP_UINT64, TRUE, "object array" }, + { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, + { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, + { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, + { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, + { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, + { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, + { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, + { DMU_BSWAP_ZAP, TRUE, "DSL props" }, + { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, + { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, + { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, + { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, + { DMU_BSWAP_UINT8, FALSE, "zvol object" }, + { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, + { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, + { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, + { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, + { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, + { DMU_BSWAP_UINT8, TRUE, "SPA history" }, + { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, + { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, + { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, + { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, + { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, + { DMU_BSWAP_UINT8, TRUE, "FUID table" }, + { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, + { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, + { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, + { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, + { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, "System attributes" }, + { DMU_BSWAP_ZAP, TRUE, "SA master node" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, + { DMU_BSWAP_ZAP, TRUE, "scan translations" }, + { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, + { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, + { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } +}; + +const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { + { byteswap_uint8_array, "uint8" }, + { byteswap_uint16_array, "uint16" }, + { byteswap_uint32_array, "uint32" }, + { byteswap_uint64_array, "uint64" }, + { zap_byteswap, "zap" }, + { dnode_buf_byteswap, "dnode" }, + { dmu_objset_byteswap, "objset" }, + { zfs_znode_byteswap, "znode" }, + { zfs_oldacl_byteswap, "oldacl" }, + { zfs_acl_byteswap, "acl" } }; int @@ -176,7 +190,7 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if (type > DMU_OT_NUMTYPES) { + if (!DMU_OT_IS_VALID(type)) { error = EINVAL; } else if (dn->dn_bonus != db) { error = EINVAL; @@ -1503,7 +1517,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; - boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata || + boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c index e47d533..96fb0c0 100644 --- a/uts/common/fs/zfs/dmu_send.c +++ b/uts/common/fs/zfs/dmu_send.c @@ -20,6 +20,9 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/dmu.h> @@ -44,50 +47,38 @@ #include <sys/ddt.h> #include <sys/zfs_onexit.h> -static char *dmu_recv_tag = "dmu_recv_tag"; - -/* - * The list of data whose inclusion in a send stream can be pending from - * one call to backup_cb to another. Multiple calls to dump_free() and - * dump_freeobjects() can be aggregated into a single DRR_FREE or - * DRR_FREEOBJECTS replay record. - */ -typedef enum { - PENDING_NONE, - PENDING_FREE, - PENDING_FREEOBJECTS -} pendop_t; +/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ +int zfs_send_corrupt_data = B_FALSE; -struct backuparg { - dmu_replay_record_t *drr; - vnode_t *vp; - offset_t *off; - objset_t *os; - zio_cksum_t zc; - uint64_t toguid; - int err; - pendop_t pending_op; -}; +static char *dmu_recv_tag = "dmu_recv_tag"; static int -dump_bytes(struct backuparg *ba, void *buf, int len) +dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { + dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; ssize_t resid; /* have to get resid to get detailed errno */ ASSERT3U(len % 8, ==, 0); - fletcher_4_incremental_native(buf, len, &ba->zc); - ba->err = vn_rdwr(UIO_WRITE, ba->vp, + fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); + dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, (caddr_t)buf, len, 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); - *ba->off += len; - return (ba->err); + + mutex_enter(&ds->ds_sendstream_lock); + *dsp->dsa_off += len; + mutex_exit(&ds->ds_sendstream_lock); + + return (dsp->dsa_err); } static int -dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, +dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) { - struct drr_free *drrf = &(ba->drr->drr_u.drr_free); + struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); + + if (length != -1ULL && offset + length < offset) + length = -1ULL; /* * If there is a pending op, but it's not PENDING_FREE, push it out, @@ -96,13 +87,15 @@ dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, * other DRR_FREE records. DRR_FREEOBJECTS records can only be * aggregated with other DRR_FREEOBJECTS records. */ - if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + if (dsp->dsa_pending_op != PENDING_NONE && + dsp->dsa_pending_op != PENDING_FREE) { + if (dump_bytes(dsp, dsp->dsa_drr, + sizeof (dmu_replay_record_t)) != 0) return (EINTR); - ba->pending_op = PENDING_NONE; + dsp->dsa_pending_op = PENDING_NONE; } - if (ba->pending_op == PENDING_FREE) { + if (dsp->dsa_pending_op == PENDING_FREE) { /* * There should never be a PENDING_FREE if length is -1 * (because dump_dnode is the only place where this @@ -120,34 +113,35 @@ dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, return (0); } else { /* not a continuation. Push out pending record */ - if (dump_bytes(ba, ba->drr, + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); - ba->pending_op = PENDING_NONE; + dsp->dsa_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_FREE; + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; drrf->drr_length = length; - drrf->drr_toguid = ba->toguid; + drrf->drr_toguid = dsp->dsa_toguid; if (length == -1ULL) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + if (dump_bytes(dsp, dsp->dsa_drr, + sizeof (dmu_replay_record_t)) != 0) return (EINTR); } else { - ba->pending_op = PENDING_FREE; + dsp->dsa_pending_op = PENDING_FREE; } return (0); } static int -dump_data(struct backuparg *ba, dmu_object_type_t type, +dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) { - struct drr_write *drrw = &(ba->drr->drr_u.drr_write); + struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); /* @@ -156,19 +150,20 @@ dump_data(struct backuparg *ba, dmu_object_type_t type, * the stream, since aggregation can't be done across operations * of different types. */ - if (ba->pending_op != PENDING_NONE) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_bytes(dsp, dsp->dsa_drr, + sizeof (dmu_replay_record_t)) != 0) return (EINTR); - ba->pending_op = PENDING_NONE; + dsp->dsa_pending_op = PENDING_NONE; } /* write a DATA record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_WRITE; + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; drrw->drr_length = blksz; - drrw->drr_toguid = ba->toguid; + drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; @@ -177,42 +172,43 @@ dump_data(struct backuparg *ba, dmu_object_type_t type, DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); drrw->drr_key.ddk_cksum = bp->blk_cksum; - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); - if (dump_bytes(ba, data, blksz) != 0) + if (dump_bytes(dsp, data, blksz) != 0) return (EINTR); return (0); } static int -dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data) +dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) { - struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill); + struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); - if (ba->pending_op != PENDING_NONE) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_bytes(dsp, dsp->dsa_drr, + sizeof (dmu_replay_record_t)) != 0) return (EINTR); - ba->pending_op = PENDING_NONE; + dsp->dsa_pending_op = PENDING_NONE; } /* write a SPILL record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_SPILL; + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; - drrs->drr_toguid = ba->toguid; + drrs->drr_toguid = dsp->dsa_toguid; - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) return (EINTR); - if (dump_bytes(ba, data, blksz)) + if (dump_bytes(dsp, data, blksz)) return (EINTR); return (0); } static int -dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) +dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { - struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); + struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, @@ -221,13 +217,14 @@ dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records * can only be aggregated with other DRR_FREEOBJECTS records. */ - if (ba->pending_op != PENDING_NONE && - ba->pending_op != PENDING_FREEOBJECTS) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + if (dsp->dsa_pending_op != PENDING_NONE && + dsp->dsa_pending_op != PENDING_FREEOBJECTS) { + if (dump_bytes(dsp, dsp->dsa_drr, + sizeof (dmu_replay_record_t)) != 0) return (EINTR); - ba->pending_op = PENDING_NONE; + dsp->dsa_pending_op = PENDING_NONE; } - if (ba->pending_op == PENDING_FREEOBJECTS) { + if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one @@ -237,42 +234,43 @@ dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) return (0); } else { /* can't be aggregated. Push out pending record */ - if (dump_bytes(ba, ba->drr, + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); - ba->pending_op = PENDING_NONE; + dsp->dsa_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_FREEOBJECTS; + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; - drrfo->drr_toguid = ba->toguid; + drrfo->drr_toguid = dsp->dsa_toguid; - ba->pending_op = PENDING_FREEOBJECTS; + dsp->dsa_pending_op = PENDING_FREEOBJECTS; return (0); } static int -dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) +dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { - struct drr_object *drro = &(ba->drr->drr_u.drr_object); + struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) - return (dump_freeobjects(ba, object, 1)); + return (dump_freeobjects(dsp, object, 1)); - if (ba->pending_op != PENDING_NONE) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_bytes(dsp, dsp->dsa_drr, + sizeof (dmu_replay_record_t)) != 0) return (EINTR); - ba->pending_op = PENDING_NONE; + dsp->dsa_pending_op = PENDING_NONE; } /* write an OBJECT record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_OBJECT; + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; @@ -280,19 +278,19 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) drro->drr_bonuslen = dnp->dn_bonuslen; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; - drro->drr_toguid = ba->toguid; + drro->drr_toguid = dsp->dsa_toguid; - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); - if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) + if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (EINTR); /* free anything past the end of the file */ - if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * + if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) return (EINTR); - if (ba->err) + if (dsp->dsa_err) return (EINTR); return (0); } @@ -306,7 +304,7 @@ static int backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { - struct backuparg *ba = arg; + dmu_sendarg_t *dsp = arg; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; @@ -319,10 +317,10 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); + err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); } else if (bp == NULL) { uint64_t span = BP_SPAN(dnp, zb->zb_level); - err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); + err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { @@ -341,7 +339,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; - err = dump_dnode(ba, dnobj, blk+i); + err = dump_dnode(dsp, dnobj, blk+i); if (err) break; } @@ -356,7 +354,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); - err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data); + err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } else { /* it's a level-0 block of a regular object */ uint32_t aflags = ARC_WAIT; @@ -365,10 +363,22 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, if (dsl_read(NULL, spa, bp, pbuf, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL, &aflags, zb) != 0) - return (EIO); + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { + if (zfs_send_corrupt_data) { + /* Send a block filled with 0x"zfs badd bloc" */ + abuf = arc_buf_alloc(spa, blksz, &abuf, + ARC_BUFC_DATA); + uint64_t *ptr; + for (ptr = abuf->b_data; + (char *)ptr < (char *)abuf->b_data + blksz; + ptr++) + *ptr = 0x2f5baddb10c; + } else { + return (EIO); + } + } - err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, + err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, blksz, bp, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -378,13 +388,13 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, } int -dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - vnode_t *vp, offset_t *off) +dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + int outfd, vnode_t *vp, offset_t *off) { dsl_dataset_t *ds = tosnap->os_dsl_dataset; dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; dmu_replay_record_t *drr; - struct backuparg ba; + dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; @@ -425,8 +435,10 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, #ifdef _KERNEL if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { uint64_t version; - if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) + if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { + kmem_free(drr, sizeof (dmu_replay_record_t)); return (EINVAL); + } if (version == ZPL_VERSION_SA) { DMU_SET_FEATUREFLAGS( drr->drr_u.drr_begin.drr_versioninfo, @@ -453,44 +465,137 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, if (fromorigin) dsl_dataset_rele(fromds, FTAG); - ba.drr = drr; - ba.vp = vp; - ba.os = tosnap; - ba.off = off; - ba.toguid = ds->ds_phys->ds_guid; - ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); - ba.pending_op = PENDING_NONE; - - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (ba.err); + dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); + + dsp->dsa_drr = drr; + dsp->dsa_vp = vp; + dsp->dsa_outfd = outfd; + dsp->dsa_proc = curproc; + dsp->dsa_os = tosnap; + dsp->dsa_off = off; + dsp->dsa_toguid = ds->ds_phys->ds_guid; + ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); + dsp->dsa_pending_op = PENDING_NONE; + + mutex_enter(&ds->ds_sendstream_lock); + list_insert_head(&ds->ds_sendstreams, dsp); + mutex_exit(&ds->ds_sendstream_lock); + + if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { + err = dsp->dsa_err; + goto out; } err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, - backup_cb, &ba); + backup_cb, dsp); - if (ba.pending_op != PENDING_NONE) - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) + if (dsp->dsa_pending_op != PENDING_NONE) + if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) err = EINTR; if (err) { - if (err == EINTR && ba.err) - err = ba.err; - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (err); + if (err == EINTR && dsp->dsa_err) + err = dsp->dsa_err; + goto out; } bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; - drr->drr_u.drr_end.drr_checksum = ba.zc; - drr->drr_u.drr_end.drr_toguid = ba.toguid; + drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; + drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (ba.err); + if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { + err = dsp->dsa_err; + goto out; } +out: + mutex_enter(&ds->ds_sendstream_lock); + list_remove(&ds->ds_sendstreams, dsp); + mutex_exit(&ds->ds_sendstream_lock); + kmem_free(drr, sizeof (dmu_replay_record_t)); + kmem_free(dsp, sizeof (dmu_sendarg_t)); + + return (err); +} + +int +dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + uint64_t *sizep) +{ + dsl_dataset_t *ds = tosnap->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + int err; + uint64_t size; + + /* tosnap must be a snapshot */ + if (ds->ds_phys->ds_next_snap_obj == 0) + return (EINVAL); + + /* fromsnap must be an earlier snapshot from the same fs as tosnap */ + if (fromds && (ds->ds_dir != fromds->ds_dir || + fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) + return (EXDEV); + + if (fromorigin) { + if (fromsnap) + return (EINVAL); + + if (dsl_dir_is_clone(ds->ds_dir)) { + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + } else { + fromorigin = B_FALSE; + } + } + + /* Get uncompressed size estimate of changed data. */ + if (fromds == NULL) { + size = ds->ds_phys->ds_uncompressed_bytes; + } else { + uint64_t used, comp; + err = dsl_dataset_space_written(fromds, ds, + &used, &comp, &size); + if (fromorigin) + dsl_dataset_rele(fromds, FTAG); + if (err) + return (err); + } + + /* + * Assume that space (both on-disk and in-stream) is dominated by + * data. We will adjust for indirect blocks and the copies property, + * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). + */ + + /* + * Subtract out approximate space used by indirect blocks. + * Assume most space is used by data blocks (non-indirect, non-dnode). + * Assume all blocks are recordsize. Assume ditto blocks and + * internal fragmentation counter out compression. + * + * Therefore, space used by indirect blocks is sizeof(blkptr_t) per + * block, which we observe in practice. + */ + uint64_t recordsize; + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_prop_get_ds(ds, "recordsize", + sizeof (recordsize), 1, &recordsize, NULL); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + size -= size / recordsize * sizeof (blkptr_t); + + /* Add in the space for the record associated with each block. */ + size += size / recordsize * sizeof (dmu_replay_record_t); + + *sizep = size; return (0); } @@ -833,61 +938,6 @@ guid_compare(const void *arg1, const void *arg2) return (0); } -/* - * This function is a callback used by dmu_objset_find() (which - * enumerates the object sets) to build an avl tree that maps guids - * to datasets. The resulting table is used when processing DRR_WRITE_BYREF - * send stream records. These records, which are used in dedup'ed - * streams, do not contain data themselves, but refer to a copy - * of the data block that has already been written because it was - * earlier in the stream. That previous copy is identified by the - * guid of the dataset with the referenced data. - */ -int -find_ds_by_guid(const char *name, void *arg) -{ - avl_tree_t *guid_map = arg; - dsl_dataset_t *ds, *snapds; - guid_map_entry_t *gmep; - dsl_pool_t *dp; - int err; - uint64_t lastobj, firstobj; - - if (dsl_dataset_hold(name, FTAG, &ds) != 0) - return (0); - - dp = ds->ds_dir->dd_pool; - rw_enter(&dp->dp_config_rwlock, RW_READER); - firstobj = ds->ds_dir->dd_phys->dd_origin_obj; - lastobj = ds->ds_phys->ds_prev_snap_obj; - - while (lastobj != firstobj) { - err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds); - if (err) { - /* - * Skip this snapshot and move on. It's not - * clear why this would ever happen, but the - * remainder of the snapshot streadm can be - * processed. - */ - rw_exit(&dp->dp_config_rwlock); - dsl_dataset_rele(ds, FTAG); - return (0); - } - - gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); - gmep->guid = snapds->ds_phys->ds_guid; - gmep->gme_ds = snapds; - avl_add(guid_map, gmep); - lastobj = snapds->ds_phys->ds_prev_snap_obj; - } - - rw_exit(&dp->dp_config_rwlock); - dsl_dataset_rele(ds, FTAG); - - return (0); -} - static void free_guid_map_onexit(void *arg) { @@ -1025,8 +1075,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) void *data = NULL; if (drro->drr_type == DMU_OT_NONE || - drro->drr_type >= DMU_OT_NUMTYPES || - drro->drr_bonustype >= DMU_OT_NUMTYPES || + !DMU_OT_IS_VALID(drro->drr_type) || + !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || @@ -1091,7 +1141,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { - dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drro->drr_bonustype); + dmu_ot_byteswap[byteswap].ob_func(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); @@ -1134,7 +1186,7 @@ restore_write(struct restorearg *ra, objset_t *os, int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || - drrw->drr_type >= DMU_OT_NUMTYPES) + !DMU_OT_IS_VALID(drrw->drr_type)) return (EINVAL); data = restore_read(ra, drrw->drr_length); @@ -1153,8 +1205,11 @@ restore_write(struct restorearg *ra, objset_t *os, dmu_tx_abort(tx); return (err); } - if (ra->byteswap) - dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); + if (ra->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); + } dmu_write(os, drrw->drr_object, drrw->drr_offset, drrw->drr_length, data, tx); dmu_tx_commit(tx); @@ -1370,9 +1425,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, avl_create(ra.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); - (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, - (void *)ra.guid_to_ds_map, - DS_FIND_CHILDREN); ra.err = zfs_onexit_add_cb(minor, free_guid_map_onexit, ra.guid_to_ds_map, action_handlep); @@ -1384,6 +1436,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, if (ra.err) goto out; } + + drc->drc_guid_to_ds_map = ra.guid_to_ds_map; } /* @@ -1522,11 +1576,35 @@ recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) } static int +add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; + dsl_dataset_t *snapds; + guid_map_entry_t *gmep; + int err; + + ASSERT(guid_map != NULL); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); + if (err == 0) { + gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); + gmep->guid = snapds->ds_phys->ds_guid; + gmep->gme_ds = snapds; + avl_add(guid_map, gmep); + } + + rw_exit(&dp->dp_config_rwlock); + return (err); +} + +static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { struct recvendsyncarg resa; dsl_dataset_t *ds = drc->drc_logical_ds; - int err; + int err, myerr; /* * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() @@ -1561,8 +1639,11 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc) out: mutex_exit(&ds->ds_recvlock); + if (err == 0 && drc->drc_guid_to_ds_map != NULL) + (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); dsl_dataset_disown(ds, dmu_recv_tag); - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); + myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); + ASSERT3U(myerr, ==, 0); return (err); } @@ -1590,6 +1671,8 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc) /* clean up the fs we just recv'd into */ (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); } else { + if (drc->drc_guid_to_ds_map != NULL) + (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); /* release the hold from dmu_recv_begin */ dsl_dataset_disown(ds, dmu_recv_tag); } diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c index 023f90e..bfe9e65 100644 --- a/uts/common/fs/zfs/dmu_traverse.c +++ b/uts/common/fs/zfs/dmu_traverse.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -53,6 +54,7 @@ typedef struct traverse_data { uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; + zbookmark_t *td_resume; int td_flags; prefetch_data_t *td_pfd; blkptr_cb_t *td_func; @@ -128,6 +130,54 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh) zil_free(zilog); } +typedef enum resume_skip { + RESUME_SKIP_ALL, + RESUME_SKIP_NONE, + RESUME_SKIP_CHILDREN +} resume_skip_t; + +/* + * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and + * the block indicated by zb does not need to be visited at all. Returns + * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the + * resume point. This indicates that this block should be visited but not its + * children (since they must have been visited in a previous traversal). + * Otherwise returns RESUME_SKIP_NONE. + */ +static resume_skip_t +resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, + const zbookmark_t *zb) +{ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { + /* + * If we already visited this bp & everything below, + * don't bother doing it again. + */ + if (zbookmark_is_before(dnp, zb, td->td_resume)) + return (RESUME_SKIP_ALL); + + /* + * If we found the block we're trying to resume from, zero + * the bookmark out to indicate that we have resumed. + */ + ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object); + if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { + bzero(td->td_resume, sizeof (*zb)); + if (td->td_flags & TRAVERSE_POST) + return (RESUME_SKIP_CHILDREN); + } + } + return (RESUME_SKIP_NONE); +} + +static void +traverse_pause(traverse_data_t *td, const zbookmark_t *zb) +{ + ASSERT(td->td_resume != NULL); + ASSERT3U(zb->zb_level, ==, 0); + bcopy(zb, td->td_resume, sizeof (*td->td_resume)); +} + static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) @@ -137,8 +187,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; + boolean_t pause = B_FALSE; + + switch (resume_skip_check(td, dnp, zb)) { + case RESUME_SKIP_ALL: + return (0); + case RESUME_SKIP_CHILDREN: + goto post; + case RESUME_SKIP_NONE: + break; + default: + ASSERT(0); + } - if (bp->blk_birth == 0) { + if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, td->td_arg); return (err); @@ -164,8 +226,10 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); - if (err) - return (err); + if (err == ERESTART) + pause = B_TRUE; /* handle pausing at a common point */ + if (err != 0) + goto post; } if (BP_GET_LEVEL(bp) > 0) { @@ -253,9 +317,18 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (buf) (void) arc_buf_remove_ref(buf, &buf); +post: if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, td->td_arg); + if (err == ERESTART) + pause = B_TRUE; + } + + if (pause && td->td_resume != NULL) { + ASSERT3U(err, ==, ERESTART); + ASSERT(!hard); + traverse_pause(td, zb); } return (err != 0 ? err : lasterr); @@ -353,18 +426,23 @@ traverse_prefetch_thread(void *arg) * in syncing context). */ static int -traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp, - uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) +traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg) { traverse_data_t td; prefetch_data_t pd = { 0 }; zbookmark_t czb; int err; + ASSERT(ds == NULL || objset == ds->ds_object); + ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); + td.td_spa = spa; - td.td_objset = ds ? ds->ds_object : 0; + td.td_objset = objset; td.td_rootbp = rootbp; td.td_min_txg = txg_start; + td.td_resume = resume; td.td_func = func; td.td_arg = arg; td.td_pfd = &pd; @@ -416,8 +494,17 @@ int traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, - &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, + &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg)); +} + +int +traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg) +{ + return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, + blkptr, txg_start, resume, flags, func, arg)); } /* @@ -434,8 +521,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ - err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa), - txg_start, flags, func, arg); + err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), + txg_start, NULL, flags, func, arg); if (err) return (err); diff --git a/uts/common/fs/zfs/dmu_tx.c b/uts/common/fs/zfs/dmu_tx.c index bd5c71a..3dd5f2c 100644 --- a/uts/common/fs/zfs/dmu_tx.c +++ b/uts/common/fs/zfs/dmu_tx.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -673,9 +675,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) return; } - ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 && !add) { + blkptr_t *bp; + /* * If there is only one block (i.e. this is a micro-zap) * and we are not adding anything, the accounting is simple. @@ -690,14 +694,13 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) * Use max block size here, since we don't know how much * the size will change between now and the dbuf dirty call. */ + bp = &dn->dn_phys->dn_blkptr[0]; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - &dn->dn_phys->dn_blkptr[0], - dn->dn_phys->dn_blkptr[0].blk_birth)) { + bp, bp->blk_birth)) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; - } else { + else txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - } - if (dn->dn_phys->dn_blkptr[0].blk_birth) + if (!BP_IS_HOLE(bp)) txh->txh_space_tounref += SPA_MAXBLOCKSIZE; return; } @@ -1273,7 +1276,6 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { dnode_t *dn; dmu_tx_hold_t *txh; - blkptr_t *bp; txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_SPILL, 0, 0); @@ -1284,17 +1286,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) return; /* If blkptr doesn't exist then add space to towrite */ - bp = &dn->dn_phys->dn_spill; - if (BP_IS_HOLE(bp)) { + if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - txh->txh_space_tounref = 0; } else { + blkptr_t *bp; + + bp = &dn->dn_phys->dn_spill; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, bp, bp->blk_birth)) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; else txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - if (bp->blk_birth) + if (!BP_IS_HOLE(bp)) txh->txh_space_tounref += SPA_MAXBLOCKSIZE; } } diff --git a/uts/common/fs/zfs/dnode.c b/uts/common/fs/zfs/dnode.c index 850dd58..05ccf9f 100644 --- a/uts/common/fs/zfs/dnode.c +++ b/uts/common/fs/zfs/dnode.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -193,7 +194,7 @@ dnode_verify(dnode_t *dn) ASSERT(dn->dn_objset); ASSERT(dn->dn_handle->dnh_dnode == dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; @@ -212,7 +213,7 @@ dnode_verify(dnode_t *dn) ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); - ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); @@ -278,8 +279,10 @@ dnode_byteswap(dnode_phys_t *dnp) */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); size_t len = DN_MAX_BONUSLEN - off; - ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); - dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); + ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(dnp->dn_bonustype); + dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } /* Swap SPILL block if we have one */ @@ -407,7 +410,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dmu_zfetch_init(&dn->dn_zfetch, dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); list_insert_head(&os->os_dnodes, dn); @@ -496,11 +499,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); - ASSERT3U(ot, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT3U(dn->dn_maxblkid, ==, 0); @@ -568,7 +571,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); /* clean up any unreferenced dbufs */ diff --git a/uts/common/fs/zfs/dnode_sync.c b/uts/common/fs/zfs/dnode_sync.c index 2ee990a..8d81791 100644 --- a/uts/common/fs/zfs/dnode_sync.c +++ b/uts/common/fs/zfs/dnode_sync.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -594,7 +596,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) } if (dn->dn_next_bonustype[txgoff]) { - ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; dn->dn_next_bonustype[txgoff] = 0; } diff --git a/uts/common/fs/zfs/dsl_dataset.c b/uts/common/fs/zfs/dsl_dataset.c index 59ac4a6..ccfa71c 100644 --- a/uts/common/fs/zfs/dsl_dataset.c +++ b/uts/common/fs/zfs/dsl_dataset.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/dmu_objset.h> @@ -28,10 +30,12 @@ #include <sys/dsl_prop.h> #include <sys/dsl_synctask.h> #include <sys/dmu_traverse.h> +#include <sys/dmu_impl.h> #include <sys/dmu_tx.h> #include <sys/arc.h> #include <sys/zio.h> #include <sys/zap.h> +#include <sys/zfeature.h> #include <sys/unique.h> #include <sys/zfs_context.h> #include <sys/zfs_ioctl.h> @@ -97,7 +101,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) if (BP_IS_HOLE(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); - ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { /* * Account for the meta-objset space in its placeholder @@ -114,7 +118,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); - ds->ds_phys->ds_used_bytes += used; + ds->ds_phys->ds_referenced_bytes += used; ds->ds_phys->ds_compressed_bytes += compressed; ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; @@ -208,8 +212,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, } } mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); - ds->ds_phys->ds_used_bytes -= used; + ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); + ds->ds_phys->ds_referenced_bytes -= used; ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); ds->ds_phys->ds_compressed_bytes -= compressed; ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); @@ -393,6 +397,8 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&ds->ds_rwlock, 0, 0, 0); cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); @@ -400,6 +406,9 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), + offsetof(dmu_sendarg_t, dsa_link)); + if (err == 0) { err = dsl_dir_open_obj(dp, ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); @@ -810,8 +819,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = origin->ds_phys->ds_creation_txg; - dsphys->ds_used_bytes = - origin->ds_phys->ds_used_bytes; + dsphys->ds_referenced_bytes = + origin->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = origin->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = @@ -901,69 +910,55 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, return (dsobj); } -struct destroyarg { - dsl_sync_task_group_t *dstg; - char *snapname; - char *failed; - boolean_t defer; -}; - -static int -dsl_snapshot_destroy_one(const char *name, void *arg) -{ - struct destroyarg *da = arg; - dsl_dataset_t *ds; - int err; - char *dsname; - - dsname = kmem_asprintf("%s@%s", name, da->snapname); - err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds); - strfree(dsname); - if (err == 0) { - struct dsl_ds_destroyarg *dsda; - - dsl_dataset_make_exclusive(ds, da->dstg); - dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); - dsda->ds = ds; - dsda->defer = da->defer; - dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, dsda, da->dstg, 0); - } else if (err == ENOENT) { - err = 0; - } else { - (void) strcpy(da->failed, name); - } - return (err); -} - /* - * Destroy 'snapname' in all descendants of 'fsname'. + * The snapshots must all be in the same pool. */ -#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy int -dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) +dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) { int err; - struct destroyarg da; dsl_sync_task_t *dst; spa_t *spa; + nvpair_t *pair; + dsl_sync_task_group_t *dstg; - err = spa_open(fsname, &spa, FTAG); + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + + err = spa_open(nvpair_name(pair), &spa, FTAG); if (err) return (err); - da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - da.snapname = snapname; - da.failed = fsname; - da.defer = defer; + dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - err = dmu_objset_find(fsname, - dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + dsl_dataset_t *ds; + + err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); + if (err == 0) { + struct dsl_ds_destroyarg *dsda; + + dsl_dataset_make_exclusive(ds, dstg); + dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), + KM_SLEEP); + dsda->ds = ds; + dsda->defer = defer; + dsl_sync_task_create(dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, dsda, dstg, 0); + } else if (err == ENOENT) { + err = 0; + } else { + (void) strcpy(failed, nvpair_name(pair)); + break; + } + } if (err == 0) - err = dsl_sync_task_group_wait(da.dstg); + err = dsl_sync_task_group_wait(dstg); - for (dst = list_head(&da.dstg->dstg_tasks); dst; - dst = list_next(&da.dstg->dstg_tasks, dst)) { + for (dst = list_head(&dstg->dstg_tasks); dst; + dst = list_next(&dstg->dstg_tasks, dst)) { struct dsl_ds_destroyarg *dsda = dst->dst_arg1; dsl_dataset_t *ds = dsda->ds; @@ -971,17 +966,17 @@ dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) * Return the file system name that triggered the error */ if (dst->dst_err) { - dsl_dataset_name(ds, fsname); - *strchr(fsname, '@') = '\0'; + dsl_dataset_name(ds, failed); } ASSERT3P(dsda->rm_origin, ==, NULL); - dsl_dataset_disown(ds, da.dstg); + dsl_dataset_disown(ds, dstg); kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); } - dsl_sync_task_group_destroy(da.dstg); + dsl_sync_task_group_destroy(dstg); spa_close(spa, FTAG); return (err); + } static boolean_t @@ -1087,19 +1082,23 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) goto out; /* - * remove the objects in open context, so that we won't - * have too much to do in syncing context. + * If async destruction is not enabled try to remove all objects + * while in the open context so that there is less work to do in + * the syncing context. */ - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, - ds->ds_phys->ds_prev_snap_txg)) { - /* - * Ignore errors, if there is not enough disk space - * we will deal with it in dsl_dataset_destroy_sync(). - */ - (void) dmu_free_object(os, obj); + if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, + ds->ds_phys->ds_prev_snap_txg)) { + /* + * Ignore errors, if there is not enough disk space + * we will deal with it in dsl_dataset_destroy_sync(). + */ + (void) dmu_free_object(os, obj); + } + if (err != ESRCH) + goto out; } - if (err != ESRCH) - goto out; /* * Only the ZIL knows how to free log blocks. @@ -1245,7 +1244,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ASSERT(!dsl_dataset_is_snapshot(ds)); if (ds->ds_phys->ds_prev_snap_obj != 0) - mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; + mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; else mrs_used = 0; @@ -1253,7 +1252,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ASSERT3U(dlused, <=, mrs_used); ds->ds_phys->ds_unique_bytes = - ds->ds_phys->ds_used_bytes - (mrs_used - dlused); + ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) @@ -1611,6 +1610,30 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, ds_next->ds_phys->ds_deadlist_obj); } +static int +old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + int err; + struct killarg ka; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.tx = tx; + err = traverse_dataset(ds, + ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, + kill_blkptr, &ka); + ASSERT3U(err, ==, 0); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); + + return (err); +} + void dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { @@ -1757,7 +1780,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) tx); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); - dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, @@ -1833,32 +1855,54 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) } dsl_dataset_rele(ds_next, FTAG); } else { + zfeature_info_t *async_destroy = + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; + /* * There's no next snapshot, so this is a head dataset. * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty. (If it's a clone, it's * safe to ignore the deadlist contents.) */ - struct killarg ka; - dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; - /* - * Free everything that we point to (that's born after - * the previous snapshot, if we are a clone) - * - * NB: this should be very quick, because we already - * freed all the objects in open context. - */ - ka.ds = ds; - ka.tx = tx; - err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - TRAVERSE_POST, kill_blkptr, &ka); - ASSERT3U(err, ==, 0); - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - ds->ds_phys->ds_unique_bytes == 0); + if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { + err = old_synchronous_dataset_destroy(ds, tx); + } else { + /* + * Move the bptree into the pool's list of trees to + * clean up and update space accounting information. + */ + uint64_t used, comp, uncomp; + + ASSERT(err == 0 || err == EBUSY); + if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { + spa_feature_incr(dp->dp_spa, async_destroy, tx); + dp->dp_bptree_obj = bptree_alloc( + dp->dp_meta_objset, tx); + VERIFY(zap_add(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx) == 0); + } + + used = ds->ds_dir->dd_phys->dd_used_bytes; + comp = ds->ds_dir->dd_phys->dd_compressed_bytes; + uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + ds->ds_phys->ds_unique_bytes == used); + + bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj, + &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, + used, comp, uncomp, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + } if (ds->ds_prev != NULL) { if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { @@ -2049,7 +2093,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; - dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; + dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; dsphys->ds_flags = ds->ds_phys->ds_flags; @@ -2142,10 +2186,71 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_objset_sync(ds->ds_objset, zio, tx); } +static void +get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) +{ + uint64_t count = 0; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + nvlist_t *propval; + nvlist_t *val; + + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + /* + * There may me missing entries in ds_next_clones_obj + * due to a bug in a previous version of the code. + * Only trust it if it has the right number of entries. + */ + if (ds->ds_phys->ds_next_clones_obj != 0) { + ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, + &count)); + } + if (count != ds->ds_phys->ds_num_children - 1) { + goto fail; + } + for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + char buf[ZFS_MAXNAMELEN]; + /* + * Even though we hold the dp_config_rwlock, the dataset + * may fail to open, returning ENOENT. If there is a + * thread concurrently attempting to destroy this + * dataset, it will have the ds_rwlock held for + * RW_WRITER. Our call to dsl_dataset_hold_obj() -> + * dsl_dataset_hold_ref() will fail its + * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the + * dp_config_rwlock, and wait for the destroy progress + * and signal ds_exclusive_cv. If the destroy was + * successful, we will see that + * DSL_DATASET_IS_DESTROYED(), and return ENOENT. + */ + if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone) != 0) + continue; + dsl_dir_name(clone->ds_dir, buf); + VERIFY(nvlist_add_boolean(val, buf) == 0); + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); + VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); + VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), + propval) == 0); +fail: + nvlist_free(val); + nvlist_free(propval); + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); +} + void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { - uint64_t refd, avail, uobjs, aobjs; + uint64_t refd, avail, uobjs, aobjs, ratio; dsl_dir_stats(ds->ds_dir, nv); @@ -2172,6 +2277,31 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, DS_IS_DEFER_DESTROY(ds) ? 1 : 0); + if (ds->ds_phys->ds_prev_snap_obj != 0) { + uint64_t written, comp, uncomp; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_dataset_t *prev; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + int err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + rw_exit(&dp->dp_config_rwlock); + if (err == 0) { + err = dsl_dataset_space_written(prev, ds, &written, + &comp, &uncomp); + dsl_dataset_rele(prev, FTAG); + if (err == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, + written); + } + } + } + + ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : + (ds->ds_phys->ds_uncompressed_bytes * 100 / + ds->ds_phys->ds_compressed_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); + if (ds->ds_phys->ds_next_snap_obj) { /* * This is a snapshot; override the dd's space used with @@ -2179,10 +2309,9 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) */ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, ds->ds_phys->ds_unique_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, - ds->ds_phys->ds_compressed_bytes == 0 ? 100 : - (ds->ds_phys->ds_uncompressed_bytes * 100 / - ds->ds_phys->ds_compressed_bytes)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); + + get_clones_stat(ds, nv); } } @@ -2226,7 +2355,7 @@ dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { - *refdbytesp = ds->ds_phys->ds_used_bytes; + *refdbytesp = ds->ds_phys->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; @@ -2563,7 +2692,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ - pa->used = origin_ds->ds_phys->ds_used_bytes; + pa->used = origin_ds->ds_phys->ds_referenced_bytes; pa->comp = origin_ds->ds_phys->ds_compressed_bytes; pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; for (snap = list_head(&pa->shared_snaps); snap; @@ -2597,7 +2726,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * so we need to subtract out the clone origin's used space. */ if (pa->origin_origin) { - pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; + pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; } @@ -3113,8 +3242,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsl_deadlist_space(&csa->ohds->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); - dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - - (csa->ohds->ds_phys->ds_used_bytes + odl_used); + dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - + (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + @@ -3143,8 +3272,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) } /* swap ds_*_bytes */ - SWITCH64(csa->ohds->ds_phys->ds_used_bytes, - csa->cds->ds_phys->ds_used_bytes); + SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, + csa->cds->ds_phys->ds_referenced_bytes); SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, csa->cds->ds_phys->ds_compressed_bytes); SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, @@ -3273,8 +3402,9 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { - if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) + if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { + if (inflight > 0 || + ds->ds_phys->ds_referenced_bytes < ds->ds_quota) error = ERESTART; else error = EDQUOT; @@ -3301,7 +3431,7 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) if (psa->psa_effective_value == 0) return (0); - if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || + if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || psa->psa_effective_value < ds->ds_reserved) return (ENOSPC); @@ -4009,7 +4139,7 @@ dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) } /* - * Note, this fuction is used as the callback for dmu_objset_find(). We + * Note, this function is used as the callback for dmu_objset_find(). We * always return 0 so that we will continue to find and process * inconsistent datasets, even if we encounter an error trying to * process one of them. @@ -4028,3 +4158,156 @@ dsl_destroy_inconsistent(const char *dsname, void *arg) } return (0); } + +/* + * Return (in *usedp) the amount of space written in new that is not + * present in oldsnap. New may be a snapshot or the head. Old must be + * a snapshot before new, in new's filesystem (or its origin). If not then + * fail and return EINVAL. + * + * The written space is calculated by considering two components: First, we + * ignore any freed space, and calculate the written as new's used space + * minus old's used space. Next, we add in the amount of space that was freed + * between the two snapshots, thus reducing new's used space relative to old's. + * Specifically, this is the space that was born before old->ds_creation_txg, + * and freed before new (ie. on new's deadlist or a previous deadlist). + * + * space freed [---------------------] + * snapshots ---O-------O--------O-------O------ + * oldsnap new + */ +int +dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = new->ds_dir->dd_pool; + + *usedp = 0; + *usedp += new->ds_phys->ds_referenced_bytes; + *usedp -= oldsnap->ds_phys->ds_referenced_bytes; + + *compp = 0; + *compp += new->ds_phys->ds_compressed_bytes; + *compp -= oldsnap->ds_phys->ds_compressed_bytes; + + *uncompp = 0; + *uncompp += new->ds_phys->ds_uncompressed_bytes; + *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + snapobj = new->ds_object; + while (snapobj != oldsnap->ds_object) { + dsl_dataset_t *snap; + uint64_t used, comp, uncomp; + + if (snapobj == new->ds_object) { + snap = new; + } else { + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + } + + if (snap->ds_phys->ds_prev_snap_txg == + oldsnap->ds_phys->ds_creation_txg) { + /* + * The blocks in the deadlist can not be born after + * ds_prev_snap_txg, so get the whole deadlist space, + * which is more efficient (especially for old-format + * deadlists). Unfortunately the deadlist code + * doesn't have enough information to make this + * optimization itself. + */ + dsl_deadlist_space(&snap->ds_deadlist, + &used, &comp, &uncomp); + } else { + dsl_deadlist_space_range(&snap->ds_deadlist, + 0, oldsnap->ds_phys->ds_creation_txg, + &used, &comp, &uncomp); + } + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + /* + * If we get to the beginning of the chain of snapshots + * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap + * was not a snapshot of/before new. + */ + snapobj = snap->ds_phys->ds_prev_snap_obj; + if (snap != new) + dsl_dataset_rele(snap, FTAG); + if (snapobj == 0) { + err = EINVAL; + break; + } + + } + rw_exit(&dp->dp_config_rwlock); + return (err); +} + +/* + * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, + * lastsnap, and all snapshots in between are deleted. + * + * blocks that would be freed [---------------------------] + * snapshots ---O-------O--------O-------O--------O + * firstsnap lastsnap + * + * This is the set of blocks that were born after the snap before firstsnap, + * (birth > firstsnap->prev_snap_txg) and died before the snap after the + * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). + * We calculate this by iterating over the relevant deadlists (from the snap + * after lastsnap, backward to the snap after firstsnap), summing up the + * space on the deadlist that was born after the snap before firstsnap. + */ +int +dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, + dsl_dataset_t *lastsnap, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; + + ASSERT(dsl_dataset_is_snapshot(firstsnap)); + ASSERT(dsl_dataset_is_snapshot(lastsnap)); + + /* + * Check that the snapshots are in the same dsl_dir, and firstsnap + * is before lastsnap. + */ + if (firstsnap->ds_dir != lastsnap->ds_dir || + firstsnap->ds_phys->ds_creation_txg > + lastsnap->ds_phys->ds_creation_txg) + return (EINVAL); + + *usedp = *compp = *uncompp = 0; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + snapobj = lastsnap->ds_phys->ds_next_snap_obj; + while (snapobj != firstsnap->ds_object) { + dsl_dataset_t *ds; + uint64_t used, comp, uncomp; + + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); + if (err != 0) + break; + + dsl_deadlist_space_range(&ds->ds_deadlist, + firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + snapobj = ds->ds_phys->ds_prev_snap_obj; + ASSERT3U(snapobj, !=, 0); + dsl_dataset_rele(ds, FTAG); + } + rw_exit(&dp->dp_config_rwlock); + return (err); +} diff --git a/uts/common/fs/zfs/dsl_deadlist.c b/uts/common/fs/zfs/dsl_deadlist.c index 064f8ac..dd6db21 100644 --- a/uts/common/fs/zfs/dsl_deadlist.c +++ b/uts/common/fs/zfs/dsl_deadlist.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include <sys/dsl_dataset.h> @@ -29,6 +30,26 @@ #include <sys/zfs_context.h> #include <sys/dsl_pool.h> +/* + * Deadlist concurrency: + * + * Deadlists can only be modified from the syncing thread. + * + * Except for dsl_deadlist_insert(), it can only be modified with the + * dp_config_rwlock held with RW_WRITER. + * + * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can + * be called concurrently, from open context, with the dl_config_rwlock held + * with RW_READER. + * + * Therefore, we only need to provide locking between dsl_deadlist_insert() and + * the accessors, protecting: + * dl_phys->dl_used,comp,uncomp + * and protecting the dl_tree from being loaded. + * The locking is provided by dl_lock. Note that locking on the bpobj_t + * provides its own locking, and dl_oldfmt is immutable. + */ + static int dsl_deadlist_compare(const void *arg1, const void *arg2) { @@ -309,14 +330,14 @@ dsl_deadlist_space(dsl_deadlist_t *dl, * return space used in the range (mintxg, maxtxg]. * Includes maxtxg, does not include mintxg. * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is - * UINT64_MAX). + * larger than any bp in the deadlist (eg. UINT64_MAX)). */ void dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { - dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; + dsl_deadlist_entry_t dle_tofind; avl_index_t where; if (dl->dl_oldfmt) { @@ -325,9 +346,10 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, return; } - dsl_deadlist_load_tree(dl); *usedp = *compp = *uncompp = 0; + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); /* @@ -336,6 +358,7 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, */ ASSERT(dle != NULL || avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); + for (; dle && dle->dle_mintxg < maxtxg; dle = AVL_NEXT(&dl->dl_tree, dle)) { uint64_t used, comp, uncomp; @@ -347,6 +370,7 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, *compp += comp; *uncompp += uncomp; } + mutex_exit(&dl->dl_lock); } static void diff --git a/uts/common/fs/zfs/dsl_deleg.c b/uts/common/fs/zfs/dsl_deleg.c index 529fb05..c13ddd4 100644 --- a/uts/common/fs/zfs/dsl_deleg.c +++ b/uts/common/fs/zfs/dsl_deleg.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -170,10 +171,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { - jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, - DMU_OT_NONE, 0, tx); - VERIFY(zap_update(mos, zapobj, - whokey, 8, 1, &jumpobj, tx) == 0); + jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, + zapobj, whokey, tx); } while (permpair = nvlist_next_nvpair(perms, permpair)) { @@ -525,10 +524,12 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, } /* - * Check if user has requested permission. + * Check if user has requested permission. If descendent is set, must have + * descendent perms. */ int -dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) +dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, + cred_t *cr) { dsl_dir_t *dd; dsl_pool_t *dp; @@ -549,7 +550,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) SPA_VERSION_DELEGATED_PERMS) return (EPERM); - if (dsl_dataset_is_snapshot(ds)) { + if (dsl_dataset_is_snapshot(ds) || descendent) { /* * Snapshots are treated as descendents only, * local permissions do not apply. @@ -642,7 +643,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) if (error) return (error); - error = dsl_deleg_access_impl(ds, perm, cr); + error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); dsl_dataset_rele(ds, FTAG); return (error); diff --git a/uts/common/fs/zfs/dsl_pool.c b/uts/common/fs/zfs/dsl_pool.c index 700cc96..e922394 100644 --- a/uts/common/fs/zfs/dsl_pool.c +++ b/uts/common/fs/zfs/dsl_pool.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dsl_pool.h> @@ -39,6 +40,8 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/bptree.h> +#include <sys/zfeature.h> int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -99,20 +102,32 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) } int -dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, + &dp->dp_meta_objset); + if (err != 0) + dsl_pool_close(dp); + else + *dpp = dp; + + return (err); +} + +int +dsl_pool_open(dsl_pool_t *dp) +{ + int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; - rw_enter(&dp->dp_config_rwlock, RW_WRITER); - err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, - &dp->dp_meta_objset); - if (err) - goto out; + ASSERT(!dmu_objset_is_dirty_anywhere(dp->dp_meta_objset)); + rw_enter(&dp->dp_config_rwlock, RW_WRITER); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); @@ -128,7 +143,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - if (spa_version(spa) >= SPA_VERSION_ORIGIN) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; @@ -145,7 +160,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) goto out; } - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) @@ -159,6 +174,15 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) dp->dp_meta_objset, obj)); } + if (spa_feature_is_active(dp->dp_spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj); + if (err != 0) + goto out; + } + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); @@ -167,15 +191,10 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - err = dsl_scan_init(dp, txg); + err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rw_exit(&dp->dp_config_rwlock); - if (err) - dsl_pool_close(dp); - else - *dpp = dp; - return (err); } @@ -291,7 +310,10 @@ static int deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; + dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); + rw_enter(&dp->dp_config_rwlock, RW_READER); dsl_deadlist_insert(dl, bp, tx); + rw_exit(&dp->dp_config_rwlock); return (0); } @@ -466,7 +488,7 @@ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || - spa_get_dsl(dp->dp_spa) == NULL); + spa_is_initializing(dp->dp_spa)); } uint64_t @@ -784,11 +806,8 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); - dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, - DMU_OT_NONE, 0, tx); - - VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, - sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); + dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c index 56d4108..328f4d0 100644 --- a/uts/common/fs/zfs/dsl_scan.c +++ b/uts/common/fs/zfs/dsl_scan.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dsl_scan.h> @@ -44,6 +45,7 @@ #include <sys/ddt.h> #include <sys/sa.h> #include <sys/sa_impl.h> +#include <sys/zfeature.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif @@ -382,55 +384,6 @@ dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, priority, zio_flags, arc_flags, zb)); } -static boolean_t -bookmark_is_zero(const zbookmark_t *zb) -{ - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); -} - -/* dnp is the dnode for zb1->zb_object */ -static boolean_t -bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); -} - static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { @@ -462,7 +415,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ @@ -617,13 +570,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, /* * We never skip over user/group accounting objects (obj<0) */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ - if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* @@ -816,22 +769,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return; - if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { - /* - * For non-user-accounting blocks, we need to read the - * new bp (from a deleted snapshot, found in - * check_existing_xlation). If we used the old bp, - * pointers inside this block from before we resumed - * would be untranslated. - * - * For user-accounting blocks, we need to read the old - * bp, because we will apply the entire space delta to - * it (original untranslated -> translations from - * deleted snap -> now). - */ - bp_toread = *bp; - } - if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, &buf) != 0) return; @@ -1396,19 +1333,28 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) zap_cursor_fini(&zc); } -static int -dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +static boolean_t +dsl_scan_free_should_pause(dsl_scan_t *scn) { - dsl_scan_t *scn = arg; uint64_t elapsed_nanosecs; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; - - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || - spa_shutting_down(scn->scn_dp->dp_spa)) - return (ERESTART); + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +static int +dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + + if (!scn->scn_is_bptree || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { + if (dsl_scan_free_should_pause(scn)) + return (ERESTART); + } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); @@ -1433,6 +1379,10 @@ dsl_scan_active(dsl_scan_t *scn) if (scn->scn_phys.scn_state == DSS_SCANNING) return (B_TRUE); + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + return (B_TRUE); + } if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); @@ -1479,14 +1429,40 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * traversing it. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, - dsl_scan_free_cb, scn, tx); + dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + + if (err == 0 && spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, + scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + if (err != 0) + return; + + /* disable async destroy feature */ + spa_feature_decr(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); + ASSERT(!spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj txg %llu", + "free_bpobj/bptree txg %llu", (longlong_t)scn->scn_visited_this_txg, (longlong_t) (gethrtime() - scn->scn_sync_start_time) / MICROSEC, @@ -1601,6 +1577,8 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + if (t & DMU_OT_NEWTYPE) + t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index 17b4b12..2f7c882 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -30,10 +31,29 @@ #include <sys/vdev_impl.h> #include <sys/zio.h> +/* + * Allow allocations to switch to gang blocks quickly. We do this to + * avoid having to load lots of space_maps in a given txg. There are, + * however, some cases where we want to avoid "fast" ganging and instead + * we want to do an exhaustive search of all metaslabs on this device. + * Currently we don't allow any gang, zil, or dump device related allocations + * to "fast" gang. + */ +#define CAN_FASTGANG(flags) \ + (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ + METASLAB_GANG_AVOID))) + uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* + * This value defines the number of allowed allocation failures per vdev. + * If a device reaches this threshold in a given txg then we consider skipping + * allocations on that device. + */ +int zfs_mg_alloc_failures; + +/* * Metaslab debugging: when set, keeps all space maps in core to verify frees. */ static int metaslab_debug = 0; @@ -671,7 +691,7 @@ static space_map_ops_t metaslab_ndf_ops = { metaslab_ndf_fragmented }; -space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; +space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; /* * ========================================================================== @@ -844,7 +864,7 @@ metaslab_prefetch(metaslab_group_t *mg) } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) +metaslab_activate(metaslab_t *msp, uint64_t activation_weight) { metaslab_group_t *mg = msp->ms_group; space_map_t *sm = &msp->ms_map; @@ -877,13 +897,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) mutex_exit(&mg->mg_lock); } - /* - * If we were able to load the map then make sure - * that this map is still able to satisfy our request. - */ - if (msp->ms_weight < size) - return (ENOSPC); - metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } @@ -1099,6 +1112,7 @@ void metaslab_sync_reassess(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; + int64_t failures = mg->mg_alloc_failures; /* * Re-evaluate all metaslabs which have lower offsets than the @@ -1115,6 +1129,8 @@ metaslab_sync_reassess(metaslab_group_t *mg) mutex_exit(&msp->ms_lock); } + atomic_add_64(&mg->mg_alloc_failures, -failures); + /* * Prefetch the next potential metaslabs */ @@ -1139,9 +1155,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) } static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, - uint64_t min_distance, dva_t *dva, int d) +metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, + uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) { + spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp = NULL; uint64_t offset = -1ULL; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -1162,11 +1179,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, mutex_enter(&mg->mg_lock); for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { - if (msp->ms_weight < size) { + if (msp->ms_weight < asize) { + spa_dbgmsg(spa, "%s: failed to meet weight " + "requirement: vdev %llu, txg %llu, mg %p, " + "msp %p, psize %llu, asize %llu, " + "failures %llu, weight %llu", + spa_name(spa), mg->mg_vd->vdev_id, txg, + mg, msp, psize, asize, + mg->mg_alloc_failures, msp->ms_weight); mutex_exit(&mg->mg_lock); return (-1ULL); } - was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -1185,6 +1208,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if (msp == NULL) return (-1ULL); + /* + * If we've already reached the allowable number of failed + * allocation attempts on this metaslab group then we + * consider skipping it. We skip it only if we're allowed + * to "fast" gang, the physical size is larger than + * a gang block, and we're attempting to allocate from + * the primary metaslab. + */ + if (mg->mg_alloc_failures > zfs_mg_alloc_failures && + CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && + activation_weight == METASLAB_WEIGHT_PRIMARY) { + spa_dbgmsg(spa, "%s: skipping metaslab group: " + "vdev %llu, txg %llu, mg %p, psize %llu, " + "asize %llu, failures %llu", spa_name(spa), + mg->mg_vd->vdev_id, txg, mg, psize, asize, + mg->mg_alloc_failures); + return (-1ULL); + } + mutex_enter(&msp->ms_lock); /* @@ -1193,7 +1235,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * another thread may have changed the weight while we * were blocked on the metaslab lock. */ - if (msp->ms_weight < size || (was_active && + if (msp->ms_weight < asize || (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK) && activation_weight == METASLAB_WEIGHT_PRIMARY)) { mutex_exit(&msp->ms_lock); @@ -1208,14 +1250,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, continue; } - if (metaslab_activate(msp, activation_weight, size) != 0) { + if (metaslab_activate(msp, activation_weight) != 0) { mutex_exit(&msp->ms_lock); continue; } - if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) + if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) break; + atomic_inc_64(&mg->mg_alloc_failures); + metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); mutex_exit(&msp->ms_lock); @@ -1224,7 +1268,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); mutex_exit(&msp->ms_lock); @@ -1351,7 +1395,8 @@ top: asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + offset = metaslab_group_alloc(mg, psize, asize, txg, distance, + dva, d, flags); if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -1363,18 +1408,24 @@ top: vdev_stat_t *vs = &vd->vdev_stat; int64_t vu, cu; - /* - * Determine percent used in units of 0..1024. - * (This is just to avoid floating point.) - */ - vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); - cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); + vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); + cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); /* - * Bias by at most +/- 25% of the aliquot. + * Calculate how much more or less we should + * try to allocate from this device during + * this iteration around the rotor. + * For example, if a device is 80% full + * and the pool is 20% full then we should + * reduce allocations by 60% on this device. + * + * mg_bias = (20 - 80) * 512K / 100 = -307K + * + * This reduces allocations by 307K for this + * iteration. */ mg->mg_bias = ((cu - vu) * - (int64_t)mg->mg_aliquot) / (1024 * 4); + (int64_t)mg->mg_aliquot) / 100; } if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= @@ -1488,7 +1539,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) error = ENOENT; diff --git a/uts/common/fs/zfs/sa.c b/uts/common/fs/zfs/sa.c index 4cb4546..06607d7 100644 --- a/uts/common/fs/zfs/sa.c +++ b/uts/common/fs/zfs/sa.c @@ -18,8 +18,11 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 iXsystems, Inc + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -426,10 +429,9 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, char attr_name[8]; if (sa->sa_layout_attr_obj == 0) { - sa->sa_layout_attr_obj = zap_create(os, - DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1, - &sa->sa_layout_attr_obj, tx) == 0); + sa->sa_layout_attr_obj = zap_create_link(os, + DMU_OT_SA_ATTR_LAYOUTS, + sa->sa_master_obj, SA_LAYOUTS, tx); } (void) snprintf(attr_name, sizeof (attr_name), @@ -605,14 +607,14 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, * and spill buffer. */ if (buftype == SA_BONUS && *index == -1 && - P2ROUNDUP(*total + hdrsize, 8) > + *total + P2ROUNDUP(hdrsize, 8) > (full_space - sizeof (blkptr_t))) { *index = i; done = B_TRUE; } next: - if (P2ROUNDUP(*total + hdrsize, 8) > full_space && + if (*total + P2ROUNDUP(hdrsize, 8) > full_space && buftype == SA_BONUS) *will_spill = B_TRUE; } @@ -1551,10 +1553,9 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) } if (sa->sa_reg_attr_obj == NULL) { - sa->sa_reg_attr_obj = zap_create(hdl->sa_os, - DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj, - SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0); + sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, + DMU_OT_SA_ATTR_REGISTRATION, + sa->sa_master_obj, SA_REGISTRY, tx); } for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_registered) diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index b6190e4..c5f11ed 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -60,6 +62,7 @@ #include <sys/spa_boot.h> #include <sys/zfs_ioctl.h> #include <sys/dsl_scan.h> +#include <sys/zfeature.h> #ifdef _KERNEL #include <sys/bootprops.h> @@ -111,6 +114,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; +static dsl_syncfunc_t spa_sync_version; static dsl_syncfunc_t spa_sync_props; static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, @@ -165,15 +169,18 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { + vdev_t *rvd = spa->spa_root_vdev; + dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size; uint64_t alloc; + uint64_t space; uint64_t cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa->spa_props_lock)); - if (spa->spa_root_vdev != NULL) { + if (rvd != NULL) { alloc = metaslab_class_get_alloc(spa_normal_class(spa)); size = metaslab_class_get_space(spa_normal_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); @@ -181,6 +188,15 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); + + space = 0; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + space += tvd->vdev_max_asize - tvd->vdev_asize; + } + spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, + src); + spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == FREAD), src); @@ -191,7 +207,7 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, - spa->spa_root_vdev->vdev_state, src); + rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) @@ -201,8 +217,29 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } + if (pool != NULL) { + dsl_dir_t *freedir = pool->dp_free_dir; + + /* + * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, + * when opening pools before this version freedir will be NULL. + */ + if (freedir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, + freedir->dd_phys->dd_used_bytes, src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, + NULL, 0, src); + } + } + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); + if (spa->spa_comment != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, + 0, ZPROP_SRC_LOCAL); + } + if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); @@ -335,25 +372,55 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum; + boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - zpool_prop_t prop; - char *propname, *strval; uint64_t intval; - objset_t *os; - char *slash; + char *strval, *slash, *check, *fname; + const char *propname = nvpair_name(elem); + zpool_prop_t prop = zpool_name_to_prop(propname); - propname = nvpair_name(elem); + switch (prop) { + case ZPROP_INVAL: + if (!zpool_prop_feature(propname)) { + error = EINVAL; + break; + } - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) - return (EINVAL); + /* + * Sanitize the input. + */ + if (nvpair_type(elem) != DATA_TYPE_UINT64) { + error = EINVAL; + break; + } + + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + + if (intval != 0) { + error = EINVAL; + break; + } + + fname = strchr(propname, '@') + 1; + if (zfeature_lookup_name(fname, NULL) != 0) { + error = EINVAL; + break; + } + + has_feature = B_TRUE; + break; - switch (prop) { case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && - (intval < spa_version(spa) || intval > SPA_VERSION)) + (intval < spa_version(spa) || + intval > SPA_VERSION_BEFORE_FEATURES || + has_feature)) error = EINVAL; break; @@ -390,6 +457,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = nvpair_value_string(elem, &strval); if (!error) { + objset_t *os; uint64_t compress; if (strval == NULL || strval[0] == '\0') { @@ -462,6 +530,26 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = EINVAL; break; + case ZPOOL_PROP_COMMENT: + if ((error = nvpair_value_string(elem, &strval)) != 0) + break; + for (check = strval; *check != '\0'; check++) { + /* + * The kernel doesn't have an easy isprint() + * check. For this kernel check, we merely + * check ASCII apart from DEL. Fix this if + * there is an easy-to-use kernel isprint(). + */ + if (*check >= 0x7f) { + error = EINVAL; + break; + } + check++; + } + if (strlen(strval) > ZPROP_MAX_COMMENT) + error = E2BIG; + break; + case ZPOOL_PROP_DEDUPDITTO: if (spa_version(spa) < SPA_VERSION_DEDUP) error = ENOTSUP; @@ -519,33 +607,58 @@ int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; - nvpair_t *elem; + nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; - zpool_prop_t prop; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { - if ((prop = zpool_name_to_prop( - nvpair_name(elem))) == ZPROP_INVAL) - return (EINVAL); + zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; + if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { + uint64_t ver; + + if (prop == ZPOOL_PROP_VERSION) { + VERIFY(nvpair_value_uint64(elem, &ver) == 0); + } else { + ASSERT(zpool_prop_feature(nvpair_name(elem))); + ver = SPA_VERSION_FEATURES; + need_sync = B_TRUE; + } + + /* Save time if the version is already set. */ + if (ver == spa_version(spa)) + continue; + + /* + * In addition to the pool directory object, we might + * create the pool properties object, the features for + * read object, the features for write object, or the + * feature descriptions object. + */ + error = dsl_sync_task_do(spa_get_dsl(spa), NULL, + spa_sync_version, spa, &ver, 6); + if (error) + return (error); + continue; + } + need_sync = B_TRUE; break; } - if (need_sync) + if (need_sync) { return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); - else - return (0); + spa, nvp, 6)); + } + + return (0); } /* @@ -563,6 +676,43 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) } /* + * Change the GUID for the pool. This is done so that we can later + * re-import a pool built from a clone of our own vdevs. We will modify + * the root vdev's guid, our own pool guid, and then mark all of our + * vdevs dirty. Note that we must make sure that all our vdevs are + * online when we do this, or else any vdevs that weren't present + * would be orphaned from our pool. We are also going to issue a + * sysevent to update any watchers. + */ +int +spa_change_guid(spa_t *spa) +{ + uint64_t oldguid, newguid; + uint64_t txg; + + if (!(spa_mode_global & FWRITE)) + return (EROFS); + + txg = spa_vdev_enter(spa); + + if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY) + return (spa_vdev_exit(spa, NULL, txg, ENXIO)); + + oldguid = spa_guid(spa); + newguid = spa_generate_guid(NULL); + ASSERT3U(oldguid, !=, newguid); + + spa->spa_root_vdev->vdev_guid = newguid; + spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid); + + vdev_config_dirty(spa->spa_root_vdev); + + spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); + + return (spa_vdev_exit(spa, NULL, txg, 0)); +} + +/* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== @@ -610,7 +760,7 @@ static taskq_t * spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, uint_t value) { - uint_t flags = TASKQ_PREPOPULATE; + uint_t flags = 0; boolean_t batch = B_FALSE; switch (mode) { @@ -988,8 +1138,10 @@ spa_unload(spa_t *spa) } spa->spa_spares.sav_count = 0; - for (i = 0; i < spa->spa_l2cache.sav_count; i++) + for (i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); + } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); @@ -1003,6 +1155,11 @@ spa_unload(spa_t *spa) spa->spa_async_suspended = 0; + if (spa->spa_comment != NULL) { + spa_strfree(spa->spa_comment); + spa->spa_comment = NULL; + } + spa_config_exit(spa, SCL_ALL, FTAG); } @@ -1212,11 +1369,13 @@ spa_load_l2cache(spa_t *spa) vd = oldvdevs[i]; if (vd != NULL) { + ASSERT(vd->vdev_isl2cache); + if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - (void) vdev_close(vd); - spa_l2cache_remove(vd); + vdev_clear_stats(vd); + vdev_free(vd); } } @@ -1523,7 +1682,7 @@ spa_load_verify_done(zio_t *zio) int error = zio->io_error; if (error) { - if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && + if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_add_64(&sle->sle_meta_count, 1); else @@ -1718,6 +1877,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, { nvlist_t *config = spa->spa_config; char *ereport = FM_EREPORT_ZFS_POOL; + char *comment; int error; uint64_t pool_guid; nvlist_t *nvl; @@ -1725,6 +1885,10 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) return (EINVAL); + ASSERT(spa->spa_comment == NULL); + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + spa->spa_comment = spa_strdup(comment); + /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. @@ -1740,7 +1904,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, spa_guid_exists(pool_guid, 0)) { error = EEXIST; } else { - spa->spa_load_guid = pool_guid; + spa->spa_config_guid = pool_guid; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) { @@ -1748,6 +1912,9 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, KM_SLEEP) == 0); } + nvlist_free(spa->spa_load_info); + spa->spa_load_info = fnvlist_alloc(); + gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); @@ -1780,12 +1947,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, { int error = 0; nvlist_t *nvroot = NULL; + nvlist_t *label; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; + boolean_t missing_feat_write = B_FALSE; /* * If this is an untrusted config, access the pool in read-only mode. @@ -1852,7 +2021,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ if (type != SPA_IMPORT_ASSEMBLE) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd); + error = vdev_validate(rvd, mosconfig); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) @@ -1865,19 +2034,78 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Find the best uberblock. */ - vdev_uberblock_load(NULL, rvd, ub); + vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ - if (ub->ub_txg == 0) + if (ub->ub_txg == 0) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); + } /* - * If the pool is newer than the code, we can't open it. + * If the pool has an unsupported version we can't open it. */ - if (ub->ub_version > SPA_VERSION) + if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); + } + + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *features; + + /* + * If we weren't able to find what's necessary for reading the + * MOS in the label, return failure. + */ + if (label == NULL || nvlist_lookup_nvlist(label, + ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { + nvlist_free(label); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + /* + * Update our in-core representation with the definitive values + * from the label. + */ + nvlist_free(spa->spa_label_features); + VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); + } + + nvlist_free(label); + + /* + * Look through entries in the label nvlist's features_for_read. If + * there is a feature listed there which we don't understand then we + * cannot open a pool. + */ + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *unsup_feat; + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, + NULL); nvp != NULL; + nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { + if (!zfeature_is_supported(nvpair_name(nvp))) { + VERIFY(nvlist_add_string(unsup_feat, + nvpair_name(nvp), "") == 0); + } + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + nvlist_free(unsup_feat); + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + nvlist_free(unsup_feat); + } /* * If the vdev guid sum doesn't match the uberblock, we have an @@ -1911,7 +2139,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; - error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; @@ -1919,6 +2147,84 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (spa_version(spa) >= SPA_VERSION_FEATURES) { + boolean_t missing_feat_read = B_FALSE; + nvlist_t *unsup_feat; + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, + &spa->spa_feat_for_read_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, + &spa->spa_feat_for_write_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, + &spa->spa_feat_desc_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + if (!feature_is_supported(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, + unsup_feat)) + missing_feat_read = B_TRUE; + + if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { + if (!feature_is_supported(spa->spa_meta_objset, + spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, + unsup_feat)) + missing_feat_write = B_TRUE; + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + } + + nvlist_free(unsup_feat); + + if (!missing_feat_read) { + fnvlist_add_boolean(spa->spa_load_info, + ZPOOL_CONFIG_CAN_RDONLY); + } + + /* + * If the state is SPA_LOAD_TRYIMPORT, our objective is + * twofold: to determine whether the pool is available for + * import in read-write mode and (if it is not) whether the + * pool is available for import in read-only mode. If the pool + * is available for import in read-write mode, it is displayed + * as available in userland; if it is not available for import + * in read-only mode, it is displayed as unavailable in + * userland. If the pool is available for import in read-only + * mode but not read-write mode, it is displayed as unavailable + * in userland with a special note that the pool is actually + * available for open in read-only mode. + * + * As a result, if the state is SPA_LOAD_TRYIMPORT and we are + * missing a feature for write, we must first determine whether + * the pool can be opened read-only before returning to + * userland in order to know whether to display the + * abovementioned note. + */ + if (missing_feat_read || (missing_feat_write && + spa_writeable(spa))) { + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + } + + spa->spa_is_initializing = B_TRUE; + error = dsl_pool_open(spa->spa_dsl_pool); + spa->spa_is_initializing = B_FALSE; + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (!mosconfig) { uint64_t hostid; nvlist_t *policy = NULL, *nvconfig; @@ -1949,7 +2255,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " - "See: http://www.sun.com/msg/ZFS-8000-EY", + "See: http://illumos.org/msg/ZFS-8000-EY", spa_name(spa), hostname, (unsigned long)hostid); return (EBADF); @@ -2136,7 +2442,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, nvlist_free(nvconfig); /* - * Now that we've validate the config, check the state of the + * Now that we've validated the config, check the state of the * root vdev. If it can't be opened, it indicates one or * more toplevel vdevs are faulted. */ @@ -2149,6 +2455,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, } } + if (missing_feat_write) { + ASSERT(state == SPA_LOAD_TRYIMPORT); + + /* + * At this point, we know that we can open the pool in + * read-only mode but not read-write mode. We now have enough + * information and can return to userland. + */ + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); + } + /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. @@ -2258,10 +2575,18 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } +/* + * If spa_load() fails this function will try loading prior txg's. If + * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool + * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this + * function will not rewind the pool and will return the same error as + * spa_load(). + */ static int spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, uint64_t max_request, int rewind_flags) { + nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; @@ -2290,9 +2615,18 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, return (load_error); } - /* Price of rolling back is discarding txgs, including log */ - if (state == SPA_LOAD_RECOVER) + if (state == SPA_LOAD_RECOVER) { + /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * If we aren't rolling back save the load info from our first + * import attempt so that we can restore it after attempting + * to rewind. + */ + loadinfo = spa->spa_load_info; + spa->spa_load_info = fnvlist_alloc(); + } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; @@ -2316,7 +2650,20 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); - return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); + if (state == SPA_LOAD_RECOVER) { + ASSERT3P(loadinfo, ==, NULL); + return (rewind_error); + } else { + /* Store the rewind info as part of the initial load info */ + fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, + spa->spa_load_info); + + /* Restore the initial load info */ + fnvlist_free(spa->spa_load_info); + spa->spa_load_info = loadinfo; + + return (load_error); + } } /* @@ -2586,8 +2933,50 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) } } +static void +spa_add_feature_stats(spa_t *spa, nvlist_t *config) +{ + nvlist_t *features; + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (spa->spa_feat_for_read_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_read_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + if (spa->spa_feat_for_write_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_write_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + features) == 0); + nvlist_free(features); +} + int -spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) +spa_get_stats(const char *name, nvlist_t **config, + char *altroot, size_t buflen) { int error; spa_t *spa; @@ -2622,6 +3011,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); + spa_add_feature_stats(spa, *config); } } @@ -2712,6 +3102,7 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { error = ENOTBLK; + vdev_free(vd); goto out; } #endif @@ -2821,10 +3212,6 @@ spa_l2cache_drop(spa_t *spa) if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - if (vd->vdev_isl2cache) - spa_l2cache_remove(vd); - vdev_clear_stats(vd); - (void) vdev_close(vd); } } @@ -2845,6 +3232,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; + boolean_t has_features; /* * If this pool already exists, return failure. @@ -2870,10 +3258,18 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, return (error); } - if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), - &version) != 0) + has_features = B_FALSE; + for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); + elem != NULL; elem = nvlist_next_nvpair(props, elem)) { + if (zpool_prop_feature(nvpair_name(elem))) + has_features = B_TRUE; + } + + if (has_features || nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; - ASSERT(version <= SPA_VERSION); + } + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; @@ -2949,8 +3345,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_l2cache.sav_sync = B_TRUE; } + spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; + spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). @@ -2974,6 +3372,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, cmn_err(CE_PANIC, "failed to add pool config"); } + if (spa_version(spa) >= SPA_VERSION_FEATURES) + spa_feature_create_zap_objects(spa, tx); + if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { @@ -3164,7 +3565,7 @@ spa_import_rootpool(char *devpath, char *devid) } #endif if (config == NULL) { - cmn_err(CE_NOTE, "Can not read the pool label from '%s'", + cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); return (EIO); } @@ -3478,6 +3879,8 @@ spa_tryimport(nvlist_t *tryconfig) state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we @@ -3816,7 +4219,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ADD)) != 0) + VDEV_ALLOC_ATTACH)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) @@ -5195,7 +5598,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) * information. This avoids the dbuf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ - bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); + bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = kmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, @@ -5280,6 +5683,24 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } +static void +spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + uint64_t version = *(uint64_t *)arg2; + + /* + * Setting the version is special cased when first creating the pool. + */ + ASSERT(tx->tx_txg != TXG_INITIAL); + + ASSERT(version <= SPA_VERSION); + ASSERT(version >= spa_version(spa)); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); +} + /* * Set zpool properties. */ @@ -5289,32 +5710,38 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) spa_t *spa = arg1; objset_t *mos = spa->spa_meta_objset; nvlist_t *nvp = arg2; - nvpair_t *elem; - uint64_t intval; - char *strval; - zpool_prop_t prop; - const char *propname; - zprop_type_t proptype; + nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem))) { + uint64_t intval; + char *strval, *fname; + zpool_prop_t prop; + const char *propname; + zprop_type_t proptype; + zfeature_info_t *feature; + switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + case ZPROP_INVAL: + /* + * We checked this earlier in spa_prop_validate(). + */ + ASSERT(zpool_prop_feature(nvpair_name(elem))); + + fname = strchr(nvpair_name(elem), '@') + 1; + VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); + + spa_feature_enable(spa, feature, tx); + break; + case ZPOOL_PROP_VERSION: + VERIFY(nvpair_value_uint64(elem, &intval) == 0); /* - * Only set version for non-zpool-creation cases - * (set/import). spa_create() needs special care - * for version setting. + * The version is synced seperatly before other + * properties and should be correct by now. */ - if (tx->tx_txg != TXG_INITIAL) { - VERIFY(nvpair_value_uint64(elem, - &intval) == 0); - ASSERT(intval <= SPA_VERSION); - ASSERT(intval >= spa_version(spa)); - spa->spa_uberblock.ub_version = intval; - vdev_config_dirty(spa->spa_root_vdev); - } + ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: @@ -5332,19 +5759,29 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) * properties. */ break; + case ZPOOL_PROP_COMMENT: + VERIFY(nvpair_value_string(elem, &strval) == 0); + if (spa->spa_comment != NULL) + spa_strfree(spa->spa_comment); + spa->spa_comment = spa_strdup(strval); + /* + * We need to dirty the configuration on all the vdevs + * so that their labels get updated. It's unnecessary + * to do this for pool creation since the vdev's + * configuratoin has already been dirtied. + */ + if (tx->tx_txg != TXG_INITIAL) + vdev_config_dirty(spa->spa_root_vdev); + break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { - VERIFY((spa->spa_pool_props_object = - zap_create(mos, DMU_OT_POOL_PROPS, - DMU_OT_NONE, 0, tx)) > 0); - - VERIFY(zap_update(mos, + spa->spa_pool_props_object = + zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, - 8, 1, &spa->spa_pool_props_object, tx) - == 0); + tx); } /* normalize the property name */ @@ -5443,6 +5880,11 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && + spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + spa_feature_create_zap_objects(spa, tx); + } } /* diff --git a/uts/common/fs/zfs/spa_config.c b/uts/common/fs/zfs/spa_config.c index 69d57f6..3665450 100644 --- a/uts/common/fs/zfs/spa_config.c +++ b/uts/common/fs/zfs/spa_config.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -33,6 +35,7 @@ #include <sys/utsname.h> #include <sys/systeminfo.h> #include <sys/sunddi.h> +#include <sys/zfeature.h> #ifdef _KERNEL #include <sys/kobj.h> #include <sys/zone.h> @@ -345,6 +348,10 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)) == 0); + VERIFY(spa->spa_comment == NULL || nvlist_add_string(config, + ZPOOL_CONFIG_COMMENT, spa->spa_comment) == 0); + + #ifdef _KERNEL hostid = zone_get_hostid(NULL); #else /* _KERNEL */ @@ -403,6 +410,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); + /* + * Store what's necessary for reading the MOS in the label. + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + spa->spa_label_features) == 0); + if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; diff --git a/uts/common/fs/zfs/spa_history.c b/uts/common/fs/zfs/spa_history.c index 212abae..6df8411 100644 --- a/uts/common/fs/zfs/spa_history.c +++ b/uts/common/fs/zfs/spa_history.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -101,11 +102,11 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) /* * Figure out maximum size of history log. We set it at - * 1% of pool size, with a max of 32MB and min of 128KB. + * 0.1% of pool size, with a max of 1G and min of 128KB. */ shpp->sh_phys_max_off = - metaslab_class_get_dspace(spa_normal_class(spa)) / 100; - shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); + metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; + shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); dmu_buf_rele(dbp, FTAG); diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index 1b54afb..9400194 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ #include <sys/zfs_context.h> @@ -46,6 +48,7 @@ #include <sys/arc.h> #include <sys/ddt.h> #include "zfs_prop.h" +#include "zfeature_common.h" /* * SPA locking @@ -214,7 +217,7 @@ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. * - * spa_rename() is also implemented within this file since is requires + * spa_rename() is also implemented within this file since it requires * manipulation of the namespace. */ @@ -481,8 +484,22 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (config != NULL) + if (config != NULL) { + nvlist_t *features; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) == 0) { + VERIFY(nvlist_dup(features, &spa->spa_label_features, + 0) == 0); + } + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + } + + if (spa->spa_label_features == NULL) { + VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + } return (spa); } @@ -519,6 +536,7 @@ spa_remove(spa_t *spa) list_destroy(&spa->spa_config_list); + nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); @@ -1027,6 +1045,20 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) * ========================================================================== */ +void +spa_activate_mos_feature(spa_t *spa, const char *feature) +{ + (void) nvlist_add_boolean(spa->spa_label_features, feature); + vdev_config_dirty(spa->spa_root_vdev); +} + +void +spa_deactivate_mos_feature(spa_t *spa, const char *feature) +{ + (void) nvlist_remove_all(spa->spa_label_features, feature); + vdev_config_dirty(spa->spa_root_vdev); +} + /* * Rename a spa_t. */ @@ -1177,12 +1209,22 @@ spa_generate_guid(spa_t *spa) void sprintf_blkptr(char *buf, const blkptr_t *bp) { - char *type = NULL; + char type[256]; char *checksum = NULL; char *compress = NULL; if (bp != NULL) { - type = dmu_ot[BP_GET_TYPE(bp)].ot_name; + if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); + (void) snprintf(type, sizeof (type), "bswap %s %s", + DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? + "metadata" : "data", + dmu_ot_byteswap[bswap].ob_name); + } else { + (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, + sizeof (type)); + } checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } @@ -1264,6 +1306,12 @@ spa_get_dsl(spa_t *spa) return (spa->spa_dsl_pool); } +boolean_t +spa_is_initializing(spa_t *spa) +{ + return (spa->spa_is_initializing); +} + blkptr_t * spa_get_rootblkptr(spa_t *spa) { @@ -1303,13 +1351,24 @@ spa_guid(spa_t *spa) /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root - * vdev. We stash the original pool guid in 'spa_load_guid' to handle + * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev != NULL) return (spa->spa_root_vdev->vdev_guid); else - return (spa->spa_load_guid); + return (spa->spa_config_guid); +} + +uint64_t +spa_load_guid(spa_t *spa) +{ + /* + * This is a GUID that exists solely as a reference for the + * purposes of the arc. It is generated at load time, and + * is never written to persistent storage. + */ + return (spa->spa_load_guid); } uint64_t @@ -1536,6 +1595,7 @@ spa_init(int mode) vdev_cache_stat_init(); zfs_prop_init(); zpool_prop_init(); + zpool_feature_init(); spa_config_load(); l2arc_start(); } @@ -1670,3 +1730,9 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) return (0); } + +boolean_t +spa_debug_enabled(spa_t *spa) +{ + return (spa->spa_debug); +} diff --git a/uts/common/fs/zfs/sys/bptree.h b/uts/common/fs/zfs/sys/bptree.h new file mode 100644 index 0000000..9715072 --- /dev/null +++ b/uts/common/fs/zfs/sys/bptree.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BPTREE_H +#define _SYS_BPTREE_H + +#include <sys/spa.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bptree_phys { + uint64_t bt_begin; + uint64_t bt_end; + uint64_t bt_bytes; + uint64_t bt_comp; + uint64_t bt_uncomp; +} bptree_phys_t; + +typedef struct bptree_entry_phys { + blkptr_t be_bp; + uint64_t be_birth_txg; /* only delete blocks born after this txg */ + zbookmark_t be_zb; /* holds traversal resume point if needed */ +} bptree_entry_phys_t; + +typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx); +int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); + +void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx); + +int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, + bptree_itor_t func, void *arg, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPTREE_H */ diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h index 07f5949..2dac732 100644 --- a/uts/common/fs/zfs/sys/dmu.h +++ b/uts/common/fs/zfs/sys/dmu.h @@ -18,8 +18,12 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -70,6 +74,53 @@ typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; +typedef enum dmu_object_byteswap { + DMU_BSWAP_UINT8, + DMU_BSWAP_UINT16, + DMU_BSWAP_UINT32, + DMU_BSWAP_UINT64, + DMU_BSWAP_ZAP, + DMU_BSWAP_DNODE, + DMU_BSWAP_OBJSET, + DMU_BSWAP_ZNODE, + DMU_BSWAP_OLDACL, + DMU_BSWAP_ACL, + /* + * Allocating a new byteswap type number makes the on-disk format + * incompatible with any other format that uses the same number. + * + * Data can usually be structured to work with one of the + * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. + */ + DMU_BSWAP_NUMFUNCS +} dmu_object_byteswap_t; + +#define DMU_OT_NEWTYPE 0x80 +#define DMU_OT_METADATA 0x40 +#define DMU_OT_BYTESWAP_MASK 0x3f + +/* + * Defines a uint8_t object type. Object types specify if the data + * in the object is metadata (boolean) and how to byteswap the data + * (dmu_object_byteswap_t). + */ +#define DMU_OT(byteswap, metadata) \ + (DMU_OT_NEWTYPE | \ + ((metadata) ? DMU_OT_METADATA : 0) | \ + ((byteswap) & DMU_OT_BYTESWAP_MASK)) + +#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ + (ot) < DMU_OT_NUMTYPES) + +#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_METADATA) : \ + dmu_ot[(ot)].ot_metadata) + +#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) : \ + dmu_ot[(ot)].ot_byteswap) + typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ @@ -134,7 +185,35 @@ typedef enum dmu_object_type { DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ - DMU_OT_NUMTYPES + /* + * Do not allocate new object types here. Doing so makes the on-disk + * format incompatible with any other format that uses the same object + * type number. + * + * When creating an object which does not have one of the above types + * use the DMU_OTN_* type with the correct byteswap and metadata + * values. + * + * The DMU_OTN_* types do not have entries in the dmu_ot table, + * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead + * of indexing into dmu_ot directly (this works for both DMU_OT_* types + * and DMU_OTN_* types). + */ + DMU_OT_NUMTYPES, + + /* + * Names for valid types declared with DMU_OT(). + */ + DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), + DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), + DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), + DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), + DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), + DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), + DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), + DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), + DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), + DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; typedef enum dmu_objset_type { @@ -191,7 +270,7 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, uint64_t flags); int dmu_objset_destroy(const char *name, boolean_t defer); -int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); +int dmu_snapshots_destroy_nvl(struct nvlist *snaps, boolean_t defer, char *); int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); int dmu_objset_rename(const char *name, const char *newname, @@ -214,6 +293,9 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" +#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" +#define DMU_POOL_FEATURES_FOR_READ "features_for_read" +#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" @@ -229,6 +311,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" +#define DMU_POOL_BPTREE_OBJ "bptree_obj" /* * Allocate an object from this objset. The range of object numbers @@ -489,7 +572,7 @@ void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, /* * Free up the data blocks for a defined range of a file. If size is - * zero, the range from offset to end-of-file is freed. + * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); @@ -559,12 +642,18 @@ typedef struct dmu_object_info { typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { - arc_byteswap_func_t *ot_byteswap; + dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; char *ot_name; } dmu_object_type_info_t; +typedef struct dmu_object_byteswap_info { + arc_byteswap_func_t *ob_func; + char *ob_name; +} dmu_object_byteswap_info_t; + extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; +extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. @@ -700,8 +789,10 @@ typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); -int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - struct vnode *vp, offset_t *off); +int dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + int outfd, struct vnode *vp, offset_t *off); +int dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorign, + uint64_t *sizep); typedef struct dmu_recv_cookie { /* @@ -718,6 +809,7 @@ typedef struct dmu_recv_cookie { char *drc_top_ds; boolean_t drc_newfs; boolean_t drc_force; + struct avl_tree *drc_guid_to_ds_map; } dmu_recv_cookie_t; int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *, diff --git a/uts/common/fs/zfs/sys/dmu_impl.h b/uts/common/fs/zfs/sys/dmu_impl.h index 22f9f5f..defcdb2 100644 --- a/uts/common/fs/zfs/sys/dmu_impl.h +++ b/uts/common/fs/zfs/sys/dmu_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -30,6 +31,7 @@ #include <sys/zio.h> #include <sys/dnode.h> #include <sys/zfs_context.h> +#include <sys/zfs_ioctl.h> #ifdef __cplusplus extern "C" { @@ -264,6 +266,32 @@ static xuio_stats_t xuio_stats = { atomic_add_64(&xuio_stats.stat.value.ui64, (val)) #define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) +/* + * The list of data whose inclusion in a send stream can be pending from + * one call to backup_cb to another. Multiple calls to dump_free() and + * dump_freeobjects() can be aggregated into a single DRR_FREE or + * DRR_FREEOBJECTS replay record. + */ +typedef enum { + PENDING_NONE, + PENDING_FREE, + PENDING_FREEOBJECTS +} dmu_pendop_t; + +typedef struct dmu_sendarg { + list_node_t dsa_link; + dmu_replay_record_t *dsa_drr; + vnode_t *dsa_vp; + int dsa_outfd; + struct proc *dsa_proc; + offset_t *dsa_off; + objset_t *dsa_os; + zio_cksum_t dsa_zc; + uint64_t dsa_toguid; + int dsa_err; + dmu_pendop_t dsa_pending_op; +} dmu_sendarg_t; + #ifdef __cplusplus } diff --git a/uts/common/fs/zfs/sys/dmu_traverse.h b/uts/common/fs/zfs/sys/dmu_traverse.h index 5b326cd..3cbf42f 100644 --- a/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/uts/common/fs/zfs/sys/dmu_traverse.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -54,6 +55,9 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg); int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); diff --git a/uts/common/fs/zfs/sys/dsl_dataset.h b/uts/common/fs/zfs/sys/dsl_dataset.h index 22733d0..77b7d70 100644 --- a/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/uts/common/fs/zfs/sys/dsl_dataset.h @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_DSL_DATASET_H @@ -84,7 +86,12 @@ typedef struct dsl_dataset_phys { uint64_t ds_creation_time; /* seconds since 1970 */ uint64_t ds_creation_txg; uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ - uint64_t ds_used_bytes; + /* + * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes + * include all blocks referenced by this dataset, including those + * shared with any other datasets. + */ + uint64_t ds_referenced_bytes; uint64_t ds_compressed_bytes; uint64_t ds_uncompressed_bytes; uint64_t ds_unique_bytes; /* only relevant to snapshots */ @@ -149,6 +156,9 @@ typedef struct dsl_dataset { uint64_t ds_reserved; /* cached refreservation */ uint64_t ds_quota; /* cached refquota */ + kmutex_t ds_sendstream_lock; + list_t ds_sendstreams; + /* Protected by ds_lock; keep at end of struct for better locality */ char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; @@ -249,6 +259,10 @@ void dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); +int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); diff --git a/uts/common/fs/zfs/sys/dsl_deleg.h b/uts/common/fs/zfs/sys/dsl_deleg.h index 73c43bd..9db6d07 100644 --- a/uts/common/fs/zfs/sys/dsl_deleg.h +++ b/uts/common/fs/zfs/sys/dsl_deleg.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DELEG_H @@ -64,7 +65,8 @@ extern "C" { int dsl_deleg_get(const char *ddname, nvlist_t **nvp); int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); -int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr); +int dsl_deleg_access_impl(struct dsl_dataset *ds, boolean_t descendent, + const char *perm, cred_t *cr); void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); diff --git a/uts/common/fs/zfs/sys/dsl_pool.h b/uts/common/fs/zfs/sys/dsl_pool.h index 7d25bd7..9ff4148 100644 --- a/uts/common/fs/zfs/sys/dsl_pool.h +++ b/uts/common/fs/zfs/sys/dsl_pool.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_POOL_H @@ -34,6 +35,7 @@ #include <sys/ddt.h> #include <sys/arc.h> #include <sys/bpobj.h> +#include <sys/bptree.h> #ifdef __cplusplus extern "C" { @@ -48,7 +50,8 @@ struct dsl_scan; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE -#define DMU_OT_TOTAL DMU_OT_NUMTYPES +#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */ +#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1) typedef struct zfs_blkstat { uint64_t zb_count; @@ -85,6 +88,7 @@ typedef struct dsl_pool { uint64_t dp_write_limit; uint64_t dp_tmp_userrefs_obj; bpobj_t dp_free_bpobj; + uint64_t dp_bptree_obj; struct dsl_scan *dp_scan; @@ -110,7 +114,8 @@ typedef struct dsl_pool { zfs_all_blkstats_t *dp_blkstats; } dsl_pool_t; -int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +int dsl_pool_open(dsl_pool_t *dp); void dsl_pool_close(dsl_pool_t *dp); dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); diff --git a/uts/common/fs/zfs/sys/dsl_scan.h b/uts/common/fs/zfs/sys/dsl_scan.h index c79666e..5691f4d 100644 --- a/uts/common/fs/zfs/sys/dsl_scan.h +++ b/uts/common/fs/zfs/sys/dsl_scan.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_SCAN_H @@ -79,6 +80,9 @@ typedef struct dsl_scan { uint64_t scn_sync_start_time; zio_t *scn_zio_root; + /* for freeing blocks */ + boolean_t scn_is_bptree; + /* for debugging / information */ uint64_t scn_visited_this_txg; diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h index 583d630..2cf4d2b 100644 --- a/uts/common/fs/zfs/sys/metaslab.h +++ b/uts/common/fs/zfs/sys/metaslab.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -47,6 +48,8 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg); #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 #define METASLAB_GANG_HEADER 0x2 +#define METASLAB_GANG_CHILD 0x4 +#define METASLAB_GANG_AVOID 0x8 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h index 07988dd..6c670a1 100644 --- a/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/uts/common/fs/zfs/sys/metaslab_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -52,6 +53,7 @@ struct metaslab_group { avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; uint64_t mg_bonus_area; + uint64_t mg_alloc_failures; int64_t mg_bias; int64_t mg_activation_count; metaslab_class_t *mg_class; diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h index 456ec06..c790f37 100644 --- a/uts/common/fs/zfs/sys/spa.h +++ b/uts/common/fs/zfs/sys/spa.h @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_SPA_H @@ -92,7 +94,7 @@ struct dsl_pool; /* * Size of block to hold the configuration data (a packed nvlist) */ -#define SPA_CONFIG_BLOCKSIZE (1 << 14) +#define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. @@ -260,7 +262,7 @@ typedef struct blkptr { DVA_GET_ASIZE(&(bp)->blk_dva[2])) #define BP_GET_UCSIZE(bp) \ - ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ + ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ @@ -401,8 +403,8 @@ typedef struct blkptr { #include <sys/dmu.h> #define BP_GET_BUFC_TYPE(bp) \ - (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA); + (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ + ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, @@ -413,8 +415,8 @@ typedef enum spa_import_type { extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, nvlist_t *policy, nvlist_t **config); -extern int spa_get_stats(const char *pool, nvlist_t **config, - char *altroot, size_t buflen); +extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, + size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, const char *history_str, nvlist_t *zplprops); extern int spa_import_rootpool(char *devpath, char *devid); @@ -571,12 +573,14 @@ extern void spa_claim_notify(zio_t *zio); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern boolean_t spa_is_initializing(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); extern int spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); +extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); @@ -601,6 +605,8 @@ extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); /* Miscellaneous support routines */ +extern void spa_activate_mos_feature(spa_t *spa, const char *feature); +extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern int spa_rename(const char *oldname, const char *newname); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); @@ -610,6 +616,7 @@ extern uint64_t spa_get_random(uint64_t range); extern uint64_t spa_generate_guid(spa_t *spa); extern void sprintf_blkptr(char *buf, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); +extern int spa_change_guid(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, @@ -697,6 +704,13 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_bp(bp, fmt, ...) #endif +extern boolean_t spa_debug_enabled(spa_t *spa); +#define spa_dbgmsg(spa, ...) \ +{ \ + if (spa_debug_enabled(spa)) \ + zfs_dbgmsg(__VA_ARGS__); \ +} + extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ #ifdef __cplusplus diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h index c965ffb..5118954 100644 --- a/uts/common/fs/zfs/sys/spa_impl.h +++ b/uts/common/fs/zfs/sys/spa_impl.h @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -110,6 +112,7 @@ struct spa { * Fields protected by spa_namespace_lock. */ char spa_name[MAXNAMELEN]; /* pool name */ + char *spa_comment; /* comment */ avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ @@ -124,6 +127,7 @@ struct spa { uint64_t spa_import_flags; /* import specific flags */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; + boolean_t spa_is_initializing; /* true while opening pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ uint64_t spa_first_txg; /* first txg after spa_open() */ @@ -135,11 +139,13 @@ struct spa { objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ - uint64_t spa_load_guid; /* initial guid for spa_load */ + uint64_t spa_config_guid; /* config pool guid */ + uint64_t spa_load_guid; /* spa_load initialized guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ @@ -196,6 +202,7 @@ struct spa { kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ uint8_t spa_claiming; /* pool is doing zil_claim() */ + boolean_t spa_debug; /* debug enabled? */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ int spa_mode; /* FREAD | FWRITE */ @@ -215,7 +222,10 @@ struct spa { boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ - uint64_t spa_prev_software_version; + uint64_t spa_prev_software_version; /* See ub_software_version */ + uint64_t spa_feat_for_write_obj; /* required to write to pool */ + uint64_t spa_feat_for_read_obj; /* required to read from pool */ + uint64_t spa_feat_desc_obj; /* Feature descriptions */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/uts/common/fs/zfs/sys/vdev.h b/uts/common/fs/zfs/sys/vdev.h index 941f234..2329d5b 100644 --- a/uts/common/fs/zfs/sys/vdev.h +++ b/uts/common/fs/zfs/sys/vdev.h @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -48,7 +50,7 @@ extern boolean_t zfs_nocacheflush; extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); extern boolean_t vdev_uses_zvols(vdev_t *); -extern int vdev_validate(vdev_t *); +extern int vdev_validate(vdev_t *, boolean_t); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern void vdev_reopen(vdev_t *); @@ -140,8 +142,8 @@ extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, struct uberblock; extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern int vdev_label_number(uint64_t psise, uint64_t offset); -extern nvlist_t *vdev_label_read_config(vdev_t *vd); -extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); +extern nvlist_t *vdev_label_read_config(vdev_t *vd, int label); +extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h index 161bd21..6d2e962 100644 --- a/uts/common/fs/zfs/sys/vdev_impl.h +++ b/uts/common/fs/zfs/sys/vdev_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -55,7 +56,8 @@ typedef struct vdev_cache_entry vdev_cache_entry_t; /* * Virtual device operations */ -typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift); +typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, + uint64_t *ashift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef int vdev_io_start_func_t(zio_t *zio); @@ -118,6 +120,7 @@ struct vdev { uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_min_asize; /* min acceptable asize */ + uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ @@ -199,7 +202,7 @@ struct vdev { * For DTrace to work in userland (libzpool) context, these fields must * remain at the end of the structure. DTrace will use the kernel's * CTF definition for 'struct vdev', and since the size of a kmutex_t is - * larger in userland, the offsets for the rest fields would be + * larger in userland, the offsets for the rest of the fields would be * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ @@ -254,6 +257,7 @@ typedef struct vdev_label { #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 +#define VDEV_BEST_LABEL VDEV_LABELS #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 @@ -261,6 +265,7 @@ typedef struct vdev_label { #define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_ROOTPOOL 4 #define VDEV_ALLOC_SPLIT 5 +#define VDEV_ALLOC_ATTACH 6 /* * Allocate or free a vdev diff --git a/uts/common/fs/zfs/sys/zap.h b/uts/common/fs/zfs/sys/zap.h index a1130bb..4d7b315 100644 --- a/uts/common/fs/zfs/sys/zap.h +++ b/uts/common/fs/zfs/sys/zap.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ZAP_H @@ -132,6 +133,8 @@ uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, + uint64_t parent_obj, const char *name, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) @@ -300,12 +303,6 @@ int zap_add_int_key(objset_t *os, uint64_t obj, int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep); -/* - * They name is a stringified version of key; increment its value by - * delta. Zero values will be zap_remove()-ed. - */ -int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, - dmu_tx_t *tx); int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx); diff --git a/uts/common/fs/zfs/sys/zfeature.h b/uts/common/fs/zfs/sys/zfeature.h new file mode 100644 index 0000000..9ff1c93 --- /dev/null +++ b/uts/common/fs/zfs/sys/zfeature.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFEATURE_H +#define _SYS_ZFEATURE_H + +#include <sys/dmu.h> +#include <sys/nvpair.h> +#include "zfeature_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t feature_is_supported(objset_t *os, uint64_t obj, + uint64_t desc_obj, nvlist_t *unsup_feat); + +struct spa; +extern void spa_feature_create_zap_objects(struct spa *, dmu_tx_t *); +extern void spa_feature_enable(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern void spa_feature_incr(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern void spa_feature_decr(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *); +extern boolean_t spa_feature_is_active(struct spa *, zfeature_info_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFEATURE_H */ diff --git a/uts/common/fs/zfs/sys/zfs_acl.h b/uts/common/fs/zfs/sys/zfs_acl.h index c1a0aee..d1a6418 100644 --- a/uts/common/fs/zfs/sys/zfs_acl.h +++ b/uts/common/fs/zfs/sys/zfs_acl.h @@ -218,7 +218,7 @@ int zfs_fastaccesschk_execute(struct znode *, cred_t *); extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); -void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); +int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); int zfs_zaccess_rename(struct znode *, struct znode *, struct znode *, struct znode *, cred_t *cr); diff --git a/uts/common/fs/zfs/sys/zfs_context.h b/uts/common/fs/zfs/sys/zfs_context.h index 558e9e1..fdd0412 100644 --- a/uts/common/fs/zfs/sys/zfs_context.h +++ b/uts/common/fs/zfs/sys/zfs_context.h @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H @@ -39,6 +42,7 @@ extern "C" { #include <sys/cmn_err.h> #include <sys/kmem.h> #include <sys/taskq.h> +#include <sys/taskq_impl.h> #include <sys/buf.h> #include <sys/param.h> #include <sys/systm.h> diff --git a/uts/common/fs/zfs/sys/zfs_vfsops.h b/uts/common/fs/zfs/sys/zfs_vfsops.h index 38c87df..9af5cef 100644 --- a/uts/common/fs/zfs/sys/zfs_vfsops.h +++ b/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -57,6 +57,7 @@ struct zfsvfs { boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ diff --git a/uts/common/fs/zfs/sys/zio.h b/uts/common/fs/zfs/sys/zio.h index 97d8ec7..9e475b4 100644 --- a/uts/common/fs/zfs/sys/zio.h +++ b/uts/common/fs/zfs/sys/zio.h @@ -22,6 +22,10 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ #ifndef _ZIO_H #define _ZIO_H @@ -269,6 +273,14 @@ typedef struct zbookmark { #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) +#define ZB_IS_ZERO(zb) \ + ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ + (zb)->zb_level == 0 && (zb)->zb_blkid == 0) +#define ZB_IS_ROOT(zb) \ + ((zb)->zb_object == ZB_ROOT_OBJECT && \ + (zb)->zb_level == ZB_ROOT_LEVEL && \ + (zb)->zb_blkid == ZB_ROOT_BLKID) + typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; @@ -286,6 +298,7 @@ typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ +struct dnode_phys; struct zio_cksum_report { struct zio_cksum_report *zcr_next; @@ -417,6 +430,9 @@ struct zio { /* FMA state */ zio_cksum_report_t *io_cksum_report; uint64_t io_ena; + + /* Taskq dispatching state */ + taskq_ent_t io_tqent; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, @@ -552,6 +568,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); +/* zbookmark functions */ +boolean_t zbookmark_is_before(const struct dnode_phys *dnp, + const zbookmark_t *zb1, const zbookmark_t *zb2); + #ifdef __cplusplus } #endif diff --git a/uts/common/fs/zfs/txg.c b/uts/common/fs/zfs/txg.c index 9b308ca..55b1f38 100644 --- a/uts/common/fs/zfs/txg.c +++ b/uts/common/fs/zfs/txg.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska */ #include <sys/zfs_context.h> @@ -479,7 +480,7 @@ void txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) { tx_state_t *tx = &dp->dp_tx; - int timeout = ddi_get_lbolt() + ticks; + clock_t timeout = ddi_get_lbolt() + ticks; /* don't delay if this txg could transition to quiesing immediately */ if (tx->tx_open_txg > txg || diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c index bac3e86..6fbaf7b 100644 --- a/uts/common/fs/zfs/vdev.c +++ b/uts/common/fs/zfs/vdev.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -106,7 +108,7 @@ vdev_get_min_asize(vdev_t *vd) vdev_t *pvd = vd->vdev_parent; /* - * The our parent is NULL (inactive spare or cache) or is the root, + * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) @@ -286,6 +288,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; + spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { @@ -485,7 +488,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_removing); } - if (parent && !parent->vdev_parent) { + if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || @@ -661,6 +664,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; + if (tvd->vdev_mg) + ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; @@ -732,6 +737,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; + mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; @@ -1103,7 +1109,8 @@ vdev_open(vdev_t *vd) spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; - uint64_t asize, psize; + uint64_t max_osize = 0; + uint64_t asize, max_asize, psize; uint64_t ashift = 0; ASSERT(vd->vdev_open_thread == curthread || @@ -1134,7 +1141,7 @@ vdev_open(vdev_t *vd) return (ENXIO); } - error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); + error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); /* * Reset the vdev_reopening flag so that we actually close @@ -1192,6 +1199,7 @@ vdev_open(vdev_t *vd) } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); + max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { @@ -1201,6 +1209,8 @@ vdev_open(vdev_t *vd) } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + max_asize = max_osize - (VDEV_LABEL_START_SIZE + + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { @@ -1210,6 +1220,7 @@ vdev_open(vdev_t *vd) } psize = 0; asize = osize; + max_asize = max_osize; } vd->vdev_psize = psize; @@ -1229,16 +1240,22 @@ vdev_open(vdev_t *vd) * For testing purposes, a higher ashift can be requested. */ vd->vdev_asize = asize; + vd->vdev_max_asize = max_asize; vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); } else { /* - * Make sure the alignment requirement hasn't increased. + * Detect if the alignment requirement has increased. + * We don't want to make the pool unavailable, just + * issue a warning instead. */ - if (ashift > vd->vdev_top->vdev_ashift) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); + if (ashift > vd->vdev_top->vdev_ashift && + vd->vdev_ops->vdev_op_leaf) { + cmn_err(CE_WARN, + "Disk, '%s', has a block alignment that is " + "larger than the pool's alignment\n", + vd->vdev_path); } + vd->vdev_max_asize = max_asize; } /* @@ -1280,13 +1297,18 @@ vdev_open(vdev_t *vd) * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * + * If 'strict' is false ignore the spa guid check. This is necessary because + * if the machine crashed during a re-guid the new guid might have been written + * to all of the vdev labels, but not the cached config. The strict check + * will be performed when the pool is opened again using the mos config. + * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int -vdev_validate(vdev_t *vd) +vdev_validate(vdev_t *vd, boolean_t strict) { spa_t *spa = vd->vdev_spa; nvlist_t *label; @@ -1294,7 +1316,7 @@ vdev_validate(vdev_t *vd) uint64_t state; for (int c = 0; c < vd->vdev_children; c++) - if (vdev_validate(vd->vdev_child[c]) != 0) + if (vdev_validate(vd->vdev_child[c], strict) != 0) return (EBADF); /* @@ -1306,7 +1328,8 @@ vdev_validate(vdev_t *vd) uint64_t aux_guid = 0; nvlist_t *nvl; - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == + NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); @@ -1324,8 +1347,9 @@ vdev_validate(vdev_t *vd) return (0); } - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, - &guid) != 0 || guid != spa_guid(spa)) { + if (strict && (nvlist_lookup_uint64(label, + ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || + guid != spa_guid(spa))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); @@ -1487,7 +1511,7 @@ vdev_reopen(vdev_t *vd) !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { - (void) vdev_validate(vd); + (void) vdev_validate(vd, B_TRUE); } /* @@ -1946,14 +1970,14 @@ vdev_validate_aux(vdev_t *vd) if (!vdev_readable(vd)) return (0); - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || - version > SPA_VERSION || + !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { @@ -2456,6 +2480,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; mutex_exit(&vd->vdev_stat_lock); /* diff --git a/uts/common/fs/zfs/vdev_cache.c b/uts/common/fs/zfs/vdev_cache.c index 688d541..77f8116 100644 --- a/uts/common/fs/zfs/vdev_cache.c +++ b/uts/common/fs/zfs/vdev_cache.c @@ -71,9 +71,16 @@ * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software * track buffer). At most zfs_vdev_cache_size bytes will be kept in each * vdev's vdev_cache. + * + * TODO: Note that with the current ZFS code, it turns out that the + * vdev cache is not helpful, and in some cases actually harmful. It + * is better if we disable this. Once some time has passed, we should + * actually remove this to simplify the code. For now we just disable + * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11 + * has made these same changes. */ int zfs_vdev_cache_max = 1<<14; /* 16KB */ -int zfs_vdev_cache_size = 10ULL << 20; /* 10MB */ +int zfs_vdev_cache_size = 0; int zfs_vdev_cache_bshift = 16; #define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ diff --git a/uts/common/fs/zfs/vdev_disk.c b/uts/common/fs/zfs/vdev_disk.c index d741773..759f0f8 100644 --- a/uts/common/fs/zfs/vdev_disk.c +++ b/uts/common/fs/zfs/vdev_disk.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -30,6 +31,7 @@ #include <sys/fs/zfs.h> #include <sys/zio.h> #include <sys/sunldi.h> +#include <sys/efi_partition.h> #include <sys/fm/fs/zfs.h> /* @@ -102,8 +104,39 @@ vdev_disk_rele(vdev_t *vd) } } +static uint64_t +vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz) +{ + ASSERT(vd->vdev_wholedisk); + + vdev_disk_t *dvd = vd->vdev_tsd; + dk_efi_t dk_ioc; + efi_gpt_t *efi; + uint64_t avail_space = 0; + int efisize = EFI_LABEL_SIZE * 2; + + dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP); + dk_ioc.dki_lba = 1; + dk_ioc.dki_length = efisize; + dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data; + efi = dk_ioc.dki_data; + + if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc, + FKIOCTL, kcred, NULL) == 0) { + uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA); + + zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu", + vd->vdev_path, capacity, efi_altern_lba); + if (capacity > efi_altern_lba) + avail_space = (capacity - efi_altern_lba) * blksz; + } + kmem_free(dk_ioc.dki_data, efisize); + return (avail_space); +} + static int -vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; @@ -274,16 +307,6 @@ skip_open: } /* - * If we own the whole disk, try to enable disk write caching. - * We ignore errors because it's OK if we can't do it. - */ - if (vd->vdev_wholedisk == 1) { - int wce = 1; - (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, - FKIOCTL, kcred, NULL); - } - - /* * Determine the device's minimum transfer size. * If the ioctl isn't supported, assume DEV_BSIZE. */ @@ -293,6 +316,25 @@ skip_open: *ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1; + if (vd->vdev_wholedisk == 1) { + uint64_t capacity = dkmext.dki_capacity - 1; + uint64_t blksz = dkmext.dki_lbsize; + int wce = 1; + + /* + * If we own the whole disk, try to enable disk write caching. + * We ignore errors because it's OK if we can't do it. + */ + (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, + FKIOCTL, kcred, NULL); + + *max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz); + zfs_dbgmsg("capacity change: vdev %s, psize %llu, " + "max_psize %llu", vd->vdev_path, *psize, *max_psize); + } else { + *max_psize = *psize; + } + /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. diff --git a/uts/common/fs/zfs/vdev_file.c b/uts/common/fs/zfs/vdev_file.c index 8c22aa5..043fa51 100644 --- a/uts/common/fs/zfs/vdev_file.c +++ b/uts/common/fs/zfs/vdev_file.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -47,7 +48,8 @@ vdev_file_rele(vdev_t *vd) } static int -vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) { vdev_file_t *vf; vnode_t *vp; @@ -112,7 +114,7 @@ skip_open: return (error); } - *psize = vattr.va_size; + *max_psize = *psize = vattr.va_size; *ashift = SPA_MINBLOCKSHIFT; return (0); diff --git a/uts/common/fs/zfs/vdev_label.c b/uts/common/fs/zfs/vdev_label.c index c08ed8b..b943647 100644 --- a/uts/common/fs/zfs/vdev_label.c +++ b/uts/common/fs/zfs/vdev_label.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -121,6 +123,8 @@ * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. + * features_for_read + * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * @@ -428,8 +432,13 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } +/* + * Returns the configuration from the label of the given vdev. If 'label' is + * VDEV_BEST_LABEL, each label of the vdev will be read until a valid + * configuration is found; otherwise, only the specified label will be read. + */ nvlist_t * -vdev_label_read_config(vdev_t *vd) +vdev_label_read_config(vdev_t *vd, int label) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; @@ -447,6 +456,8 @@ vdev_label_read_config(vdev_t *vd) retry: for (int l = 0; l < VDEV_LABELS; l++) { + if (label >= 0 && label < VDEV_LABELS && label != l) + continue; zio = zio_root(spa, NULL, NULL, flags); @@ -496,7 +507,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, /* * Read the label, if any, and perform some basic sanity checks. */ - if ((label = vdev_label_read_config(vd)) == NULL) + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, @@ -833,7 +844,7 @@ retry: * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then - * the missing replica comes back, then for a new seconds we'll have two + * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ @@ -853,46 +864,50 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) return (0); } +struct ubl_cbdata { + uberblock_t *ubl_ubbest; /* Best uberblock */ + vdev_t *ubl_vd; /* vdev associated with the above */ + int ubl_label; /* Label associated with the above */ +}; + static void vdev_uberblock_load_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = zio->io_data; - uberblock_t *ubbest = rio->io_private; + struct ubl_cbdata *cbp = rio->io_private; - ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (ub->ub_txg <= spa->spa_load_max_txg && - vdev_uberblock_compare(ub, ubbest) > 0) - *ubbest = *ub; + vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { + /* + * Keep track of the vdev and label in which this + * uberblock was found. We will use this information + * later to obtain the config nvlist associated with + * this uberblock. + */ + *cbp->ubl_ubbest = *ub; + cbp->ubl_vd = vd; + cbp->ubl_label = vdev_label_number(vd->vdev_psize, + zio->io_offset); + } mutex_exit(&rio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); } -void -vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) +static void +vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, + struct ubl_cbdata *cbp) { - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; - - if (vd == rvd) { - ASSERT(zio == NULL); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - zio = zio_root(spa, NULL, ubbest, flags); - bzero(ubbest, sizeof (uberblock_t)); - } - - ASSERT(zio != NULL); - for (int c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { for (int l = 0; l < VDEV_LABELS; l++) { @@ -905,11 +920,45 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) } } } +} - if (vd == rvd) { - (void) zio_wait(zio); - spa_config_exit(spa, SCL_ALL, FTAG); +/* + * Reads the 'best' uberblock from disk along with its associated + * configuration. First, we read the uberblock array of each label of each + * vdev, keeping track of the uberblock with the highest txg in each array. + * Then, we read the configuration from the same label as the best uberblock. + */ +void +vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) +{ + int i; + zio_t *zio; + spa_t *spa = rvd->vdev_spa; + struct ubl_cbdata cb; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(ub); + ASSERT(config); + + bzero(ub, sizeof (uberblock_t)); + *config = NULL; + + cb.ubl_ubbest = ub; + cb.ubl_vd = NULL; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, &cb, flags); + vdev_uberblock_load_impl(zio, rvd, flags, &cb); + (void) zio_wait(zio); + if (cb.ubl_vd != NULL) { + for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) { + *config = vdev_label_read_config(cb.ubl_vd, i); + if (*config != NULL) + break; + } } + spa_config_exit(spa, SCL_ALL, FTAG); } /* diff --git a/uts/common/fs/zfs/vdev_mirror.c b/uts/common/fs/zfs/vdev_mirror.c index 698c027..a28ca3e 100644 --- a/uts/common/fs/zfs/vdev_mirror.c +++ b/uts/common/fs/zfs/vdev_mirror.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> @@ -127,7 +131,8 @@ vdev_mirror_map_alloc(zio_t *zio) } static int -vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *ashift) { int numerrors = 0; int lasterror = 0; @@ -149,6 +154,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *ashift = MAX(*ashift, cvd->vdev_ashift); } diff --git a/uts/common/fs/zfs/vdev_missing.c b/uts/common/fs/zfs/vdev_missing.c index 6a5588d..3bd8c90 100644 --- a/uts/common/fs/zfs/vdev_missing.c +++ b/uts/common/fs/zfs/vdev_missing.c @@ -24,6 +24,10 @@ */ /* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* * The 'missing' vdev is a special vdev type used only during import. It * signifies a placeholder in the root vdev for some vdev that we know is * missing. We pass it down to the kernel to allow the rest of the @@ -40,7 +44,8 @@ /* ARGSUSED */ static int -vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) { /* * Really this should just fail. But then the root vdev will be in the @@ -49,6 +54,7 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * will fail the GUID sum check before ever trying to open the pool. */ *psize = 0; + *max_psize = 0; *ashift = 0; return (0); } diff --git a/uts/common/fs/zfs/vdev_raidz.c b/uts/common/fs/zfs/vdev_raidz.c index 4b0f560..030ea42 100644 --- a/uts/common/fs/zfs/vdev_raidz.c +++ b/uts/common/fs/zfs/vdev_raidz.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -1441,7 +1442,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) } static int -vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *ashift) { vdev_t *cvd; uint64_t nparity = vd->vdev_nparity; @@ -1469,10 +1471,12 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *ashift = MAX(*ashift, cvd->vdev_ashift); } *asize *= vd->vdev_children; + *max_asize *= vd->vdev_children; if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; diff --git a/uts/common/fs/zfs/vdev_root.c b/uts/common/fs/zfs/vdev_root.c index 879f78f..1abc79d 100644 --- a/uts/common/fs/zfs/vdev_root.c +++ b/uts/common/fs/zfs/vdev_root.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> @@ -50,7 +54,8 @@ too_many_errors(vdev_t *vd, int numerrors) } static int -vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *ashift) { int lasterror = 0; int numerrors = 0; @@ -77,6 +82,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) } *asize = 0; + *max_asize = 0; *ashift = 0; return (0); diff --git a/uts/common/fs/zfs/zap.c b/uts/common/fs/zfs/zap.c index 288a4d9..fa1d99f 100644 --- a/uts/common/fs/zfs/zap.c +++ b/uts/common/fs/zfs/zap.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -946,6 +947,19 @@ fzap_prefetch(zap_name_t *zn) * Helper functions for consumers. */ +uint64_t +zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, dmu_tx_t *tx) +{ + uint64_t new_obj; + + VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0); + VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, + tx) == 0); + + return (new_obj); +} + int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name) diff --git a/uts/common/fs/zfs/zap_micro.c b/uts/common/fs/zfs/zap_micro.c index 2d89c20..3e80fb9 100644 --- a/uts/common/fs/zfs/zap_micro.c +++ b/uts/common/fs/zfs/zap_micro.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zio.h> @@ -460,7 +461,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif @@ -584,7 +585,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif @@ -1403,7 +1404,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, } /* - * We lock the zap with adding == FALSE. Because, if we pass + * We lock the zap with adding == FALSE. Because, if we pass * the actual value of add, it could trigger a mzap_upgrade(). * At present we are just evaluating the possibility of this operation * and hence we donot want to trigger an upgrade. diff --git a/uts/common/fs/zfs/zfeature.c b/uts/common/fs/zfs/zfeature.c new file mode 100644 index 0000000..ba72208 --- /dev/null +++ b/uts/common/fs/zfs/zfeature.c @@ -0,0 +1,414 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/zfeature.h> +#include <sys/dmu.h> +#include <sys/nvpair.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include "zfeature_common.h" +#include <sys/spa_impl.h> + +/* + * ZFS Feature Flags + * ----------------- + * + * ZFS feature flags are used to provide fine-grained versioning to the ZFS + * on-disk format. Once enabled on a pool feature flags replace the old + * spa_version() number. + * + * Each new on-disk format change will be given a uniquely identifying string + * guid rather than a version number. This avoids the problem of different + * organizations creating new on-disk formats with the same version number. To + * keep feature guids unique they should consist of the reverse dns name of the + * organization which implemented the feature and a short name for the feature, + * separated by a colon (e.g. com.delphix:async_destroy). + * + * Reference Counts + * ---------------- + * + * Within each pool features can be in one of three states: disabled, enabled, + * or active. These states are differentiated by a reference count stored on + * disk for each feature: + * + * 1) If there is no reference count stored on disk the feature is disabled. + * 2) If the reference count is 0 a system administrator has enabled the + * feature, but the feature has not been used yet, so no on-disk + * format changes have been made. + * 3) If the reference count is greater than 0 the feature is active. + * The format changes required by the feature are currently on disk. + * Note that if the feature's format changes are reversed the feature + * may choose to set its reference count back to 0. + * + * Feature flags makes no differentiation between non-zero reference counts + * for an active feature (e.g. a reference count of 1 means the same thing as a + * reference count of 27834721), but feature implementations may choose to use + * the reference count to store meaningful information. For example, a new RAID + * implementation might set the reference count to the number of vdevs using + * it. If all those disks are removed from the pool the feature goes back to + * having a reference count of 0. + * + * It is the responsibility of the individual features to maintain a non-zero + * reference count as long as the feature's format changes are present on disk. + * + * Dependencies + * ------------ + * + * Each feature may depend on other features. The only effect of this + * relationship is that when a feature is enabled all of its dependencies are + * automatically enabled as well. Any future work to support disabling of + * features would need to ensure that features cannot be disabled if other + * enabled features depend on them. + * + * On-disk Format + * -------------- + * + * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES + * (5000). In order for this to work the pool is automatically upgraded to + * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk + * format changes will be in use. + * + * Information about features is stored in 3 ZAP objects in the pool's MOS. + * These objects are linked to by the following names in the pool directory + * object: + * + * 1) features_for_read: feature guid -> reference count + * Features needed to open the pool for reading. + * 2) features_for_write: feature guid -> reference count + * Features needed to open the pool for writing. + * 3) feature_descriptions: feature guid -> descriptive string + * A human readable string. + * + * All enabled features appear in either features_for_read or + * features_for_write, but not both. + * + * To open a pool in read-only mode only the features listed in + * features_for_read need to be supported. + * + * To open the pool in read-write mode features in both features_for_read and + * features_for_write need to be supported. + * + * Some features may be required to read the ZAP objects containing feature + * information. To allow software to check for compatibility with these features + * before the pool is opened their names must be stored in the label in a + * new "features_for_read" entry (note that features that are only required + * to write to a pool never need to be stored in the label since the + * features_for_write ZAP object can be read before the pool is written to). + * To save space in the label features must be explicitly marked as needing to + * be written to the label. Also, reference counts are not stored in the label, + * instead any feature whose reference count drops to 0 is removed from the + * label. + * + * Adding New Features + * ------------------- + * + * Features must be registered in zpool_feature_init() function in + * zfeature_common.c using the zfeature_register() function. This function + * has arguments to specify if the feature should be stored in the + * features_for_read or features_for_write ZAP object and if it needs to be + * written to the label when active. + * + * Once a feature is registered it will appear as a "feature@<feature name>" + * property which can be set by an administrator. Feature implementors should + * use the spa_feature_is_enabled() and spa_feature_is_active() functions to + * query the state of a feature and the spa_feature_incr() and + * spa_feature_decr() functions to change an enabled feature's reference count. + * Reference counts may only be updated in the syncing context. + * + * Features may not perform enable-time initialization. Instead, any such + * initialization should occur when the feature is first used. This design + * enforces that on-disk changes be made only when features are used. Code + * should only check if a feature is enabled using spa_feature_is_enabled(), + * not by relying on any feature specific metadata existing. If a feature is + * enabled, but the feature's metadata is not on disk yet then it should be + * created as needed. + * + * As an example, consider the com.delphix:async_destroy feature. This feature + * relies on the existence of a bptree in the MOS that store blocks for + * asynchronous freeing. This bptree is not created when async_destroy is + * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is + * called to check if async_destroy is enabled. If it is and the bptree object + * does not exist yet, the bptree object is created as part of the dataset + * destroy and async_destroy's reference count is incremented to indicate it + * has made an on-disk format change. Later, after the destroyed dataset's + * blocks have all been asynchronously freed there is no longer any use for the + * bptree object, so it is destroyed and async_destroy's reference count is + * decremented back to 0 to indicate that it has undone its on-disk format + * changes. + */ + +typedef enum { + FEATURE_ACTION_ENABLE, + FEATURE_ACTION_INCR, + FEATURE_ACTION_DECR, +} feature_action_t; + +/* + * Checks that the features active in the specified object are supported by + * this software. Adds each unsupported feature (name -> description) to + * the supplied nvlist. + */ +boolean_t +feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, + nvlist_t *unsup_feat) +{ + boolean_t supported; + zap_cursor_t zc; + zap_attribute_t za; + + supported = B_TRUE; + for (zap_cursor_init(&zc, os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + + if (za.za_first_integer != 0 && + !zfeature_is_supported(za.za_name)) { + supported = B_FALSE; + + if (unsup_feat != NULL) { + char *desc = ""; + char buf[MAXPATHLEN]; + + if (zap_lookup(os, desc_obj, za.za_name, + 1, sizeof (buf), buf) == 0) + desc = buf; + + VERIFY(nvlist_add_string(unsup_feat, za.za_name, + desc) == 0); + } + } + } + zap_cursor_fini(&zc); + + return (supported); +} + +static int +feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj, + zfeature_info_t *feature, uint64_t *res) +{ + int err; + uint64_t refcount; + uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; + + ASSERT(0 != zapobj); + + err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1, + &refcount); + if (err != 0) { + if (err == ENOENT) + return (ENOTSUP); + else + return (err); + } + *res = refcount; + return (0); +} + +static int +feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj, + uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action, + dmu_tx_t *tx) +{ + int error; + uint64_t refcount; + uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; + + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + + error = zap_lookup(os, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount); + + /* + * If we can't ascertain the status of the specified feature, an I/O + * error occurred. + */ + if (error != 0 && error != ENOENT) + return (error); + + switch (action) { + case FEATURE_ACTION_ENABLE: + /* + * If the feature is already enabled, ignore the request. + */ + if (error == 0) + return (0); + refcount = 0; + break; + case FEATURE_ACTION_INCR: + if (error == ENOENT) + return (ENOTSUP); + if (refcount == UINT64_MAX) + return (EOVERFLOW); + refcount++; + break; + case FEATURE_ACTION_DECR: + if (error == ENOENT) + return (ENOTSUP); + if (refcount == 0) + return (EOVERFLOW); + refcount--; + break; + default: + ASSERT(0); + break; + } + + if (action == FEATURE_ACTION_ENABLE) { + int i; + + for (i = 0; feature->fi_depends[i] != NULL; i++) { + zfeature_info_t *dep = feature->fi_depends[i]; + + error = feature_do_action(os, read_obj, write_obj, + desc_obj, dep, FEATURE_ACTION_ENABLE, tx); + if (error != 0) + return (error); + } + } + + error = zap_update(os, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount, tx); + if (error != 0) + return (error); + + if (action == FEATURE_ACTION_ENABLE) { + error = zap_update(os, desc_obj, + feature->fi_guid, 1, strlen(feature->fi_desc) + 1, + feature->fi_desc, tx); + if (error != 0) + return (error); + } + + if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) { + spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid); + } + + if (action == FEATURE_ACTION_DECR && refcount == 0) { + spa_deactivate_mos_feature(dmu_objset_spa(os), + feature->fi_guid); + } + + return (0); +} + +void +spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We create feature flags ZAP objects in two instances: during pool + * creation and during pool upgrade. + */ + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on && + tx->tx_txg == TXG_INITIAL)); + + spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_READ, tx); + spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_WRITE, tx); + spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_DESCRIPTIONS, tx); +} + +/* + * Enable any required dependencies, then enable the requested feature. + */ +void +spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx)); +} + +/* + * If the specified feature has not yet been enabled, this function returns + * ENOTSUP; otherwise, this function increments the feature's refcount (or + * returns EOVERFLOW if the refcount cannot be incremented). This function must + * be called from syncing context. + */ +void +spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx)); +} + +/* + * If the specified feature has not yet been enabled, this function returns + * ENOTSUP; otherwise, this function decrements the feature's refcount (or + * returns EOVERFLOW if the refcount is already 0). This function must + * be called from syncing context. + */ +void +spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx)); +} + +boolean_t +spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature) +{ + int err; + uint64_t refcount; + + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + feature, &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0); +} + +boolean_t +spa_feature_is_active(spa_t *spa, zfeature_info_t *feature) +{ + int err; + uint64_t refcount; + + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + feature, &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0 && refcount > 0); +} diff --git a/uts/common/fs/zfs/zfs_acl.c b/uts/common/fs/zfs/zfs_acl.c index 843b5ff..2b93fc8 100644 --- a/uts/common/fs/zfs/zfs_acl.c +++ b/uts/common/fs/zfs/zfs_acl.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ #include <sys/types.h> @@ -1330,75 +1331,8 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); } -/* - * Update access mask for prepended ACE - * - * This applies the "groupmask" value for aclmode property. - */ -static void -zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep, - mode_t mode, uint64_t owner) -{ - int rmask, wmask, xmask; - int user_ace; - uint16_t aceflags; - uint32_t origmask, acepmask; - uint64_t fuid; - - aceflags = aclp->z_ops.ace_flags_get(acep); - fuid = aclp->z_ops.ace_who_get(acep); - origmask = aclp->z_ops.ace_mask_get(origacep); - acepmask = aclp->z_ops.ace_mask_get(acep); - - user_ace = (!(aceflags & - (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP))); - - if (user_ace && (fuid == owner)) { - rmask = S_IRUSR; - wmask = S_IWUSR; - xmask = S_IXUSR; - } else { - rmask = S_IRGRP; - wmask = S_IWGRP; - xmask = S_IXGRP; - } - - if (origmask & ACE_READ_DATA) { - if (mode & rmask) { - acepmask &= ~ACE_READ_DATA; - } else { - acepmask |= ACE_READ_DATA; - } - } - - if (origmask & ACE_WRITE_DATA) { - if (mode & wmask) { - acepmask &= ~ACE_WRITE_DATA; - } else { - acepmask |= ACE_WRITE_DATA; - } - } - - if (origmask & ACE_APPEND_DATA) { - if (mode & wmask) { - acepmask &= ~ACE_APPEND_DATA; - } else { - acepmask |= ACE_APPEND_DATA; - } - } - - if (origmask & ACE_EXECUTE) { - if (mode & xmask) { - acepmask &= ~ACE_EXECUTE; - } else { - acepmask |= ACE_EXECUTE; - } - } - aclp->z_ops.ace_mask_set(acep, acepmask); -} - static void -zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) +zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t trim, zfs_acl_t *aclp) { void *acep = NULL; uint64_t who; @@ -1410,30 +1344,31 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) zfs_acl_node_t *newnode; size_t abstract_size = aclp->z_ops.ace_abstract_size(); void *zacep; - uint32_t owner, group, everyone; - uint32_t deny1, deny2, allow0; + boolean_t isdir; + trivial_acl_t masks; new_count = new_bytes = 0; - acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2, - &owner, &group, &everyone); + isdir = (vtype == VDIR); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); zacep = newnode->z_acldata; - if (allow0) { - zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER); + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; - } if (deny1) { - zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER); + } if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } - if (deny2) { - zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP); + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; @@ -1452,10 +1387,17 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) continue; } + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (inherit_flags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + if ((type != ALLOW && type != DENY) || (inherit_flags & ACE_INHERIT_ONLY_ACE)) { - if (inherit_flags) - aclp->z_hints |= ZFS_INHERIT_ACE; switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: @@ -1468,20 +1410,13 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) /* * Limit permissions to be no greater than - * group permissions + * group permissions. + * The "aclinherit" and "aclmode" properties + * affect policy for create and chmod(2), + * respectively. */ - if (zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) { - if (!(mode & S_IRGRP)) - access_mask &= ~ACE_READ_DATA; - if (!(mode & S_IWGRP)) - access_mask &= - ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - if (!(mode & S_IXGRP)) - access_mask &= ~ACE_EXECUTE; - access_mask &= - ~(ACE_WRITE_OWNER|ACE_WRITE_ACL| - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS); - } + if ((type == ALLOW) && trim) + access_mask &= masks.group; } zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); ace_size = aclp->z_ops.ace_size(acep); @@ -1489,11 +1424,11 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) new_count++; new_bytes += ace_size; } - zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER); + zfs_set_ace(aclp, zacep, masks.owner, 0, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); - zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP); + zfs_set_ace(aclp, zacep, masks.group, 0, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); - zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE); + zfs_set_ace(aclp, zacep, masks.everyone, 0, -1, ACE_EVERYONE); new_count += 3; new_bytes += abstract_size * 3; @@ -1505,17 +1440,27 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) list_insert_tail(&aclp->z_acl, newnode); } -void +int zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { + int error = 0; + mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); - *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); - (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; - zfs_acl_chmod(zp->z_zfsvfs, mode, *aclp); + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(ZTOV(zp)->v_type, mode, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); - ASSERT(*aclp); + + return (error); } /* @@ -1763,8 +1708,8 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, if (acl_ids->z_aclp == NULL) { mutex_enter(&dzp->z_acl_lock); mutex_enter(&dzp->z_lock); - if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR && - (dzp->z_pflags & ZFS_INHERIT_ACE)) && + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && !(dzp->z_pflags & ZFS_XATTR)) { VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, &paclp, B_FALSE)); @@ -1781,7 +1726,9 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, if (need_chmod) { acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ? ZFS_ACL_AUTO_INHERIT : 0; - zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp); + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, + (zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED), + acl_ids->z_aclp); } } diff --git a/uts/common/fs/zfs/zfs_fm.c b/uts/common/fs/zfs/zfs_fm.c index 0b48126..fa5903a 100644 --- a/uts/common/fs/zfs/zfs_fm.c +++ b/uts/common/fs/zfs/zfs_fm.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include <sys/spa.h> #include <sys/spa_impl.h> #include <sys/vdev.h> @@ -709,6 +713,10 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, if (report->zcr_ereport == NULL) { report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); + if (report->zcr_ckinfo != NULL) { + kmem_free(report->zcr_ckinfo, + sizeof (*report->zcr_ckinfo)); + } kmem_free(report, sizeof (*report)); return; } diff --git a/uts/common/fs/zfs/zfs_ioctl.c b/uts/common/fs/zfs/zfs_ioctl.c index 1b63c9b..137816e 100644 --- a/uts/common/fs/zfs/zfs_ioctl.c +++ b/uts/common/fs/zfs/zfs_ioctl.c @@ -18,8 +18,13 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -48,6 +53,7 @@ #include <sys/dsl_prop.h> #include <sys/dsl_deleg.h> #include <sys/dmu_objset.h> +#include <sys/dmu_impl.h> #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/sunldi.h> @@ -347,17 +353,37 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) return (zfs_dozonecheck_impl(dataset, zoned, cr)); } +/* + * If name ends in a '@', then require recursive permissions. + */ int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; + boolean_t descendent = B_FALSE; + dsl_dataset_t *ds; + char *at; + + at = strchr(name, '@'); + if (at != NULL && at[1] == '\0') { + *at = '\0'; + descendent = B_TRUE; + } - error = zfs_dozonecheck(name, cr); + error = dsl_dataset_hold(name, FTAG, &ds); + if (at != NULL) + *at = '@'; + if (error != 0) + return (error); + + error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error) - error = dsl_deleg_access(name, perm, cr); + error = dsl_deleg_access_impl(ds, descendent, perm, cr); } + + dsl_dataset_rele(ds, FTAG); return (error); } @@ -371,7 +397,7 @@ zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, if (error == 0) { error = secpolicy_zfs(cr); if (error) - error = dsl_deleg_access_impl(ds, perm, cr); + error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); } return (error); } @@ -678,21 +704,14 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) /* * Destroying snapshots with delegated permissions requires * descendent mount and destroy permissions. - * Reassemble the full filesystem@snap name so dsl_deleg_access() - * can do the correct permission check. - * - * Since this routine is used when doing a recursive destroy of snapshots - * and destroying snapshots requires descendent permissions, a successfull - * check of the top level snapshot applies to snapshots of all descendent - * datasets as well. */ static int -zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_destroy_recursive(zfs_cmd_t *zc, cred_t *cr) { int error; char *dsname; - dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); + dsname = kmem_asprintf("%s@", zc->zc_name); error = zfs_secpolicy_destroy_perms(dsname, cr); @@ -1116,6 +1135,8 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp) /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_vfs will be NULL, and it will be opened as the owner. + * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, + * which prevents all vnode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) @@ -1179,7 +1200,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); - if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) { + if (!SPA_VERSION_IS_SUPPORTED(version)) { error = EINVAL; goto pool_props_bad; } @@ -1303,6 +1324,15 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * + * outputs: + * zc_cookie real errno + * zc_nvlist_dst config nvlist + * zc_nvlist_dst_size size of config nvlist + */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { @@ -1404,7 +1434,8 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { + if (zc->zc_cookie < spa_version(spa) || + !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (EINVAL); } @@ -1448,6 +1479,20 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) } static int +zfs_ioc_pool_reguid(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + error = spa_change_guid(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { int error; @@ -1744,9 +1789,12 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * inconsistent. So this is a bit of a workaround... * XXX reading with out owning */ - if (!zc->zc_objset_stats.dds_inconsistent) { - if (dmu_objset_type(os) == DMU_OST_ZVOL) - VERIFY(zvol_get_stats(os, nv) == 0); + if (!zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZVOL) { + error = zvol_get_stats(os, nv); + if (error == EIO) + return (error); + VERIFY3S(error, ==, 0); } error = put_nvlist(zc, nv); nvlist_free(nv); @@ -1943,8 +1991,10 @@ top: uint64_t cookie = 0; int len = sizeof (zc->zc_name) - (p - zc->zc_name); - while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) - (void) dmu_objset_prefetch(p, NULL); + while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) { + if (!dataset_name_hidden(zc->zc_name)) + (void) dmu_objset_prefetch(zc->zc_name, NULL); + } } do { @@ -1953,8 +2003,7 @@ top: NULL, &zc->zc_cookie); if (error == ENOENT) error = ESRCH; - } while (error == 0 && dataset_name_hidden(zc->zc_name) && - !(zc->zc_iflags & FKIOCTL)); + } while (error == 0 && dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* @@ -2232,6 +2281,8 @@ retry: if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = EINVAL; + } else { + err = EINVAL; } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { @@ -3098,25 +3149,45 @@ zfs_unmount_snap(const char *name, void *arg) /* * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot + * zc_name name of filesystem, snaps must be under it + * zc_nvlist_src[_size] full names of snapshots to destroy * zc_defer_destroy mark for deferred destroy * - * outputs: none + * outputs: + * zc_name on failure, name of failed snapshot */ static int -zfs_ioc_destroy_snaps(zfs_cmd_t *zc) +zfs_ioc_destroy_snaps_nvl(zfs_cmd_t *zc) { - int err; + int err, len; + nvlist_t *nvl; + nvpair_t *pair; - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - err = dmu_objset_find(zc->zc_name, - zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); - if (err) + if ((err = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl)) != 0) return (err); - return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value, - zc->zc_defer_destroy)); + + len = strlen(zc->zc_name); + for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(nvl, pair)) { + const char *name = nvpair_name(pair); + /* + * The snap name must be underneath the zc_name. This ensures + * that our permission checks were legitimate. + */ + if (strncmp(zc->zc_name, name, len) != 0 || + (name[len] != '@' && name[len] != '/')) { + nvlist_free(nvl); + return (EINVAL); + } + + (void) zfs_unmount_snap(name, NULL); + } + + err = dmu_snapshots_destroy_nvl(nvl, zc->zc_defer_destroy, + zc->zc_name); + nvlist_free(nvl); + return (err); } /* @@ -3759,6 +3830,8 @@ out: * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) + * zc_guid if set, estimate size of stream only. zc_cookie is ignored. + * output size in zc_objset_type. * * outputs: none */ @@ -3767,13 +3840,13 @@ zfs_ioc_send(zfs_cmd_t *zc) { objset_t *fromsnap = NULL; objset_t *tosnap; - file_t *fp; int error; offset_t off; dsl_dataset_t *ds; dsl_dataset_t *dsfrom = NULL; spa_t *spa; dsl_pool_t *dp; + boolean_t estimate = (zc->zc_guid != 0); error = spa_open(zc->zc_name, &spa, FTAG); if (error) @@ -3814,26 +3887,75 @@ zfs_ioc_send(zfs_cmd_t *zc) spa_close(spa, FTAG); } - fp = getf(zc->zc_cookie); - if (fp == NULL) { - dsl_dataset_rele(ds, FTAG); - if (dsfrom) - dsl_dataset_rele(dsfrom, FTAG); - return (EBADF); - } + if (estimate) { + error = dmu_send_estimate(tosnap, fromsnap, zc->zc_obj, + &zc->zc_objset_type); + } else { + file_t *fp = getf(zc->zc_cookie); + if (fp == NULL) { + dsl_dataset_rele(ds, FTAG); + if (dsfrom) + dsl_dataset_rele(dsfrom, FTAG); + return (EBADF); + } - off = fp->f_offset; - error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp->f_vnode, &off); + off = fp->f_offset; + error = dmu_send(tosnap, fromsnap, zc->zc_obj, + zc->zc_cookie, fp->f_vnode, &off); - if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; - releasef(zc->zc_cookie); + if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) + fp->f_offset = off; + releasef(zc->zc_cookie); + } if (dsfrom) dsl_dataset_rele(dsfrom, FTAG); dsl_dataset_rele(ds, FTAG); return (error); } +/* + * inputs: + * zc_name name of snapshot on which to report progress + * zc_cookie file descriptor of send stream + * + * outputs: + * zc_cookie number of bytes written in send stream thus far + */ +static int +zfs_ioc_send_progress(zfs_cmd_t *zc) +{ + dsl_dataset_t *ds; + dmu_sendarg_t *dsp = NULL; + int error; + + if ((error = dsl_dataset_hold(zc->zc_name, FTAG, &ds)) != 0) + return (error); + + mutex_enter(&ds->ds_sendstream_lock); + + /* + * Iterate over all the send streams currently active on this dataset. + * If there's one which matches the specified file descriptor _and_ the + * stream was started by the current process, return the progress of + * that stream. + */ + for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; + dsp = list_next(&ds->ds_sendstreams, dsp)) { + if (dsp->dsa_outfd == zc->zc_cookie && + dsp->dsa_proc == curproc) + break; + } + + if (dsp != NULL) + zc->zc_cookie = *(dsp->dsa_off); + else + error = ENOENT; + + mutex_exit(&ds->ds_sendstream_lock); + dsl_dataset_rele(ds, FTAG); + return (error); +} + static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { @@ -3968,6 +4090,22 @@ zfs_ioc_clear(zfs_cmd_t *zc) return (error); } +static int +zfs_ioc_pool_reopen(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error) + return (error); + + spa_vdev_state_enter(spa, SCL_NONE); + vdev_reopen(spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (0); +} /* * inputs: * zc_name name of filesystem @@ -4622,6 +4760,70 @@ zfs_ioc_get_holds(zfs_cmd_t *zc) } /* + * inputs: + * zc_name name of new filesystem or snapshot + * zc_value full name of old snapshot + * + * outputs: + * zc_cookie space in bytes + * zc_objset_type compressed space in bytes + * zc_perm_action uncompressed space in bytes + */ +static int +zfs_ioc_space_written(zfs_cmd_t *zc) +{ + int error; + dsl_dataset_t *new, *old; + + error = dsl_dataset_hold(zc->zc_name, FTAG, &new); + if (error != 0) + return (error); + error = dsl_dataset_hold(zc->zc_value, FTAG, &old); + if (error != 0) { + dsl_dataset_rele(new, FTAG); + return (error); + } + + error = dsl_dataset_space_written(old, new, &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + return (error); +} + +/* + * inputs: + * zc_name full name of last snapshot + * zc_value full name of first snapshot + * + * outputs: + * zc_cookie space in bytes + * zc_objset_type compressed space in bytes + * zc_perm_action uncompressed space in bytes + */ +static int +zfs_ioc_space_snaps(zfs_cmd_t *zc) +{ + int error; + dsl_dataset_t *new, *old; + + error = dsl_dataset_hold(zc->zc_name, FTAG, &new); + if (error != 0) + return (error); + error = dsl_dataset_hold(zc->zc_value, FTAG, &old); + if (error != 0) { + dsl_dataset_rele(new, FTAG); + return (error); + } + + error = dsl_dataset_space_wouldfree(old, new, &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + return (error); +} + +/* * pool create, destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export * do the logging of those commands. @@ -4683,7 +4885,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, - { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, + { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, POOL_CHECK_NONE }, @@ -4697,8 +4899,6 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { POOL_CHECK_NONE }, { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME, - B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE, @@ -4742,7 +4942,19 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, - POOL_CHECK_SUSPENDED } + POOL_CHECK_SUSPENDED }, + { zfs_ioc_space_written, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + POOL_CHECK_SUSPENDED }, + { zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + POOL_CHECK_SUSPENDED }, + { zfs_ioc_destroy_snaps_nvl, zfs_secpolicy_destroy_recursive, + DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_pool_reguid, zfs_secpolicy_config, POOL_NAME, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, B_TRUE, + POOL_CHECK_SUSPENDED }, + { zfs_ioc_send_progress, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + POOL_CHECK_NONE } }; int diff --git a/uts/common/fs/zfs/zfs_vfsops.c b/uts/common/fs/zfs/zfs_vfsops.c index 4970552..14b888b 100644 --- a/uts/common/fs/zfs/zfs_vfsops.c +++ b/uts/common/fs/zfs/zfs_vfsops.c @@ -384,6 +384,14 @@ vscan_changed_cb(void *arg, uint64_t newval) } static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; @@ -514,6 +522,8 @@ zfs_register_callbacks(vfs_t *vfsp) error = error ? error : dsl_prop_register(ds, "snapdir", snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, + "aclmode", acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "vscan", vscan_changed_cb, zfsvfs); @@ -554,6 +564,7 @@ unregister: (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); @@ -1236,6 +1247,9 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs) VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs) == 0); + VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, + zfsvfs) == 0); + VERIFY(dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs) == 0); diff --git a/uts/common/fs/zfs/zfs_vnops.c b/uts/common/fs/zfs/zfs_vnops.c index a072007..0c39274 100644 --- a/uts/common/fs/zfs/zfs_vnops.c +++ b/uts/common/fs/zfs/zfs_vnops.c @@ -2975,7 +2975,8 @@ top: uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - zfs_acl_chmod_setattr(zp, &aclp, new_mode); + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) + goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { @@ -4193,6 +4194,14 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, ZFS_VERIFY_ZP(zp); /* + * There's nothing to do if no data is cached. + */ + if (!vn_has_cached_data(vp)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* * Align this request to the file block size in case we kluster. * XXX - this can result in pretty aggresive locking, which can * impact simultanious read/write access. One option might be diff --git a/uts/common/fs/zfs/zil.c b/uts/common/fs/zfs/zil.c index c66313f..081242c 100644 --- a/uts/common/fs/zfs/zil.c +++ b/uts/common/fs/zfs/zil.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -560,7 +561,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); - ASSERT(!keep_first); + VERIFY(!keep_first); while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) @@ -1661,20 +1662,9 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) void zil_free(zilog_t *zilog) { - lwb_t *head_lwb; - zilog->zl_stop_sync = 1; - /* - * After zil_close() there should only be one lwb with a buffer. - */ - head_lwb = list_head(&zilog->zl_lwb_list); - if (head_lwb) { - ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list)); - list_remove(&zilog->zl_lwb_list, head_lwb); - zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz); - kmem_cache_free(zil_lwb_cache, head_lwb); - } + ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); avl_destroy(&zilog->zl_vdev_tree); @@ -1714,6 +1704,10 @@ zil_open(objset_t *os, zil_get_data_t *get_data) { zilog_t *zilog = dmu_objset_zil(os); + ASSERT(zilog->zl_clean_taskq == NULL); + ASSERT(zilog->zl_get_data == NULL); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + zilog->zl_get_data = get_data; zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 2, 2, TASKQ_PREPOPULATE); @@ -1727,7 +1721,7 @@ zil_open(objset_t *os, zil_get_data_t *get_data) void zil_close(zilog_t *zilog) { - lwb_t *tail_lwb; + lwb_t *lwb; uint64_t txg = 0; zil_commit(zilog, 0); /* commit all itx */ @@ -1739,9 +1733,9 @@ zil_close(zilog_t *zilog) * destroy the zl_clean_taskq. */ mutex_enter(&zilog->zl_lock); - tail_lwb = list_tail(&zilog->zl_lwb_list); - if (tail_lwb != NULL) - txg = tail_lwb->lwb_max_txg; + lwb = list_tail(&zilog->zl_lwb_list); + if (lwb != NULL) + txg = lwb->lwb_max_txg; mutex_exit(&zilog->zl_lock); if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); @@ -1749,6 +1743,19 @@ zil_close(zilog_t *zilog) taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; zilog->zl_get_data = NULL; + + /* + * We should have only one LWB left on the list; remove it now. + */ + mutex_enter(&zilog->zl_lock); + lwb = list_head(&zilog->zl_lwb_list); + if (lwb != NULL) { + ASSERT(lwb == list_tail(&zilog->zl_lwb_list)); + list_remove(&zilog->zl_lwb_list, lwb); + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + kmem_cache_free(zil_lwb_cache, lwb); + } + mutex_exit(&zilog->zl_lock); } /* diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c index eb509c5..cfb5733 100644 --- a/uts/common/fs/zfs/zio.c +++ b/uts/common/fs/zfs/zio.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ #include <sys/zfs_context.h> @@ -78,6 +80,7 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #ifdef _KERNEL extern vmem_t *zio_alloc_arena; #endif +extern int zfs_mg_alloc_failures; /* * An allocating zio is one that either currently has the DVA allocate @@ -158,6 +161,12 @@ zio_init(void) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } + /* + * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs + * to fail 3 times per txg or 8 failures, whichever is greater. + */ + zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); + zio_inject_init(); } @@ -610,7 +619,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && - zp->zp_type < DMU_OT_NUMTYPES && + DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa) && @@ -894,7 +903,7 @@ zio_read_bp_init(zio_t *zio) zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } - if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) @@ -1053,7 +1062,7 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; - int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); + int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and @@ -1077,8 +1086,15 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); - (void) taskq_dispatch(spa->spa_zio_taskq[t][q], - (task_func_t *)zio_execute, zio, flags); + + /* + * NB: We are assuming that the zio can only be dispatched + * to a single taskq at a time. It would be a grievous error + * to dispatch the zio to another taskq at the same time. + */ + ASSERT(zio->io_tqent.tqent_next == NULL); + taskq_dispatch_ent(spa->spa_zio_taskq[t][q], + (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t @@ -2114,6 +2130,7 @@ zio_dva_allocate(zio_t *zio) metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; + int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); @@ -2126,10 +2143,21 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + /* + * The dump device does not support gang blocks so allocation on + * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid + * the "fast" gang feature. + */ + flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; + flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? + METASLAB_GANG_CHILD : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, 0); + zio->io_prop.zp_copies, zio->io_txg, NULL, flags); if (error) { + spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " + "size %llu, error %d", spa_name(spa), zio, zio->io_size, + error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; @@ -2191,13 +2219,22 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, ASSERT(txg > spa_syncing_txg(spa)); - if (use_slog) + /* + * ZIL blocks are always contiguous (i.e. not gang blocks) so we + * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" + * when allocating them. + */ + if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + new_bp, 1, txg, old_bp, + METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); + } - if (error) + if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + new_bp, 1, txg, old_bp, + METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); + } if (error == 0) { BP_SET_LSIZE(new_bp, size); @@ -2869,9 +2906,11 @@ zio_done(zio_t *zio) * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ - (void) taskq_dispatch( + ASSERT(zio->io_tqent.tqent_next == NULL); + (void) taskq_dispatch_ent( spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], - (task_func_t *)zio_reexecute, zio, TQ_SLEEP); + (task_func_t *)zio_reexecute, zio, 0, + &zio->io_tqent); } return (ZIO_PIPELINE_STOP); } @@ -2950,3 +2989,45 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_checksum_verify, zio_done }; + +/* dnp is the dnode for zb1->zb_object */ +boolean_t +zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == DMU_DEADLIST_OBJECT) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == DMU_META_DNODE_OBJECT) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} diff --git a/uts/common/fs/zfs/zvol.c b/uts/common/fs/zfs/zvol.c index 47b6c5a..df9a16b 100644 --- a/uts/common/fs/zfs/zvol.c +++ b/uts/common/fs/zfs/zvol.c @@ -20,10 +20,12 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Portions Copyright 2010 Robert Milkowski + * + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ -/* Portions Copyright 2010 Robert Milkowski */ - /* * ZFS volume emulation driver. * @@ -342,6 +344,24 @@ zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) } /* + * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we + * implement DKIOCFREE/free-long-range. + */ +static int +zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap) +{ + uint64_t offset, length; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); +} + +/* * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ @@ -391,7 +411,7 @@ zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) /* * Callback vectors for replaying records. - * Only TX_WRITE is needed for zvol. + * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* 0 no such transaction type */ @@ -404,7 +424,7 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_LINK */ zvol_replay_err, /* TX_RENAME */ zvol_replay_write, /* TX_WRITE */ - zvol_replay_err, /* TX_TRUNCATE */ + zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ zvol_replay_err, /* TX_CREATE_ACL */ @@ -1512,7 +1532,32 @@ zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid, */ /* + * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. + */ +static void +zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, + boolean_t sync) +{ + itx_t *itx; + lr_truncate_t *lr; + zilog_t *zilog = zv->zv_zilog; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); + lr = (lr_truncate_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = len; + + itx->itx_sync = sync; + zil_itx_assign(zilog, itx, tx); +} + +/* * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). + * Also a dirtbag dkio ioctl for unmap/free-block functionality. */ /*ARGSUSED*/ int @@ -1631,6 +1676,65 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) zfs_range_unlock(rl); break; + case DKIOCFREE: + { + dkioc_free_t df; + dmu_tx_t *tx; + + if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) { + error = EFAULT; + break; + } + + /* + * Apply Postel's Law to length-checking. If they overshoot, + * just blank out until the end, if there's a need to blank + * out anything. + */ + if (df.df_start >= zv->zv_volsize) + break; /* No need to do anything... */ + if (df.df_start + df.df_length > zv->zv_volsize) + df.df_length = DMU_OBJECT_END; + + rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length, + RL_WRITER); + tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zvol_log_truncate(zv, tx, df.df_start, + df.df_length, B_TRUE); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + df.df_start, df.df_length); + } + + zfs_range_unlock(rl); + + if (error == 0) { + /* + * If the write-cache is disabled or 'sync' property + * is set to 'always' then treat this as a synchronous + * operation (i.e. commit to zil). + */ + if (!(zv->zv_flags & ZVOL_WCE) || + (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + + /* + * If the caller really wants synchronous writes, and + * can't wait for them, don't return until the write + * is done. + */ + if (df.df_flags & DF_WAIT_SYNC) { + txg_wait_synced( + dmu_objset_pool(zv->zv_objset), 0); + } + } + break; + } + default: error = ENOTTY; break; diff --git a/uts/common/os/fm.c b/uts/common/os/fm.c index 4efcff4..eff91ae 100644 --- a/uts/common/os/fm.c +++ b/uts/common/os/fm.c @@ -79,7 +79,7 @@ * URL and SUNW-MSG-ID value to display for fm_panic(), defined below. These * values must be kept in sync with the FMA source code in usr/src/cmd/fm. */ -static const char *fm_url = "http://www.sun.com/msg"; +static const char *fm_url = "http://illumos.org/msg"; static const char *fm_msgid = "SUNOS-8000-0G"; static char *volatile fm_panicstr = NULL; diff --git a/uts/common/sys/ccompile.h b/uts/common/sys/ccompile.h index c9857b08..690bb7a 100644 --- a/uts/common/sys/ccompile.h +++ b/uts/common/sys/ccompile.h @@ -27,8 +27,6 @@ #ifndef _SYS_CCOMPILE_H #define _SYS_CCOMPILE_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains definitions designed to enable different compilers * to be used harmoniously on Solaris systems. @@ -79,6 +77,27 @@ extern "C" { */ #define __sun_attr___noreturn__ __attribute__((__noreturn__)) +/* + * The function is 'extern inline' and expects GNU C89 behaviour, not C99 + * behaviour. + * + * Should only be used on 'extern inline' definitions for GCC. + */ +#if __GNUC_VERSION >= 40200 +#define __sun_attr___gnu_inline__ __attribute__((__gnu_inline__)) +#else +#define __sun_attr___gnu_inline__ +#endif + +/* + * The function has control flow such that it may return multiple times (in + * the manner of setjmp or vfork) + */ +#if __GNUC_VERSION >= 40100 +#define __sun_attr___returns_twice__ __attribute__((__returns_twice__)) +#else +#define __sun_attr___returns_twice__ +#endif /* * This is an appropriate label for functions that do not @@ -116,10 +135,11 @@ extern "C" { #define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n))) #define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n))) #define __NORETURN __sun_attr__((__noreturn__)) +#define __GNU_INLINE __inline__ __sun_attr__((__gnu_inline__)) +#define __RETURNS_TWICE __sun_attr__((__returns_twice__)) #define __CONST __sun_attr__((__const__)) #define __PURE __sun_attr__((__pure__)) - #ifdef __cplusplus } #endif diff --git a/uts/common/sys/cmn_err.h b/uts/common/sys/cmn_err.h index e710d8e..736c77b 100644 --- a/uts/common/sys/cmn_err.h +++ b/uts/common/sys/cmn_err.h @@ -26,17 +26,19 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_CMN_ERR_H #define _SYS_CMN_ERR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #if defined(_KERNEL) && !defined(_ASM) #include <sys/va_list.h> #endif +#include <sys/dditypes.h> + #ifdef __cplusplus extern "C" { #endif @@ -56,47 +58,40 @@ extern "C" { /*PRINTFLIKE2*/ extern void cmn_err(int, const char *, ...) __KPRINTFLIKE(2); -#pragma rarely_called(cmn_err) extern void vzcmn_err(zoneid_t, int, const char *, __va_list) __KVPRINTFLIKE(3); -#pragma rarely_called(vzcmn_err) + +extern void dev_err(dev_info_t *, int, char *, ...) + __KPRINTFLIKE(3); extern void vcmn_err(int, const char *, __va_list) __KVPRINTFLIKE(2); -#pragma rarely_called(vcmn_err) /*PRINTFLIKE3*/ extern void zcmn_err(zoneid_t, int, const char *, ...) __KPRINTFLIKE(3); -#pragma rarely_called(zcmn_err) /*PRINTFLIKE1*/ extern void printf(const char *, ...) __KPRINTFLIKE(1); -#pragma rarely_called(printf) extern void vzprintf(zoneid_t, const char *, __va_list) __KVPRINTFLIKE(2); -#pragma rarely_called(vzprintf) /*PRINTFLIKE2*/ extern void zprintf(zoneid_t, const char *, ...) __KPRINTFLIKE(2); -#pragma rarely_called(zprintf) extern void vprintf(const char *, __va_list) __KVPRINTFLIKE(1); -#pragma rarely_called(vprintf) /*PRINTFLIKE1*/ extern void uprintf(const char *, ...) __KPRINTFLIKE(1); -#pragma rarely_called(uprintf) extern void vuprintf(const char *, __va_list) __KVPRINTFLIKE(1); -#pragma rarely_called(vuprintf) /*PRINTFLIKE3*/ extern size_t snprintf(char *, size_t, const char *, ...) @@ -112,11 +107,9 @@ extern char *vsprintf(char *, const char *, __va_list) /*PRINTFLIKE1*/ extern void panic(const char *, ...) __KPRINTFLIKE(1) __NORETURN; -#pragma rarely_called(panic) extern void vpanic(const char *, __va_list) __KVPRINTFLIKE(1) __NORETURN; -#pragma rarely_called(vpanic) #endif /* _KERNEL */ #endif /* !_ASM */ diff --git a/uts/common/sys/dtrace.h b/uts/common/sys/dtrace.h index 007502d..c15799a 100644 --- a/uts/common/sys/dtrace.h +++ b/uts/common/sys/dtrace.h @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_DTRACE_H #define _SYS_DTRACE_H @@ -202,6 +206,7 @@ typedef enum dtrace_probespec { #define DIF_VAR_ARGS 0x0000 /* arguments array */ #define DIF_VAR_REGS 0x0001 /* registers array */ #define DIF_VAR_UREGS 0x0002 /* user registers array */ +#define DIF_VAR_VMREGS 0x0003 /* virtual machine registers array */ #define DIF_VAR_CURTHREAD 0x0100 /* thread pointer */ #define DIF_VAR_TIMESTAMP 0x0101 /* timestamp */ #define DIF_VAR_VTIMESTAMP 0x0102 /* virtual timestamp */ @@ -280,8 +285,10 @@ typedef enum dtrace_probespec { #define DIF_SUBR_INET_NTOP 41 #define DIF_SUBR_INET_NTOA 42 #define DIF_SUBR_INET_NTOA6 43 +#define DIF_SUBR_TOUPPER 44 +#define DIF_SUBR_TOLOWER 45 -#define DIF_SUBR_MAX 43 /* max subroutine value */ +#define DIF_SUBR_MAX 45 /* max subroutine value */ typedef uint32_t dif_instr_t; @@ -390,6 +397,8 @@ typedef struct dtrace_difv { #define DTRACEACT_PRINTF 3 /* printf() action */ #define DTRACEACT_PRINTA 4 /* printa() action */ #define DTRACEACT_LIBACT 5 /* library-controlled action */ +#define DTRACEACT_TRACEMEM 6 /* tracemem() action */ +#define DTRACEACT_TRACEMEM_DYNSIZE 7 /* dynamic tracemem() size */ #define DTRACEACT_PROC 0x0100 #define DTRACEACT_USTACK (DTRACEACT_PROC + 1) @@ -455,6 +464,7 @@ typedef struct dtrace_difv { #define DTRACEAGG_STDDEV (DTRACEACT_AGGREGATION + 6) #define DTRACEAGG_QUANTIZE (DTRACEACT_AGGREGATION + 7) #define DTRACEAGG_LQUANTIZE (DTRACEACT_AGGREGATION + 8) +#define DTRACEAGG_LLQUANTIZE (DTRACEACT_AGGREGATION + 9) #define DTRACEACT_ISAGG(x) \ (DTRACEACT_CLASS(x) == DTRACEACT_AGGREGATION) @@ -489,6 +499,31 @@ typedef struct dtrace_difv { (int32_t)(((x) & DTRACE_LQUANTIZE_BASEMASK) >> \ DTRACE_LQUANTIZE_BASESHIFT) +#define DTRACE_LLQUANTIZE_FACTORSHIFT 48 +#define DTRACE_LLQUANTIZE_FACTORMASK ((uint64_t)UINT16_MAX << 48) +#define DTRACE_LLQUANTIZE_LOWSHIFT 32 +#define DTRACE_LLQUANTIZE_LOWMASK ((uint64_t)UINT16_MAX << 32) +#define DTRACE_LLQUANTIZE_HIGHSHIFT 16 +#define DTRACE_LLQUANTIZE_HIGHMASK ((uint64_t)UINT16_MAX << 16) +#define DTRACE_LLQUANTIZE_NSTEPSHIFT 0 +#define DTRACE_LLQUANTIZE_NSTEPMASK UINT16_MAX + +#define DTRACE_LLQUANTIZE_FACTOR(x) \ + (uint16_t)(((x) & DTRACE_LLQUANTIZE_FACTORMASK) >> \ + DTRACE_LLQUANTIZE_FACTORSHIFT) + +#define DTRACE_LLQUANTIZE_LOW(x) \ + (uint16_t)(((x) & DTRACE_LLQUANTIZE_LOWMASK) >> \ + DTRACE_LLQUANTIZE_LOWSHIFT) + +#define DTRACE_LLQUANTIZE_HIGH(x) \ + (uint16_t)(((x) & DTRACE_LLQUANTIZE_HIGHMASK) >> \ + DTRACE_LLQUANTIZE_HIGHSHIFT) + +#define DTRACE_LLQUANTIZE_NSTEP(x) \ + (uint16_t)(((x) & DTRACE_LLQUANTIZE_NSTEPMASK) >> \ + DTRACE_LLQUANTIZE_NSTEPSHIFT) + #define DTRACE_USTACK_NFRAMES(x) (uint32_t)((x) & UINT32_MAX) #define DTRACE_USTACK_STRSIZE(x) (uint32_t)((x) >> 32) #define DTRACE_USTACK_ARG(x, y) \ @@ -1321,7 +1356,7 @@ typedef struct dof_helper { * dtps_resume() <-- Resume specified probe * dtps_getargdesc() <-- Get the argument description for args[X] * dtps_getargval() <-- Get the value for an argX or args[X] variable - * dtps_usermode() <-- Find out if the probe was fired in user mode + * dtps_mode() <-- Return the mode of the fired probe * dtps_destroy() <-- Destroy all state associated with this probe * * 1.2 void dtps_provide(void *arg, const dtrace_probedesc_t *spec) @@ -1570,24 +1605,32 @@ typedef struct dof_helper { * This is called from within dtrace_probe() meaning that interrupts * are disabled. No locks should be taken within this entry point. * - * 1.10 int dtps_usermode(void *arg, dtrace_id_t id, void *parg) + * 1.10 int dtps_mode(void *arg, dtrace_id_t id, void *parg) * * 1.10.1 Overview * - * Called to determine if the probe was fired in a user context. + * Called to determine the mode of a fired probe. * * 1.10.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The - * second argument is the identifier of the current probe. The third + * second argument is the identifier of the current probe. The third * argument is the probe argument as passed to dtrace_probe_create(). This * entry point must not be left NULL for providers whose probes allow for - * mixed mode tracing, that is to say those probes that can fire during - * kernel- _or_ user-mode execution + * mixed mode tracing, that is to say those unanchored probes that can fire + * during kernel- or user-mode execution. * * 1.10.3 Return value * - * A boolean value. + * A bitwise OR that encapsulates both the mode (either DTRACE_MODE_KERNEL + * or DTRACE_MODE_USER) and the policy when the privilege of the enabling + * is insufficient for that mode (either DTRACE_MODE_NOPRIV_DROP or + * DTRACE_MODE_NOPRIV_RESTRICT). If the policy is DTRACE_MODE_NOPRIV_DROP, + * insufficient privilege will result in the probe firing being silently + * ignored for the enabling; if the policy is DTRACE_NODE_NOPRIV_RESTRICT, + * insufficient privilege will not prevent probe processing for the + * enabling, but restrictions will be in place that induce a UPRIV fault + * upon attempt to examine probe arguments or current process state. * * 1.10.4 Caller's context * @@ -1978,10 +2021,15 @@ typedef struct dtrace_pops { dtrace_argdesc_t *desc); uint64_t (*dtps_getargval)(void *arg, dtrace_id_t id, void *parg, int argno, int aframes); - int (*dtps_usermode)(void *arg, dtrace_id_t id, void *parg); + int (*dtps_mode)(void *arg, dtrace_id_t id, void *parg); void (*dtps_destroy)(void *arg, dtrace_id_t id, void *parg); } dtrace_pops_t; +#define DTRACE_MODE_KERNEL 0x01 +#define DTRACE_MODE_USER 0x02 +#define DTRACE_MODE_NOPRIV_DROP 0x10 +#define DTRACE_MODE_NOPRIV_RESTRICT 0x20 + typedef uintptr_t dtrace_provider_id_t; extern int dtrace_register(const char *, const dtrace_pattr_t *, uint32_t, diff --git a/uts/common/sys/dtrace_impl.h b/uts/common/sys/dtrace_impl.h index fed537e..3bebd0c 100644 --- a/uts/common/sys/dtrace_impl.h +++ b/uts/common/sys/dtrace_impl.h @@ -24,11 +24,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_DTRACE_IMPL_H #define _SYS_DTRACE_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -419,8 +421,11 @@ typedef struct dtrace_buffer { uint32_t dtb_errors; /* number of errors */ uint32_t dtb_xamot_errors; /* errors in inactive buffer */ #ifndef _LP64 - uint64_t dtb_pad1; + uint64_t dtb_pad1; /* pad out to 64 bytes */ #endif + uint64_t dtb_switched; /* time of last switch */ + uint64_t dtb_interval; /* observed switch interval */ + uint64_t dtb_pad2[6]; /* pad to avoid false sharing */ } dtrace_buffer_t; /* @@ -924,7 +929,8 @@ typedef struct dtrace_mstate { * Access flag used by dtrace_mstate.dtms_access. */ #define DTRACE_ACCESS_KERNEL 0x1 /* the priv to read kmem */ - +#define DTRACE_ACCESS_PROC 0x2 /* the priv for proc state */ +#define DTRACE_ACCESS_ARGS 0x4 /* the priv to examine args */ /* * DTrace Activity @@ -1139,7 +1145,7 @@ struct dtrace_provider { dtrace_pops_t dtpv_pops; /* provider operations */ char *dtpv_name; /* provider name */ void *dtpv_arg; /* provider argument */ - uint_t dtpv_defunct; /* boolean: defunct provider */ + hrtime_t dtpv_defunct; /* when made defunct */ struct dtrace_provider *dtpv_next; /* next provider */ }; @@ -1246,6 +1252,7 @@ extern void dtrace_copyoutstr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); extern void dtrace_getpcstack(pc_t *, int, int, uint32_t *); extern ulong_t dtrace_getreg(struct regs *, uint_t); +extern uint64_t dtrace_getvmreg(uint_t, volatile uint16_t *); extern int dtrace_getstackdepth(int); extern void dtrace_getupcstack(uint64_t *, int); extern void dtrace_getufpstack(uint64_t *, uint64_t *, int); diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h index da0b12b..4cf7bba 100644 --- a/uts/common/sys/fs/zfs.h +++ b/uts/common/sys/fs/zfs.h @@ -21,6 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -87,7 +90,7 @@ typedef enum { ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, - ZFS_PROP_PRIVATE, /* not exposed to user, temporary */ + ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, /* not exposed to the user */ ZFS_PROP_NAME, /* not exposed to the user */ @@ -122,6 +125,9 @@ typedef enum { ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, + ZFS_PROP_REFRATIO, + ZFS_PROP_WRITTEN, + ZFS_PROP_CLONES, ZFS_NUM_PROPS } zfs_prop_t; @@ -161,9 +167,15 @@ typedef enum { ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, + ZPOOL_PROP_COMMENT, + ZPOOL_PROP_EXPANDSZ, + ZPOOL_PROP_FREEING, ZPOOL_NUM_PROPS } zpool_prop_t; +/* Small enough to not hog a whole line of printout in zpool(1M). */ +#define ZPROP_MAX_COMMENT 32 + #define ZPROP_CONT -2 #define ZPROP_INVAL -1 @@ -218,6 +230,7 @@ const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); boolean_t zfs_prop_userquota(const char *); +boolean_t zfs_prop_written(const char *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); @@ -231,6 +244,8 @@ const char *zpool_prop_to_name(zpool_prop_t); const char *zpool_prop_default_string(zpool_prop_t); uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); +boolean_t zpool_prop_feature(const char *); +boolean_t zpool_prop_unsupported(const char *name); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); @@ -338,6 +353,7 @@ typedef enum { #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL +#define SPA_VERSION_5000 5000ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk @@ -345,8 +361,8 @@ typedef enum { * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_28 -#define SPA_VERSION_STRING "28" +#define SPA_VERSION SPA_VERSION_5000 +#define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -397,6 +413,12 @@ typedef enum { #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 +#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 +#define SPA_VERSION_FEATURES SPA_VERSION_5000 + +#define SPA_VERSION_IS_SUPPORTED(v) \ + (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ + ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -488,11 +510,17 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVERING "resilvering" +#define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ +#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ +#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ +#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ +#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" +#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -571,6 +599,7 @@ typedef enum vdev_aux { VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ + VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ @@ -661,6 +690,7 @@ typedef struct vdev_stat { uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ + uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ @@ -752,7 +782,6 @@ typedef enum zfs_ioc { ZFS_IOC_ERROR_LOG, ZFS_IOC_CLEAR, ZFS_IOC_PROMOTE, - ZFS_IOC_DESTROY_SNAPS, ZFS_IOC_SNAPSHOT, ZFS_IOC_DSOBJ_TO_DSNAME, ZFS_IOC_OBJ_TO_PATH, @@ -774,7 +803,13 @@ typedef enum zfs_ioc { ZFS_IOC_NEXT_OBJ, ZFS_IOC_DIFF, ZFS_IOC_TMP_SNAPSHOT, - ZFS_IOC_OBJ_TO_STATS + ZFS_IOC_OBJ_TO_STATS, + ZFS_IOC_SPACE_WRITTEN, + ZFS_IOC_SPACE_SNAPS, + ZFS_IOC_DESTROY_SNAPS_NVL, + ZFS_IOC_POOL_REGUID, + ZFS_IOC_POOL_REOPEN, + ZFS_IOC_SEND_PROGRESS } zfs_ioc_t; /* @@ -837,6 +872,7 @@ typedef enum { * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_END * ESC_ZFS_POOL_DESTROY + * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 diff --git a/uts/common/sys/nvpair.h b/uts/common/sys/nvpair.h index 30ff4e0..ad25eff 100644 --- a/uts/common/sys/nvpair.h +++ b/uts/common/sys/nvpair.h @@ -20,12 +20,14 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_NVPAIR_H #define _SYS_NVPAIR_H #include <sys/types.h> +#include <sys/time.h> #include <sys/errno.h> #include <sys/va_list.h> @@ -274,6 +276,73 @@ int nvpair_value_hrtime(nvpair_t *, hrtime_t *); int nvpair_value_double(nvpair_t *, double *); #endif +nvlist_t *fnvlist_alloc(void); +void fnvlist_free(nvlist_t *); +size_t fnvlist_size(nvlist_t *); +char *fnvlist_pack(nvlist_t *, size_t *); +void fnvlist_pack_free(char *, size_t); +nvlist_t *fnvlist_unpack(char *, size_t); +nvlist_t *fnvlist_dup(nvlist_t *); +void fnvlist_merge(nvlist_t *, nvlist_t *); + +void fnvlist_add_boolean(nvlist_t *, const char *); +void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); +void fnvlist_add_byte(nvlist_t *, const char *, uchar_t); +void fnvlist_add_int8(nvlist_t *, const char *, int8_t); +void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t); +void fnvlist_add_int16(nvlist_t *, const char *, int16_t); +void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t); +void fnvlist_add_int32(nvlist_t *, const char *, int32_t); +void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t); +void fnvlist_add_int64(nvlist_t *, const char *, int64_t); +void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t); +void fnvlist_add_string(nvlist_t *, const char *, const char *); +void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +void fnvlist_add_nvpair(nvlist_t *, nvpair_t *); +void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); +void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); +void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); +void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); +void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); +void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); +void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); +void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); +void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); +void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); +void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t); +void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); + +void fnvlist_remove(nvlist_t *, const char *); +void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *); + +nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name); +boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name); +boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name); +uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name); +int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name); +int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name); +int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name); +int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name); +uint8_t fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name); +uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name); +uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name); +uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name); +char *fnvlist_lookup_string(nvlist_t *nvl, const char *name); +nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name); + +boolean_t fnvpair_value_boolean_value(nvpair_t *nvp); +uchar_t fnvpair_value_byte(nvpair_t *nvp); +int8_t fnvpair_value_int8(nvpair_t *nvp); +int16_t fnvpair_value_int16(nvpair_t *nvp); +int32_t fnvpair_value_int32(nvpair_t *nvp); +int64_t fnvpair_value_int64(nvpair_t *nvp); +uint8_t fnvpair_value_uint8_t(nvpair_t *nvp); +uint16_t fnvpair_value_uint16(nvpair_t *nvp); +uint32_t fnvpair_value_uint32(nvpair_t *nvp); +uint64_t fnvpair_value_uint64(nvpair_t *nvp); +char *fnvpair_value_string(nvpair_t *nvp); +nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp); + #ifdef __cplusplus } #endif diff --git a/uts/common/sys/sysevent/eventdefs.h b/uts/common/sys/sysevent/eventdefs.h index 3ed9bb2..5a75c5d 100644 --- a/uts/common/sys/sysevent/eventdefs.h +++ b/uts/common/sys/sysevent/eventdefs.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_SYSEVENT_EVENTDEFS_H @@ -256,6 +257,7 @@ extern "C" { #define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish" #define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare" #define ESC_ZFS_BOOTFS_VDEV_ATTACH "ESC_ZFS_bootfs_vdev_attach" +#define ESC_ZFS_POOL_REGUID "ESC_ZFS_pool_reguid" /* * datalink subclass definitions. diff --git a/uts/common/sys/sysmacros.h b/uts/common/sys/sysmacros.h index 89a672d..71042eb 100644 --- a/uts/common/sys/sysmacros.h +++ b/uts/common/sys/sysmacros.h @@ -25,6 +25,8 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2011, 2012 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_SYSMACROS_H @@ -364,12 +366,18 @@ extern unsigned char bcd_to_byte[256]; #error One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined #endif /* _BIT_FIELDS_LTOH */ -#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) - /* avoid any possibility of clashing with <stddef.h> version */ +#if defined(_KERNEL) && !defined(_KMEMUSER) +#if !defined(offsetof) #define offsetof(s, m) ((size_t)(&(((s *)0)->m))) -#endif +#endif /* !offsetof */ + +#define container_of(m, s, name) \ + (void *)((uintptr_t)(m) - (uintptr_t)offsetof(s, name)) + +#define ARRAY_SIZE(x) (sizeof (x) / sizeof (x[0])) +#endif /* _KERNEL, !_KMEMUSER */ #ifdef __cplusplus } |