diff options
Diffstat (limited to 'sys/contrib')
191 files changed, 88133 insertions, 0 deletions
diff --git a/sys/contrib/opensolaris/common/acl/acl_common.c b/sys/contrib/opensolaris/common/acl/acl_common.c new file mode 100644 index 0000000..2f32e7a --- /dev/null +++ b/sys/contrib/opensolaris/common/acl/acl_common.c @@ -0,0 +1,217 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/acl.h> +#include <sys/stat.h> +#if defined(_KERNEL) +#include <sys/systm.h> +#include <sys/debug.h> +#else +#include <errno.h> +#include <stdlib.h> +#include <strings.h> +#include <assert.h> +#define ASSERT assert +#endif + + +ace_t trivial_acl[] = { + {-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE}, + {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS, ACE_OWNER, ACE_ACCESS_ALLOWED_ACE_TYPE}, + {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_DENIED_ACE_TYPE}, + {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_ALLOWED_ACE_TYPE}, + {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, ACE_ACCESS_DENIED_ACE_TYPE}, + {-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE, ACE_EVERYONE, ACE_ACCESS_ALLOWED_ACE_TYPE} +}; + + +void +adjust_ace_pair(ace_t *pair, mode_t mode) +{ + if (mode & S_IROTH) + pair[1].a_access_mask |= ACE_READ_DATA; + else + pair[0].a_access_mask |= ACE_READ_DATA; + if (mode & S_IWOTH) + pair[1].a_access_mask |= + ACE_WRITE_DATA|ACE_APPEND_DATA; + else + pair[0].a_access_mask |= + ACE_WRITE_DATA|ACE_APPEND_DATA; + if (mode & S_IXOTH) + pair[1].a_access_mask |= ACE_EXECUTE; + else + pair[0].a_access_mask |= ACE_EXECUTE; +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implys that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +int +ace_trivial(ace_t *acep, int aclcnt) +{ + int i; + int owner_seen = 0; + int group_seen = 0; + int everyone_seen = 0; + + for (i = 0; i != aclcnt; i++) { + switch (acep[i].a_flags & 0xf040) { + case ACE_OWNER: + if (group_seen || everyone_seen) + return (1); + owner_seen++; + break; + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + if (everyone_seen || owner_seen == 0) + return (1); + group_seen++; + break; + + case ACE_EVERYONE: + if (owner_seen == 0 || group_seen == 0) + return (1); + everyone_seen++; + break; + default: + return (1); + + } + + if (acep[i].a_flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((acep[i].a_access_mask & + (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (acep[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Allow on owner@ to allow + * write_acl/write_owner/write_attributes + */ + if (acep[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(acep[i].a_flags & ACE_OWNER) && (acep[i].a_access_mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL|ACE_WRITE_ATTRIBUTES)))) + return (1); + } + + if ((owner_seen == 0) || (group_seen == 0) || (everyone_seen == 0)) + return (1); + + return (0); +} + + +/* + * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified. + * v = Ptr to array/vector of objs + * n = # objs in the array + * s = size of each obj (must be multiples of a word size) + * f = ptr to function to compare two objs + * returns (-1 = less than, 0 = equal, 1 = greater than + */ +void +ksort(caddr_t v, int n, int s, int (*f)()) +{ + int g, i, j, ii; + unsigned int *p1, *p2; + unsigned int tmp; + + /* No work to do */ + if (v == NULL || n <= 1) + return; + + /* Sanity check on arguments */ + ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0); + ASSERT(s > 0); + for (g = n / 2; g > 0; g /= 2) { + for (i = g; i < n; i++) { + for (j = i - g; j >= 0 && + (*f)(v + j * s, v + (j + g) * s) == 1; + j -= g) { + p1 = (void *)(v + j * s); + p2 = (void *)(v + (j + g) * s); + for (ii = 0; ii < s / 4; ii++) { + tmp = *p1; + *p1++ = *p2; + *p2++ = tmp; + } + } + } + } +} + +/* + * Compare two acls, all fields. Returns: + * -1 (less than) + * 0 (equal) + * +1 (greater than) + */ +int +cmp2acls(void *a, void *b) +{ + aclent_t *x = (aclent_t *)a; + aclent_t *y = (aclent_t *)b; + + /* Compare types */ + if (x->a_type < y->a_type) + return (-1); + if (x->a_type > y->a_type) + return (1); + /* Equal types; compare id's */ + if (x->a_id < y->a_id) + return (-1); + if (x->a_id > y->a_id) + return (1); + /* Equal ids; compare perms */ + if (x->a_perm < y->a_perm) + return (-1); + if (x->a_perm > y->a_perm) + return (1); + /* Totally equal */ + return (0); +} diff --git a/sys/contrib/opensolaris/common/acl/acl_common.h b/sys/contrib/opensolaris/common/acl/acl_common.h new file mode 100644 index 0000000..2227ad7 --- /dev/null +++ b/sys/contrib/opensolaris/common/acl/acl_common.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ACL_ACL_UTILS_H +#define _ACL_ACL_UTILS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + + +#include <sys/types.h> +#include <sys/acl.h> +#include <sys/stat.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern ace_t trivial_acl[6]; + +extern int acltrivial(const char *); +extern void adjust_ace_pair(ace_t *pair, mode_t mode); +extern int ace_trivial(ace_t *acep, int aclcnt); +void ksort(caddr_t v, int n, int s, int (*f)()); +int cmp2acls(void *a, void *b); + + + + +#ifdef __cplusplus +} +#endif + +#endif /* _ACL_ACL_UTILS_H */ diff --git a/sys/contrib/opensolaris/common/avl/avl.c b/sys/contrib/opensolaris/common/avl/avl.c new file mode 100644 index 0000000..bbdbe08 --- /dev/null +++ b/sys/contrib/opensolaris/common/avl/avl.c @@ -0,0 +1,968 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * AVL - generic AVL tree implementation for kernel use + * + * A complete description of AVL trees can be found in many CS textbooks. + * + * Here is a very brief overview. An AVL tree is a binary search tree that is + * almost perfectly balanced. By "almost" perfectly balanced, we mean that at + * any given node, the left and right subtrees are allowed to differ in height + * by at most 1 level. + * + * This relaxation from a perfectly balanced binary tree allows doing + * insertion and deletion relatively efficiently. Searching the tree is + * still a fast operation, roughly O(log(N)). + * + * The key to insertion and deletion is a set of tree maniuplations called + * rotations, which bring unbalanced subtrees back into the semi-balanced state. + * + * This implementation of AVL trees has the following peculiarities: + * + * - The AVL specific data structures are physically embedded as fields + * in the "using" data structures. To maintain generality the code + * must constantly translate between "avl_node_t *" and containing + * data structure "void *"s by adding/subracting the avl_offset. + * + * - Since the AVL data is always embedded in other structures, there is + * no locking or memory allocation in the AVL routines. This must be + * provided for by the enclosing data structure's semantics. Typically, + * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of + * exclusive write lock. Other operations require a read lock. + * + * - The implementation uses iteration instead of explicit recursion, + * since it is intended to run on limited size kernel stacks. Since + * there is no recursion stack present to move "up" in the tree, + * there is an explicit "parent" link in the avl_node_t. + * + * - The left/right children pointers of a node are in an array. + * In the code, variables (instead of constants) are used to represent + * left and right indices. The implementation is written as if it only + * dealt with left handed manipulations. By changing the value assigned + * to "left", the code also works for right handed trees. The + * following variables/terms are frequently used: + * + * int left; // 0 when dealing with left children, + * // 1 for dealing with right children + * + * int left_heavy; // -1 when left subtree is taller at some node, + * // +1 when right subtree is taller + * + * int right; // will be the opposite of left (0 or 1) + * int right_heavy;// will be the opposite of left_heavy (-1 or 1) + * + * int direction; // 0 for "<" (ie. left child); 1 for ">" (right) + * + * Though it is a little more confusing to read the code, the approach + * allows using half as much code (and hence cache footprint) for tree + * manipulations and eliminates many conditional branches. + * + * - The avl_index_t is an opaque "cookie" used to find nodes at or + * adjacent to where a new value would be inserted in the tree. The value + * is a modified "avl_node_t *". The bottom bit (normally 0 for a + * pointer) is set to indicate if that the new node has a value greater + * than the value of the indicated "avl_node_t *". + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/debug.h> +#include <sys/avl.h> + +/* + * Small arrays to translate between balance (or diff) values and child indeces. + * + * Code that deals with binary tree data structures will randomly use + * left and right children when examining a tree. C "if()" statements + * which evaluate randomly suffer from very poor hardware branch prediction. + * In this code we avoid some of the branch mispredictions by using the + * following translation arrays. They replace random branches with an + * additional memory reference. Since the translation arrays are both very + * small the data should remain efficiently in cache. + */ +static const int avl_child2balance[2] = {-1, 1}; +static const int avl_balance2child[] = {0, 0, 1}; + + +/* + * Walk from one node to the previous valued node (ie. an infix walk + * towards the left). At any given node we do one of 2 things: + * + * - If there is a left child, go to it, then to it's rightmost descendant. + * + * - otherwise we return thru parent nodes until we've come from a right child. + * + * Return Value: + * NULL - if at the end of the nodes + * otherwise next node + */ +void * +avl_walk(avl_tree_t *tree, void *oldnode, int left) +{ + size_t off = tree->avl_offset; + avl_node_t *node = AVL_DATA2NODE(oldnode, off); + int right = 1 - left; + int was_child; + + + /* + * nowhere to walk to if tree is empty + */ + if (node == NULL) + return (NULL); + + /* + * Visit the previous valued node. There are two possibilities: + * + * If this node has a left child, go down one left, then all + * the way right. + */ + if (node->avl_child[left] != NULL) { + for (node = node->avl_child[left]; + node->avl_child[right] != NULL; + node = node->avl_child[right]) + ; + /* + * Otherwise, return thru left children as far as we can. + */ + } else { + for (;;) { + was_child = AVL_XCHILD(node); + node = AVL_XPARENT(node); + if (node == NULL) + return (NULL); + if (was_child == right) + break; + } + } + + return (AVL_NODE2DATA(node, off)); +} + +/* + * Return the lowest valued node in a tree or NULL. + * (leftmost child from root of tree) + */ +void * +avl_first(avl_tree_t *tree) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + size_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) + prev = node; + + if (prev != NULL) + return (AVL_NODE2DATA(prev, off)); + return (NULL); +} + +/* + * Return the highest valued node in a tree or NULL. + * (rightmost child from root of tree) + */ +void * +avl_last(avl_tree_t *tree) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + size_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; node = node->avl_child[1]) + prev = node; + + if (prev != NULL) + return (AVL_NODE2DATA(prev, off)); + return (NULL); +} + +/* + * Access the node immediately before or after an insertion point. + * + * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child + * + * Return value: + * NULL: no node in the given direction + * "void *" of the found tree node + */ +void * +avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) +{ + int child = AVL_INDEX2CHILD(where); + avl_node_t *node = AVL_INDEX2NODE(where); + void *data; + size_t off = tree->avl_offset; + + if (node == NULL) { + ASSERT(tree->avl_root == NULL); + return (NULL); + } + data = AVL_NODE2DATA(node, off); + if (child != direction) + return (data); + + return (avl_walk(tree, data, direction)); +} + + +/* + * Search for the node which contains "value". The algorithm is a + * simple binary tree search. + * + * return value: + * NULL: the value is not in the AVL tree + * *where (if not NULL) is set to indicate the insertion point + * "void *" of the found tree node + */ +void * +avl_find(avl_tree_t *tree, void *value, avl_index_t *where) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + int child = 0; + int diff; + size_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; + node = node->avl_child[child]) { + + prev = node; + + diff = tree->avl_compar(value, AVL_NODE2DATA(node, off)); + ASSERT(-1 <= diff && diff <= 1); + if (diff == 0) { +#ifdef DEBUG + if (where != NULL) + *where = 0; +#endif + return (AVL_NODE2DATA(node, off)); + } + child = avl_balance2child[1 + diff]; + + } + + if (where != NULL) + *where = AVL_MKINDEX(prev, child); + + return (NULL); +} + + +/* + * Perform a rotation to restore balance at the subtree given by depth. + * + * This routine is used by both insertion and deletion. The return value + * indicates: + * 0 : subtree did not change height + * !0 : subtree was reduced in height + * + * The code is written as if handling left rotations, right rotations are + * symmetric and handled by swapping values of variables right/left[_heavy] + * + * On input balance is the "new" balance at "node". This value is either + * -2 or +2. + */ +static int +avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance) +{ + int left = !(balance < 0); /* when balance = -2, left will be 0 */ + int right = 1 - left; + int left_heavy = balance >> 1; + int right_heavy = -left_heavy; + avl_node_t *parent = AVL_XPARENT(node); + avl_node_t *child = node->avl_child[left]; + avl_node_t *cright; + avl_node_t *gchild; + avl_node_t *gright; + avl_node_t *gleft; + int which_child = AVL_XCHILD(node); + int child_bal = AVL_XBALANCE(child); + + /* BEGIN CSTYLED */ + /* + * case 1 : node is overly left heavy, the left child is balanced or + * also left heavy. This requires the following rotation. + * + * (node bal:-2) + * / \ + * / \ + * (child bal:0 or -1) + * / \ + * / \ + * cright + * + * becomes: + * + * (child bal:1 or 0) + * / \ + * / \ + * (node bal:-1 or 0) + * / \ + * / \ + * cright + * + * we detect this situation by noting that child's balance is not + * right_heavy. + */ + /* END CSTYLED */ + if (child_bal != right_heavy) { + + /* + * compute new balance of nodes + * + * If child used to be left heavy (now balanced) we reduced + * the height of this sub-tree -- used in "return...;" below + */ + child_bal += right_heavy; /* adjust towards right */ + + /* + * move "cright" to be node's left child + */ + cright = child->avl_child[right]; + node->avl_child[left] = cright; + if (cright != NULL) { + AVL_SETPARENT(cright, node); + AVL_SETCHILD(cright, left); + } + + /* + * move node to be child's right child + */ + child->avl_child[right] = node; + AVL_SETBALANCE(node, -child_bal); + AVL_SETCHILD(node, right); + AVL_SETPARENT(node, child); + + /* + * update the pointer into this subtree + */ + AVL_SETBALANCE(child, child_bal); + AVL_SETCHILD(child, which_child); + AVL_SETPARENT(child, parent); + if (parent != NULL) + parent->avl_child[which_child] = child; + else + tree->avl_root = child; + + return (child_bal == 0); + } + + /* BEGIN CSTYLED */ + /* + * case 2 : When node is left heavy, but child is right heavy we use + * a different rotation. + * + * (node b:-2) + * / \ + * / \ + * / \ + * (child b:+1) + * / \ + * / \ + * (gchild b: != 0) + * / \ + * / \ + * gleft gright + * + * becomes: + * + * (gchild b:0) + * / \ + * / \ + * / \ + * (child b:?) (node b:?) + * / \ / \ + * / \ / \ + * gleft gright + * + * computing the new balances is more complicated. As an example: + * if gchild was right_heavy, then child is now left heavy + * else it is balanced + */ + /* END CSTYLED */ + gchild = child->avl_child[right]; + gleft = gchild->avl_child[left]; + gright = gchild->avl_child[right]; + + /* + * move gright to left child of node and + * + * move gleft to right child of node + */ + node->avl_child[left] = gright; + if (gright != NULL) { + AVL_SETPARENT(gright, node); + AVL_SETCHILD(gright, left); + } + + child->avl_child[right] = gleft; + if (gleft != NULL) { + AVL_SETPARENT(gleft, child); + AVL_SETCHILD(gleft, right); + } + + /* + * move child to left child of gchild and + * + * move node to right child of gchild and + * + * fixup parent of all this to point to gchild + */ + balance = AVL_XBALANCE(gchild); + gchild->avl_child[left] = child; + AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0)); + AVL_SETPARENT(child, gchild); + AVL_SETCHILD(child, left); + + gchild->avl_child[right] = node; + AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0)); + AVL_SETPARENT(node, gchild); + AVL_SETCHILD(node, right); + + AVL_SETBALANCE(gchild, 0); + AVL_SETPARENT(gchild, parent); + AVL_SETCHILD(gchild, which_child); + if (parent != NULL) + parent->avl_child[which_child] = gchild; + else + tree->avl_root = gchild; + + return (1); /* the new tree is always shorter */ +} + + +/* + * Insert a new node into an AVL tree at the specified (from avl_find()) place. + * + * Newly inserted nodes are always leaf nodes in the tree, since avl_find() + * searches out to the leaf positions. The avl_index_t indicates the node + * which will be the parent of the new node. + * + * After the node is inserted, a single rotation further up the tree may + * be necessary to maintain an acceptable AVL balance. + */ +void +avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) +{ + avl_node_t *node; + avl_node_t *parent = AVL_INDEX2NODE(where); + int old_balance; + int new_balance; + int which_child = AVL_INDEX2CHILD(where); + size_t off = tree->avl_offset; + + ASSERT(tree); +#ifdef _LP64 + ASSERT(((uintptr_t)new_data & 0x7) == 0); +#endif + + node = AVL_DATA2NODE(new_data, off); + + /* + * First, add the node to the tree at the indicated position. + */ + ++tree->avl_numnodes; + + node->avl_child[0] = NULL; + node->avl_child[1] = NULL; + + AVL_SETCHILD(node, which_child); + AVL_SETBALANCE(node, 0); + AVL_SETPARENT(node, parent); + if (parent != NULL) { + ASSERT(parent->avl_child[which_child] == NULL); + parent->avl_child[which_child] = node; + } else { + ASSERT(tree->avl_root == NULL); + tree->avl_root = node; + } + /* + * Now, back up the tree modifying the balance of all nodes above the + * insertion point. If we get to a highly unbalanced ancestor, we + * need to do a rotation. If we back out of the tree we are done. + * If we brought any subtree into perfect balance (0), we are also done. + */ + for (;;) { + node = parent; + if (node == NULL) + return; + + /* + * Compute the new balance + */ + old_balance = AVL_XBALANCE(node); + new_balance = old_balance + avl_child2balance[which_child]; + + /* + * If we introduced equal balance, then we are done immediately + */ + if (new_balance == 0) { + AVL_SETBALANCE(node, 0); + return; + } + + /* + * If both old and new are not zero we went + * from -1 to -2 balance, do a rotation. + */ + if (old_balance != 0) + break; + + AVL_SETBALANCE(node, new_balance); + parent = AVL_XPARENT(node); + which_child = AVL_XCHILD(node); + } + + /* + * perform a rotation to fix the tree and return + */ + (void) avl_rotation(tree, node, new_balance); +} + +/* + * Insert "new_data" in "tree" in the given "direction" either after or + * before (AVL_AFTER, AVL_BEFORE) the data "here". + * + * Insertions can only be done at empty leaf points in the tree, therefore + * if the given child of the node is already present we move to either + * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since + * every other node in the tree is a leaf, this always works. + * + * To help developers using this interface, we assert that the new node + * is correctly ordered at every step of the way in DEBUG kernels. + */ +void +avl_insert_here( + avl_tree_t *tree, + void *new_data, + void *here, + int direction) +{ + avl_node_t *node; + int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */ +#ifdef DEBUG + int diff; +#endif + + ASSERT(tree != NULL); + ASSERT(new_data != NULL); + ASSERT(here != NULL); + ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER); + + /* + * If corresponding child of node is not NULL, go to the neighboring + * node and reverse the insertion direction. + */ + node = AVL_DATA2NODE(here, tree->avl_offset); + +#ifdef DEBUG + diff = tree->avl_compar(new_data, here); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + + if (node->avl_child[child] != NULL) { + node = node->avl_child[child]; + child = 1 - child; + while (node->avl_child[child] != NULL) { +#ifdef DEBUG + diff = tree->avl_compar(new_data, + AVL_NODE2DATA(node, tree->avl_offset)); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + node = node->avl_child[child]; + } +#ifdef DEBUG + diff = tree->avl_compar(new_data, + AVL_NODE2DATA(node, tree->avl_offset)); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + } + ASSERT(node->avl_child[child] == NULL); + + avl_insert(tree, new_data, AVL_MKINDEX(node, child)); +} + +/* + * Add a new node to an AVL tree. + */ +void +avl_add(avl_tree_t *tree, void *new_node) +{ + avl_index_t where; + + /* + * This is unfortunate. We want to call panic() here, even for + * non-DEBUG kernels. In userland, however, we can't depend on anything + * in libc or else the rtld build process gets confused. So, all we can + * do in userland is resort to a normal ASSERT(). + */ + if (avl_find(tree, new_node, &where) != NULL) +#ifdef _KERNEL + panic("avl_find() succeeded inside avl_add()"); +#else + ASSERT(0); +#endif + avl_insert(tree, new_node, where); +} + +/* + * Delete a node from the AVL tree. Deletion is similar to insertion, but + * with 2 complications. + * + * First, we may be deleting an interior node. Consider the following subtree: + * + * d c c + * / \ / \ / \ + * b e b e b e + * / \ / \ / + * a c a a + * + * When we are deleting node (d), we find and bring up an adjacent valued leaf + * node, say (c), to take the interior node's place. In the code this is + * handled by temporarily swapping (d) and (c) in the tree and then using + * common code to delete (d) from the leaf position. + * + * Secondly, an interior deletion from a deep tree may require more than one + * rotation to fix the balance. This is handled by moving up the tree through + * parents and applying rotations as needed. The return value from + * avl_rotation() is used to detect when a subtree did not change overall + * height due to a rotation. + */ +void +avl_remove(avl_tree_t *tree, void *data) +{ + avl_node_t *delete; + avl_node_t *parent; + avl_node_t *node; + avl_node_t tmp; + int old_balance; + int new_balance; + int left; + int right; + int which_child; + size_t off = tree->avl_offset; + + ASSERT(tree); + + delete = AVL_DATA2NODE(data, off); + + /* + * Deletion is easiest with a node that has at most 1 child. + * We swap a node with 2 children with a sequentially valued + * neighbor node. That node will have at most 1 child. Note this + * has no effect on the ordering of the remaining nodes. + * + * As an optimization, we choose the greater neighbor if the tree + * is right heavy, otherwise the left neighbor. This reduces the + * number of rotations needed. + */ + if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) { + + /* + * choose node to swap from whichever side is taller + */ + old_balance = AVL_XBALANCE(delete); + left = avl_balance2child[old_balance + 1]; + right = 1 - left; + + /* + * get to the previous value'd node + * (down 1 left, as far as possible right) + */ + for (node = delete->avl_child[left]; + node->avl_child[right] != NULL; + node = node->avl_child[right]) + ; + + /* + * create a temp placeholder for 'node' + * move 'node' to delete's spot in the tree + */ + tmp = *node; + + *node = *delete; + if (node->avl_child[left] == node) + node->avl_child[left] = &tmp; + + parent = AVL_XPARENT(node); + if (parent != NULL) + parent->avl_child[AVL_XCHILD(node)] = node; + else + tree->avl_root = node; + AVL_SETPARENT(node->avl_child[left], node); + AVL_SETPARENT(node->avl_child[right], node); + + /* + * Put tmp where node used to be (just temporary). + * It always has a parent and at most 1 child. + */ + delete = &tmp; + parent = AVL_XPARENT(delete); + parent->avl_child[AVL_XCHILD(delete)] = delete; + which_child = (delete->avl_child[1] != 0); + if (delete->avl_child[which_child] != NULL) + AVL_SETPARENT(delete->avl_child[which_child], delete); + } + + + /* + * Here we know "delete" is at least partially a leaf node. It can + * be easily removed from the tree. + */ + ASSERT(tree->avl_numnodes > 0); + --tree->avl_numnodes; + parent = AVL_XPARENT(delete); + which_child = AVL_XCHILD(delete); + if (delete->avl_child[0] != NULL) + node = delete->avl_child[0]; + else + node = delete->avl_child[1]; + + /* + * Connect parent directly to node (leaving out delete). + */ + if (node != NULL) { + AVL_SETPARENT(node, parent); + AVL_SETCHILD(node, which_child); + } + if (parent == NULL) { + tree->avl_root = node; + return; + } + parent->avl_child[which_child] = node; + + + /* + * Since the subtree is now shorter, begin adjusting parent balances + * and performing any needed rotations. + */ + do { + + /* + * Move up the tree and adjust the balance + * + * Capture the parent and which_child values for the next + * iteration before any rotations occur. + */ + node = parent; + old_balance = AVL_XBALANCE(node); + new_balance = old_balance - avl_child2balance[which_child]; + parent = AVL_XPARENT(node); + which_child = AVL_XCHILD(node); + + /* + * If a node was in perfect balance but isn't anymore then + * we can stop, since the height didn't change above this point + * due to a deletion. + */ + if (old_balance == 0) { + AVL_SETBALANCE(node, new_balance); + break; + } + + /* + * If the new balance is zero, we don't need to rotate + * else + * need a rotation to fix the balance. + * If the rotation doesn't change the height + * of the sub-tree we have finished adjusting. + */ + if (new_balance == 0) + AVL_SETBALANCE(node, new_balance); + else if (!avl_rotation(tree, node, new_balance)) + break; + } while (parent != NULL); +} + +/* + * initialize a new AVL tree + */ +void +avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), + size_t size, size_t offset) +{ + ASSERT(tree); + ASSERT(compar); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (avl_node_t)); +#ifdef _LP64 + ASSERT((offset & 0x7) == 0); +#endif + + tree->avl_compar = compar; + tree->avl_root = NULL; + tree->avl_numnodes = 0; + tree->avl_size = size; + tree->avl_offset = offset; +} + +/* + * Delete a tree. + */ +/* ARGSUSED */ +void +avl_destroy(avl_tree_t *tree) +{ + ASSERT(tree); + ASSERT(tree->avl_numnodes == 0); + ASSERT(tree->avl_root == NULL); +} + + +/* + * Return the number of nodes in an AVL tree. + */ +ulong_t +avl_numnodes(avl_tree_t *tree) +{ + ASSERT(tree); + return (tree->avl_numnodes); +} + + +#define CHILDBIT (1L) + +/* + * Post-order tree walk used to visit all tree nodes and destroy the tree + * in post order. This is used for destroying a tree w/o paying any cost + * for rebalancing it. + * + * example: + * + * void *cookie = NULL; + * my_data_t *node; + * + * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) + * free(node); + * avl_destroy(tree); + * + * The cookie is really an avl_node_t to the current node's parent and + * an indication of which child you looked at last. + * + * On input, a cookie value of CHILDBIT indicates the tree is done. + */ +void * +avl_destroy_nodes(avl_tree_t *tree, void **cookie) +{ + avl_node_t *node; + avl_node_t *parent; + int child; + void *first; + size_t off = tree->avl_offset; + + /* + * Initial calls go to the first node or it's right descendant. + */ + if (*cookie == NULL) { + first = avl_first(tree); + + /* + * deal with an empty tree + */ + if (first == NULL) { + *cookie = (void *)CHILDBIT; + return (NULL); + } + + node = AVL_DATA2NODE(first, off); + parent = AVL_XPARENT(node); + goto check_right_side; + } + + /* + * If there is no parent to return to we are done. + */ + parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT); + if (parent == NULL) { + if (tree->avl_root != NULL) { + ASSERT(tree->avl_numnodes == 1); + tree->avl_root = NULL; + tree->avl_numnodes = 0; + } + return (NULL); + } + + /* + * Remove the child pointer we just visited from the parent and tree. + */ + child = (uintptr_t)(*cookie) & CHILDBIT; + parent->avl_child[child] = NULL; + ASSERT(tree->avl_numnodes > 1); + --tree->avl_numnodes; + + /* + * If we just did a right child or there isn't one, go up to parent. + */ + if (child == 1 || parent->avl_child[1] == NULL) { + node = parent; + parent = AVL_XPARENT(parent); + goto done; + } + + /* + * Do parent's right child, then leftmost descendent. + */ + node = parent->avl_child[1]; + while (node->avl_child[0] != NULL) { + parent = node; + node = node->avl_child[0]; + } + + /* + * If here, we moved to a left child. It may have one + * child on the right (when balance == +1). + */ +check_right_side: + if (node->avl_child[1] != NULL) { + ASSERT(AVL_XBALANCE(node) == 1); + parent = node; + node = node->avl_child[1]; + ASSERT(node->avl_child[0] == NULL && + node->avl_child[1] == NULL); + } else { + ASSERT(AVL_XBALANCE(node) <= 0); + } + +done: + if (parent == NULL) { + *cookie = (void *)CHILDBIT; + ASSERT(node == tree->avl_root); + } else { + *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node)); + } + + return (AVL_NODE2DATA(node, off)); +} diff --git a/sys/contrib/opensolaris/common/nvpair/nvpair.c b/sys/contrib/opensolaris/common/nvpair/nvpair.c new file mode 100644 index 0000000..a1e6190 --- /dev/null +++ b/sys/contrib/opensolaris/common/nvpair/nvpair.c @@ -0,0 +1,2953 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/debug.h> +#include <sys/nvpair.h> +#include <sys/nvpair_impl.h> +#include <rpc/types.h> +#include <rpc/xdr.h> + +#if defined(_KERNEL) && !defined(_BOOT) +#include <sys/varargs.h> +#else +#include <stdarg.h> +#include <strings.h> +#endif + +#ifndef offsetof +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + + +/* + * nvpair.c - Provides kernel & userland interfaces for manipulating + * name-value pairs. + * + * Overview Diagram + * + * +--------------+ + * | nvlist_t | + * |--------------| + * | nvl_version | + * | nvl_nvflag | + * | nvl_priv -+-+ + * | nvl_flag | | + * | nvl_pad | | + * +--------------+ | + * V + * +--------------+ last i_nvp in list + * | nvpriv_t | +---------------------> + * |--------------| | + * +--+- nvp_list | | +------------+ + * | | nvp_last -+--+ + nv_alloc_t | + * | | nvp_curr | |------------| + * | | nvp_nva -+----> | nva_ops | + * | | nvp_stat | | nva_arg | + * | +--------------+ +------------+ + * | + * +-------+ + * V + * +---------------------+ +-------------------+ + * | i_nvp_t | +-->| i_nvp_t | +--> + * |---------------------| | |-------------------| | + * | nvi_next -+--+ | nvi_next -+--+ + * | nvi_prev (NULL) | <----+ nvi_prev | + * | . . . . . . . . . . | | . . . . . . . . . | + * | nvp (nvpair_t) | | nvp (nvpair_t) | + * | - nvp_size | | - nvp_size | + * | - nvp_name_sz | | - nvp_name_sz | + * | - nvp_value_elem | | - nvp_value_elem | + * | - nvp_type | | - nvp_type | + * | - data ... | | - data ... | + * +---------------------+ +-------------------+ + * + * + * + * +---------------------+ +---------------------+ + * | i_nvp_t | +--> +-->| i_nvp_t (last) | + * |---------------------| | | |---------------------| + * | nvi_next -+--+ ... --+ | nvi_next (NULL) | + * <-+- nvi_prev |<-- ... <----+ nvi_prev | + * | . . . . . . . . . | | . . . . . . . . . | + * | nvp (nvpair_t) | | nvp (nvpair_t) | + * | - nvp_size | | - nvp_size | + * | - nvp_name_sz | | - nvp_name_sz | + * | - nvp_value_elem | | - nvp_value_elem | + * | - DATA_TYPE_NVLIST | | - nvp_type | + * | - data (embedded) | | - data ... | + * | nvlist name | +---------------------+ + * | +--------------+ | + * | | nvlist_t | | + * | |--------------| | + * | | nvl_version | | + * | | nvl_nvflag | | + * | | nvl_priv --+---+----> + * | | nvl_flag | | + * | | nvl_pad | | + * | +--------------+ | + * +---------------------+ + * + * + * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will + * allow value to be aligned on 8 byte boundary + * + * name_len is the length of the name string including the null terminator + * so it must be >= 1 + */ +#define NVP_SIZE_CALC(name_len, data_len) \ + (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len)) + +static int i_get_value_size(data_type_t type, const void *data, uint_t nelem); +static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, + uint_t nelem, const void *data); + +#define NV_STAT_EMBEDDED 0x1 +#define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp)) +#define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp)) + +#define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz)) +#define NVPAIR2I_NVP(nvp) \ + ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp))) + + +int +nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...) +{ + va_list valist; + int err = 0; + + nva->nva_ops = nvo; + nva->nva_arg = NULL; + + va_start(valist, nvo); + if (nva->nva_ops->nv_ao_init != NULL) + err = nva->nva_ops->nv_ao_init(nva, valist); + va_end(valist); + + return (err); +} + +void +nv_alloc_reset(nv_alloc_t *nva) +{ + if (nva->nva_ops->nv_ao_reset != NULL) + nva->nva_ops->nv_ao_reset(nva); +} + +void +nv_alloc_fini(nv_alloc_t *nva) +{ + if (nva->nva_ops->nv_ao_fini != NULL) + nva->nva_ops->nv_ao_fini(nva); +} + +nv_alloc_t * +nvlist_lookup_nv_alloc(nvlist_t *nvl) +{ + nvpriv_t *priv; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (NULL); + + return (priv->nvp_nva); +} + +static void * +nv_mem_zalloc(nvpriv_t *nvp, size_t size) +{ + nv_alloc_t *nva = nvp->nvp_nva; + void *buf; + + if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL) + bzero(buf, size); + + return (buf); +} + +static void +nv_mem_free(nvpriv_t *nvp, void *buf, size_t size) +{ + nv_alloc_t *nva = nvp->nvp_nva; + + nva->nva_ops->nv_ao_free(nva, buf, size); +} + +static void +nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat) +{ + bzero(priv, sizeof (priv)); + + priv->nvp_nva = nva; + priv->nvp_stat = stat; +} + +static nvpriv_t * +nv_priv_alloc(nv_alloc_t *nva) +{ + nvpriv_t *priv; + + /* + * nv_mem_alloc() cannot called here because it needs the priv + * argument. + */ + if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL) + return (NULL); + + nv_priv_init(priv, nva, 0); + + return (priv); +} + +/* + * Embedded lists need their own nvpriv_t's. We create a new + * nvpriv_t using the parameters and allocator from the parent + * list's nvpriv_t. + */ +static nvpriv_t * +nv_priv_alloc_embedded(nvpriv_t *priv) +{ + nvpriv_t *emb_priv; + + if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL) + return (NULL); + + nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED); + + return (emb_priv); +} + +static void +nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv) +{ + nvl->nvl_version = NV_VERSION; + nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE); + nvl->nvl_priv = (uint64_t)(uintptr_t)priv; + nvl->nvl_flag = 0; + nvl->nvl_pad = 0; +} + +/* + * nvlist_alloc - Allocate nvlist. + */ +/*ARGSUSED1*/ +int +nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) +{ +#if defined(_KERNEL) && !defined(_BOOT) + return (nvlist_xalloc(nvlp, nvflag, + (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); +#else + return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep)); +#endif +} + +int +nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva) +{ + nvpriv_t *priv; + + if (nvlp == NULL || nva == NULL) + return (EINVAL); + + if ((priv = nv_priv_alloc(nva)) == NULL) + return (ENOMEM); + + if ((*nvlp = nv_mem_zalloc(priv, + NV_ALIGN(sizeof (nvlist_t)))) == NULL) { + nv_mem_free(priv, priv, sizeof (nvpriv_t)); + return (ENOMEM); + } + + nvlist_init(*nvlp, nvflag, priv); + + return (0); +} + +/* + * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair. + */ +static nvpair_t * +nvp_buf_alloc(nvlist_t *nvl, size_t len) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *buf; + nvpair_t *nvp; + size_t nvsize; + + /* + * Allocate the buffer + */ + nvsize = len + offsetof(i_nvp_t, nvi_nvp); + + if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL) + return (NULL); + + nvp = &buf->nvi_nvp; + nvp->nvp_size = len; + + return (nvp); +} + +/* + * nvp_buf_free - de-Allocate an i_nvp_t. + */ +static void +nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp); + + nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize); +} + +/* + * nvp_buf_link - link a new nv pair into the nvlist. + */ +static void +nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr = NVPAIR2I_NVP(nvp); + + /* Put element at end of nvlist */ + if (priv->nvp_list == NULL) { + priv->nvp_list = priv->nvp_last = curr; + } else { + curr->nvi_prev = priv->nvp_last; + priv->nvp_last->nvi_next = curr; + priv->nvp_last = curr; + } +} + +/* + * nvp_buf_unlink - unlink an removed nvpair out of the nvlist. + */ +static void +nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr = NVPAIR2I_NVP(nvp); + + /* + * protect nvlist_next_nvpair() against walking on freed memory. + */ + if (priv->nvp_curr == curr) + priv->nvp_curr = curr->nvi_next; + + if (curr == priv->nvp_list) + priv->nvp_list = curr->nvi_next; + else + curr->nvi_prev->nvi_next = curr->nvi_next; + + if (curr == priv->nvp_last) + priv->nvp_last = curr->nvi_prev; + else + curr->nvi_next->nvi_prev = curr->nvi_prev; +} + +/* + * take a nvpair type and number of elements and make sure the are valid + */ +static int +i_validate_type_nelem(data_type_t type, uint_t nelem) +{ + switch (type) { + case DATA_TYPE_BOOLEAN: + if (nelem != 0) + return (EINVAL); + break; + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: + case DATA_TYPE_STRING: + case DATA_TYPE_HRTIME: + case DATA_TYPE_NVLIST: + if (nelem != 1) + return (EINVAL); + break; + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + case DATA_TYPE_STRING_ARRAY: + case DATA_TYPE_NVLIST_ARRAY: + /* we allow arrays with 0 elements */ + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * Verify nvp_name_sz and check the name string length. + */ +static int +i_validate_nvpair_name(nvpair_t *nvp) +{ + if ((nvp->nvp_name_sz <= 0) || + (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0))) + return (EFAULT); + + /* verify the name string, make sure its terminated */ + if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0') + return (EFAULT); + + return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT); +} + +static int +i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data) +{ + switch (type) { + case DATA_TYPE_BOOLEAN_VALUE: + if (*(boolean_t *)data != B_TRUE && + *(boolean_t *)data != B_FALSE) + return (EINVAL); + break; + case DATA_TYPE_BOOLEAN_ARRAY: { + int i; + + for (i = 0; i < nelem; i++) + if (((boolean_t *)data)[i] != B_TRUE && + ((boolean_t *)data)[i] != B_FALSE) + return (EINVAL); + break; + } + default: + break; + } + + return (0); +} + +/* + * This function takes a pointer to what should be a nvpair and it's size + * and then verifies that all the nvpair fields make sense and can be + * trusted. This function is used when decoding packed nvpairs. + */ +static int +i_validate_nvpair(nvpair_t *nvp) +{ + data_type_t type = NVP_TYPE(nvp); + int size1, size2; + + /* verify nvp_name_sz, check the name string length */ + if (i_validate_nvpair_name(nvp) != 0) + return (EFAULT); + + if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0) + return (EFAULT); + + /* + * verify nvp_type, nvp_value_elem, and also possibly + * verify string values and get the value size. + */ + size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp)); + size1 = nvp->nvp_size - NVP_VALOFF(nvp); + if (size2 < 0 || size1 != NV_ALIGN(size2)) + return (EFAULT); + + return (0); +} + +static int +nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl) +{ + nvpriv_t *priv; + i_nvp_t *curr; + + if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL) + return (EINVAL); + + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { + nvpair_t *nvp = &curr->nvi_nvp; + int err; + + if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp), + NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0) + return (err); + } + + return (0); +} + +/* + * Frees all memory allocated for an nvpair (like embedded lists) with + * the exception of the nvpair buffer itself. + */ +static void +nvpair_free(nvpair_t *nvp) +{ + switch (NVP_TYPE(nvp)) { + case DATA_TYPE_NVLIST: + nvlist_free(EMBEDDED_NVL(nvp)); + break; + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); + int i; + + for (i = 0; i < NVP_NELEM(nvp); i++) + if (nvlp[i] != NULL) + nvlist_free(nvlp[i]); + break; + } + default: + break; + } +} + +/* + * nvlist_free - free an unpacked nvlist + */ +void +nvlist_free(nvlist_t *nvl) +{ + nvpriv_t *priv; + i_nvp_t *curr; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return; + + /* + * Unpacked nvlist are linked through i_nvp_t + */ + curr = priv->nvp_list; + while (curr != NULL) { + nvpair_t *nvp = &curr->nvi_nvp; + curr = curr->nvi_next; + + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + } + + if (!(priv->nvp_stat & NV_STAT_EMBEDDED)) + nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t))); + else + nvl->nvl_priv = 0; + + nv_mem_free(priv, priv, sizeof (nvpriv_t)); +} + +static int +nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr; + + if (nvp == NULL) + return (0); + + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) + if (&curr->nvi_nvp == nvp) + return (1); + + return (0); +} + +/* + * Make a copy of nvlist + */ +/*ARGSUSED1*/ +int +nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag) +{ +#if defined(_KERNEL) && !defined(_BOOT) + return (nvlist_xdup(nvl, nvlp, + (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); +#else + return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep)); +#endif +} + +int +nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva) +{ + int err; + nvlist_t *ret; + + if (nvl == NULL || nvlp == NULL) + return (EINVAL); + + if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0) + return (err); + + if ((err = nvlist_copy_pairs(nvl, ret)) != 0) + nvlist_free(ret); + else + *nvlp = ret; + + return (err); +} + +/* + * Remove all with matching name + */ +int +nvlist_remove_all(nvlist_t *nvl, const char *name) +{ + nvpriv_t *priv; + i_nvp_t *curr; + int error = ENOENT; + + if (nvl == NULL || name == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (EINVAL); + + curr = priv->nvp_list; + while (curr != NULL) { + nvpair_t *nvp = &curr->nvi_nvp; + + curr = curr->nvi_next; + if (strcmp(name, NVP_NAME(nvp)) != 0) + continue; + + nvp_buf_unlink(nvl, nvp); + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + + error = 0; + } + + return (error); +} + +/* + * Remove first one with matching name and type + */ +int +nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) +{ + nvpriv_t *priv; + i_nvp_t *curr; + + if (nvl == NULL || name == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (EINVAL); + + curr = priv->nvp_list; + while (curr != NULL) { + nvpair_t *nvp = &curr->nvi_nvp; + + if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) { + nvp_buf_unlink(nvl, nvp); + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + + return (0); + } + curr = curr->nvi_next; + } + + return (ENOENT); +} + +/* + * This function calculates the size of an nvpair value. + * + * The data argument controls the behavior in case of the data types + * DATA_TYPE_STRING and + * DATA_TYPE_STRING_ARRAY + * Is data == NULL then the size of the string(s) is excluded. + */ +static int +i_get_value_size(data_type_t type, const void *data, uint_t nelem) +{ + uint64_t value_sz; + + if (i_validate_type_nelem(type, nelem) != 0) + return (-1); + + /* Calculate required size for holding value */ + switch (type) { + case DATA_TYPE_BOOLEAN: + value_sz = 0; + break; + case DATA_TYPE_BOOLEAN_VALUE: + value_sz = sizeof (boolean_t); + break; + case DATA_TYPE_BYTE: + value_sz = sizeof (uchar_t); + break; + case DATA_TYPE_INT8: + value_sz = sizeof (int8_t); + break; + case DATA_TYPE_UINT8: + value_sz = sizeof (uint8_t); + break; + case DATA_TYPE_INT16: + value_sz = sizeof (int16_t); + break; + case DATA_TYPE_UINT16: + value_sz = sizeof (uint16_t); + break; + case DATA_TYPE_INT32: + value_sz = sizeof (int32_t); + break; + case DATA_TYPE_UINT32: + value_sz = sizeof (uint32_t); + break; + case DATA_TYPE_INT64: + value_sz = sizeof (int64_t); + break; + case DATA_TYPE_UINT64: + value_sz = sizeof (uint64_t); + break; + case DATA_TYPE_STRING: + if (data == NULL) + value_sz = 0; + else + value_sz = strlen(data) + 1; + break; + case DATA_TYPE_BOOLEAN_ARRAY: + value_sz = (uint64_t)nelem * sizeof (boolean_t); + break; + case DATA_TYPE_BYTE_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uchar_t); + break; + case DATA_TYPE_INT8_ARRAY: + value_sz = (uint64_t)nelem * sizeof (int8_t); + break; + case DATA_TYPE_UINT8_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint8_t); + break; + case DATA_TYPE_INT16_ARRAY: + value_sz = (uint64_t)nelem * sizeof (int16_t); + break; + case DATA_TYPE_UINT16_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint16_t); + break; + case DATA_TYPE_INT32_ARRAY: + value_sz = (uint64_t)nelem * sizeof (int32_t); + break; + case DATA_TYPE_UINT32_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint32_t); + break; + case DATA_TYPE_INT64_ARRAY: + value_sz = (uint64_t)nelem * sizeof (int64_t); + break; + case DATA_TYPE_UINT64_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint64_t); + break; + case DATA_TYPE_STRING_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint64_t); + + if (data != NULL) { + char *const *strs = data; + uint_t i; + + /* no alignment requirement for strings */ + for (i = 0; i < nelem; i++) { + if (strs[i] == NULL) + return (-1); + value_sz += strlen(strs[i]) + 1; + } + } + break; + case DATA_TYPE_HRTIME: + value_sz = sizeof (hrtime_t); + break; + case DATA_TYPE_NVLIST: + value_sz = NV_ALIGN(sizeof (nvlist_t)); + break; + case DATA_TYPE_NVLIST_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint64_t) + + (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t)); + break; + default: + return (-1); + } + + return (value_sz > INT32_MAX ? -1 : (int)value_sz); +} + +static int +nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl) +{ + nvpriv_t *priv; + int err; + + if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t) + nvl->nvl_priv)) == NULL) + return (ENOMEM); + + nvlist_init(emb_nvl, onvl->nvl_nvflag, priv); + + if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) { + nvlist_free(emb_nvl); + emb_nvl->nvl_priv = 0; + } + + return (err); +} + +/* + * nvlist_add_common - Add new <name,value> pair to nvlist + */ +static int +nvlist_add_common(nvlist_t *nvl, const char *name, + data_type_t type, uint_t nelem, const void *data) +{ + nvpair_t *nvp; + uint_t i; + + int nvp_sz, name_sz, value_sz; + int err = 0; + + if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) + return (EINVAL); + + if (nelem != 0 && data == NULL) + return (EINVAL); + + /* + * Verify type and nelem and get the value size. + * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY + * is the size of the string(s) included. + */ + if ((value_sz = i_get_value_size(type, data, nelem)) < 0) + return (EINVAL); + + if (i_validate_nvpair_value(type, nelem, data) != 0) + return (EINVAL); + + /* + * If we're adding an nvlist or nvlist array, ensure that we are not + * adding the input nvlist to itself, which would cause recursion, + * and ensure that no NULL nvlist pointers are present. + */ + switch (type) { + case DATA_TYPE_NVLIST: + if (data == nvl || data == NULL) + return (EINVAL); + break; + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **onvlp = (nvlist_t **)data; + for (i = 0; i < nelem; i++) { + if (onvlp[i] == nvl || onvlp[i] == NULL) + return (EINVAL); + } + break; + } + default: + break; + } + + /* calculate sizes of the nvpair elements and the nvpair itself */ + name_sz = strlen(name) + 1; + + nvp_sz = NVP_SIZE_CALC(name_sz, value_sz); + + if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL) + return (ENOMEM); + + ASSERT(nvp->nvp_size == nvp_sz); + nvp->nvp_name_sz = name_sz; + nvp->nvp_value_elem = nelem; + nvp->nvp_type = type; + bcopy(name, NVP_NAME(nvp), name_sz); + + switch (type) { + case DATA_TYPE_BOOLEAN: + break; + case DATA_TYPE_STRING_ARRAY: { + char *const *strs = data; + char *buf = NVP_VALUE(nvp); + char **cstrs = (void *)buf; + + /* skip pre-allocated space for pointer array */ + buf += nelem * sizeof (uint64_t); + for (i = 0; i < nelem; i++) { + int slen = strlen(strs[i]) + 1; + bcopy(strs[i], buf, slen); + cstrs[i] = buf; + buf += slen; + } + break; + } + case DATA_TYPE_NVLIST: { + nvlist_t *nnvl = EMBEDDED_NVL(nvp); + nvlist_t *onvl = (nvlist_t *)data; + + if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) { + nvp_buf_free(nvl, nvp); + return (err); + } + break; + } + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **onvlp = (nvlist_t **)data; + nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); + nvlist_t *embedded = (nvlist_t *) + ((uintptr_t)nvlp + nelem * sizeof (uint64_t)); + + for (i = 0; i < nelem; i++) { + if ((err = nvlist_copy_embedded(nvl, + onvlp[i], embedded)) != 0) { + /* + * Free any successfully created lists + */ + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (err); + } + + nvlp[i] = embedded++; + } + break; + } + default: + bcopy(data, NVP_VALUE(nvp), value_sz); + } + + /* if unique name, remove before add */ + if (nvl->nvl_nvflag & NV_UNIQUE_NAME) + (void) nvlist_remove_all(nvl, name); + else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE) + (void) nvlist_remove(nvl, name, type); + + nvp_buf_link(nvl, nvp); + + return (0); +} + +int +nvlist_add_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL)); +} + +int +nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val)); +} + +int +nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val)); +} + +int +nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val)); +} + +int +nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val)); +} + +int +nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val)); +} + +int +nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val)); +} + +int +nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val)); +} + +int +nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val)); +} + +int +nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val)); +} + +int +nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val)); +} + +int +nvlist_add_string(nvlist_t *nvl, const char *name, const char *val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val)); +} + +int +nvlist_add_boolean_array(nvlist_t *nvl, const char *name, + boolean_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); +} + +int +nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); +} + +int +nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); +} + +int +nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); +} + +int +nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); +} + +int +nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); +} + +int +nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); +} + +int +nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); +} + +int +nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); +} + +int +nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); +} + +int +nvlist_add_string_array(nvlist_t *nvl, const char *name, + char *const *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); +} + +int +nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val)); +} + +int +nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val)); +} + +int +nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); +} + +/* reading name-value pairs */ +nvpair_t * +nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv; + i_nvp_t *curr; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (NULL); + + curr = NVPAIR2I_NVP(nvp); + + /* + * Ensure that nvp is an valid pointer. + */ + if (nvp == NULL) + curr = priv->nvp_list; + else if (priv->nvp_curr == curr) + curr = curr->nvi_next; + else if (nvlist_contains_nvp(nvl, nvp) == 0) + curr = NULL; + + priv->nvp_curr = curr; + + return (curr != NULL ? &curr->nvi_nvp : NULL); +} + +char * +nvpair_name(nvpair_t *nvp) +{ + return (NVP_NAME(nvp)); +} + +data_type_t +nvpair_type(nvpair_t *nvp) +{ + return (NVP_TYPE(nvp)); +} + +static int +nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data) +{ + if (nvp == NULL || nvpair_type(nvp) != type) + return (EINVAL); + + /* + * For non-array types, we copy the data. + * For array types (including string), we set a pointer. + */ + switch (type) { + case DATA_TYPE_BOOLEAN: + if (nelem != NULL) + *nelem = 0; + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: + case DATA_TYPE_HRTIME: + if (data == NULL) + return (EINVAL); + bcopy(NVP_VALUE(nvp), data, + (size_t)i_get_value_size(type, NULL, 1)); + if (nelem != NULL) + *nelem = 1; + break; + + case DATA_TYPE_NVLIST: + case DATA_TYPE_STRING: + if (data == NULL) + return (EINVAL); + *(void **)data = (void *)NVP_VALUE(nvp); + if (nelem != NULL) + *nelem = 1; + break; + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + case DATA_TYPE_STRING_ARRAY: + case DATA_TYPE_NVLIST_ARRAY: + if (nelem == NULL || data == NULL) + return (EINVAL); + if ((*nelem = NVP_NELEM(nvp)) != 0) + *(void **)data = (void *)NVP_VALUE(nvp); + else + *(void **)data = NULL; + break; + + default: + return (ENOTSUP); + } + + return (0); +} + +static int +nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type, + uint_t *nelem, void *data) +{ + nvpriv_t *priv; + nvpair_t *nvp; + i_nvp_t *curr; + + if (name == NULL || nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (EINVAL); + + if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE))) + return (ENOTSUP); + + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { + nvp = &curr->nvi_nvp; + + if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) + return (nvpair_value_common(nvp, type, nelem, data)); + } + + return (ENOENT); +} + +int +nvlist_lookup_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL)); +} + +int +nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val) +{ + return (nvlist_lookup_common(nvl, name, + DATA_TYPE_BOOLEAN_VALUE, NULL, val)); +} + +int +nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val)); +} + +int +nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val)); +} + +int +nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val)); +} + +int +nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val)); +} + +int +nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val)); +} + +int +nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val)); +} + +int +nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val)); +} + +int +nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val)); +} + +int +nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val)); +} + +int +nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val)); +} + +int +nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val)); +} + +int +nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, + boolean_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, + DATA_TYPE_BOOLEAN_ARRAY, n, a)); +} + +int +nvlist_lookup_byte_array(nvlist_t *nvl, const char *name, + uchar_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); +} + +int +nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); +} + +int +nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, + uint8_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); +} + +int +nvlist_lookup_int16_array(nvlist_t *nvl, const char *name, + int16_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); +} + +int +nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, + uint16_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); +} + +int +nvlist_lookup_int32_array(nvlist_t *nvl, const char *name, + int32_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); +} + +int +nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, + uint32_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); +} + +int +nvlist_lookup_int64_array(nvlist_t *nvl, const char *name, + int64_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); +} + +int +nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, + uint64_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); +} + +int +nvlist_lookup_string_array(nvlist_t *nvl, const char *name, + char ***a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); +} + +int +nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name, + nvlist_t ***a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); +} + +int +nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val)); +} + +int +nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) +{ + va_list ap; + char *name; + int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0); + int ret = 0; + + va_start(ap, flag); + while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { + data_type_t type; + void *val; + uint_t *nelem; + + switch (type = va_arg(ap, data_type_t)) { + case DATA_TYPE_BOOLEAN: + ret = nvlist_lookup_common(nvl, name, type, NULL, NULL); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: + case DATA_TYPE_HRTIME: + case DATA_TYPE_STRING: + case DATA_TYPE_NVLIST: + val = va_arg(ap, void *); + ret = nvlist_lookup_common(nvl, name, type, NULL, val); + break; + + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + case DATA_TYPE_STRING_ARRAY: + case DATA_TYPE_NVLIST_ARRAY: + val = va_arg(ap, void *); + nelem = va_arg(ap, uint_t *); + ret = nvlist_lookup_common(nvl, name, type, nelem, val); + break; + + default: + ret = EINVAL; + } + + if (ret == ENOENT && noentok) + ret = 0; + } + va_end(ap); + + return (ret); +} + +int +nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); +} + +int +nvpair_value_byte(nvpair_t *nvp, uchar_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val)); +} + +int +nvpair_value_int8(nvpair_t *nvp, int8_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val)); +} + +int +nvpair_value_uint8(nvpair_t *nvp, uint8_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val)); +} + +int +nvpair_value_int16(nvpair_t *nvp, int16_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val)); +} + +int +nvpair_value_uint16(nvpair_t *nvp, uint16_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val)); +} + +int +nvpair_value_int32(nvpair_t *nvp, int32_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val)); +} + +int +nvpair_value_uint32(nvpair_t *nvp, uint32_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val)); +} + +int +nvpair_value_int64(nvpair_t *nvp, int64_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val)); +} + +int +nvpair_value_uint64(nvpair_t *nvp, uint64_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val)); +} + +int +nvpair_value_string(nvpair_t *nvp, char **val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val)); +} + +int +nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val)); +} + +int +nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val)); +} + +int +nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val)); +} + +int +nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val)); +} + +int +nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val)); +} + +int +nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val)); +} + +int +nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val)); +} + +int +nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val)); +} + +int +nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val)); +} + +int +nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val)); +} + +int +nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val)); +} + +int +nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val)); +} + +int +nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val)); +} + +int +nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val)); +} + +/* + * Add specified pair to the list. + */ +int +nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + if (nvl == NULL || nvp == NULL) + return (EINVAL); + + return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp), + NVP_NELEM(nvp), NVP_VALUE(nvp))); +} + +/* + * Merge the supplied nvlists and put the result in dst. + * The merged list will contain all names specified in both lists, + * the values are taken from nvl in the case of duplicates. + * Return 0 on success. + */ +/*ARGSUSED*/ +int +nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag) +{ + if (nvl == NULL || dst == NULL) + return (EINVAL); + + if (dst != nvl) + return (nvlist_copy_pairs(nvl, dst)); + + return (0); +} + +/* + * Encoding related routines + */ +#define NVS_OP_ENCODE 0 +#define NVS_OP_DECODE 1 +#define NVS_OP_GETSIZE 2 + +typedef struct nvs_ops nvs_ops_t; + +typedef struct { + int nvs_op; + const nvs_ops_t *nvs_ops; + void *nvs_private; + nvpriv_t *nvs_priv; +} nvstream_t; + +/* + * nvs operations are: + * - nvs_nvlist + * encoding / decoding of a nvlist header (nvlist_t) + * calculates the size used for header and end detection + * + * - nvs_nvpair + * responsible for the first part of encoding / decoding of an nvpair + * calculates the decoded size of an nvpair + * + * - nvs_nvp_op + * second part of encoding / decoding of an nvpair + * + * - nvs_nvp_size + * calculates the encoding size of an nvpair + * + * - nvs_nvl_fini + * encodes the end detection mark (zeros). + */ +struct nvs_ops { + int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *); + int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *); + int (*nvs_nvp_op)(nvstream_t *, nvpair_t *); + int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *); + int (*nvs_nvl_fini)(nvstream_t *); +}; + +typedef struct { + char nvh_encoding; /* nvs encoding method */ + char nvh_endian; /* nvs endian */ + char nvh_reserved1; /* reserved for future use */ + char nvh_reserved2; /* reserved for future use */ +} nvs_header_t; + +static int +nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr; + + /* + * Walk nvpair in list and encode each nvpair + */ + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) + if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0) + return (EFAULT); + + return (nvs->nvs_ops->nvs_nvl_fini(nvs)); +} + +static int +nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl) +{ + nvpair_t *nvp; + size_t nvsize; + int err; + + /* + * Get decoded size of next pair in stream, alloc + * memory for nvpair_t, then decode the nvpair + */ + while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) { + if (nvsize == 0) /* end of list */ + break; + + /* make sure len makes sense */ + if (nvsize < NVP_SIZE_CALC(1, 0)) + return (EFAULT); + + if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL) + return (ENOMEM); + + if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) { + nvp_buf_free(nvl, nvp); + return (err); + } + + if (i_validate_nvpair(nvp) != 0) { + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (EFAULT); + } + + nvp_buf_link(nvl, nvp); + } + return (err); +} + +static int +nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr; + uint64_t nvsize = *buflen; + size_t size; + + /* + * Get encoded size of nvpairs in nvlist + */ + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { + if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0) + return (EINVAL); + + if ((nvsize += size) > INT32_MAX) + return (EINVAL); + } + + *buflen = nvsize; + return (0); +} + +static int +nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) +{ + int err; + + if (nvl->nvl_priv == 0) + return (EFAULT); + + /* + * Perform the operation, starting with header, then each nvpair + */ + if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0) + return (err); + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + err = nvs_encode_pairs(nvs, nvl); + break; + + case NVS_OP_DECODE: + err = nvs_decode_pairs(nvs, nvl); + break; + + case NVS_OP_GETSIZE: + err = nvs_getsize_pairs(nvs, nvl, buflen); + break; + + default: + err = EINVAL; + } + + return (err); +} + +static int +nvs_embedded(nvstream_t *nvs, nvlist_t *embedded) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + return (nvs_operation(nvs, embedded, NULL)); + + case NVS_OP_DECODE: { + nvpriv_t *priv; + int err; + + if (embedded->nvl_version != NV_VERSION) + return (ENOTSUP); + + if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL) + return (ENOMEM); + + nvlist_init(embedded, embedded->nvl_nvflag, priv); + + if ((err = nvs_operation(nvs, embedded, NULL)) != 0) + nvlist_free(embedded); + return (err); + } + default: + break; + } + + return (EINVAL); +} + +static int +nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + size_t nelem = NVP_NELEM(nvp); + nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); + int i; + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + for (i = 0; i < nelem; i++) + if (nvs_embedded(nvs, nvlp[i]) != 0) + return (EFAULT); + break; + + case NVS_OP_DECODE: { + size_t len = nelem * sizeof (uint64_t); + nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len); + + bzero(nvlp, len); /* don't trust packed data */ + for (i = 0; i < nelem; i++) { + if (nvs_embedded(nvs, embedded) != 0) { + nvpair_free(nvp); + return (EFAULT); + } + + nvlp[i] = embedded++; + } + break; + } + case NVS_OP_GETSIZE: { + uint64_t nvsize = 0; + + for (i = 0; i < nelem; i++) { + size_t nvp_sz = 0; + + if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0) + return (EINVAL); + + if ((nvsize += nvp_sz) > INT32_MAX) + return (EINVAL); + } + + *size = nvsize; + break; + } + default: + return (EINVAL); + } + + return (0); +} + +static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *); +static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *); + +/* + * Common routine for nvlist operations: + * encode, decode, getsize (encoded size). + */ +static int +nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, + int nvs_op) +{ + int err = 0; + nvstream_t nvs; + int nvl_endian; +#ifdef _LITTLE_ENDIAN + int host_endian = 1; +#else + int host_endian = 0; +#endif /* _LITTLE_ENDIAN */ + nvs_header_t *nvh = (void *)buf; + + if (buflen == NULL || nvl == NULL || + (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (EINVAL); + + nvs.nvs_op = nvs_op; + + /* + * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and + * a buffer is allocated. The first 4 bytes in the buffer are + * used for encoding method and host endian. + */ + switch (nvs_op) { + case NVS_OP_ENCODE: + if (buf == NULL || *buflen < sizeof (nvs_header_t)) + return (EINVAL); + + nvh->nvh_encoding = encoding; + nvh->nvh_endian = nvl_endian = host_endian; + nvh->nvh_reserved1 = 0; + nvh->nvh_reserved2 = 0; + break; + + case NVS_OP_DECODE: + if (buf == NULL || *buflen < sizeof (nvs_header_t)) + return (EINVAL); + + /* get method of encoding from first byte */ + encoding = nvh->nvh_encoding; + nvl_endian = nvh->nvh_endian; + break; + + case NVS_OP_GETSIZE: + nvl_endian = host_endian; + + /* + * add the size for encoding + */ + *buflen = sizeof (nvs_header_t); + break; + + default: + return (ENOTSUP); + } + + /* + * Create an nvstream with proper encoding method + */ + switch (encoding) { + case NV_ENCODE_NATIVE: + /* + * check endianness, in case we are unpacking + * from a file + */ + if (nvl_endian != host_endian) + return (ENOTSUP); + err = nvs_native(&nvs, nvl, buf, buflen); + break; + case NV_ENCODE_XDR: + err = nvs_xdr(&nvs, nvl, buf, buflen); + break; + default: + err = ENOTSUP; + break; + } + + return (err); +} + +int +nvlist_size(nvlist_t *nvl, size_t *size, int encoding) +{ + return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE)); +} + +/* + * Pack nvlist into contiguous memory + */ +/*ARGSUSED1*/ +int +nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, + int kmflag) +{ +#if defined(_KERNEL) && !defined(_BOOT) + return (nvlist_xpack(nvl, bufp, buflen, encoding, + (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); +#else + return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep)); +#endif +} + +int +nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, + nv_alloc_t *nva) +{ + nvpriv_t nvpriv; + size_t alloc_size; + char *buf; + int err; + + if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL) + return (EINVAL); + + if (*bufp != NULL) + return (nvlist_common(nvl, *bufp, buflen, encoding, + NVS_OP_ENCODE)); + + /* + * Here is a difficult situation: + * 1. The nvlist has fixed allocator properties. + * All other nvlist routines (like nvlist_add_*, ...) use + * these properties. + * 2. When using nvlist_pack() the user can specify his own + * allocator properties (e.g. by using KM_NOSLEEP). + * + * We use the user specified properties (2). A clearer solution + * will be to remove the kmflag from nvlist_pack(), but we will + * not change the interface. + */ + nv_priv_init(&nvpriv, nva, 0); + + if (err = nvlist_size(nvl, &alloc_size, encoding)) + return (err); + + if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL) + return (ENOMEM); + + if ((err = nvlist_common(nvl, buf, &alloc_size, encoding, + NVS_OP_ENCODE)) != 0) { + nv_mem_free(&nvpriv, buf, alloc_size); + } else { + *buflen = alloc_size; + *bufp = buf; + } + + return (err); +} + +/* + * Unpack buf into an nvlist_t + */ +/*ARGSUSED1*/ +int +nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag) +{ +#if defined(_KERNEL) && !defined(_BOOT) + return (nvlist_xunpack(buf, buflen, nvlp, + (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); +#else + return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep)); +#endif +} + +int +nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva) +{ + nvlist_t *nvl; + int err; + + if (nvlp == NULL) + return (EINVAL); + + if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0) + return (err); + + if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0) + nvlist_free(nvl); + else + *nvlp = nvl; + + return (err); +} + +/* + * Native encoding functions + */ +typedef struct { + /* + * This structure is used when decoding a packed nvpair in + * the native format. n_base points to a buffer containing the + * packed nvpair. n_end is a pointer to the end of the buffer. + * (n_end actually points to the first byte past the end of the + * buffer.) n_curr is a pointer that lies between n_base and n_end. + * It points to the current data that we are decoding. + * The amount of data left in the buffer is equal to n_end - n_curr. + * n_flag is used to recognize a packed embedded list. + */ + caddr_t n_base; + caddr_t n_end; + caddr_t n_curr; + uint_t n_flag; +} nvs_native_t; + +static int +nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf, + size_t buflen) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: + nvs->nvs_private = native; + native->n_curr = native->n_base = buf; + native->n_end = buf + buflen; + native->n_flag = 0; + return (0); + + case NVS_OP_GETSIZE: + nvs->nvs_private = native; + native->n_curr = native->n_base = native->n_end = NULL; + native->n_flag = 0; + return (0); + default: + return (EINVAL); + } +} + +/*ARGSUSED*/ +static void +nvs_native_destroy(nvstream_t *nvs) +{ +} + +static int +native_cp(nvstream_t *nvs, void *buf, size_t size) +{ + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + + if (native->n_curr + size > native->n_end) + return (EFAULT); + + /* + * The bcopy() below eliminates alignment requirement + * on the buffer (stream) and is preferred over direct access. + */ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + bcopy(buf, native->n_curr, size); + break; + case NVS_OP_DECODE: + bcopy(native->n_curr, buf, size); + break; + default: + return (EINVAL); + } + + native->n_curr += size; + return (0); +} + +/* + * operate on nvlist_t header + */ +static int +nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) +{ + nvs_native_t *native = nvs->nvs_private; + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: + if (native->n_flag) + return (0); /* packed embedded list */ + + native->n_flag = 1; + + /* copy version and nvflag of the nvlist_t */ + if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 || + native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0) + return (EFAULT); + + return (0); + + case NVS_OP_GETSIZE: + /* + * if calculate for packed embedded list + * 4 for end of the embedded list + * else + * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag + * and 4 for end of the entire list + */ + if (native->n_flag) { + *size += 4; + } else { + native->n_flag = 1; + *size += 2 * sizeof (int32_t) + 4; + } + + return (0); + + default: + return (EINVAL); + } +} + +static int +nvs_native_nvl_fini(nvstream_t *nvs) +{ + if (nvs->nvs_op == NVS_OP_ENCODE) { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + /* + * Add 4 zero bytes at end of nvlist. They are used + * for end detection by the decode routine. + */ + if (native->n_curr + sizeof (int) > native->n_end) + return (EFAULT); + + bzero(native->n_curr, sizeof (int)); + native->n_curr += sizeof (int); + } + + return (0); +} + +static int +nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp) +{ + if (nvs->nvs_op == NVS_OP_ENCODE) { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + nvlist_t *packed = (void *) + (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); + /* + * Null out the pointer that is meaningless in the packed + * structure. The address may not be aligned, so we have + * to use bzero. + */ + bzero(&packed->nvl_priv, sizeof (packed->nvl_priv)); + } + + return (nvs_embedded(nvs, EMBEDDED_NVL(nvp))); +} + +static int +nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp) +{ + if (nvs->nvs_op == NVS_OP_ENCODE) { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp); + size_t len = NVP_NELEM(nvp) * sizeof (uint64_t); + nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len); + int i; + /* + * Null out pointers that are meaningless in the packed + * structure. The addresses may not be aligned, so we have + * to use bzero. + */ + bzero(value, len); + + for (i = 0; i < NVP_NELEM(nvp); i++, packed++) + /* + * Null out the pointer that is meaningless in the + * packed structure. The address may not be aligned, + * so we have to use bzero. + */ + bzero(&packed->nvl_priv, sizeof (packed->nvl_priv)); + } + + return (nvs_embedded_nvl_array(nvs, nvp, NULL)); +} + +static void +nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + uint64_t *strp = (void *) + (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); + /* + * Null out pointers that are meaningless in the packed + * structure. The addresses may not be aligned, so we have + * to use bzero. + */ + bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t)); + break; + } + case NVS_OP_DECODE: { + char **strp = (void *)NVP_VALUE(nvp); + char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t)); + int i; + + for (i = 0; i < NVP_NELEM(nvp); i++) { + strp[i] = buf; + buf += strlen(buf) + 1; + } + break; + } + } +} + +static int +nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp) +{ + data_type_t type; + int value_sz; + int ret = 0; + + /* + * We do the initial bcopy of the data before we look at + * the nvpair type, because when we're decoding, we won't + * have the correct values for the pair until we do the bcopy. + */ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: + if (native_cp(nvs, nvp, nvp->nvp_size) != 0) + return (EFAULT); + break; + default: + return (EINVAL); + } + + /* verify nvp_name_sz, check the name string length */ + if (i_validate_nvpair_name(nvp) != 0) + return (EFAULT); + + type = NVP_TYPE(nvp); + + /* + * Verify type and nelem and get the value size. + * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY + * is the size of the string(s) excluded. + */ + if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0) + return (EFAULT); + + if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size) + return (EFAULT); + + switch (type) { + case DATA_TYPE_NVLIST: + ret = nvpair_native_embedded(nvs, nvp); + break; + case DATA_TYPE_NVLIST_ARRAY: + ret = nvpair_native_embedded_array(nvs, nvp); + break; + case DATA_TYPE_STRING_ARRAY: + nvpair_native_string_array(nvs, nvp); + break; + default: + break; + } + + return (ret); +} + +static int +nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + uint64_t nvp_sz = nvp->nvp_size; + + switch (NVP_TYPE(nvp)) { + case DATA_TYPE_NVLIST: { + size_t nvsize = 0; + + if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0) + return (EINVAL); + + nvp_sz += nvsize; + break; + } + case DATA_TYPE_NVLIST_ARRAY: { + size_t nvsize; + + if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0) + return (EINVAL); + + nvp_sz += nvsize; + break; + } + default: + break; + } + + if (nvp_sz > INT32_MAX) + return (EINVAL); + + *size = nvp_sz; + + return (0); +} + +static int +nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + return (nvs_native_nvp_op(nvs, nvp)); + + case NVS_OP_DECODE: { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + int32_t decode_len; + + /* try to read the size value from the stream */ + if (native->n_curr + sizeof (int32_t) > native->n_end) + return (EFAULT); + bcopy(native->n_curr, &decode_len, sizeof (int32_t)); + + /* sanity check the size value */ + if (decode_len < 0 || + decode_len > native->n_end - native->n_curr) + return (EFAULT); + + *size = decode_len; + + /* + * If at the end of the stream then move the cursor + * forward, otherwise nvpair_native_op() will read + * the entire nvpair at the same cursor position. + */ + if (*size == 0) + native->n_curr += sizeof (int32_t); + break; + } + + default: + return (EINVAL); + } + + return (0); +} + +static const nvs_ops_t nvs_native_ops = { + nvs_native_nvlist, + nvs_native_nvpair, + nvs_native_nvp_op, + nvs_native_nvp_size, + nvs_native_nvl_fini +}; + +static int +nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) +{ + nvs_native_t native; + int err; + + nvs->nvs_ops = &nvs_native_ops; + + if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t), + *buflen - sizeof (nvs_header_t))) != 0) + return (err); + + err = nvs_operation(nvs, nvl, buflen); + + nvs_native_destroy(nvs); + + return (err); +} + +/* + * XDR encoding functions + * + * An xdr packed nvlist is encoded as: + * + * - encoding methode and host endian (4 bytes) + * - nvl_version (4 bytes) + * - nvl_nvflag (4 bytes) + * + * - encoded nvpairs, the format of one xdr encoded nvpair is: + * - encoded size of the nvpair (4 bytes) + * - decoded size of the nvpair (4 bytes) + * - name string, (4 + sizeof(NV_ALIGN4(string)) + * a string is coded as size (4 bytes) and data + * - data type (4 bytes) + * - number of elements in the nvpair (4 bytes) + * - data + * + * - 2 zero's for end of the entire list (8 bytes) + */ +static int +nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen) +{ + /* xdr data must be 4 byte aligned */ + if ((ulong_t)buf % 4 != 0) + return (EFAULT); + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE); + nvs->nvs_private = xdr; + return (0); + case NVS_OP_DECODE: + xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE); + nvs->nvs_private = xdr; + return (0); + case NVS_OP_GETSIZE: + nvs->nvs_private = NULL; + return (0); + default: + return (EINVAL); + } +} + +static void +nvs_xdr_destroy(nvstream_t *nvs) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: + xdr_destroy((XDR *)nvs->nvs_private); + break; + default: + break; + } +} + +static int +nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: { + XDR *xdr = nvs->nvs_private; + + if (!xdr_int(xdr, &nvl->nvl_version) || + !xdr_u_int(xdr, &nvl->nvl_nvflag)) + return (EFAULT); + break; + } + case NVS_OP_GETSIZE: { + /* + * 2 * 4 for nvl_version + nvl_nvflag + * and 8 for end of the entire list + */ + *size += 2 * 4 + 8; + break; + } + default: + return (EINVAL); + } + return (0); +} + +static int +nvs_xdr_nvl_fini(nvstream_t *nvs) +{ + if (nvs->nvs_op == NVS_OP_ENCODE) { + XDR *xdr = nvs->nvs_private; + int zero = 0; + + if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero)) + return (EFAULT); + } + + return (0); +} + +/* + * The format of xdr encoded nvpair is: + * encode_size, decode_size, name string, data type, nelem, data + */ +static int +nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) +{ + data_type_t type; + char *buf; + char *buf_end = (char *)nvp + nvp->nvp_size; + int value_sz; + uint_t nelem, buflen; + bool_t ret = FALSE; + XDR *xdr = nvs->nvs_private; + + ASSERT(xdr != NULL && nvp != NULL); + + /* name string */ + if ((buf = NVP_NAME(nvp)) >= buf_end) + return (EFAULT); + buflen = buf_end - buf; + + if (!xdr_string(xdr, &buf, buflen - 1)) + return (EFAULT); + nvp->nvp_name_sz = strlen(buf) + 1; + + /* type and nelem */ + if (!xdr_int(xdr, (int *)&nvp->nvp_type) || + !xdr_int(xdr, &nvp->nvp_value_elem)) + return (EFAULT); + + type = NVP_TYPE(nvp); + nelem = nvp->nvp_value_elem; + + /* + * Verify type and nelem and get the value size. + * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY + * is the size of the string(s) excluded. + */ + if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0) + return (EFAULT); + + /* if there is no data to extract then return */ + if (nelem == 0) + return (0); + + /* value */ + if ((buf = NVP_VALUE(nvp)) >= buf_end) + return (EFAULT); + buflen = buf_end - buf; + + if (buflen < value_sz) + return (EFAULT); + + switch (type) { + case DATA_TYPE_NVLIST: + if (nvs_embedded(nvs, (void *)buf) == 0) + return (0); + break; + + case DATA_TYPE_NVLIST_ARRAY: + if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0) + return (0); + break; + + case DATA_TYPE_BOOLEAN: + ret = TRUE; + break; + + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + ret = xdr_char(xdr, buf); + break; + + case DATA_TYPE_INT16: + ret = xdr_short(xdr, (void *)buf); + break; + + case DATA_TYPE_UINT16: + ret = xdr_u_short(xdr, (void *)buf); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_INT32: + ret = xdr_int(xdr, (void *)buf); + break; + + case DATA_TYPE_UINT32: + ret = xdr_u_int(xdr, (void *)buf); + break; + + case DATA_TYPE_INT64: + ret = xdr_longlong_t(xdr, (void *)buf); + break; + + case DATA_TYPE_UINT64: + ret = xdr_u_longlong_t(xdr, (void *)buf); + break; + + case DATA_TYPE_HRTIME: + /* + * NOTE: must expose the definition of hrtime_t here + */ + ret = xdr_longlong_t(xdr, (void *)buf); + break; + + case DATA_TYPE_STRING: + ret = xdr_string(xdr, &buf, buflen - 1); + break; + + case DATA_TYPE_BYTE_ARRAY: + ret = xdr_opaque(xdr, buf, nelem); + break; + + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t), + (xdrproc_t)xdr_char); + break; + + case DATA_TYPE_INT16_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t), + sizeof (int16_t), (xdrproc_t)xdr_short); + break; + + case DATA_TYPE_UINT16_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t), + sizeof (uint16_t), (xdrproc_t)xdr_u_short); + break; + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT32_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t), + sizeof (int32_t), (xdrproc_t)xdr_int); + break; + + case DATA_TYPE_UINT32_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t), + sizeof (uint32_t), (xdrproc_t)xdr_u_int); + break; + + case DATA_TYPE_INT64_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t), + sizeof (int64_t), (xdrproc_t)xdr_longlong_t); + break; + + case DATA_TYPE_UINT64_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t), + sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t); + break; + + case DATA_TYPE_STRING_ARRAY: { + size_t len = nelem * sizeof (uint64_t); + char **strp = (void *)buf; + int i; + + if (nvs->nvs_op == NVS_OP_DECODE) + bzero(buf, len); /* don't trust packed data */ + + for (i = 0; i < nelem; i++) { + if (buflen <= len) + return (EFAULT); + + buf += len; + buflen -= len; + + if (xdr_string(xdr, &buf, buflen - 1) != TRUE) + return (EFAULT); + + if (nvs->nvs_op == NVS_OP_DECODE) + strp[i] = buf; + len = strlen(buf) + 1; + } + ret = TRUE; + break; + } + default: + break; + } + + return (ret == TRUE ? 0 : EFAULT); +} + +static int +nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + data_type_t type = NVP_TYPE(nvp); + /* + * encode_size + decode_size + name string size + data type + nelem + * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + */ + uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4; + + switch (type) { + case DATA_TYPE_BOOLEAN: + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + nvp_sz += 4; /* 4 is the minimum xdr unit */ + break; + + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: + case DATA_TYPE_HRTIME: + nvp_sz += 8; + break; + + case DATA_TYPE_STRING: + nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp))); + break; + + case DATA_TYPE_BYTE_ARRAY: + nvp_sz += NV_ALIGN4(NVP_NELEM(nvp)); + break; + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp); + break; + + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp); + break; + + case DATA_TYPE_STRING_ARRAY: { + int i; + char **strs = (void *)NVP_VALUE(nvp); + + for (i = 0; i < NVP_NELEM(nvp); i++) + nvp_sz += 4 + NV_ALIGN4(strlen(strs[i])); + + break; + } + + case DATA_TYPE_NVLIST: + case DATA_TYPE_NVLIST_ARRAY: { + size_t nvsize = 0; + int old_nvs_op = nvs->nvs_op; + int err; + + nvs->nvs_op = NVS_OP_GETSIZE; + if (type == DATA_TYPE_NVLIST) + err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize); + else + err = nvs_embedded_nvl_array(nvs, nvp, &nvsize); + nvs->nvs_op = old_nvs_op; + + if (err != 0) + return (EINVAL); + + nvp_sz += nvsize; + break; + } + + default: + return (EINVAL); + } + + if (nvp_sz > INT32_MAX) + return (EINVAL); + + *size = nvp_sz; + + return (0); +} + + +/* + * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates + * the largest nvpair that could be encoded in the buffer. + * + * See comments above nvpair_xdr_op() for the format of xdr encoding. + * The size of a xdr packed nvpair without any data is 5 words. + * + * Using the size of the data directly as an estimate would be ok + * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY + * then the actual nvpair has space for an array of pointers to index + * the strings. These pointers are not encoded into the packed xdr buffer. + * + * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are + * of length 0, then each string is endcoded in xdr format as a single word. + * Therefore when expanded to an nvpair there will be 2.25 word used for + * each string. (a int64_t allocated for pointer usage, and a single char + * for the null termination.) + * + * This is the calculation performed by the NVS_XDR_MAX_LEN macro. + */ +#define NVS_XDR_HDR_LEN ((size_t)(5 * 4)) +#define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \ + 0 : ((size_t)(y) - NVS_XDR_HDR_LEN)) +#define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \ + (NVS_XDR_DATA_LEN(x) * 2) + \ + NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4))) + +static int +nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + XDR *xdr = nvs->nvs_private; + int32_t encode_len, decode_len; + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: { + size_t nvsize; + + if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0) + return (EFAULT); + + decode_len = nvp->nvp_size; + encode_len = nvsize; + if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) + return (EFAULT); + + return (nvs_xdr_nvp_op(nvs, nvp)); + } + case NVS_OP_DECODE: { + struct xdr_bytesrec bytesrec; + + /* get the encode and decode size */ + if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) + return (EFAULT); + *size = decode_len; + + /* are we at the end of the stream? */ + if (*size == 0) + return (0); + + /* sanity check the size parameter */ + if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec)) + return (EFAULT); + + if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail)) + return (EFAULT); + break; + } + + default: + return (EINVAL); + } + return (0); +} + +static const struct nvs_ops nvs_xdr_ops = { + nvs_xdr_nvlist, + nvs_xdr_nvpair, + nvs_xdr_nvp_op, + nvs_xdr_nvp_size, + nvs_xdr_nvl_fini +}; + +static int +nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) +{ + XDR xdr; + int err; + + nvs->nvs_ops = &nvs_xdr_ops; + + if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t), + *buflen - sizeof (nvs_header_t))) != 0) + return (err); + + err = nvs_operation(nvs, nvl, buflen); + + nvs_xdr_destroy(nvs); + + return (err); +} diff --git a/sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c b/sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c new file mode 100644 index 0000000..620171e --- /dev/null +++ b/sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/nvpair.h> +#include <sys/sysmacros.h> +#if defined(_KERNEL) && !defined(_BOOT) +#include <sys/varargs.h> +#else +#include <stdarg.h> +#include <strings.h> +#endif + +/* + * This allocator is very simple. + * - it uses a pre-allocated buffer for memory allocations. + * - it does _not_ free memory in the pre-allocated buffer. + * + * The reason for the selected implemention is simplicity. + * This allocator is designed for the usage in interrupt context when + * the caller may not wait for free memory. + */ + +/* pre-allocated buffer for memory allocations */ +typedef struct nvbuf { + uintptr_t nvb_buf; /* address of pre-allocated buffer */ + uintptr_t nvb_lim; /* limit address in the buffer */ + uintptr_t nvb_cur; /* current address in the buffer */ +} nvbuf_t; + +/* + * Initialize the pre-allocated buffer allocator. The caller needs to supply + * + * buf address of pre-allocated buffer + * bufsz size of pre-allocated buffer + * + * nv_fixed_init() calculates the remaining members of nvbuf_t. + */ +static int +nv_fixed_init(nv_alloc_t *nva, va_list valist) +{ + uintptr_t base = va_arg(valist, uintptr_t); + uintptr_t lim = base + va_arg(valist, size_t); + nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t)); + + if (base == 0 || (uintptr_t)&nvb[1] > lim) + return (EINVAL); + + nvb->nvb_buf = (uintptr_t)&nvb[0]; + nvb->nvb_cur = (uintptr_t)&nvb[1]; + nvb->nvb_lim = lim; + nva->nva_arg = nvb; + + return (0); +} + +static void * +nv_fixed_alloc(nv_alloc_t *nva, size_t size) +{ + nvbuf_t *nvb = nva->nva_arg; + uintptr_t new = nvb->nvb_cur; + + if (size == 0 || new + size > nvb->nvb_lim) + return (NULL); + + nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t)); + + return ((void *)new); +} + +/*ARGSUSED*/ +static void +nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size) +{ + /* don't free memory in the pre-allocated buffer */ +} + +static void +nv_fixed_reset(nv_alloc_t *nva) +{ + nvbuf_t *nvb = nva->nva_arg; + + nvb->nvb_cur = (uintptr_t)&nvb[1]; +} + +const nv_alloc_ops_t nv_fixed_ops_def = { + nv_fixed_init, /* nv_ao_init() */ + NULL, /* nv_ao_fini() */ + nv_fixed_alloc, /* nv_ao_alloc() */ + nv_fixed_free, /* nv_ao_free() */ + nv_fixed_reset /* nv_ao_reset() */ +}; + +const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def; diff --git a/sys/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.c new file mode 100644 index 0000000..2004d86 --- /dev/null +++ b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.c @@ -0,0 +1,287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Common name validation routines for ZFS. These routines are shared by the + * userland code as well as the ioctl() layer to ensure that we don't + * inadvertently expose a hole through direct ioctl()s that never gets tested. + * In userland, however, we want significantly more information about _why_ the + * name is invalid. In the kernel, we only care whether it's valid or not. + * Each routine therefore takes a 'namecheck_err_t' which describes exactly why + * the name failed to validate. + * + * Each function returns 0 on success, -1 on error. + */ + +#if defined(_KERNEL) +#include <sys/systm.h> +#else +#include <string.h> +#endif + +#include <sys/param.h> +#include "zfs_namecheck.h" + +static int +valid_char(char c) +{ + return ((c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || + c == '-' || c == '_' || c == '.' || c == ':'); +} + +/* + * Snapshot names must be made up of alphanumeric characters plus the following + * characters: + * + * [-_.:] + */ +int +snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + const char *loc; + + if (strlen(path) >= MAXNAMELEN) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + + if (path[0] == '\0') { + if (why) + *why = NAME_ERR_EMPTY_COMPONENT; + return (-1); + } + + for (loc = path; *loc; loc++) { + if (!valid_char(*loc)) { + if (why) { + *why = NAME_ERR_INVALCHAR; + *what = *loc; + } + return (-1); + } + } + return (0); +} + +/* + * Dataset names must be of the following form: + * + * [component][/]*[component][@component] + * + * Where each component is made up of alphanumeric characters plus the following + * characters: + * + * [-_.:] + */ +int +dataset_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + const char *loc, *end; + int found_snapshot; + + /* + * Make sure the name is not too long. + * + * ZFS_MAXNAMELEN is the maximum dataset length used in the userland + * which is the same as MAXNAMELEN used in the kernel. + * If ZFS_MAXNAMELEN value is changed, make sure to cleanup all + * places using MAXNAMELEN. + */ + if (strlen(path) >= MAXNAMELEN) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + + /* Explicitly check for a leading slash. */ + if (path[0] == '/') { + if (why) + *why = NAME_ERR_LEADING_SLASH; + return (-1); + } + + if (path[0] == '\0') { + if (why) + *why = NAME_ERR_EMPTY_COMPONENT; + return (-1); + } + + loc = path; + found_snapshot = 0; + for (;;) { + /* Find the end of this component */ + end = loc; + while (*end != '/' && *end != '@' && *end != '\0') + end++; + + if (*end == '\0' && end[-1] == '/') { + /* trailing slashes are not allowed */ + if (why) + *why = NAME_ERR_TRAILING_SLASH; + return (-1); + } + + /* Zero-length components are not allowed */ + if (loc == end) { + if (why) { + /* + * Make sure this is really a zero-length + * component and not a '@@'. + */ + if (*end == '@' && found_snapshot) { + *why = NAME_ERR_MULTIPLE_AT; + } else { + *why = NAME_ERR_EMPTY_COMPONENT; + } + } + + return (-1); + } + + /* Validate the contents of this component */ + while (loc != end) { + if (!valid_char(*loc)) { + if (why) { + *why = NAME_ERR_INVALCHAR; + *what = *loc; + } + return (-1); + } + loc++; + } + + /* If we've reached the end of the string, we're OK */ + if (*end == '\0') + return (0); + + if (*end == '@') { + /* + * If we've found an @ symbol, indicate that we're in + * the snapshot component, and report a second '@' + * character as an error. + */ + if (found_snapshot) { + if (why) + *why = NAME_ERR_MULTIPLE_AT; + return (-1); + } + + found_snapshot = 1; + } + + /* + * If there is a '/' in a snapshot name + * then report an error + */ + if (*end == '/' && found_snapshot) { + if (why) + *why = NAME_ERR_TRAILING_SLASH; + return (-1); + } + + /* Update to the next component */ + loc = end + 1; + } +} + +/* + * For pool names, we have the same set of valid characters as described in + * dataset names, with the additional restriction that the pool name must begin + * with a letter. The pool names 'raidz' and 'mirror' are also reserved names + * that cannot be used. + */ +int +pool_namecheck(const char *pool, namecheck_err_t *why, char *what) +{ + const char *c; + + /* + * Make sure the name is not too long. + * + * ZPOOL_MAXNAMELEN is the maximum pool length used in the userland + * which is the same as MAXNAMELEN used in the kernel. + * If ZPOOL_MAXNAMELEN value is changed, make sure to cleanup all + * places using MAXNAMELEN. + */ + if (strlen(pool) >= MAXNAMELEN) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + + c = pool; + while (*c != '\0') { + if (!valid_char(*c)) { + if (why) { + *why = NAME_ERR_INVALCHAR; + *what = *c; + } + return (-1); + } + c++; + } + + if (!(*pool >= 'a' && *pool <= 'z') && + !(*pool >= 'A' && *pool <= 'Z')) { + if (why) + *why = NAME_ERR_NOLETTER; + return (-1); + } + + if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (why) + *why = NAME_ERR_RESERVED; + return (-1); + } + + if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) { + if (why) + *why = NAME_ERR_DISKLIKE; + return (-1); + } + + return (0); +} + +/* + * Check if the dataset name is private for internal usage. + * '$' is reserved for internal dataset names. e.g. "$MOS" + * + * Return 1 if the given name is used internally. + * Return 0 if it is not. + */ +int +dataset_name_hidden(const char *name) +{ + if (strchr(name, '$') != NULL) + return (1); + + return (0); +} diff --git a/sys/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.h new file mode 100644 index 0000000..7e0cda9 --- /dev/null +++ b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_NAMECHECK_H +#define _ZFS_NAMECHECK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + NAME_ERR_LEADING_SLASH, /* name begins with leading slash */ + NAME_ERR_EMPTY_COMPONENT, /* name contains an empty component */ + NAME_ERR_TRAILING_SLASH, /* name ends with a slash */ + NAME_ERR_INVALCHAR, /* invalid character found */ + NAME_ERR_MULTIPLE_AT, /* multiple '@' characters found */ + NAME_ERR_NOLETTER, /* pool doesn't begin with a letter */ + NAME_ERR_RESERVED, /* entire name is reserved */ + NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */ + NAME_ERR_TOOLONG, /* name is too long */ +} namecheck_err_t; + +int pool_namecheck(const char *, namecheck_err_t *, char *); +int dataset_namecheck(const char *, namecheck_err_t *, char *); +int dataset_name_hidden(const char *); +int snapshot_namecheck(const char *, namecheck_err_t *, char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_NAMECHECK_H */ diff --git a/sys/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/contrib/opensolaris/common/zfs/zfs_prop.c new file mode 100644 index 0000000..7125619 --- /dev/null +++ b/sys/contrib/opensolaris/common/zfs/zfs_prop.c @@ -0,0 +1,657 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Master property table. + * + * This table keeps track of all the properties supported by ZFS, and their + * various attributes. Not all of these are needed by the kernel, and several + * are only used by a single libzfs client. But having them here centralizes + * all property information in one location. + * + * name The human-readable string representing this property + * proptype Basic type (string, boolean, number) + * default Default value for the property. Sadly, C only allows + * you to initialize the first member of a union, so we + * have two default members for each property. + * attr Attributes (readonly, inheritable) for the property + * types Valid dataset types to which this applies + * values String describing acceptable values for the property + * colname The column header for 'zfs list' + * colfmt The column formatting for 'zfs list' + * + * This table must match the order of property types in libzfs.h. + */ + +#include <sys/zio.h> +#include <sys/spa.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> + +#include "zfs_prop.h" + +#if defined(_KERNEL) +#include <sys/systm.h> +#else +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#endif + +typedef enum { + prop_default, + prop_readonly, + prop_inherit +} prop_attr_t; + +typedef struct { + const char *pd_name; + zfs_proptype_t pd_proptype; + uint64_t pd_numdefault; + const char *pd_strdefault; + prop_attr_t pd_attr; + int pd_types; + const char *pd_values; + const char *pd_colname; + boolean_t pd_rightalign; + boolean_t pd_visible; +} prop_desc_t; + +static prop_desc_t zfs_prop_table[] = { + { "type", prop_type_string, 0, NULL, prop_readonly, + ZFS_TYPE_ANY, "filesystem | volume | snapshot", "TYPE", B_TRUE, + B_TRUE }, + { "creation", prop_type_number, 0, NULL, prop_readonly, + ZFS_TYPE_ANY, "<date>", "CREATION", B_FALSE, B_TRUE }, + { "used", prop_type_number, 0, NULL, prop_readonly, + ZFS_TYPE_ANY, "<size>", "USED", B_TRUE, B_TRUE }, + { "available", prop_type_number, 0, NULL, prop_readonly, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL", B_TRUE, + B_TRUE }, + { "referenced", prop_type_number, 0, NULL, prop_readonly, + ZFS_TYPE_ANY, + "<size>", "REFER", B_TRUE, B_TRUE }, + { "compressratio", prop_type_number, 0, NULL, prop_readonly, + ZFS_TYPE_ANY, "<1.00x or higher if compressed>", "RATIO", B_TRUE, + B_TRUE }, + { "mounted", prop_type_boolean, 0, NULL, prop_readonly, + ZFS_TYPE_FILESYSTEM, "yes | no | -", "MOUNTED", B_TRUE, B_TRUE }, + { "origin", prop_type_string, 0, NULL, prop_readonly, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN", + B_FALSE, B_TRUE }, + { "quota", prop_type_number, 0, NULL, prop_default, + ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA", B_TRUE, B_TRUE }, + { "reservation", prop_type_number, 0, NULL, prop_default, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "<size> | none", "RESERV", B_TRUE, B_TRUE }, + { "volsize", prop_type_number, 0, NULL, prop_default, + ZFS_TYPE_VOLUME, "<size>", "VOLSIZE", B_TRUE, B_TRUE }, + { "volblocksize", prop_type_number, 8192, NULL, prop_readonly, + ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK", B_TRUE, + B_TRUE }, + { "recordsize", prop_type_number, SPA_MAXBLOCKSIZE, NULL, + prop_inherit, + ZFS_TYPE_FILESYSTEM, + "512 to 128k, power of 2", "RECSIZE", B_TRUE, B_TRUE }, + { "mountpoint", prop_type_string, 0, "/", prop_inherit, + ZFS_TYPE_FILESYSTEM, + "<path> | legacy | none", "MOUNTPOINT", B_FALSE, B_TRUE }, + { "sharenfs", prop_type_string, 0, "off", prop_inherit, + ZFS_TYPE_FILESYSTEM, + "on | off | exports(5) options", "SHARENFS", B_FALSE, B_TRUE }, + { "checksum", prop_type_index, ZIO_CHECKSUM_DEFAULT, "on", + prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", B_TRUE, + B_TRUE }, + { "compression", prop_type_index, ZIO_COMPRESS_DEFAULT, "off", + prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", B_TRUE, B_TRUE }, + { "atime", prop_type_boolean, 1, NULL, prop_inherit, + ZFS_TYPE_FILESYSTEM, + "on | off", "ATIME", B_TRUE, B_TRUE }, + { "devices", prop_type_boolean, 1, NULL, prop_inherit, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "on | off", "DEVICES", B_TRUE, B_TRUE }, + { "exec", prop_type_boolean, 1, NULL, prop_inherit, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "on | off", "EXEC", B_TRUE, B_TRUE }, + { "setuid", prop_type_boolean, 1, NULL, prop_inherit, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", + B_TRUE, B_TRUE }, + { "readonly", prop_type_boolean, 0, NULL, prop_inherit, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "on | off", "RDONLY", B_TRUE, B_TRUE }, + { "jailed", prop_type_boolean, 0, NULL, prop_inherit, + ZFS_TYPE_FILESYSTEM, + "on | off", "JAILED", B_TRUE, B_TRUE }, + { "snapdir", prop_type_index, ZFS_SNAPDIR_HIDDEN, "hidden", + prop_inherit, + ZFS_TYPE_FILESYSTEM, + "hidden | visible", "SNAPDIR", B_TRUE, B_TRUE }, + { "aclmode", prop_type_index, ZFS_ACL_GROUPMASK, "groupmask", + prop_inherit, ZFS_TYPE_FILESYSTEM, + "discard | groupmask | passthrough", "ACLMODE", B_TRUE, B_TRUE }, + { "aclinherit", prop_type_index, ZFS_ACL_SECURE, "secure", + prop_inherit, ZFS_TYPE_FILESYSTEM, + "discard | noallow | secure | passthrough", "ACLINHERIT", B_TRUE, + B_TRUE }, + { "createtxg", prop_type_number, 0, NULL, prop_readonly, + ZFS_TYPE_ANY, NULL, NULL, B_FALSE, B_FALSE }, + { "name", prop_type_string, 0, NULL, prop_readonly, + ZFS_TYPE_ANY, NULL, "NAME", B_FALSE, B_FALSE }, + { "canmount", prop_type_boolean, 1, NULL, prop_default, + ZFS_TYPE_FILESYSTEM, + "on | off", "CANMOUNT", B_TRUE, B_TRUE }, + { "shareiscsi", prop_type_string, 0, "off", prop_inherit, + ZFS_TYPE_ANY, + "on | off | type=<type>", "SHAREISCSI", B_FALSE, B_TRUE }, + { "iscsioptions", prop_type_string, 0, NULL, prop_inherit, + ZFS_TYPE_VOLUME, NULL, "ISCSIOPTIONS", B_FALSE, B_FALSE }, + { "xattr", prop_type_boolean, 1, NULL, prop_inherit, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "on | off", "XATTR", B_TRUE, B_TRUE }, + { "numclones", prop_type_number, 0, NULL, prop_readonly, + ZFS_TYPE_SNAPSHOT, NULL, NULL, B_FALSE, B_FALSE }, + { "copies", prop_type_index, 1, "1", prop_inherit, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "1 | 2 | 3", "COPIES", B_TRUE, B_TRUE }, + { "bootfs", prop_type_string, 0, NULL, prop_default, + ZFS_TYPE_POOL, "<filesystem>", "BOOTFS", B_FALSE, B_TRUE }, +}; + +#define ZFS_PROP_COUNT ((sizeof (zfs_prop_table))/(sizeof (prop_desc_t))) + +/* + * Returns TRUE if the property applies to the given dataset types. + */ +int +zfs_prop_valid_for_type(zfs_prop_t prop, int types) +{ + return ((zfs_prop_table[prop].pd_types & types) != 0); +} + +/* + * Determine if the specified property is visible or not. + */ +boolean_t +zfs_prop_is_visible(zfs_prop_t prop) +{ + if (prop < 0) + return (B_FALSE); + + return (zfs_prop_table[prop].pd_visible); +} + +/* + * Iterate over all properties, calling back into the specified function + * for each property. We will continue to iterate until we either + * reach the end or the callback function something other than + * ZFS_PROP_CONT. + */ +zfs_prop_t +zfs_prop_iter_common(zfs_prop_f func, void *cb, zfs_type_t type, + boolean_t show_all) +{ + int i; + + for (i = 0; i < ZFS_PROP_COUNT; i++) { + if (zfs_prop_valid_for_type(i, type) && + (zfs_prop_is_visible(i) || show_all)) { + if (func(i, cb) != ZFS_PROP_CONT) + return (i); + } + } + return (ZFS_PROP_CONT); +} + +zfs_prop_t +zfs_prop_iter(zfs_prop_f func, void *cb, boolean_t show_all) +{ + return (zfs_prop_iter_common(func, cb, ZFS_TYPE_ANY, show_all)); +} + +zpool_prop_t +zpool_prop_iter(zpool_prop_f func, void *cb, boolean_t show_all) +{ + return (zfs_prop_iter_common(func, cb, ZFS_TYPE_POOL, show_all)); +} + +zfs_proptype_t +zfs_prop_get_type(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_proptype); +} + +static boolean_t +propname_match(const char *p, zfs_prop_t prop, size_t len) +{ + const char *propname = zfs_prop_table[prop].pd_name; +#ifndef _KERNEL + const char *colname = zfs_prop_table[prop].pd_colname; + int c; +#endif + +#ifndef _KERNEL + if (colname == NULL) + return (B_FALSE); +#endif + + if (len == strlen(propname) && + strncmp(p, propname, len) == 0) + return (B_TRUE); + +#ifndef _KERNEL + if (len != strlen(colname)) + return (B_FALSE); + + for (c = 0; c < len; c++) + if (p[c] != tolower(colname[c])) + break; + + return (colname[c] == '\0'); +#else + return (B_FALSE); +#endif +} + +zfs_prop_t +zfs_name_to_prop_cb(zfs_prop_t prop, void *cb_data) +{ + const char *propname = cb_data; + + if (propname_match(propname, prop, strlen(propname))) + return (prop); + + return (ZFS_PROP_CONT); +} + +/* + * Given a property name and its type, returns the corresponding property ID. + */ +zfs_prop_t +zfs_name_to_prop_common(const char *propname, zfs_type_t type) +{ + zfs_prop_t prop; + + prop = zfs_prop_iter_common(zfs_name_to_prop_cb, (void *)propname, + type, B_TRUE); + return (prop == ZFS_PROP_CONT ? ZFS_PROP_INVAL : prop); +} + +/* + * Given a zfs dataset property name, returns the corresponding property ID. + */ +zfs_prop_t +zfs_name_to_prop(const char *propname) +{ + return (zfs_name_to_prop_common(propname, ZFS_TYPE_ANY)); +} + +/* + * Given a pool property name, returns the corresponding property ID. + */ +zpool_prop_t +zpool_name_to_prop(const char *propname) +{ + return (zfs_name_to_prop_common(propname, ZFS_TYPE_POOL)); +} + +/* + * For user property names, we allow all lowercase alphanumeric characters, plus + * a few useful punctuation characters. + */ +static int +valid_char(char c) +{ + return ((c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == '-' || c == '_' || c == '.' || c == ':'); +} + +/* + * Returns true if this is a valid user-defined property (one with a ':'). + */ +boolean_t +zfs_prop_user(const char *name) +{ + int i; + char c; + boolean_t foundsep = B_FALSE; + + for (i = 0; i < strlen(name); i++) { + c = name[i]; + if (!valid_char(c)) + return (B_FALSE); + if (c == ':') + foundsep = B_TRUE; + } + + if (!foundsep) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Return the default value for the given property. + */ +const char * +zfs_prop_default_string(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_strdefault); +} + +uint64_t +zfs_prop_default_numeric(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_numdefault); +} + +/* + * Returns TRUE if the property is readonly. + */ +int +zfs_prop_readonly(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_attr == prop_readonly); +} + +/* + * Given a dataset property ID, returns the corresponding name. + * Assuming the zfs dataset propety ID is valid. + */ +const char * +zfs_prop_to_name(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_name); +} + +/* + * Given a pool property ID, returns the corresponding name. + * Assuming the pool propety ID is valid. + */ +const char * +zpool_prop_to_name(zpool_prop_t prop) +{ + return (zfs_prop_table[prop].pd_name); +} + +/* + * Returns TRUE if the property is inheritable. + */ +int +zfs_prop_inheritable(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_attr == prop_inherit); +} + +typedef struct zfs_index { + const char *name; + uint64_t index; +} zfs_index_t; + +static zfs_index_t checksum_table[] = { + { "on", ZIO_CHECKSUM_ON }, + { "off", ZIO_CHECKSUM_OFF }, + { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, + { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, + { "sha256", ZIO_CHECKSUM_SHA256 }, + { NULL } +}; + +static zfs_index_t compress_table[] = { + { "on", ZIO_COMPRESS_ON }, + { "off", ZIO_COMPRESS_OFF }, + { "lzjb", ZIO_COMPRESS_LZJB }, + { "gzip", ZIO_COMPRESS_GZIP_6 }, /* the default gzip level */ + { "gzip-1", ZIO_COMPRESS_GZIP_1 }, + { "gzip-2", ZIO_COMPRESS_GZIP_2 }, + { "gzip-3", ZIO_COMPRESS_GZIP_3 }, + { "gzip-4", ZIO_COMPRESS_GZIP_4 }, + { "gzip-5", ZIO_COMPRESS_GZIP_5 }, + { "gzip-6", ZIO_COMPRESS_GZIP_6 }, + { "gzip-7", ZIO_COMPRESS_GZIP_7 }, + { "gzip-8", ZIO_COMPRESS_GZIP_8 }, + { "gzip-9", ZIO_COMPRESS_GZIP_9 }, + { NULL } +}; + +static zfs_index_t snapdir_table[] = { + { "hidden", ZFS_SNAPDIR_HIDDEN }, + { "visible", ZFS_SNAPDIR_VISIBLE }, + { NULL } +}; + +static zfs_index_t acl_mode_table[] = { + { "discard", ZFS_ACL_DISCARD }, + { "groupmask", ZFS_ACL_GROUPMASK }, + { "passthrough", ZFS_ACL_PASSTHROUGH }, + { NULL } +}; + +static zfs_index_t acl_inherit_table[] = { + { "discard", ZFS_ACL_DISCARD }, + { "noallow", ZFS_ACL_NOALLOW }, + { "secure", ZFS_ACL_SECURE }, + { "passthrough", ZFS_ACL_PASSTHROUGH }, + { NULL } +}; + +static zfs_index_t copies_table[] = { + { "1", 1 }, + { "2", 2 }, + { "3", 3 }, + { NULL } +}; + +static zfs_index_t * +zfs_prop_index_table(zfs_prop_t prop) +{ + switch (prop) { + case ZFS_PROP_CHECKSUM: + return (checksum_table); + case ZFS_PROP_COMPRESSION: + return (compress_table); + case ZFS_PROP_SNAPDIR: + return (snapdir_table); + case ZFS_PROP_ACLMODE: + return (acl_mode_table); + case ZFS_PROP_ACLINHERIT: + return (acl_inherit_table); + case ZFS_PROP_COPIES: + return (copies_table); + default: + return (NULL); + } +} + + +/* + * Tables of index types, plus functions to convert between the user view + * (strings) and internal representation (uint64_t). + */ +int +zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) +{ + zfs_index_t *table; + int i; + + if ((table = zfs_prop_index_table(prop)) == NULL) + return (-1); + + for (i = 0; table[i].name != NULL; i++) { + if (strcmp(string, table[i].name) == 0) { + *index = table[i].index; + return (0); + } + } + + return (-1); +} + +int +zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) +{ + zfs_index_t *table; + int i; + + if ((table = zfs_prop_index_table(prop)) == NULL) + return (-1); + + for (i = 0; table[i].name != NULL; i++) { + if (table[i].index == index) { + *string = table[i].name; + return (0); + } + } + + return (-1); +} + +#ifndef _KERNEL + +/* + * Returns a string describing the set of acceptable values for the given + * zfs property, or NULL if it cannot be set. + */ +const char * +zfs_prop_values(zfs_prop_t prop) +{ + if (zfs_prop_table[prop].pd_types == ZFS_TYPE_POOL) + return (NULL); + + return (zfs_prop_table[prop].pd_values); +} + +/* + * Returns a string describing the set of acceptable values for the given + * zpool property, or NULL if it cannot be set. + */ +const char * +zpool_prop_values(zfs_prop_t prop) +{ + if (zfs_prop_table[prop].pd_types != ZFS_TYPE_POOL) + return (NULL); + + return (zfs_prop_table[prop].pd_values); +} + +/* + * Returns TRUE if this property is a string type. Note that index types + * (compression, checksum) are treated as strings in userland, even though they + * are stored numerically on disk. + */ +int +zfs_prop_is_string(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_proptype == prop_type_string || + zfs_prop_table[prop].pd_proptype == prop_type_index); +} + +/* + * Returns the column header for the given property. Used only in + * 'zfs list -o', but centralized here with the other property information. + */ +const char * +zfs_prop_column_name(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_colname); +} + +/* + * Returns whether the given property should be displayed right-justified for + * 'zfs list'. + */ +boolean_t +zfs_prop_align_right(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_rightalign); +} + +/* + * Determines the minimum width for the column, and indicates whether it's fixed + * or not. Only string columns are non-fixed. + */ +size_t +zfs_prop_width(zfs_prop_t prop, boolean_t *fixed) +{ + prop_desc_t *pd = &zfs_prop_table[prop]; + zfs_index_t *idx; + size_t ret; + int i; + + *fixed = B_TRUE; + + /* + * Start with the width of the column name. + */ + ret = strlen(pd->pd_colname); + + /* + * For fixed-width values, make sure the width is large enough to hold + * any possible value. + */ + switch (pd->pd_proptype) { + case prop_type_number: + /* + * The maximum length of a human-readable number is 5 characters + * ("20.4M", for example). + */ + if (ret < 5) + ret = 5; + /* + * 'creation' is handled specially because it's a number + * internally, but displayed as a date string. + */ + if (prop == ZFS_PROP_CREATION) + *fixed = B_FALSE; + break; + case prop_type_boolean: + /* + * The maximum length of a boolean value is 3 characters, for + * "off". + */ + if (ret < 3) + ret = 3; + break; + case prop_type_index: + idx = zfs_prop_index_table(prop); + for (i = 0; idx[i].name != NULL; i++) { + if (strlen(idx[i].name) > ret) + ret = strlen(idx[i].name); + } + break; + + case prop_type_string: + *fixed = B_FALSE; + break; + } + + return (ret); +} + +#endif diff --git a/sys/contrib/opensolaris/common/zfs/zfs_prop.h b/sys/contrib/opensolaris/common/zfs/zfs_prop.h new file mode 100644 index 0000000..133e740 --- /dev/null +++ b/sys/contrib/opensolaris/common/zfs/zfs_prop.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_PROP_H +#define _ZFS_PROP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/fs/zfs.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * For index types (e.g. compression and checksum), we want the numeric value + * in the kernel, but the string value in userland. + */ +typedef enum { + prop_type_number, /* numeric value */ + prop_type_string, /* string value */ + prop_type_boolean, /* boolean value */ + prop_type_index /* numeric value indexed by string */ +} zfs_proptype_t; + +zfs_proptype_t zfs_prop_get_type(zfs_prop_t); +size_t zfs_prop_width(zfs_prop_t, boolean_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_PROP_H */ diff --git a/sys/contrib/opensolaris/uts/common/Makefile.files b/sys/contrib/opensolaris/uts/common/Makefile.files new file mode 100644 index 0000000..1800e79 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/Makefile.files @@ -0,0 +1,101 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This Makefile defines all file modules for the directory uts/common +# and its children. These are the source files which may be considered +# common to all SunOS systems. + +ZFS_COMMON_OBJS += \ + arc.o \ + bplist.o \ + dbuf.o \ + dmu.o \ + dmu_send.o \ + dmu_object.o \ + dmu_objset.o \ + dmu_traverse.o \ + dmu_tx.o \ + dnode.o \ + dnode_sync.o \ + dsl_dir.o \ + dsl_dataset.o \ + dsl_pool.o \ + dsl_synctask.o \ + dmu_zfetch.o \ + dsl_prop.o \ + fletcher.o \ + gzip.o \ + lzjb.o \ + metaslab.o \ + refcount.o \ + sha256.o \ + spa.o \ + spa_config.o \ + spa_errlog.o \ + spa_history.o \ + spa_misc.o \ + space_map.o \ + txg.o \ + uberblock.o \ + unique.o \ + vdev.o \ + vdev_cache.o \ + vdev_label.o \ + vdev_mirror.o \ + vdev_missing.o \ + vdev_queue.o \ + vdev_raidz.o \ + vdev_root.o \ + zap.o \ + zap_leaf.o \ + zap_micro.o \ + zfs_byteswap.o \ + zfs_fm.o \ + zfs_znode.o \ + zil.o \ + zio.o \ + zio_checksum.o \ + zio_compress.o \ + zio_inject.o + +ZFS_SHARED_OBJS += \ + zfs_namecheck.o \ + zfs_prop.o + +ZFS_OBJS += \ + $(ZFS_COMMON_OBJS) \ + $(ZFS_SHARED_OBJS) \ + zfs_acl.o \ + zfs_ctldir.o \ + zfs_dir.o \ + zfs_ioctl.o \ + zfs_log.o \ + zfs_replay.o \ + zfs_rlock.o \ + zfs_vfsops.o \ + zfs_vnops.o \ + zvol.o diff --git a/sys/contrib/opensolaris/uts/common/arch/amd64/atomic.S b/sys/contrib/opensolaris/uts/common/arch/amd64/atomic.S new file mode 100644 index 0000000..1fa3c05 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/arch/amd64/atomic.S @@ -0,0 +1,564 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "%Z%%M% %I% %E% SMI" + + .file "%M%" + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(_KERNEL) + /* + * Legacy kernel interfaces; they will go away (eventually). + */ + ANSI_PRAGMA_WEAK2(cas8,atomic_cas_8,function) + ANSI_PRAGMA_WEAK2(cas32,atomic_cas_32,function) + ANSI_PRAGMA_WEAK2(cas64,atomic_cas_64,function) + ANSI_PRAGMA_WEAK2(caslong,atomic_cas_ulong,function) + ANSI_PRAGMA_WEAK2(casptr,atomic_cas_ptr,function) + ANSI_PRAGMA_WEAK2(atomic_and_long,atomic_and_ulong,function) + ANSI_PRAGMA_WEAK2(atomic_or_long,atomic_or_ulong,function) +#endif + + ENTRY(atomic_inc_8) + ALTENTRY(atomic_inc_uchar) + lock + incb (%rdi) + ret + SET_SIZE(atomic_inc_uchar) + SET_SIZE(atomic_inc_8) + + ENTRY(atomic_inc_16) + ALTENTRY(atomic_inc_ushort) + lock + incw (%rdi) + ret + SET_SIZE(atomic_inc_ushort) + SET_SIZE(atomic_inc_16) + + ENTRY(atomic_inc_32) + ALTENTRY(atomic_inc_uint) + lock + incl (%rdi) + ret + SET_SIZE(atomic_inc_uint) + SET_SIZE(atomic_inc_32) + + ENTRY(atomic_inc_64) + ALTENTRY(atomic_inc_ulong) + lock + incq (%rdi) + ret + SET_SIZE(atomic_inc_ulong) + SET_SIZE(atomic_inc_64) + + ENTRY(atomic_inc_8_nv) + ALTENTRY(atomic_inc_uchar_nv) + movb (%rdi), %al +1: + leaq 1(%rax), %rcx + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_inc_uchar_nv) + SET_SIZE(atomic_inc_8_nv) + + ENTRY(atomic_inc_16_nv) + ALTENTRY(atomic_inc_ushort_nv) + movw (%rdi), %ax +1: + leaq 1(%rax), %rcx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_inc_ushort_nv) + SET_SIZE(atomic_inc_16_nv) + + ENTRY(atomic_inc_32_nv) + ALTENTRY(atomic_inc_uint_nv) + movl (%rdi), %eax +1: + leaq 1(%rax), %rcx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_inc_uint_nv) + SET_SIZE(atomic_inc_32_nv) + + ENTRY(atomic_inc_64_nv) + ALTENTRY(atomic_inc_ulong_nv) + movq (%rdi), %rax +1: + leaq 1(%rax), %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_inc_ulong_nv) + SET_SIZE(atomic_inc_64_nv) + + ENTRY(atomic_dec_8) + ALTENTRY(atomic_dec_uchar) + lock + decb (%rdi) + ret + SET_SIZE(atomic_dec_uchar) + SET_SIZE(atomic_dec_8) + + ENTRY(atomic_dec_16) + ALTENTRY(atomic_dec_ushort) + lock + decw (%rdi) + ret + SET_SIZE(atomic_dec_ushort) + SET_SIZE(atomic_dec_16) + + ENTRY(atomic_dec_32) + ALTENTRY(atomic_dec_uint) + lock + decl (%rdi) + ret + SET_SIZE(atomic_dec_uint) + SET_SIZE(atomic_dec_32) + + ENTRY(atomic_dec_64) + ALTENTRY(atomic_dec_ulong) + lock + decq (%rdi) + ret + SET_SIZE(atomic_dec_ulong) + SET_SIZE(atomic_dec_64) + + ENTRY(atomic_dec_8_nv) + ALTENTRY(atomic_dec_uchar_nv) + movb (%rdi), %al +1: + leaq -1(%rax), %rcx + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_dec_uchar_nv) + SET_SIZE(atomic_dec_8_nv) + + ENTRY(atomic_dec_16_nv) + ALTENTRY(atomic_dec_ushort_nv) + movw (%rdi), %ax +1: + leaq -1(%rax), %rcx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_dec_ushort_nv) + SET_SIZE(atomic_dec_16_nv) + + ENTRY(atomic_dec_32_nv) + ALTENTRY(atomic_dec_uint_nv) + movl (%rdi), %eax +1: + leaq -1(%rax), %rcx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_dec_uint_nv) + SET_SIZE(atomic_dec_32_nv) + + ENTRY(atomic_dec_64_nv) + ALTENTRY(atomic_dec_ulong_nv) + movq (%rdi), %rax +1: + leaq -1(%rax), %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_dec_ulong_nv) + SET_SIZE(atomic_dec_64_nv) + + ENTRY(atomic_or_8) + ALTENTRY(atomic_or_uchar) + lock + orb %sil, (%rdi) + ret + SET_SIZE(atomic_or_uchar) + SET_SIZE(atomic_or_8) + + ENTRY(atomic_or_16) + ALTENTRY(atomic_or_ushort) + lock + orw %si, (%rdi) + ret + SET_SIZE(atomic_or_ushort) + SET_SIZE(atomic_or_16) + + ENTRY(atomic_or_32) + ALTENTRY(atomic_or_uint) + lock + orl %esi, (%rdi) + ret + SET_SIZE(atomic_or_uint) + SET_SIZE(atomic_or_32) + + ENTRY(atomic_or_64) + ALTENTRY(atomic_or_ulong) + lock + orq %rsi, (%rdi) + ret + SET_SIZE(atomic_or_ulong) + SET_SIZE(atomic_or_64) + + ENTRY(atomic_and_8) + ALTENTRY(atomic_and_uchar) + lock + andb %sil, (%rdi) + ret + SET_SIZE(atomic_and_uchar) + SET_SIZE(atomic_and_8) + + ENTRY(atomic_and_16) + ALTENTRY(atomic_and_ushort) + lock + andw %si, (%rdi) + ret + SET_SIZE(atomic_and_ushort) + SET_SIZE(atomic_and_16) + + ENTRY(atomic_and_32) + ALTENTRY(atomic_and_uint) + lock + andl %esi, (%rdi) + ret + SET_SIZE(atomic_and_uint) + SET_SIZE(atomic_and_32) + + ENTRY(atomic_and_64) + ALTENTRY(atomic_and_ulong) + lock + andq %rsi, (%rdi) + ret + SET_SIZE(atomic_and_ulong) + SET_SIZE(atomic_and_64) + + ENTRY(atomic_add_8_nv) + ALTENTRY(atomic_add_char_nv) + movb (%rdi), %al +1: + movb %sil, %cl + addb %al, %cl + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_add_char_nv) + SET_SIZE(atomic_add_8_nv) + + ENTRY(atomic_add_16_nv) + ALTENTRY(atomic_add_short_nv) + movw (%rdi), %ax +1: + movw %si, %cx + addw %ax, %cx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_add_short_nv) + SET_SIZE(atomic_add_16_nv) + + ENTRY(atomic_add_32_nv) + ALTENTRY(atomic_add_int_nv) + movl (%rdi), %eax +1: + movl %esi, %ecx + addl %eax, %ecx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_add_int_nv) + SET_SIZE(atomic_add_32_nv) + + ENTRY(atomic_add_64_nv) + ALTENTRY(atomic_add_ptr_nv) + ALTENTRY(atomic_add_long_nv) + movq (%rdi), %rax +1: + movq %rsi, %rcx + addq %rax, %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_add_long_nv) + SET_SIZE(atomic_add_ptr_nv) + SET_SIZE(atomic_add_64_nv) + + ENTRY(atomic_and_8_nv) + ALTENTRY(atomic_and_uchar_nv) + movb (%rdi), %al +1: + movb %sil, %cl + andb %al, %cl + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_and_uchar_nv) + SET_SIZE(atomic_and_8_nv) + + ENTRY(atomic_and_16_nv) + ALTENTRY(atomic_and_ushort_nv) + movw (%rdi), %ax +1: + movw %si, %cx + andw %ax, %cx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_and_ushort_nv) + SET_SIZE(atomic_and_16_nv) + + ENTRY(atomic_and_32_nv) + ALTENTRY(atomic_and_uint_nv) + movl (%rdi), %eax +1: + movl %esi, %ecx + andl %eax, %ecx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_and_uint_nv) + SET_SIZE(atomic_and_32_nv) + + ENTRY(atomic_and_64_nv) + ALTENTRY(atomic_and_ulong_nv) + movq (%rdi), %rax +1: + movq %rsi, %rcx + andq %rax, %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_and_ulong_nv) + SET_SIZE(atomic_and_64_nv) + + ENTRY(atomic_or_8_nv) + ALTENTRY(atomic_or_uchar_nv) + movb (%rdi), %al +1: + movb %sil, %cl + orb %al, %cl + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_and_uchar_nv) + SET_SIZE(atomic_and_8_nv) + + ENTRY(atomic_or_16_nv) + ALTENTRY(atomic_or_ushort_nv) + movw (%rdi), %ax +1: + movw %si, %cx + orw %ax, %cx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_or_ushort_nv) + SET_SIZE(atomic_or_16_nv) + + ENTRY(atomic_or_32_nv) + ALTENTRY(atomic_or_uint_nv) + movl (%rdi), %eax +1: + movl %esi, %ecx + orl %eax, %ecx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_or_uint_nv) + SET_SIZE(atomic_or_32_nv) + + ENTRY(atomic_or_64_nv) + ALTENTRY(atomic_or_ulong_nv) + movq (%rdi), %rax +1: + movq %rsi, %rcx + orq %rax, %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_or_ulong_nv) + SET_SIZE(atomic_or_64_nv) + + ENTRY(atomic_cas_8) + ALTENTRY(atomic_cas_uchar) + movzbl %sil, %eax + lock + cmpxchgb %dl, (%rdi) + ret + SET_SIZE(atomic_cas_uchar) + SET_SIZE(atomic_cas_8) + + ENTRY(atomic_cas_16) + ALTENTRY(atomic_cas_ushort) + movzwl %si, %eax + lock + cmpxchgw %dx, (%rdi) + ret + SET_SIZE(atomic_cas_ushort) + SET_SIZE(atomic_cas_16) + + ENTRY(atomic_cas_32) + ALTENTRY(atomic_cas_uint) + movl %esi, %eax + lock + cmpxchgl %edx, (%rdi) + ret + SET_SIZE(atomic_cas_uint) + SET_SIZE(atomic_cas_32) + + ENTRY(atomic_cas_64) + ALTENTRY(atomic_cas_ulong) + ALTENTRY(atomic_cas_ptr) + movq %rsi, %rax + lock + cmpxchgq %rdx, (%rdi) + ret + SET_SIZE(atomic_cas_ptr) + SET_SIZE(atomic_cas_ulong) + SET_SIZE(atomic_cas_64) + + ENTRY(atomic_swap_8) + ALTENTRY(atomic_swap_uchar) + movzbl %sil, %eax + lock + xchgb %al, (%rdi) + ret + SET_SIZE(atomic_swap_uchar) + SET_SIZE(atomic_swap_8) + + ENTRY(atomic_swap_16) + ALTENTRY(atomic_swap_ushort) + movzwl %si, %eax + lock + xchgw %ax, (%rdi) + ret + SET_SIZE(atomic_swap_ushort) + SET_SIZE(atomic_swap_16) + + ENTRY(atomic_swap_32) + ALTENTRY(atomic_swap_uint) + movl %esi, %eax + lock + xchgl %eax, (%rdi) + ret + SET_SIZE(atomic_swap_uint) + SET_SIZE(atomic_swap_32) + + ENTRY(atomic_swap_64) + ALTENTRY(atomic_swap_ulong) + ALTENTRY(atomic_swap_ptr) + movq %rsi, %rax + lock + xchgq %rax, (%rdi) + ret + SET_SIZE(atomic_swap_ptr) + SET_SIZE(atomic_swap_ulong) + SET_SIZE(atomic_swap_64) + + ENTRY(atomic_set_long_excl) + xorl %eax, %eax + lock + btsq %rsi, (%rdi) + jnc 1f + decl %eax +1: + ret + SET_SIZE(atomic_set_long_excl) + + ENTRY(atomic_clear_long_excl) + xorl %eax, %eax + lock + btrq %rsi, (%rdi) + jc 1f + decl %eax +1: + ret + SET_SIZE(atomic_clear_long_excl) + + ENTRY(membar_enter) + ALTENTRY(membar_exit) + mfence + ret + SET_SIZE(membar_exit) + SET_SIZE(membar_enter) + + ENTRY(membar_producer) + sfence + ret + SET_SIZE(membar_producer) + + ENTRY(membar_consumer) + lfence + ret + SET_SIZE(membar_consumer) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/sys/contrib/opensolaris/uts/common/arch/i386/atomic.S b/sys/contrib/opensolaris/uts/common/arch/i386/atomic.S new file mode 100644 index 0000000..4a1d375 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/arch/i386/atomic.S @@ -0,0 +1,659 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "%Z%%M% %I% %E% SMI" + + .file "%M%" + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(_KERNEL) + /* + * Legacy kernel interfaces; they will go away (eventually). + */ + ANSI_PRAGMA_WEAK2(cas8,atomic_cas_8,function) + ANSI_PRAGMA_WEAK2(cas32,atomic_cas_32,function) + ANSI_PRAGMA_WEAK2(cas64,atomic_cas_64,function) + ANSI_PRAGMA_WEAK2(caslong,atomic_cas_ulong,function) + ANSI_PRAGMA_WEAK2(casptr,atomic_cas_ptr,function) + ANSI_PRAGMA_WEAK2(atomic_and_long,atomic_and_ulong,function) + ANSI_PRAGMA_WEAK2(atomic_or_long,atomic_or_ulong,function) +#endif + + ENTRY(atomic_inc_8) + ALTENTRY(atomic_inc_uchar) + movl 4(%esp), %eax + lock + incb (%eax) + ret + SET_SIZE(atomic_inc_uchar) + SET_SIZE(atomic_inc_8) + + ENTRY(atomic_inc_16) + ALTENTRY(atomic_inc_ushort) + movl 4(%esp), %eax + lock + incw (%eax) + ret + SET_SIZE(atomic_inc_ushort) + SET_SIZE(atomic_inc_16) + + ENTRY(atomic_inc_32) + ALTENTRY(atomic_inc_uint) + ALTENTRY(atomic_inc_ulong) + movl 4(%esp), %eax + lock + incl (%eax) + ret + SET_SIZE(atomic_inc_ulong) + SET_SIZE(atomic_inc_uint) + SET_SIZE(atomic_inc_32) + + ENTRY(atomic_inc_8_nv) + ALTENTRY(atomic_inc_uchar_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + leal 1(%eax), %ecx + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_inc_uchar_nv) + SET_SIZE(atomic_inc_8_nv) + + ENTRY(atomic_inc_16_nv) + ALTENTRY(atomic_inc_ushort_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + leal 1(%eax), %ecx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_inc_ushort_nv) + SET_SIZE(atomic_inc_16_nv) + + ENTRY(atomic_inc_32_nv) + ALTENTRY(atomic_inc_uint_nv) + ALTENTRY(atomic_inc_ulong_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + leal 1(%eax), %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_inc_ulong_nv) + SET_SIZE(atomic_inc_uint_nv) + SET_SIZE(atomic_inc_32_nv) + + ENTRY(atomic_inc_64) + ALTENTRY(atomic_inc_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + xorl %ebx, %ebx + xorl %ecx, %ecx + incl %ebx + addl %eax, %ebx + adcl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_inc_64_nv) + SET_SIZE(atomic_inc_64) + + ENTRY(atomic_dec_8) + ALTENTRY(atomic_dec_uchar) + movl 4(%esp), %eax + lock + decb (%eax) + ret + SET_SIZE(atomic_dec_uchar) + SET_SIZE(atomic_dec_8) + + ENTRY(atomic_dec_16) + ALTENTRY(atomic_dec_ushort) + movl 4(%esp), %eax + lock + decw (%eax) + ret + SET_SIZE(atomic_dec_ushort) + SET_SIZE(atomic_dec_16) + + ENTRY(atomic_dec_32) + ALTENTRY(atomic_dec_uint) + ALTENTRY(atomic_dec_ulong) + movl 4(%esp), %eax + lock + decl (%eax) + ret + SET_SIZE(atomic_dec_ulong) + SET_SIZE(atomic_dec_uint) + SET_SIZE(atomic_dec_32) + + ENTRY(atomic_dec_8_nv) + ALTENTRY(atomic_dec_uchar_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + leal -1(%eax), %ecx + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_dec_uchar_nv) + SET_SIZE(atomic_dec_8_nv) + + ENTRY(atomic_dec_16_nv) + ALTENTRY(atomic_dec_ushort_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + leal -1(%eax), %ecx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_dec_ushort_nv) + SET_SIZE(atomic_dec_16_nv) + + ENTRY(atomic_dec_32_nv) + ALTENTRY(atomic_dec_uint_nv) + ALTENTRY(atomic_dec_ulong_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + leal -1(%eax), %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_dec_ulong_nv) + SET_SIZE(atomic_dec_uint_nv) + SET_SIZE(atomic_dec_32_nv) + + ENTRY(atomic_dec_64) + ALTENTRY(atomic_dec_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + xorl %ebx, %ebx + xorl %ecx, %ecx + not %ecx + not %ebx + addl %eax, %ebx + adcl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_dec_64_nv) + SET_SIZE(atomic_dec_64) + + ENTRY(atomic_or_8) + ALTENTRY(atomic_or_uchar) + movl 4(%esp), %eax + movb 8(%esp), %cl + lock + orb %cl, (%eax) + ret + SET_SIZE(atomic_or_uchar) + SET_SIZE(atomic_or_8) + + ENTRY(atomic_or_16) + ALTENTRY(atomic_or_ushort) + movl 4(%esp), %eax + movw 8(%esp), %cx + lock + orw %cx, (%eax) + ret + SET_SIZE(atomic_or_ushort) + SET_SIZE(atomic_or_16) + + ENTRY(atomic_or_32) + ALTENTRY(atomic_or_uint) + ALTENTRY(atomic_or_ulong) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + orl %ecx, (%eax) + ret + SET_SIZE(atomic_or_ulong) + SET_SIZE(atomic_or_uint) + SET_SIZE(atomic_or_32) + + ENTRY(atomic_and_8) + ALTENTRY(atomic_and_uchar) + movl 4(%esp), %eax + movb 8(%esp), %cl + lock + andb %cl, (%eax) + ret + SET_SIZE(atomic_and_uchar) + SET_SIZE(atomic_and_8) + + ENTRY(atomic_and_16) + ALTENTRY(atomic_and_ushort) + movl 4(%esp), %eax + movw 8(%esp), %cx + lock + andw %cx, (%eax) + ret + SET_SIZE(atomic_and_ushort) + SET_SIZE(atomic_and_16) + + ENTRY(atomic_and_32) + ALTENTRY(atomic_and_uint) + ALTENTRY(atomic_and_ulong) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + andl %ecx, (%eax) + ret + SET_SIZE(atomic_and_ulong) + SET_SIZE(atomic_and_uint) + SET_SIZE(atomic_and_32) + + ENTRY(atomic_add_8_nv) + ALTENTRY(atomic_add_char_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + movl 8(%esp), %ecx + addb %al, %cl + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_add_char_nv) + SET_SIZE(atomic_add_8_nv) + + ENTRY(atomic_add_16_nv) + ALTENTRY(atomic_add_short_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + movl 8(%esp), %ecx + addw %ax, %cx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_add_short_nv) + SET_SIZE(atomic_add_16_nv) + + ENTRY(atomic_add_32_nv) + ALTENTRY(atomic_add_int_nv) + ALTENTRY(atomic_add_ptr_nv) + ALTENTRY(atomic_add_long_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + movl 8(%esp), %ecx + addl %eax, %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_add_long_nv) + SET_SIZE(atomic_add_ptr_nv) + SET_SIZE(atomic_add_int_nv) + SET_SIZE(atomic_add_32_nv) + + ENTRY(atomic_add_64) + ALTENTRY(atomic_add_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + movl 16(%esp), %ebx + movl 20(%esp), %ecx + addl %eax, %ebx + adcl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_add_64_nv) + SET_SIZE(atomic_add_64) + + ENTRY(atomic_or_8_nv) + ALTENTRY(atomic_or_uchar_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + movl 8(%esp), %ecx + orb %al, %cl + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_or_uchar_nv) + SET_SIZE(atomic_or_8_nv) + + ENTRY(atomic_or_16_nv) + ALTENTRY(atomic_or_ushort_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + movl 8(%esp), %ecx + orw %ax, %cx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_or_ushort_nv) + SET_SIZE(atomic_or_16_nv) + + ENTRY(atomic_or_32_nv) + ALTENTRY(atomic_or_uint_nv) + ALTENTRY(atomic_or_ulong_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + movl 8(%esp), %ecx + orl %eax, %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_or_ulong_nv) + SET_SIZE(atomic_or_uint_nv) + SET_SIZE(atomic_or_32_nv) + + ENTRY(atomic_or_64) + ALTENTRY(atomic_or_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + movl 16(%esp), %ebx + movl 20(%esp), %ecx + orl %eax, %ebx + orl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_or_64_nv) + SET_SIZE(atomic_or_64) + + ENTRY(atomic_and_8_nv) + ALTENTRY(atomic_and_uchar_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + movl 8(%esp), %ecx + andb %al, %cl + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_and_uchar_nv) + SET_SIZE(atomic_and_8_nv) + + ENTRY(atomic_and_16_nv) + ALTENTRY(atomic_and_ushort_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + movl 8(%esp), %ecx + andw %ax, %cx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_and_ushort_nv) + SET_SIZE(atomic_and_16_nv) + + ENTRY(atomic_and_32_nv) + ALTENTRY(atomic_and_uint_nv) + ALTENTRY(atomic_and_ulong_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + movl 8(%esp), %ecx + andl %eax, %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_and_ulong_nv) + SET_SIZE(atomic_and_uint_nv) + SET_SIZE(atomic_and_32_nv) + + ENTRY(atomic_and_64) + ALTENTRY(atomic_and_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + movl 16(%esp), %ebx + movl 20(%esp), %ecx + andl %eax, %ebx + andl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_and_64_nv) + SET_SIZE(atomic_and_64) + + ENTRY(atomic_cas_8) + ALTENTRY(atomic_cas_uchar) + movl 4(%esp), %edx + movzbl 8(%esp), %eax + movb 12(%esp), %cl + lock + cmpxchgb %cl, (%edx) + ret + SET_SIZE(atomic_cas_uchar) + SET_SIZE(atomic_cas_8) + + ENTRY(atomic_cas_16) + ALTENTRY(atomic_cas_ushort) + movl 4(%esp), %edx + movzwl 8(%esp), %eax + movw 12(%esp), %cx + lock + cmpxchgw %cx, (%edx) + ret + SET_SIZE(atomic_cas_ushort) + SET_SIZE(atomic_cas_16) + + ENTRY(atomic_cas_32) + ALTENTRY(atomic_cas_uint) + ALTENTRY(atomic_cas_ulong) + ALTENTRY(atomic_cas_ptr) + movl 4(%esp), %edx + movl 8(%esp), %eax + movl 12(%esp), %ecx + lock + cmpxchgl %ecx, (%edx) + ret + SET_SIZE(atomic_cas_ptr) + SET_SIZE(atomic_cas_ulong) + SET_SIZE(atomic_cas_uint) + SET_SIZE(atomic_cas_32) + + ENTRY(atomic_cas_64) + pushl %ebx + pushl %esi + movl 12(%esp), %esi + movl 16(%esp), %eax + movl 20(%esp), %edx + movl 24(%esp), %ebx + movl 28(%esp), %ecx + lock + cmpxchg8b (%esi) + popl %esi + popl %ebx + ret + SET_SIZE(atomic_cas_64) + + ENTRY(atomic_swap_8) + ALTENTRY(atomic_swap_uchar) + movl 4(%esp), %edx + movzbl 8(%esp), %eax + lock + xchgb %al, (%edx) + ret + SET_SIZE(atomic_swap_uchar) + SET_SIZE(atomic_swap_8) + + ENTRY(atomic_swap_16) + ALTENTRY(atomic_swap_ushort) + movl 4(%esp), %edx + movzwl 8(%esp), %eax + lock + xchgw %ax, (%edx) + ret + SET_SIZE(atomic_swap_ushort) + SET_SIZE(atomic_swap_16) + + ENTRY(atomic_swap_32) + ALTENTRY(atomic_swap_uint) + ALTENTRY(atomic_swap_ptr) + ALTENTRY(atomic_swap_ulong) + movl 4(%esp), %edx + movl 8(%esp), %eax + lock + xchgl %eax, (%edx) + ret + SET_SIZE(atomic_swap_ulong) + SET_SIZE(atomic_swap_ptr) + SET_SIZE(atomic_swap_uint) + SET_SIZE(atomic_swap_32) + + ENTRY(atomic_swap_64) + pushl %esi + pushl %ebx + movl 12(%esp), %esi + movl 16(%esp), %ebx + movl 20(%esp), %ecx + movl (%esi), %eax + movl 4(%esi), %edx +1: + lock + cmpxchg8b (%esi) + jne 1b + popl %ebx + popl %esi + ret + SET_SIZE(atomic_swap_64) + + ENTRY(atomic_set_long_excl) + movl 4(%esp), %edx + movl 8(%esp), %ecx + xorl %eax, %eax + lock + btsl %ecx, (%edx) + jnc 1f + decl %eax +1: + ret + SET_SIZE(atomic_set_long_excl) + + ENTRY(atomic_clear_long_excl) + movl 4(%esp), %edx + movl 8(%esp), %ecx + xorl %eax, %eax + lock + btrl %ecx, (%edx) + jc 1f + decl %eax +1: + ret + SET_SIZE(atomic_clear_long_excl) + + ENTRY(membar_enter) + ALTENTRY(membar_exit) + ALTENTRY(membar_producer) + ALTENTRY(membar_consumer) + lock + xorl $0, (%esp) + ret + SET_SIZE(membar_consumer) + SET_SIZE(membar_producer) + SET_SIZE(membar_exit) + SET_SIZE(membar_enter) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/sys/contrib/opensolaris/uts/common/fs/dnlc.c b/sys/contrib/opensolaris/uts/common/fs/dnlc.c new file mode 100644 index 0000000..48daf36 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/dnlc.c @@ -0,0 +1,824 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/dnlc.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> +#include <sys/kstat.h> +#include <sys/atomic.h> +#include <sys/taskq.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> + +#define TRACE_0(...) do { } while (0) +#define TRACE_2(...) do { } while (0) +#define TRACE_4(...) do { } while (0) + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dnlc, CTLFLAG_RW, 0, "ZFS namecache"); + +/* + * Directory name lookup cache. + * Based on code originally done by Robert Elz at Melbourne. + * + * Names found by directory scans are retained in a cache + * for future reference. Each hash chain is ordered by LRU + * Cache is indexed by hash value obtained from (vp, name) + * where the vp refers to the directory containing the name. + */ + +/* + * Tunable nc_hashavelen is the average length desired for this chain, from + * which the size of the nc_hash table is derived at create time. + */ +#define NC_HASHAVELEN_DEFAULT 4 +int nc_hashavelen = NC_HASHAVELEN_DEFAULT; + +/* + * NC_MOVETOFRONT is the move-to-front threshold: if the hash lookup + * depth exceeds this value, we move the looked-up entry to the front of + * its hash chain. The idea is to make sure that the most frequently + * accessed entries are found most quickly (by keeping them near the + * front of their hash chains). + */ +#define NC_MOVETOFRONT 2 + +/* + * + * DNLC_MAX_RELE is used to size an array on the stack when releasing + * vnodes. This array is used rather than calling VN_RELE() inline because + * all dnlc locks must be dropped by that time in order to avoid a + * possible deadlock. This deadlock occurs when the dnlc holds the last + * reference to the vnode and so the VOP_INACTIVE vector is called which + * can in turn call back into the dnlc. A global array was used but had + * many problems: + * 1) Actually doesn't have an upper bound on the array size as + * entries can be added after starting the purge. + * 2) The locking scheme causes a hang. + * 3) Caused serialisation on the global lock. + * 4) The array was often unnecessarily huge. + * + * Note the current value 8 allows up to 4 cache entries (to be purged + * from each hash chain), before having to cycle around and retry. + * This ought to be ample given that nc_hashavelen is typically very small. + */ +#define DNLC_MAX_RELE 8 /* must be even */ + +/* + * Hash table of name cache entries for fast lookup, dynamically + * allocated at startup. + */ +nc_hash_t *nc_hash; + +/* + * Rotors. Used to select entries on a round-robin basis. + */ +static nc_hash_t *dnlc_purge_fs1_rotor; +static nc_hash_t *dnlc_free_rotor; + +/* + * # of dnlc entries (uninitialized) + * + * the initial value was chosen as being + * a random string of bits, probably not + * normally chosen by a systems administrator + */ +int ncsize = -1; +TUNABLE_INT("vfs.zfs.dnlc.ncsize", &ncsize); +SYSCTL_INT(_vfs_zfs_dnlc, OID_AUTO, ncsize, CTLFLAG_RDTUN, &ncsize, 0, + "Number of DNLC entries"); +volatile uint32_t dnlc_nentries = 0; /* current num of name cache entries */ +SYSCTL_UINT(_vfs_zfs_dnlc, OID_AUTO, nentries, CTLFLAG_RD, + __DEVOLATILE(u_int *, &dnlc_nentries), 0, "Number of DNLC entries in use"); +static int nc_hashsz; /* size of hash table */ +static int nc_hashmask; /* size of hash table minus 1 */ + +/* + * The dnlc_reduce_cache() taskq queue is activated when there are + * ncsize name cache entries and if no parameter is provided, it reduces + * the size down to dnlc_nentries_low_water, which is by default one + * hundreth less (or 99%) of ncsize. + * + * If a parameter is provided to dnlc_reduce_cache(), then we reduce + * the size down based on ncsize_onepercent - where ncsize_onepercent + * is 1% of ncsize; however, we never let dnlc_reduce_cache() reduce + * the size below 3% of ncsize (ncsize_min_percent). + */ +#define DNLC_LOW_WATER_DIVISOR_DEFAULT 100 +uint_t dnlc_low_water_divisor = DNLC_LOW_WATER_DIVISOR_DEFAULT; +uint_t dnlc_nentries_low_water; +int dnlc_reduce_idle = 1; /* no locking needed */ +uint_t ncsize_onepercent; +uint_t ncsize_min_percent; + +/* + * If dnlc_nentries hits dnlc_max_nentries (twice ncsize) + * then this means the dnlc_reduce_cache() taskq is failing to + * keep up. In this case we refuse to add new entries to the dnlc + * until the taskq catches up. + */ +uint_t dnlc_max_nentries; /* twice ncsize */ +SYSCTL_UINT(_vfs_zfs_dnlc, OID_AUTO, max_nentries, CTLFLAG_RD, + &dnlc_max_nentries, 0, "Maximum number of DNLC entries"); +uint64_t dnlc_max_nentries_cnt = 0; /* statistic on times we failed */ + +/* + * Tunable to define when we should just remove items from + * the end of the chain. + */ +#define DNLC_LONG_CHAIN 8 +uint_t dnlc_long_chain = DNLC_LONG_CHAIN; + +/* + * ncstats has been deprecated, due to the integer size of the counters + * which can easily overflow in the dnlc. + * It is maintained (at some expense) for compatability. + * The preferred interface is the kstat accessible nc_stats below. + */ +struct ncstats ncstats; + +struct nc_stats ncs = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "negative_cache_hits", KSTAT_DATA_UINT64 }, + { "enters", KSTAT_DATA_UINT64 }, + { "double_enters", KSTAT_DATA_UINT64 }, + { "purge_total_entries", KSTAT_DATA_UINT64 }, + { "purge_all", KSTAT_DATA_UINT64 }, + { "purge_vp", KSTAT_DATA_UINT64 }, + { "purge_vfs", KSTAT_DATA_UINT64 }, + { "purge_fs1", KSTAT_DATA_UINT64 }, + { "pick_free", KSTAT_DATA_UINT64 }, + { "pick_heuristic", KSTAT_DATA_UINT64 }, + { "pick_last", KSTAT_DATA_UINT64 }, +}; + +static int doingcache = 1; +TUNABLE_INT("vfs.zfs.dnlc.enable", &doingcache); +SYSCTL_INT(_vfs_zfs_dnlc, OID_AUTO, enable, CTLFLAG_RDTUN, &doingcache, 0, + "Enable/disable name cache"); + +vnode_t negative_cache_vnode; + +/* + * Insert entry at the front of the queue + */ +#define nc_inshash(ncp, hp) \ +{ \ + (ncp)->hash_next = (hp)->hash_next; \ + (ncp)->hash_prev = (ncache_t *)(hp); \ + (hp)->hash_next->hash_prev = (ncp); \ + (hp)->hash_next = (ncp); \ +} + +/* + * Remove entry from hash queue + */ +#define nc_rmhash(ncp) \ +{ \ + (ncp)->hash_prev->hash_next = (ncp)->hash_next; \ + (ncp)->hash_next->hash_prev = (ncp)->hash_prev; \ + (ncp)->hash_prev = NULL; \ + (ncp)->hash_next = NULL; \ +} + +/* + * Free an entry. + */ +#define dnlc_free(ncp) \ +{ \ + kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \ + atomic_add_32(&dnlc_nentries, -1); \ +} + + +/* + * Cached directory info. + * ====================== + */ + +/* + * Cached directory free space hash function. + * Needs the free space handle and the dcp to get the hash table size + * Returns the hash index. + */ +#define DDFHASH(handle, dcp) ((handle >> 2) & (dcp)->dc_fhash_mask) + +/* + * Cached directory name entry hash function. + * Uses the name and returns in the input arguments the hash and the name + * length. + */ +#define DNLC_DIR_HASH(name, hash, namelen) \ + { \ + char Xc, *Xcp; \ + hash = *name; \ + for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + Xc; \ + ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1)); \ + namelen = Xcp - (name); \ + } + +/* special dircache_t pointer to indicate error should be returned */ +/* + * The anchor directory cache pointer can contain 3 types of values, + * 1) NULL: No directory cache + * 2) DC_RET_LOW_MEM (-1): There was a directory cache that found to be + * too big or a memory shortage occurred. This value remains in the + * pointer until a dnlc_dir_start() which returns the a DNOMEM error. + * This is kludgy but efficient and only visible in this source file. + * 3) A valid cache pointer. + */ +#define DC_RET_LOW_MEM (dircache_t *)1 +#define VALID_DIR_CACHE(dcp) ((dircache_t *)(dcp) > DC_RET_LOW_MEM) + +/* Prototypes */ +static ncache_t *dnlc_get(uchar_t namlen); +static ncache_t *dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash); +static void do_dnlc_reduce_cache(void *); + +static kstat_t *dnlc_ksp = NULL; + +/* + * Initialize the directory cache. + */ +static void +dnlc_init(void *arg __unused) +{ + nc_hash_t *hp; + int i; + + /* + * Set up the size of the dnlc (ncsize) and its low water mark. + */ + if (ncsize == -1) { + /* calculate a reasonable size for the low water */ +#if 0 + dnlc_nentries_low_water = 4 * (v.v_proc + maxusers) + 320; +#else + dnlc_nentries_low_water = vm_kmem_size / 20480; +#endif + ncsize = dnlc_nentries_low_water + + (dnlc_nentries_low_water / dnlc_low_water_divisor); + } else { + /* don't change the user specified ncsize */ + dnlc_nentries_low_water = + ncsize - (ncsize / dnlc_low_water_divisor); + } + if (ncsize <= 0) { + doingcache = 0; + ncsize = 0; + cmn_err(CE_NOTE, "name cache (dnlc) disabled"); + return; + } + dnlc_max_nentries = ncsize * 2; + ncsize_onepercent = ncsize / 100; + ncsize_min_percent = ncsize_onepercent * 3; + + /* + * Initialise the hash table. + * Compute hash size rounding to the next power of two. + */ + nc_hashsz = ncsize / nc_hashavelen; + nc_hashsz = 1 << highbit(nc_hashsz); + nc_hashmask = nc_hashsz - 1; + nc_hash = kmem_zalloc(nc_hashsz * sizeof (*nc_hash), KM_SLEEP); + for (i = 0; i < nc_hashsz; i++) { + hp = (nc_hash_t *)&nc_hash[i]; + mutex_init(&hp->hash_lock, NULL, MUTEX_DEFAULT, NULL); + hp->hash_next = (ncache_t *)hp; + hp->hash_prev = (ncache_t *)hp; + } + + /* + * Initialize rotors + */ + dnlc_free_rotor = dnlc_purge_fs1_rotor = &nc_hash[0]; + + /* + * Initialise the reference count of the negative cache vnode to 1 + * so that it never goes away (VOP_INACTIVE isn't called on it). + */ + negative_cache_vnode.v_count = 1; + negative_cache_vnode.v_holdcnt = 1; + mtx_init(&negative_cache_vnode.v_interlock, "vnode interlock", NULL, + MTX_DEF); + + /* + * Initialise kstats - both the old compatability raw kind and + * the more extensive named stats. + */ + dnlc_ksp = kstat_create("zfs", 0, "dnlcstats", "misc", KSTAT_TYPE_NAMED, + sizeof (ncs) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (dnlc_ksp) { + dnlc_ksp->ks_data = (void *) &ncs; + kstat_install(dnlc_ksp); + } +} + +static void +dnlc_fini(void *arg __unused) +{ + nc_hash_t *hp; + int i; + + if (dnlc_ksp != NULL) { + kstat_delete(dnlc_ksp); + dnlc_ksp = NULL; + } + + mtx_destroy(&negative_cache_vnode.v_interlock); + + for (i = 0; i < nc_hashsz; i++) { + hp = (nc_hash_t *)&nc_hash[i]; + mutex_destroy(&hp->hash_lock); + } + kmem_free(nc_hash, nc_hashsz * sizeof (*nc_hash)); +} + +/* + * Add a name to the directory cache. + * + * This function is basically identical with + * dnlc_enter(). The difference is that when the + * desired dnlc entry is found, the vnode in the + * ncache is compared with the vnode passed in. + * + * If they are not equal then the ncache is + * updated with the passed in vnode. Otherwise + * it just frees up the newly allocated dnlc entry. + */ +void +dnlc_update(vnode_t *dp, char *name, vnode_t *vp) +{ + ncache_t *ncp; + ncache_t *tcp; + vnode_t *tvp; + nc_hash_t *hp; + int hash; + uchar_t namlen; + + TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_update_start:"); + + if (!doingcache) { + TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, + "dnlc_update_end:(%S) %d", "not caching", 0); + return; + } + + /* + * Get a new dnlc entry and initialize it now. + * If we fail to get a new entry, call dnlc_remove() to purge + * any existing dnlc entry including negative cache (DNLC_NO_VNODE) + * entry. + * Failure to clear an existing entry could result in false dnlc + * lookup (negative/stale entry). + */ + DNLCHASH(name, dp, hash, namlen); + if ((ncp = dnlc_get(namlen)) == NULL) { + dnlc_remove(dp, name); + return; + } + ncp->dp = dp; + VN_HOLD(dp); + ncp->vp = vp; + VN_HOLD(vp); + bcopy(name, ncp->name, namlen + 1); /* name and null */ + ncp->hash = hash; + hp = &nc_hash[hash & nc_hashmask]; + + mutex_enter(&hp->hash_lock); + if ((tcp = dnlc_search(dp, name, namlen, hash)) != NULL) { + if (tcp->vp != vp) { + tvp = tcp->vp; + tcp->vp = vp; + mutex_exit(&hp->hash_lock); + VN_RELE(tvp); + ncstats.enters++; + ncs.ncs_enters.value.ui64++; + TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, + "dnlc_update_end:(%S) %d", "done", ncstats.enters); + } else { + mutex_exit(&hp->hash_lock); + VN_RELE(vp); + ncstats.dbl_enters++; + ncs.ncs_dbl_enters.value.ui64++; + TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, + "dnlc_update_end:(%S) %d", + "dbl enter", ncstats.dbl_enters); + } + VN_RELE(dp); + dnlc_free(ncp); /* crfree done here */ + return; + } + /* + * insert the new entry, since it is not in dnlc yet + */ + nc_inshash(ncp, hp); + mutex_exit(&hp->hash_lock); + ncstats.enters++; + ncs.ncs_enters.value.ui64++; + TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, + "dnlc_update_end:(%S) %d", "done", ncstats.enters); +} + +/* + * Look up a name in the directory name cache. + * + * Return a doubly-held vnode if found: one hold so that it may + * remain in the cache for other users, the other hold so that + * the cache is not re-cycled and the identity of the vnode is + * lost before the caller can use the vnode. + */ +vnode_t * +dnlc_lookup(vnode_t *dp, char *name) +{ + ncache_t *ncp; + nc_hash_t *hp; + vnode_t *vp; + int hash, depth; + uchar_t namlen; + + TRACE_2(TR_FAC_NFS, TR_DNLC_LOOKUP_START, + "dnlc_lookup_start:dp %x name %s", dp, name); + + if (!doingcache) { + TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, + "dnlc_lookup_end:%S %d vp %x name %s", + "not_caching", 0, NULL, name); + return (NULL); + } + + DNLCHASH(name, dp, hash, namlen); + depth = 1; + hp = &nc_hash[hash & nc_hashmask]; + mutex_enter(&hp->hash_lock); + + for (ncp = hp->hash_next; ncp != (ncache_t *)hp; + ncp = ncp->hash_next) { + if (ncp->hash == hash && /* fast signature check */ + ncp->dp == dp && + ncp->namlen == namlen && + bcmp(ncp->name, name, namlen) == 0) { + /* + * Move this entry to the head of its hash chain + * if it's not already close. + */ + if (depth > NC_MOVETOFRONT) { + ncache_t *next = ncp->hash_next; + ncache_t *prev = ncp->hash_prev; + + prev->hash_next = next; + next->hash_prev = prev; + ncp->hash_next = next = hp->hash_next; + ncp->hash_prev = (ncache_t *)hp; + next->hash_prev = ncp; + hp->hash_next = ncp; + + ncstats.move_to_front++; + } + + /* + * Put a hold on the vnode now so its identity + * can't change before the caller has a chance to + * put a hold on it. + */ + vp = ncp->vp; + VN_HOLD(vp); + mutex_exit(&hp->hash_lock); + ncstats.hits++; + ncs.ncs_hits.value.ui64++; + if (vp == DNLC_NO_VNODE) { + ncs.ncs_neg_hits.value.ui64++; + } + TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, + "dnlc_lookup_end:%S %d vp %x name %s", + "hit", ncstats.hits, vp, name); + return (vp); + } + depth++; + } + + mutex_exit(&hp->hash_lock); + ncstats.misses++; + ncs.ncs_misses.value.ui64++; + TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, + "dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses, + NULL, name); + return (NULL); +} + +/* + * Remove an entry in the directory name cache. + */ +void +dnlc_remove(vnode_t *dp, char *name) +{ + ncache_t *ncp; + nc_hash_t *hp; + uchar_t namlen; + int hash; + + if (!doingcache) + return; + DNLCHASH(name, dp, hash, namlen); + hp = &nc_hash[hash & nc_hashmask]; + + mutex_enter(&hp->hash_lock); + if (ncp = dnlc_search(dp, name, namlen, hash)) { + /* + * Free up the entry + */ + nc_rmhash(ncp); + mutex_exit(&hp->hash_lock); + VN_RELE(ncp->vp); + VN_RELE(ncp->dp); + dnlc_free(ncp); + return; + } + mutex_exit(&hp->hash_lock); +} + +/* + * Purge cache entries referencing a vfsp. Caller supplies a count + * of entries to purge; up to that many will be freed. A count of + * zero indicates that all such entries should be purged. Returns + * the number of entries that were purged. + */ +int +dnlc_purge_vfsp(vfs_t *vfsp, int count) +{ + nc_hash_t *nch; + ncache_t *ncp; + int n = 0; + int index; + int i; + vnode_t *nc_rele[DNLC_MAX_RELE]; + + if (!doingcache) + return (0); + + ncstats.purges++; + ncs.ncs_purge_vfs.value.ui64++; + + for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { + index = 0; + mutex_enter(&nch->hash_lock); + ncp = nch->hash_next; + while (ncp != (ncache_t *)nch) { + ncache_t *np; + + np = ncp->hash_next; + ASSERT(ncp->dp != NULL); + ASSERT(ncp->vp != NULL); + if ((ncp->dp->v_vfsp == vfsp) || + (ncp->vp->v_vfsp == vfsp)) { + n++; + nc_rele[index++] = ncp->vp; + nc_rele[index++] = ncp->dp; + nc_rmhash(ncp); + dnlc_free(ncp); + ncs.ncs_purge_total.value.ui64++; + if (index == DNLC_MAX_RELE) { + ncp = np; + break; + } + if (count != 0 && n >= count) { + break; + } + } + ncp = np; + } + mutex_exit(&nch->hash_lock); + /* Release holds on all the vnodes now that we have no locks */ + for (i = 0; i < index; i++) { + VN_RELE(nc_rele[i]); + } + if (count != 0 && n >= count) { + return (n); + } + if (ncp != (ncache_t *)nch) { + nch--; /* Do current hash chain again */ + } + } + return (n); +} + +/* + * Utility routine to search for a cache entry. Return the + * ncache entry if found, NULL otherwise. + */ +static ncache_t * +dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash) +{ + nc_hash_t *hp; + ncache_t *ncp; + + hp = &nc_hash[hash & nc_hashmask]; + + for (ncp = hp->hash_next; ncp != (ncache_t *)hp; ncp = ncp->hash_next) { + if (ncp->hash == hash && + ncp->dp == dp && + ncp->namlen == namlen && + bcmp(ncp->name, name, namlen) == 0) + return (ncp); + } + return (NULL); +} + +#if ((1 << NBBY) - 1) < (MAXNAMELEN - 1) +#error ncache_t name length representation is too small +#endif + +void +dnlc_reduce_cache(void *reduce_percent) +{ + if (dnlc_reduce_idle && (dnlc_nentries >= ncsize || reduce_percent)) { + dnlc_reduce_idle = 0; + if ((taskq_dispatch(system_taskq, do_dnlc_reduce_cache, + reduce_percent, TQ_NOSLEEP)) == 0) + dnlc_reduce_idle = 1; + } +} + +/* + * Get a new name cache entry. + * If the dnlc_reduce_cache() taskq isn't keeping up with demand, or memory + * is short then just return NULL. If we're over ncsize then kick off a + * thread to free some in use entries down to dnlc_nentries_low_water. + * Caller must initialise all fields except namlen. + * Component names are defined to be less than MAXNAMELEN + * which includes a null. + */ +static ncache_t * +dnlc_get(uchar_t namlen) +{ + ncache_t *ncp; + + if (dnlc_nentries > dnlc_max_nentries) { + dnlc_max_nentries_cnt++; /* keep a statistic */ + return (NULL); + } + ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP); + if (ncp == NULL) { + return (NULL); + } + ncp->namlen = namlen; + atomic_add_32(&dnlc_nentries, 1); + dnlc_reduce_cache(NULL); + return (ncp); +} + +/* + * Taskq routine to free up name cache entries to reduce the + * cache size to the low water mark if "reduce_percent" is not provided. + * If "reduce_percent" is provided, reduce cache size by + * (ncsize_onepercent * reduce_percent). + */ +/*ARGSUSED*/ +static void +do_dnlc_reduce_cache(void *reduce_percent) +{ + nc_hash_t *hp = dnlc_free_rotor, *start_hp = hp; + vnode_t *vp; + ncache_t *ncp; + int cnt; + uint_t low_water = dnlc_nentries_low_water; + + if (reduce_percent) { + uint_t reduce_cnt; + + /* + * Never try to reduce the current number + * of cache entries below 3% of ncsize. + */ + if (dnlc_nentries <= ncsize_min_percent) { + dnlc_reduce_idle = 1; + return; + } + reduce_cnt = ncsize_onepercent * + (uint_t)(uintptr_t)reduce_percent; + + if (reduce_cnt > dnlc_nentries || + dnlc_nentries - reduce_cnt < ncsize_min_percent) + low_water = ncsize_min_percent; + else + low_water = dnlc_nentries - reduce_cnt; + } + + do { + /* + * Find the first non empty hash queue without locking. + * Only look at each hash queue once to avoid an infinite loop. + */ + do { + if (++hp == &nc_hash[nc_hashsz]) + hp = nc_hash; + } while (hp->hash_next == (ncache_t *)hp && hp != start_hp); + + /* return if all hash queues are empty. */ + if (hp->hash_next == (ncache_t *)hp) { + dnlc_reduce_idle = 1; + return; + } + + mutex_enter(&hp->hash_lock); + for (cnt = 0, ncp = hp->hash_prev; ncp != (ncache_t *)hp; + ncp = ncp->hash_prev, cnt++) { + vp = ncp->vp; + /* + * A name cache entry with a reference count + * of one is only referenced by the dnlc. + * Also negative cache entries are purged first. + */ + if (!vn_has_cached_data(vp) && + ((vp->v_count == 1) || (vp == DNLC_NO_VNODE))) { + ncs.ncs_pick_heur.value.ui64++; + goto found; + } + /* + * Remove from the end of the chain if the + * chain is too long + */ + if (cnt > dnlc_long_chain) { + ncp = hp->hash_prev; + ncs.ncs_pick_last.value.ui64++; + vp = ncp->vp; + goto found; + } + } + /* check for race and continue */ + if (hp->hash_next == (ncache_t *)hp) { + mutex_exit(&hp->hash_lock); + continue; + } + + ncp = hp->hash_prev; /* pick the last one in the hash queue */ + ncs.ncs_pick_last.value.ui64++; + vp = ncp->vp; +found: + /* + * Remove from hash chain. + */ + nc_rmhash(ncp); + mutex_exit(&hp->hash_lock); + VN_RELE(vp); + VN_RELE(ncp->dp); + dnlc_free(ncp); + } while (dnlc_nentries > low_water); + + dnlc_free_rotor = hp; + dnlc_reduce_idle = 1; +} + +SYSINIT(dnlc, SI_SUB_DRIVERS, SI_ORDER_ANY, dnlc_init, NULL); +SYSUNINIT(dnlc, SI_SUB_DRIVERS, SI_ORDER_ANY, dnlc_fini, NULL); diff --git a/sys/contrib/opensolaris/uts/common/fs/gfs.c b/sys/contrib/opensolaris/uts/common/fs/gfs.c new file mode 100644 index 0000000..7e6d689 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/gfs.c @@ -0,0 +1,882 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Portions Copyright 2007 Shivakumar GN */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/kmem.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/uio.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/cred.h> +#include <sys/kdb.h> + +#include <sys/gfs.h> + +/* + * Generic pseudo-filesystem routines. + * + * There are significant similarities between the implementation of certain file + * system entry points across different filesystems. While one could attempt to + * "choke up on the bat" and incorporate common functionality into a VOP + * preamble or postamble, such an approach is limited in the benefit it can + * provide. In this file we instead define a toolkit of routines which can be + * called from a filesystem (with in-kernel pseudo-filesystems being the focus + * of the exercise) in a more component-like fashion. + * + * There are three basic classes of routines: + * + * 1) Lowlevel support routines + * + * These routines are designed to play a support role for existing + * pseudo-filesystems (such as procfs). They simplify common tasks, + * without enforcing the filesystem to hand over management to GFS. The + * routines covered are: + * + * gfs_readdir_init() + * gfs_readdir_emit() + * gfs_readdir_emitn() + * gfs_readdir_pred() + * gfs_readdir_fini() + * gfs_lookup_dot() + * + * 2) Complete GFS management + * + * These routines take a more active role in management of the + * pseudo-filesystem. They handle the relationship between vnode private + * data and VFS data, as well as the relationship between vnodes in the + * directory hierarchy. + * + * In order to use these interfaces, the first member of every private + * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control + * to GFS. + * + * gfs_file_create() + * gfs_dir_create() + * gfs_root_create() + * + * gfs_file_inactive() + * gfs_dir_inactive() + * gfs_dir_lookup() + * gfs_dir_readdir() + * + * gfs_vop_inactive() + * gfs_vop_lookup() + * gfs_vop_readdir() + * gfs_vop_map() + * + * 3) Single File pseudo-filesystems + * + * This routine creates a rooted file to be overlayed ontop of another + * file in the physical filespace. + * + * Note that the parent is NULL (actually the vfs), but there is nothing + * technically keeping such a file from utilizing the "Complete GFS + * management" set of routines. + * + * gfs_root_create_file() + */ + +/* + * Low level directory routines + * + * These routines provide some simple abstractions for reading directories. + * They are designed to be used by existing pseudo filesystems (namely procfs) + * that already have a complicated management infrastructure. + */ + +/* + * gfs_readdir_init: initiate a generic readdir + * st - a pointer to an uninitialized gfs_readdir_state_t structure + * name_max - the directory's maximum file name length + * ureclen - the exported file-space record length (1 for non-legacy FSs) + * uiop - the uiop passed to readdir + * parent - the parent directory's inode + * self - this directory's inode + * + * Returns 0 or a non-zero errno. + * + * Typical VOP_READDIR usage of gfs_readdir_*: + * + * if ((error = gfs_readdir_init(...)) != 0) + * return (error); + * eof = 0; + * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { + * if (!consumer_entry_at(voffset)) + * voffset = consumer_next_entry(voffset); + * if (consumer_eof(voffset)) { + * eof = 1 + * break; + * } + * if ((error = gfs_readdir_emit(..., voffset, + * consumer_ino(voffset), consumer_name(voffset))) != 0) + * break; + * } + * return (gfs_readdir_fini(..., error, eofp, eof)); + * + * As you can see, a zero result from gfs_readdir_pred() or + * gfs_readdir_emit() indicates that processing should continue, + * whereas a non-zero result indicates that the loop should terminate. + * Most consumers need do nothing more than let gfs_readdir_fini() + * determine what the cause of failure was and return the appropriate + * value. + */ +int +gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, + uio_t *uiop, ino64_t parent, ino64_t self) +{ + if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || + (uiop->uio_loffset % ureclen) != 0) + return (EINVAL); + + st->grd_ureclen = ureclen; + st->grd_oresid = uiop->uio_resid; + st->grd_namlen = name_max; + st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP); + st->grd_parent = parent; + st->grd_self = self; + + return (0); +} + +/* + * gfs_readdir_emit_int: internal routine to emit directory entry + * + * st - the current readdir state, which must have d_ino and d_name + * set + * uiop - caller-supplied uio pointer + * next - the offset of the next entry + */ +static int +gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, + int *ncookies, u_long **cookies) +{ + int reclen, namlen; + + namlen = strlen(st->grd_dirent->d_name); + reclen = DIRENT64_RECLEN(namlen); + + if (reclen > uiop->uio_resid) { + /* + * Error if no entries were returned yet + */ + if (uiop->uio_resid == st->grd_oresid) + return (EINVAL); + return (-1); + } + + st->grd_dirent->d_reclen = (ushort_t)reclen; + st->grd_dirent->d_namlen = namlen; + + if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) + return (EFAULT); + + uiop->uio_loffset = next; + if (*cookies != NULL) { + **cookies = next; + (*cookies)++; + (*ncookies)--; + KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies)); + } + + return (0); +} + +/* + * gfs_readdir_emit: emit a directory entry + * voff - the virtual offset (obtained from gfs_readdir_pred) + * ino - the entry's inode + * name - the entry's name + * + * Returns a 0 on success, a non-zero errno on failure, or -1 if the + * readdir loop should terminate. A non-zero result (either errno or + * -1) from this function is typically passed directly to + * gfs_readdir_fini(). + */ +int +gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, + ino64_t ino, const char *name, int *ncookies, u_long **cookies) +{ + offset_t off = (voff + 2) * st->grd_ureclen; + + st->grd_dirent->d_ino = ino; + (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen); + + /* + * Inter-entry offsets are invalid, so we assume a record size of + * grd_ureclen and explicitly set the offset appropriately. + */ + return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies, + cookies)); +} + +/* + * gfs_readdir_pred: readdir loop predicate + * voffp - a pointer in which the next virtual offset should be stored + * + * Returns a 0 on success, a non-zero errno on failure, or -1 if the + * readdir loop should terminate. A non-zero result (either errno or + * -1) from this function is typically passed directly to + * gfs_readdir_fini(). + */ +int +gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp, + int *ncookies, u_long **cookies) +{ + offset_t off, voff; + int error; + +top: + if (uiop->uio_resid <= 0) + return (-1); + + off = uiop->uio_loffset / st->grd_ureclen; + voff = off - 2; + if (off == 0) { + if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, + ".", ncookies, cookies)) == 0) + goto top; + } else if (off == 1) { + if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, + "..", ncookies, cookies)) == 0) + goto top; + } else { + *voffp = voff; + return (0); + } + + return (error); +} + +/* + * gfs_readdir_fini: generic readdir cleanup + * error - if positive, an error to return + * eofp - the eofp passed to readdir + * eof - the eof value + * + * Returns a 0 on success, a non-zero errno on failure. This result + * should be returned from readdir. + */ +int +gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) +{ + kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen)); + if (error > 0) + return (error); + if (eofp) + *eofp = eof; + return (0); +} + +/* + * gfs_lookup_dot + * + * Performs a basic check for "." and ".." directory entries. + */ +int +gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) +{ + if (*nm == '\0' || strcmp(nm, ".") == 0) { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } else if (strcmp(nm, "..") == 0) { + if (pvp == NULL) { + ASSERT(dvp->v_flag & VROOT); + VN_HOLD(dvp); + *vpp = dvp; + } else { + VN_HOLD(pvp); + *vpp = pvp; + } + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + return (0); + } + + return (-1); +} + +/* + * gfs_file_create(): create a new GFS file + * + * size - size of private data structure (v_data) + * pvp - parent vnode (GFS directory) + * ops - vnode operations vector + * + * In order to use this interface, the parent vnode must have been created by + * gfs_dir_create(), and the private data stored in v_data must have a + * 'gfs_file_t' as its first field. + * + * Given these constraints, this routine will automatically: + * + * - Allocate v_data for the vnode + * - Initialize necessary fields in the vnode + * - Hold the parent + */ +vnode_t * +gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops) +{ + gfs_file_t *fp; + vnode_t *vp; + int error; + + /* + * Allocate vnode and internal data structure + */ + fp = kmem_zalloc(size, KM_SLEEP); + error = getnewvnode("zfs", vfsp, ops, &vp); + ASSERT(error == 0); + vp->v_data = (caddr_t)fp; + + /* + * Set up various pointers + */ + fp->gfs_vnode = vp; + fp->gfs_parent = pvp; + fp->gfs_size = size; + fp->gfs_type = GFS_FILE; + + error = insmntque(vp, vfsp); + KASSERT(error == 0, ("insmntque() failed: error %d", error)); + + /* + * Initialize vnode and hold parent. + */ + if (pvp) + VN_HOLD(pvp); + + return (vp); +} + +/* + * gfs_dir_create: creates a new directory in the parent + * + * size - size of private data structure (v_data) + * pvp - parent vnode (GFS directory) + * ops - vnode operations vector + * entries - NULL-terminated list of static entries (if any) + * maxlen - maximum length of a directory entry + * readdir_cb - readdir callback (see gfs_dir_readdir) + * inode_cb - inode callback (see gfs_dir_readdir) + * lookup_cb - lookup callback (see gfs_dir_lookup) + * + * In order to use this function, the first member of the private vnode + * structure (v_data) must be a gfs_dir_t. For each directory, there are + * static entries, defined when the structure is initialized, and dynamic + * entries, retrieved through callbacks. + * + * If a directory has static entries, then it must supply a inode callback, + * which will compute the inode number based on the parent and the index. + * For a directory with dynamic entries, the caller must supply a readdir + * callback and a lookup callback. If a static lookup fails, we fall back to + * the supplied lookup callback, if any. + * + * This function also performs the same initialization as gfs_file_create(). + */ +vnode_t * +gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops, + gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, + gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) +{ + vnode_t *vp; + gfs_dir_t *dp; + gfs_dirent_t *de; + + vp = gfs_file_create(struct_size, pvp, vfsp, ops); + vp->v_type = VDIR; + + dp = vp->v_data; + dp->gfsd_file.gfs_type = GFS_DIR; + dp->gfsd_maxlen = maxlen; + + if (entries != NULL) { + for (de = entries; de->gfse_name != NULL; de++) + dp->gfsd_nstatic++; + + dp->gfsd_static = kmem_alloc( + dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); + bcopy(entries, dp->gfsd_static, + dp->gfsd_nstatic * sizeof (gfs_dirent_t)); + } + + dp->gfsd_readdir = readdir_cb; + dp->gfsd_lookup = lookup_cb; + dp->gfsd_inode = inode_cb; + + mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); + + return (vp); +} + +/* + * gfs_root_create(): create a root vnode for a GFS filesystem + * + * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The + * only difference is that it takes a vfs_t instead of a vnode_t as its parent. + */ +vnode_t * +gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, + gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, + gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) +{ + vnode_t *vp; + + VFS_HOLD(vfsp); + vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb, + maxlen, readdir_cb, lookup_cb); + /* Manually set the inode */ + ((gfs_file_t *)vp->v_data)->gfs_ino = ino; + vp->v_flag |= VROOT; + + return (vp); +} + +/* + * gfs_file_inactive() + * + * Called from the VOP_INACTIVE() routine. If necessary, this routine will + * remove the given vnode from the parent directory and clean up any references + * in the VFS layer. + * + * If the vnode was not removed (due to a race with vget), then NULL is + * returned. Otherwise, a pointer to the private data is returned. + */ +void * +gfs_file_inactive(vnode_t *vp) +{ + int i; + gfs_dirent_t *ge = NULL; + gfs_file_t *fp = vp->v_data; + gfs_dir_t *dp = NULL; + void *data; + + if (fp->gfs_parent == NULL) + goto found; + + dp = fp->gfs_parent->v_data; + + /* + * First, see if this vnode is cached in the parent. + */ + gfs_dir_lock(dp); + + /* + * Find it in the set of static entries. + */ + for (i = 0; i < dp->gfsd_nstatic; i++) { + ge = &dp->gfsd_static[i]; + + if (ge->gfse_vnode == vp) + goto found; + } + + /* + * If 'ge' is NULL, then it is a dynamic entry. + */ + ge = NULL; + +found: + VI_LOCK(vp); + ASSERT(vp->v_count < 2); + /* + * Really remove this vnode + */ + data = vp->v_data; + if (ge != NULL) { + /* + * If this was a statically cached entry, simply set the + * cached vnode to NULL. + */ + ge->gfse_vnode = NULL; + } + if (vp->v_count == 1) { + vp->v_usecount--; + vdropl(vp); + } else { + VI_UNLOCK(vp); + } + + /* + * Free vnode and release parent + */ + if (fp->gfs_parent) { + gfs_dir_unlock(dp); + VI_LOCK(fp->gfs_parent); + fp->gfs_parent->v_usecount--; + VI_UNLOCK(fp->gfs_parent); + } else { + ASSERT(vp->v_vfsp != NULL); + VFS_RELE(vp->v_vfsp); + } + + return (data); +} + +/* + * gfs_dir_inactive() + * + * Same as above, but for directories. + */ +void * +gfs_dir_inactive(vnode_t *vp) +{ + gfs_dir_t *dp; + + ASSERT(vp->v_type == VDIR); + + if ((dp = gfs_file_inactive(vp)) != NULL) { + mutex_destroy(&dp->gfsd_lock); + if (dp->gfsd_nstatic) + kmem_free(dp->gfsd_static, + dp->gfsd_nstatic * sizeof (gfs_dirent_t)); + } + + return (dp); +} + +/* + * gfs_dir_lookup() + * + * Looks up the given name in the directory and returns the corresponding vnode, + * if found. + * + * First, we search statically defined entries, if any. If a match is found, + * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the + * existing vnode. Otherwise, we call the static entry's callback routine, + * caching the result if necessary. + * + * If no static entry is found, we invoke the lookup callback, if any. The + * arguments to this callback are: + * + * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp); + * + * pvp - parent vnode + * nm - name of entry + * vpp - pointer to resulting vnode + * + * Returns 0 on success, non-zero on error. + */ +int +gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) +{ + int i; + gfs_dirent_t *ge; + vnode_t *vp; + gfs_dir_t *dp = dvp->v_data; + int ret = 0; + + ASSERT(dvp->v_type == VDIR); + + if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) + return (0); + + gfs_dir_lock(dp); + + /* + * Search static entries. + */ + for (i = 0; i < dp->gfsd_nstatic; i++) { + ge = &dp->gfsd_static[i]; + + if (strcmp(ge->gfse_name, nm) == 0) { + if (ge->gfse_vnode) { + ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); + vp = ge->gfse_vnode; + VN_HOLD(vp); + goto out; + } + + /* + * We drop the directory lock, as the constructor will + * need to do KM_SLEEP allocations. If we return from + * the constructor only to find that a parallel + * operation has completed, and GFS_CACHE_VNODE is set + * for this entry, we discard the result in favor of the + * cached vnode. + */ + gfs_dir_unlock(dp); + vp = ge->gfse_ctor(dvp); + gfs_dir_lock(dp); + + ((gfs_file_t *)vp->v_data)->gfs_index = i; + + /* Set the inode according to the callback. */ + ((gfs_file_t *)vp->v_data)->gfs_ino = + dp->gfsd_inode(dvp, i); + + if (ge->gfse_flags & GFS_CACHE_VNODE) { + if (ge->gfse_vnode == NULL) { + ge->gfse_vnode = vp; + } else { + /* + * A parallel constructor beat us to it; + * return existing vnode. We have to be + * careful because we can't release the + * current vnode while holding the + * directory lock; its inactive routine + * will try to lock this directory. + */ + vnode_t *oldvp = vp; + vp = ge->gfse_vnode; + VN_HOLD(vp); + + gfs_dir_unlock(dp); + VN_RELE(oldvp); + gfs_dir_lock(dp); + } + } + + goto out; + } + } + + /* + * See if there is a dynamic constructor. + */ + if (dp->gfsd_lookup) { + ino64_t ino; + gfs_file_t *fp; + + /* + * Once again, drop the directory lock, as the lookup routine + * will need to allocate memory, or otherwise deadlock on this + * directory. + */ + gfs_dir_unlock(dp); + ret = dp->gfsd_lookup(dvp, nm, &vp, &ino); + gfs_dir_lock(dp); + if (ret != 0) + goto out; + + fp = (gfs_file_t *)vp->v_data; + fp->gfs_index = -1; + fp->gfs_ino = ino; + } else { + /* + * No static entry found, and there is no lookup callback, so + * return ENOENT. + */ + ret = ENOENT; + } + +out: + gfs_dir_unlock(dp); + + if (ret == 0) + *vpp = vp; + else + *vpp = NULL; + + return (ret); +} + +/* + * gfs_dir_readdir: does a readdir() on the given directory + * + * dvp - directory vnode + * uiop - uio structure + * eofp - eof pointer + * data - arbitrary data passed to readdir callback + * + * This routine does all the readdir() dirty work. Even so, the caller must + * supply two callbacks in order to get full compatibility. + * + * If the directory contains static entries, an inode callback must be + * specified. This avoids having to create every vnode and call VOP_GETATTR() + * when reading the directory. This function has the following arguments: + * + * ino_t gfs_inode_cb(vnode_t *vp, int index); + * + * vp - vnode for the directory + * index - index in original gfs_dirent_t array + * + * Returns the inode number for the given entry. + * + * For directories with dynamic entries, a readdir callback must be provided. + * This is significantly more complex, thanks to the particulars of + * VOP_READDIR(). + * + * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, + * offset_t *off, offset_t *nextoff, void *data) + * + * vp - directory vnode + * dp - directory entry, sized according to maxlen given to + * gfs_dir_create(). callback must fill in d_name and + * d_ino. + * eofp - callback must set to 1 when EOF has been reached + * off - on entry, the last offset read from the directory. Callback + * must set to the offset of the current entry, typically left + * untouched. + * nextoff - callback must set to offset of next entry. Typically + * (off + 1) + * data - caller-supplied data + * + * Return 0 on success, or error on failure. + */ +int +gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, + u_long **cookies, void *data) +{ + gfs_readdir_state_t gstate; + int error, eof = 0; + ino64_t ino, pino; + offset_t off, next; + gfs_dir_t *dp = dvp->v_data; + + ino = dp->gfsd_file.gfs_ino; + + if (dp->gfsd_file.gfs_parent == NULL) + pino = ino; /* root of filesystem */ + else + pino = ((gfs_file_t *) + (dp->gfsd_file.gfs_parent->v_data))->gfs_ino; + + if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, + pino, ino)) != 0) + return (error); + + while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies, + cookies)) == 0 && !eof) { + + if (off >= 0 && off < dp->gfsd_nstatic) { + ino = dp->gfsd_inode(dvp, off); + + if ((error = gfs_readdir_emit(&gstate, uiop, + off, ino, dp->gfsd_static[off].gfse_name, ncookies, + cookies)) != 0) + break; + + } else if (dp->gfsd_readdir) { + off -= dp->gfsd_nstatic; + + if ((error = dp->gfsd_readdir(dvp, + gstate.grd_dirent, &eof, &off, &next, + data)) != 0 || eof) + break; + + off += dp->gfsd_nstatic + 2; + next += dp->gfsd_nstatic + 2; + + if ((error = gfs_readdir_emit_int(&gstate, uiop, + next, ncookies, cookies)) != 0) + break; + } else { + /* + * Offset is beyond the end of the static entries, and + * we have no dynamic entries. Set EOF. + */ + eof = 1; + } + } + + return (gfs_readdir_fini(&gstate, error, eofp, eof)); +} + +/* + * gfs_vop_readdir: VOP_READDIR() entry point + * + * For use directly in vnode ops table. Given a GFS directory, calls + * gfs_dir_readdir() as necessary. + */ +/* ARGSUSED */ +int +gfs_vop_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *ncookies; + u_long **a_cookies; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + uio_t *uiop = ap->a_uio; + int *eofp = ap->a_eofflag; + int ncookies = 0; + u_long *cookies = NULL; + int error; + + if (ap->a_ncookies) { + /* + * Minimum entry size is dirent size and 1 byte for a file name. + */ + ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); + cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK); + *ap->a_cookies = cookies; + *ap->a_ncookies = ncookies; + } + + error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL); + + if (error == 0) { + /* Subtract unused cookies */ + if (ap->a_ncookies) + *ap->a_ncookies -= ncookies; + } else if (ap->a_ncookies) { + free(*ap->a_cookies, M_TEMP); + *ap->a_cookies = NULL; + *ap->a_ncookies = 0; + } + + return (error); +} + +/* + * gfs_vop_inactive: VOP_INACTIVE() entry point + * + * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or + * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. + */ +/* ARGSUSED */ +int +gfs_vop_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + gfs_file_t *fp = vp->v_data; + void *data; + + if (fp->gfs_type == GFS_DIR) + data = gfs_dir_inactive(vp); + else + data = gfs_file_inactive(vp); + + if (data != NULL) + kmem_free(data, fp->gfs_size); + vp->v_data = NULL; + return (0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c new file mode 100644 index 0000000..59a376a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -0,0 +1,2829 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * DVA-based Adjustable Replacement Cache + * + * While much of the theory of operation used here is + * based on the self-tuning, low overhead replacement cache + * presented by Megiddo and Modha at FAST 2003, there are some + * significant differences: + * + * 1. The Megiddo and Modha model assumes any page is evictable. + * Pages in its cache cannot be "locked" into memory. This makes + * the eviction algorithm simple: evict the last page in the list. + * This also make the performance characteristics easy to reason + * about. Our cache is not so simple. At any given moment, some + * subset of the blocks in the cache are un-evictable because we + * have handed out a reference to them. Blocks are only evictable + * when there are no external references active. This makes + * eviction far more problematic: we choose to evict the evictable + * blocks that are the "lowest" in the list. + * + * There are times when it is not possible to evict the requested + * space. In these circumstances we are unable to adjust the cache + * size. To prevent the cache growing unbounded at these times we + * implement a "cache throttle" that slowes the flow of new data + * into the cache until we can make space avaiable. + * + * 2. The Megiddo and Modha model assumes a fixed cache size. + * Pages are evicted when the cache is full and there is a cache + * miss. Our model has a variable sized cache. It grows with + * high use, but also tries to react to memory preasure from the + * operating system: decreasing its size when system memory is + * tight. + * + * 3. The Megiddo and Modha model assumes a fixed page size. All + * elements of the cache are therefor exactly the same size. So + * when adjusting the cache size following a cache miss, its simply + * a matter of choosing a single page to evict. In our model, we + * have variable sized cache blocks (rangeing from 512 bytes to + * 128K bytes). We therefor choose a set of blocks to evict to make + * space for a cache miss that approximates as closely as possible + * the space used by the new block. + * + * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" + * by N. Megiddo & D. Modha, FAST 2003 + */ + +/* + * The locking model: + * + * A new reference to a cache buffer can be obtained in two + * ways: 1) via a hash table lookup using the DVA as a key, + * or 2) via one of the ARC lists. The arc_read() inerface + * uses method 1, while the internal arc algorithms for + * adjusting the cache use method 2. We therefor provide two + * types of locks: 1) the hash table lock array, and 2) the + * arc list locks. + * + * Buffers do not have their own mutexs, rather they rely on the + * hash table mutexs for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexs). + * + * buf_hash_find() returns the appropriate mutex (held) when it + * locates the requested buffer in the hash table. It returns + * NULL for the mutex if the buffer was not in the table. + * + * buf_hash_remove() expects the appropriate hash mutex to be + * already held before it is invoked. + * + * Each arc state also has a mutex which is used to protect the + * buffer list associated with the state. When attempting to + * obtain a hash table lock while holding an arc list lock you + * must use: mutex_tryenter() to avoid deadlock. Also note that + * the active state mutex must be held before the ghost state mutex. + * + * Arc buffers may have an associated eviction callback function. + * This function will be invoked prior to removing the buffer (e.g. + * in arc_do_user_evicts()). Note however that the data associated + * with the buffer may be evicted prior to the callback. The callback + * must be made with *no locks held* (to prevent deadlock). Additionally, + * the users of callbacks must ensure that their private data is + * protected from simultaneous callbacks from arc_buf_evict() + * and arc_do_user_evicts(). + * + * Note that the majority of the performance stats are manipulated + * with atomic operations. + */ + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_context.h> +#include <sys/arc.h> +#include <sys/refcount.h> +#ifdef _KERNEL +#include <sys/dnlc.h> +#endif +#include <sys/callb.h> +#include <sys/kstat.h> +#include <sys/sdt.h> + +#define ARC_FREE_AT_ONCE 4194304 + +static kmutex_t arc_reclaim_thr_lock; +static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ +static uint8_t arc_thread_exit; + +#define ARC_REDUCE_DNLC_PERCENT 3 +uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; + +typedef enum arc_reclaim_strategy { + ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ + ARC_RECLAIM_CONS /* Conservative reclaim strategy */ +} arc_reclaim_strategy_t; + +/* number of seconds before growing cache again */ +static int arc_grow_retry = 60; + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int arc_min_prefetch_lifespan; + +static int arc_dead; + +/* + * These tunables are for performance analysis. + */ +uint64_t zfs_arc_max; +uint64_t zfs_arc_min; + +/* + * Note that buffers can be on one of 5 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recentely used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * When there are no active references to the buffer, they + * are linked onto one of the lists in arc. These are the + * only buffers that can be evicted or deleted. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + */ + +typedef struct arc_state { + list_t arcs_list; /* linked list of evictable buffer in state */ + uint64_t arcs_lsize; /* total size of buffers in the linked list */ + uint64_t arcs_size; /* total size of all buffers in this state */ + kmutex_t arcs_mtx; +} arc_state_t; + +/* The 5 states: */ +static arc_state_t ARC_anon; +static arc_state_t ARC_mru; +static arc_state_t ARC_mru_ghost; +static arc_state_t ARC_mfu; +static arc_state_t ARC_mfu_ghost; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + kstat_named_t arcstat_recycle_miss; + kstat_named_t arcstat_mutex_miss; + kstat_named_t arcstat_evict_skip; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + kstat_named_t arcstat_size; +} arc_stats_t; + +static arc_stats_t arc_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_misses", KSTAT_DATA_UINT64 }, + { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_misses", KSTAT_DATA_UINT64 }, + { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_misses", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, + { "mru_hits", KSTAT_DATA_UINT64 }, + { "mru_ghost_hits", KSTAT_DATA_UINT64 }, + { "mfu_hits", KSTAT_DATA_UINT64 }, + { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "deleted", KSTAT_DATA_UINT64 }, + { "recycle_miss", KSTAT_DATA_UINT64 }, + { "mutex_miss", KSTAT_DATA_UINT64 }, + { "evict_skip", KSTAT_DATA_UINT64 }, + { "hash_elements", KSTAT_DATA_UINT64 }, + { "hash_elements_max", KSTAT_DATA_UINT64 }, + { "hash_collisions", KSTAT_DATA_UINT64 }, + { "hash_chains", KSTAT_DATA_UINT64 }, + { "hash_chain_max", KSTAT_DATA_UINT64 }, + { "p", KSTAT_DATA_UINT64 }, + { "c", KSTAT_DATA_UINT64 }, + { "c_min", KSTAT_DATA_UINT64 }, + { "c_max", KSTAT_DATA_UINT64 }, + { "size", KSTAT_DATA_UINT64 } +}; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)); + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +#define ARCSTAT_MAX(stat, val) { \ + uint64_t m; \ + while ((val) > (m = arc_stats.stat.value.ui64) && \ + (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ + continue; \ +} + +#define ARCSTAT_MAXSTAT(stat) \ + ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) + +/* + * We define a macro to allow ARC hits/misses to be easily broken down by + * two separate conditions, giving a total of four different subtypes for + * each of hits and misses (so eight statistics total). + */ +#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ + if (cond1) { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ + } \ + } else { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ + } \ + } + +kstat_t *arc_ksp; +static arc_state_t *arc_anon; +static arc_state_t *arc_mru; +static arc_state_t *arc_mru_ghost; +static arc_state_t *arc_mfu; +static arc_state_t *arc_mfu_ghost; + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ + +static int arc_no_grow; /* Don't try to grow cache size */ +static uint64_t arc_tempreserve; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_done_func_t *acb_done; + arc_byteswap_func_t *acb_byteswap; + arc_buf_t *acb_buf; + zio_t *acb_zio_dummy; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + uint64_t b_cksum0; + + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_buf_t *b_buf; + uint32_t b_flags; + uint32_t b_datacnt; + + arc_callback_t *b_acb; + kcondvar_t b_cv; + + /* immutable */ + arc_buf_contents_t b_type; + uint64_t b_size; + spa_t *b_spa; + + /* protected by arc state mutex */ + arc_state_t *b_state; + list_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + + /* self protecting */ + refcount_t b_refcnt; +}; + +static arc_buf_t *arc_eviction_list; +static kmutex_t arc_eviction_mtx; +static arc_buf_hdr_t arc_eviction_hdr; +static void arc_get_data_buf(arc_buf_t *buf); +static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); + +#define GHOST_STATE(state) \ + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) + +/* + * Private ARC flags. These flags are private ARC only flags that will show up + * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can + * be passed in as arc_flags in things like arc_read. However, these flags + * should never be passed and should only be set by ARC code. When adding new + * public flags, make sure not to smash the private ones. + */ + +#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ +#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ +#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ +#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ +#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ +#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ + +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) + +/* + * Hash table routines + */ + +#define HT_LOCK_PAD 128 + +struct ht_lock { + kmutex_t ht_lock; +#ifdef _KERNEL + unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; +#endif +}; + +#define BUF_LOCKS 256 +typedef struct buf_hash_table { + uint64_t ht_mask; + arc_buf_hdr_t **ht_table; + struct ht_lock ht_locks[BUF_LOCKS]; +} buf_hash_table_t; + +static buf_hash_table_t buf_hash_table; + +#define BUF_HASH_INDEX(spa, dva, birth) \ + (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) +#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) +#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define HDR_LOCK(buf) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) + +uint64_t zfs_crc64_table[256]; + +static uint64_t +buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) +{ + uintptr_t spav = (uintptr_t)spa; + uint8_t *vdva = (uint8_t *)dva; + uint64_t crc = -1ULL; + int i; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + for (i = 0; i < sizeof (dva_t); i++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; + + crc ^= (spav>>8) ^ birth; + + return (crc); +} + +#define BUF_EMPTY(buf) \ + ((buf)->b_dva.dva_word[0] == 0 && \ + (buf)->b_dva.dva_word[1] == 0 && \ + (buf)->b_birth == 0) + +#define BUF_EQUAL(spa, dva, birth, buf) \ + ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((buf)->b_birth == birth) && ((buf)->b_spa == spa) + +static arc_buf_hdr_t * +buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *buf; + + mutex_enter(hash_lock); + for (buf = buf_hash_table.ht_table[idx]; buf != NULL; + buf = buf->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, buf)) { + *lockp = hash_lock; + return (buf); + } + } + mutex_exit(hash_lock); + *lockp = NULL; + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static arc_buf_hdr_t * +buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *fbuf; + uint32_t i; + + ASSERT(!HDR_IN_HASH_TABLE(buf)); + *lockp = hash_lock; + mutex_enter(hash_lock); + for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; + fbuf = fbuf->b_hash_next, i++) { + if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) + return (fbuf); + } + + buf->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = buf; + buf->b_flags |= ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + if (i > 0) { + ARCSTAT_BUMP(arcstat_hash_collisions); + if (i == 1) + ARCSTAT_BUMP(arcstat_hash_chains); + + ARCSTAT_MAX(arcstat_hash_chain_max, i); + } + + ARCSTAT_BUMP(arcstat_hash_elements); + ARCSTAT_MAXSTAT(arcstat_hash_elements); + + return (NULL); +} + +static void +buf_hash_remove(arc_buf_hdr_t *buf) +{ + arc_buf_hdr_t *fbuf, **bufp; + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + + ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); + ASSERT(HDR_IN_HASH_TABLE(buf)); + + bufp = &buf_hash_table.ht_table[idx]; + while ((fbuf = *bufp) != buf) { + ASSERT(fbuf != NULL); + bufp = &fbuf->b_hash_next; + } + *bufp = buf->b_hash_next; + buf->b_hash_next = NULL; + buf->b_flags &= ~ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + ARCSTAT_BUMPDOWN(arcstat_hash_elements); + + if (buf_hash_table.ht_table[idx] && + buf_hash_table.ht_table[idx]->b_hash_next == NULL) + ARCSTAT_BUMPDOWN(arcstat_hash_chains); +} + +/* + * Global data structures and functions for the buf kmem cache. + */ +static kmem_cache_t *hdr_cache; +static kmem_cache_t *buf_cache; + +static void +buf_fini(void) +{ + int i; + + kmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); + for (i = 0; i < BUF_LOCKS; i++) + mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(buf_cache); +} + +/* + * Constructor callback - called when the cache is empty + * and a new buf is requested. + */ +/* ARGSUSED */ +static int +hdr_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_hdr_t)); + refcount_create(&buf->b_refcnt); + cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); + return (0); +} + +/* + * Destructor callback - called when a cached buf is + * no longer required. + */ +/* ARGSUSED */ +static void +hdr_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *buf = vbuf; + + refcount_destroy(&buf->b_refcnt); + cv_destroy(&buf->b_cv); +} + +/* + * Reclaim callback -- invoked when memory is low. + */ +/* ARGSUSED */ +static void +hdr_recl(void *unused) +{ + dprintf("hdr_recl called\n"); + /* + * umem calls the reclaim func when we destroy the buf cache, + * which is after we do arc_fini(). + */ + if (!arc_dead) + cv_signal(&arc_reclaim_thr_cv); +} + +static void +buf_init(void) +{ + uint64_t *ct; + uint64_t hsize = 1ULL << 12; + int i, j; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 64K block size. The table will take up + * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). + */ + while (hsize * 65536 < physmem * PAGESIZE) + hsize <<= 1; +retry: + buf_hash_table.ht_mask = hsize - 1; + buf_hash_table.ht_table = + kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); + if (buf_hash_table.ht_table == NULL) { + ASSERT(hsize > (1ULL << 8)); + hsize >>= 1; + goto retry; + } + + hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), + 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + + for (i = 0; i < BUF_LOCKS; i++) { + mutex_init(&buf_hash_table.ht_locks[i].ht_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +static void +arc_cksum_verify(arc_buf_t *buf) +{ + zio_cksum_t zc; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || + (buf->b_hdr->b_flags & ARC_IO_ERROR)) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + panic("buffer modified while frozen!"); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +static void +arc_cksum_compute(arc_buf_t *buf) +{ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, + buf->b_hdr->b_freeze_cksum); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_thaw(arc_buf_t *buf) +{ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + if (buf->b_hdr->b_state != arc_anon) + panic("modifying non-anon buffer!"); + if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + panic("modifying buffer while i/o in progress!"); + arc_cksum_verify(buf); + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + buf->b_hdr->b_freeze_cksum = NULL; + } + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_freeze(arc_buf_t *buf) +{ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + ASSERT(buf->b_hdr->b_freeze_cksum != NULL || + buf->b_hdr->b_state == arc_anon); + arc_cksum_compute(buf); +} + +static void +add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + ASSERT(MUTEX_HELD(hash_lock)); + + if ((refcount_add(&ab->b_refcnt, tag) == 1) && + (ab->b_state != arc_anon)) { + uint64_t delta = ab->b_size * ab->b_datacnt; + + ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); + mutex_enter(&ab->b_state->arcs_mtx); + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(&ab->b_state->arcs_list, ab); + if (GHOST_STATE(ab->b_state)) { + ASSERT3U(ab->b_datacnt, ==, 0); + ASSERT3P(ab->b_buf, ==, NULL); + delta = ab->b_size; + } + ASSERT(delta > 0); + ASSERT3U(ab->b_state->arcs_lsize, >=, delta); + atomic_add_64(&ab->b_state->arcs_lsize, -delta); + mutex_exit(&ab->b_state->arcs_mtx); + /* remove the prefetch flag is we get a reference */ + if (ab->b_flags & ARC_PREFETCH) + ab->b_flags &= ~ARC_PREFETCH; + } +} + +static int +remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + int cnt; + arc_state_t *state = ab->b_state; + + ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(!GHOST_STATE(state)); + + if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + (state != arc_anon)) { + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); + mutex_enter(&state->arcs_mtx); + ASSERT(!list_link_active(&ab->b_arc_node)); + list_insert_head(&state->arcs_list, ab); + ASSERT(ab->b_datacnt > 0); + atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); + ASSERT3U(state->arcs_size, >=, state->arcs_lsize); + mutex_exit(&state->arcs_mtx); + } + return (cnt); +} + +/* + * Move the supplied buffer to the indicated state. The mutex + * for the buffer must be held by the caller. + */ +static void +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +{ + arc_state_t *old_state = ab->b_state; + int64_t refcnt = refcount_count(&ab->b_refcnt); + uint64_t from_delta, to_delta; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(new_state != old_state); + ASSERT(refcnt == 0 || ab->b_datacnt > 0); + ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); + + from_delta = to_delta = ab->b_datacnt * ab->b_size; + + /* + * If this buffer is evictable, transfer it from the + * old state list to the new state list. + */ + if (refcnt == 0) { + if (old_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + + if (use_mutex) + mutex_enter(&old_state->arcs_mtx); + + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(&old_state->arcs_list, ab); + + /* + * If prefetching out of the ghost cache, + * we will have a non-null datacnt. + */ + if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + /* ghost elements have a ghost size */ + ASSERT(ab->b_buf == NULL); + from_delta = ab->b_size; + } + ASSERT3U(old_state->arcs_lsize, >=, from_delta); + atomic_add_64(&old_state->arcs_lsize, -from_delta); + + if (use_mutex) + mutex_exit(&old_state->arcs_mtx); + } + if (new_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + + if (use_mutex) + mutex_enter(&new_state->arcs_mtx); + + list_insert_head(&new_state->arcs_list, ab); + + /* ghost elements have a ghost size */ + if (GHOST_STATE(new_state)) { + ASSERT(ab->b_datacnt == 0); + ASSERT(ab->b_buf == NULL); + to_delta = ab->b_size; + } + atomic_add_64(&new_state->arcs_lsize, to_delta); + ASSERT3U(new_state->arcs_size + to_delta, >=, + new_state->arcs_lsize); + + if (use_mutex) + mutex_exit(&new_state->arcs_mtx); + } + } + + ASSERT(!BUF_EMPTY(ab)); + if (new_state == arc_anon && old_state != arc_anon) { + buf_hash_remove(ab); + } + + /* adjust state sizes */ + if (to_delta) + atomic_add_64(&new_state->arcs_size, to_delta); + if (from_delta) { + ASSERT3U(old_state->arcs_size, >=, from_delta); + atomic_add_64(&old_state->arcs_size, -from_delta); + } + ab->b_state = new_state; +} + +arc_buf_t * +arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + + ASSERT3U(size, >, 0); + hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + ASSERT(BUF_EMPTY(hdr)); + hdr->b_size = size; + hdr->b_type = type; + hdr->b_spa = spa; + hdr->b_state = arc_anon; + hdr->b_arc_access = 0; + mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + arc_get_data_buf(buf); + hdr->b_datacnt = 1; + hdr->b_flags = 0; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + (void) refcount_add(&hdr->b_refcnt, tag); + + return (buf); +} + +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = hdr->b_size; + + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = hdr->b_buf; + hdr->b_buf = buf; + arc_get_data_buf(buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_datacnt += 1; + return (buf); +} + +void +arc_buf_add_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + + /* + * Check to see if this buffer is currently being evicted via + * arc_do_user_evicts(). + */ + mutex_enter(&arc_eviction_mtx); + hdr = buf->b_hdr; + if (hdr == NULL) { + mutex_exit(&arc_eviction_mtx); + return; + } + hash_lock = HDR_LOCK(hdr); + mutex_exit(&arc_eviction_mtx); + + mutex_enter(hash_lock); + if (buf->b_data == NULL) { + /* + * This buffer is evicted. + */ + mutex_exit(hash_lock); + return; + } + + ASSERT(buf->b_hdr == hdr); + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); +} + +static void +arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) +{ + arc_buf_t **bufp; + + /* free up data associated with the buf */ + if (buf->b_data) { + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_cksum_verify(buf); + if (!recycle) { + if (type == ARC_BUFC_METADATA) { + zio_buf_free(buf->b_data, size); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(buf->b_data, size); + } + atomic_add_64(&arc_size, -size); + } + if (list_link_active(&buf->b_hdr->b_arc_node)) { + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(state != arc_anon); + ASSERT3U(state->arcs_lsize, >=, size); + atomic_add_64(&state->arcs_lsize, -size); + } + ASSERT3U(state->arcs_size, >=, size); + atomic_add_64(&state->arcs_size, -size); + buf->b_data = NULL; + ASSERT(buf->b_hdr->b_datacnt > 0); + buf->b_hdr->b_datacnt -= 1; + } + + /* only remove the buf if requested */ + if (!all) + return; + + /* remove the buf from the hdr list */ + for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + continue; + *bufp = buf->b_next; + + ASSERT(buf->b_efunc == NULL); + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + +static void +arc_hdr_destroy(arc_buf_hdr_t *hdr) +{ + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT3P(hdr->b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + + if (!BUF_EMPTY(hdr)) { + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + } + while (hdr->b_buf) { + arc_buf_t *buf = hdr->b_buf; + + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_buf, FALSE, FALSE); + hdr->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(hdr->b_buf, FALSE, TRUE); + } + } + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + mutex_destroy(&hdr->b_freeze_lock); + + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT3P(hdr->b_hash_next, ==, NULL); + ASSERT3P(hdr->b_acb, ==, NULL); + kmem_cache_free(hdr_cache, hdr); +} + +void +arc_buf_free(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + int hashed = hdr->b_state != arc_anon; + + ASSERT(buf->b_efunc == NULL); + ASSERT(buf->b_data != NULL); + + if (hashed) { + kmutex_t *hash_lock = HDR_LOCK(hdr); + + mutex_enter(hash_lock); + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) + arc_buf_destroy(buf, FALSE, TRUE); + else + hdr->b_flags |= ARC_BUF_AVAILABLE; + mutex_exit(hash_lock); + } else if (HDR_IO_IN_PROGRESS(hdr)) { + int destroy_hdr; + /* + * We are in the middle of an async write. Don't destroy + * this buffer unless the write completes before we finish + * decrementing the reference count. + */ + mutex_enter(&arc_eviction_mtx); + (void) remove_reference(hdr, NULL, tag); + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + if (remove_reference(hdr, NULL, tag) > 0) { + ASSERT(HDR_IO_ERROR(hdr)); + arc_buf_destroy(buf, FALSE, TRUE); + } else { + arc_hdr_destroy(hdr); + } + } +} + +int +arc_buf_remove_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + int no_callback = (buf->b_efunc == NULL); + + if (hdr->b_state == arc_anon) { + arc_buf_free(buf, tag); + return (no_callback); + } + + mutex_enter(hash_lock); + ASSERT(hdr->b_state != arc_anon); + ASSERT(buf->b_data != NULL); + + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) { + if (no_callback) + arc_buf_destroy(buf, FALSE, TRUE); + } else if (no_callback) { + ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + ASSERT(no_callback || hdr->b_datacnt > 1 || + refcount_is_zero(&hdr->b_refcnt)); + mutex_exit(hash_lock); + return (no_callback); +} + +int +arc_buf_size(arc_buf_t *buf) +{ + return (buf->b_hdr->b_size); +} + +/* + * Evict buffers from list until we've removed the specified number of + * bytes. Move the removed buffers to the appropriate evict state. + * If the recycle flag is set, then attempt to "recycle" a buffer: + * - look for a buffer to evict that is `bytes' long. + * - return the data block from this buffer rather than freeing it. + * This flag is used by callers that are trying to make space for a + * new buffer in a full arc cache. + */ +static void * +arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, + arc_buf_contents_t type) +{ + arc_state_t *evicted_state; + uint64_t bytes_evicted = 0, skipped = 0, missed = 0; + arc_buf_hdr_t *ab, *ab_prev = NULL; + kmutex_t *hash_lock; + boolean_t have_lock; + void *stolen = NULL; + + ASSERT(state == arc_mru || state == arc_mfu); + + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { + ab_prev = list_prev(&state->arcs_list, ab); + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(ab) || + (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && + lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { + skipped++; + continue; + } + /* "lookahead" for better eviction candidate */ + if (recycle && ab->b_size != bytes && + ab_prev && ab_prev->b_size == bytes) + continue; + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (have_lock || mutex_tryenter(hash_lock)) { + ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); + ASSERT(ab->b_datacnt > 0); + while (ab->b_buf) { + arc_buf_t *buf = ab->b_buf; + if (buf->b_data) { + bytes_evicted += ab->b_size; + if (recycle && ab->b_type == type && + ab->b_size == bytes) { + stolen = buf->b_data; + recycle = FALSE; + } + } + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + arc_buf_destroy(buf, + buf->b_data == stolen, FALSE); + ab->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(buf, + buf->b_data == stolen, TRUE); + } + } + ASSERT(ab->b_datacnt == 0); + arc_change_state(evicted_state, ab, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(ab)); + ab->b_flags = ARC_IN_HASH_TABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + if (!have_lock) + mutex_exit(hash_lock); + if (bytes >= 0 && bytes_evicted >= bytes) + break; + } else { + missed += 1; + } + } + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&state->arcs_mtx); + + if (bytes_evicted < bytes) + dprintf("only evicted %lld bytes from %x", + (longlong_t)bytes_evicted, state); + + if (skipped) + ARCSTAT_INCR(arcstat_evict_skip, skipped); + + if (missed) + ARCSTAT_INCR(arcstat_mutex_miss, missed); + + return (stolen); +} + +/* + * Remove buffers from list until we've removed the specified number of + * bytes. Destroy the buffers that are removed. + */ +static void +arc_evict_ghost(arc_state_t *state, int64_t bytes) +{ + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + uint64_t bytes_deleted = 0; + uint64_t bufs_skipped = 0; + + ASSERT(GHOST_STATE(state)); +top: + mutex_enter(&state->arcs_mtx); + for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { + ab_prev = list_prev(&state->arcs_list, ab); + hash_lock = HDR_LOCK(ab); + if (mutex_tryenter(hash_lock)) { + ASSERT(!HDR_IO_IN_PROGRESS(ab)); + ASSERT(ab->b_buf == NULL); + arc_change_state(arc_anon, ab, hash_lock); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_deleted); + bytes_deleted += ab->b_size; + arc_hdr_destroy(ab); + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); + if (bytes >= 0 && bytes_deleted >= bytes) + break; + } else { + if (bytes < 0) { + mutex_exit(&state->arcs_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + bufs_skipped += 1; + } + } + mutex_exit(&state->arcs_mtx); + + if (bufs_skipped) { + ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); + ASSERT(bytes >= 0); + } + + if (bytes_deleted < bytes) + dprintf("only deleted %lld bytes from %p", + (longlong_t)bytes_deleted, state); +} + +static void +arc_adjust(void) +{ + int64_t top_sz, mru_over, arc_over, todelete; + + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + + if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { + int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); + (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + } + + mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; + + if (mru_over > 0) { + if (arc_mru_ghost->arcs_lsize > 0) { + todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); + arc_evict_ghost(arc_mru_ghost, todelete); + } + } + + if ((arc_over = arc_size - arc_c) > 0) { + int64_t tbl_over; + + if (arc_mfu->arcs_lsize > 0) { + int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); + (void) arc_evict(arc_mfu, toevict, FALSE, + ARC_BUFC_UNDEF); + } + + tbl_over = arc_size + arc_mru_ghost->arcs_lsize + + arc_mfu_ghost->arcs_lsize - arc_c*2; + + if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { + todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); + arc_evict_ghost(arc_mfu_ghost, todelete); + } + } +} + +static void +arc_do_user_evicts(void) +{ + mutex_enter(&arc_eviction_mtx); + while (arc_eviction_list != NULL) { + arc_buf_t *buf = arc_eviction_list; + arc_eviction_list = buf->b_next; + buf->b_hdr = NULL; + mutex_exit(&arc_eviction_mtx); + + if (buf->b_efunc != NULL) + VERIFY(buf->b_efunc(buf) == 0); + + buf->b_efunc = NULL; + buf->b_private = NULL; + kmem_cache_free(buf_cache, buf); + mutex_enter(&arc_eviction_mtx); + } + mutex_exit(&arc_eviction_mtx); +} + +/* + * Flush all *evictable* data from the cache. + * NOTE: this will not touch "active" (i.e. referenced) data. + */ +void +arc_flush(void) +{ + while (list_head(&arc_mru->arcs_list)) + (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); + while (list_head(&arc_mfu->arcs_list)) + (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); + + arc_evict_ghost(arc_mru_ghost, -1); + arc_evict_ghost(arc_mfu_ghost, -1); + + mutex_enter(&arc_reclaim_thr_lock); + arc_do_user_evicts(); + mutex_exit(&arc_reclaim_thr_lock); + ASSERT(arc_eviction_list == NULL); +} + +int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ + +void +arc_shrink(void) +{ + if (arc_c > arc_c_min) { + uint64_t to_free; + +#ifdef _KERNEL + to_free = arc_c >> arc_shrink_shift; +#else + to_free = arc_c >> arc_shrink_shift; +#endif + if (arc_c > arc_c_min + to_free) + atomic_add_64(&arc_c, -to_free); + else + arc_c = arc_c_min; + + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + if (arc_c > arc_size) + arc_c = MAX(arc_size, arc_c_min); + if (arc_p > arc_c) + arc_p = (arc_c >> 1); + ASSERT(arc_c >= arc_c_min); + ASSERT((int64_t)arc_p >= 0); + } + + if (arc_size > arc_c) + arc_adjust(); +} + +static int zfs_needfree = 0; + +static int +arc_reclaim_needed(void) +{ +#if 0 + uint64_t extra; +#endif + +#ifdef _KERNEL + + if (zfs_needfree) + return (1); + +#if 0 + /* + * check to make sure that swapfs has enough space so that anon + * reservations can still succeeed. anon_resvmem() checks that the + * availrmem is greater than swapfs_minfree, and the number of reserved + * swap pages. We also add a bit of extra here just to prevent + * circumstances from getting really dire. + */ + if (availrmem < swapfs_minfree + swapfs_reserve + extra) + return (1); + + /* + * If zio data pages are being allocated out of a separate heap segment, + * then check that the size of available vmem for this area remains + * above 1/4th free. This needs to be done when the size of the + * non-default segment is smaller than physical memory, so we could + * conceivably run out of VA in that segment before running out of + * physical memory. + */ + if (zio_arena != NULL) { + size_t arc_ziosize = + btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); + + if ((physmem > arc_ziosize) && + (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) + return (1); + } + +#if defined(__i386) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the caclulation, if less than 1/4th is + * free) + */ + if (btop(vmem_size(heap_arena, VMEM_FREE)) < + (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + return (1); +#endif +#else + if (kmem_map->size > (vm_kmem_size * 3) / 4) + return (1); +#endif + +#else + if (spa_get_random(100) == 0) + return (1); +#endif + return (0); +} + +static void +arc_kmem_reap_now(arc_reclaim_strategy_t strat) +{ +#ifdef ZIO_USE_UMA + size_t i; + kmem_cache_t *prev_cache = NULL; + kmem_cache_t *prev_data_cache = NULL; + extern kmem_cache_t *zio_buf_cache[]; + extern kmem_cache_t *zio_data_buf_cache[]; +#endif + +#ifdef _KERNEL + /* + * First purge some DNLC entries, in case the DNLC is using + * up too much memory. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + +#if defined(__i386) + /* + * Reclaim unused memory from all kmem caches. + */ + kmem_reap(); +#endif +#endif + + /* + * An agressive reclamation will shrink the cache size as well as + * reap free buffers from the arc kmem caches. + */ + if (strat == ARC_RECLAIM_AGGR) + arc_shrink(); + +#ifdef ZIO_USE_UMA + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_now(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_now(zio_data_buf_cache[i]); + } + } +#endif + kmem_cache_reap_now(buf_cache); + kmem_cache_reap_now(hdr_cache); +} + +static void +arc_reclaim_thread(void *dummy __unused) +{ + clock_t growtime = 0; + arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_reclaim_thr_lock); + while (arc_thread_exit == 0) { + if (arc_reclaim_needed()) { + + if (arc_no_grow) { + if (last_reclaim == ARC_RECLAIM_CONS) { + last_reclaim = ARC_RECLAIM_AGGR; + } else { + last_reclaim = ARC_RECLAIM_CONS; + } + } else { + arc_no_grow = TRUE; + last_reclaim = ARC_RECLAIM_AGGR; + membar_producer(); + } + + /* reset the growth delay for every reclaim */ + growtime = lbolt + (arc_grow_retry * hz); + ASSERT(growtime > 0); + + if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) { + /* + * If zfs_needfree is TRUE our vm_lowmem hook + * was called and in that case we must free some + * memory, so switch to aggressive mode. + */ + arc_no_grow = TRUE; + last_reclaim = ARC_RECLAIM_AGGR; + } + arc_kmem_reap_now(last_reclaim); + } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { + arc_no_grow = FALSE; + } + + if (zfs_needfree || + (2 * arc_c < arc_size + + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) + arc_adjust(); + + if (arc_eviction_list != NULL) + arc_do_user_evicts(); + + if (arc_reclaim_needed()) { + zfs_needfree = 0; +#ifdef _KERNEL + wakeup(&zfs_needfree); +#endif + } + + /* block until needed, or one second, whichever is shorter */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&arc_reclaim_thr_cv, + &arc_reclaim_thr_lock, hz); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + } + + arc_thread_exit = 0; + cv_broadcast(&arc_reclaim_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + thread_exit(); +} + +/* + * Adapt arc info given the number of bytes we are trying to add and + * the state that we are comming from. This function is only called + * when we are adding new content to the cache. + */ +static void +arc_adapt(int bytes, arc_state_t *state) +{ + int mult; + + ASSERT(bytes > 0); + /* + * Adapt the target size of the MRU list: + * - if we just hit in the MRU ghost list, then increase + * the target size of the MRU list. + * - if we just hit in the MFU ghost list, then increase + * the target size of the MFU list by decreasing the + * target size of the MRU list. + */ + if (state == arc_mru_ghost) { + mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? + 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + + arc_p = MIN(arc_c, arc_p + bytes * mult); + } else if (state == arc_mfu_ghost) { + mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? + 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + + arc_p = MAX(0, (int64_t)arc_p - bytes * mult); + } + ASSERT((int64_t)arc_p >= 0); + + if (arc_reclaim_needed()) { + cv_signal(&arc_reclaim_thr_cv); + return; + } + + if (arc_no_grow) + return; + + if (arc_c >= arc_c_max) + return; + + /* + * If we're within (2 * maxblocksize) bytes of the target + * cache size, increment the target cache size + */ + if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + atomic_add_64(&arc_c, (int64_t)bytes); + if (arc_c > arc_c_max) + arc_c = arc_c_max; + else if (state == arc_anon) + atomic_add_64(&arc_p, (int64_t)bytes); + if (arc_p > arc_c) + arc_p = arc_c; + } + ASSERT((int64_t)arc_p >= 0); +} + +/* + * Check if the cache has reached its limits and eviction is required + * prior to insert. + */ +static int +arc_evict_needed() +{ + if (arc_reclaim_needed()) + return (1); + + return (arc_size > arc_c); +} + +/* + * The buffer, supplied as the first argument, needs a data block. + * So, if we are at cache max, determine which cache should be victimized. + * We have the following cases: + * + * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> + * In this situation if we're out of space, but the resident size of the MFU is + * under the limit, victimize the MFU cache to satisfy this insertion request. + * + * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> + * Here, we've used up all of the available space for the MRU, so we need to + * evict from our own cache instead. Evict from the set of resident MRU + * entries. + * + * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> + * c minus p represents the MFU space in the cache, since p is the size of the + * cache that is dedicated to the MRU. In this situation there's still space on + * the MFU side, so the MRU side needs to be victimized. + * + * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> + * MFU's resident set is consuming more space than it has been allotted. In + * this situation, we must victimize our own cache, the MFU, for this insertion. + */ +static void +arc_get_data_buf(arc_buf_t *buf) +{ + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_adapt(size, state); + + /* + * We have not yet reached cache maximum size, + * just allocate a new buffer. + */ + if (!arc_evict_needed()) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + } + atomic_add_64(&arc_size, size); + goto out; + } + + /* + * If we are prefetching from the mfu ghost list, this buffer + * will end up on the mru list; so steal space from there. + */ + if (state == arc_mfu_ghost) + state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; + else if (state == arc_mru_ghost) + state = arc_mru; + + if (state == arc_mru || state == arc_anon) { + uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; + state = (arc_p > mru_used) ? arc_mfu : arc_mru; + } else { + /* MFU cases */ + uint64_t mfu_space = arc_c - arc_p; + state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + } + if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + } + atomic_add_64(&arc_size, size); + ARCSTAT_BUMP(arcstat_recycle_miss); + } + ASSERT(buf->b_data != NULL); +out: + /* + * Update the state size. Note that ghost states have a + * "ghost size" and so don't need to be updated. + */ + if (!GHOST_STATE(buf->b_hdr->b_state)) { + arc_buf_hdr_t *hdr = buf->b_hdr; + + atomic_add_64(&hdr->b_state->arcs_size, size); + if (list_link_active(&hdr->b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + atomic_add_64(&hdr->b_state->arcs_lsize, size); + } + /* + * If we are growing the cache, and we are adding anonymous + * data, and we have outgrown arc_p, update arc_p + */ + if (arc_size < arc_c && hdr->b_state == arc_anon && + arc_anon->arcs_size + arc_mru->arcs_size > arc_p) + arc_p = MIN(arc_c, arc_p + size); + } +} + +/* + * This routine is called whenever a buffer is accessed. + * NOTE: the hash lock is dropped in this function. + */ +static void +arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +{ + ASSERT(MUTEX_HELD(hash_lock)); + + if (buf->b_state == arc_anon) { + /* + * This buffer is not in the cache, and does not + * appear in our "ghost" list. Add the new buffer + * to the MRU state. + */ + + ASSERT(buf->b_arc_access == 0); + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + arc_change_state(arc_mru, buf, hash_lock); + + } else if (buf->b_state == arc_mru) { + /* + * If this buffer is here because of a prefetch, then either: + * - clear the flag if this is a "referencing" read + * (any subsequent access will bump this into the MFU state). + * or + * - move the buffer to the head of the list if this is + * another prefetch (to make it less likely to be evicted). + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + if (refcount_count(&buf->b_refcnt) == 0) { + ASSERT(list_link_active(&buf->b_arc_node)); + mutex_enter(&arc_mru->arcs_mtx); + list_remove(&arc_mru->arcs_list, buf); + list_insert_head(&arc_mru->arcs_list, buf); + mutex_exit(&arc_mru->arcs_mtx); + } else { + buf->b_flags &= ~ARC_PREFETCH; + ARCSTAT_BUMP(arcstat_mru_hits); + } + buf->b_arc_access = lbolt; + return; + } + + /* + * This buffer has been "accessed" only once so far, + * but it is still in the cache. Move it to the MFU + * state. + */ + if (lbolt > buf->b_arc_access + ARC_MINTIME) { + /* + * More than 125ms have passed since we + * instantiated this buffer. Move it to the + * most frequently used state. + */ + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); + } + ARCSTAT_BUMP(arcstat_mru_hits); + } else if (buf->b_state == arc_mru_ghost) { + arc_state_t *new_state; + /* + * This buffer has been "accessed" recently, but + * was evicted from the cache. Move it to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + new_state = arc_mru; + if (refcount_count(&buf->b_refcnt) > 0) + buf->b_flags &= ~ARC_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + } else { + new_state = arc_mfu; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + } + + buf->b_arc_access = lbolt; + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + } else if (buf->b_state == arc_mfu) { + /* + * This buffer has been accessed more than once and is + * still in the cache. Keep it in the MFU state. + * + * NOTE: an add_reference() that occurred when we did + * the arc_read() will have kicked this off the list. + * If it was a prefetch, we will explicitly move it to + * the head of the list now. + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + ASSERT(refcount_count(&buf->b_refcnt) == 0); + ASSERT(list_link_active(&buf->b_arc_node)); + mutex_enter(&arc_mfu->arcs_mtx); + list_remove(&arc_mfu->arcs_list, buf); + list_insert_head(&arc_mfu->arcs_list, buf); + mutex_exit(&arc_mfu->arcs_mtx); + } + ARCSTAT_BUMP(arcstat_mfu_hits); + buf->b_arc_access = lbolt; + } else if (buf->b_state == arc_mfu_ghost) { + arc_state_t *new_state = arc_mfu; + /* + * This buffer has been accessed more than once but has + * been evicted from the cache. Move it back to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + /* + * This is a prefetch access... + * move this block back to the MRU state. + */ + ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); + new_state = arc_mru; + } + + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else { + ASSERT(!"invalid arc state"); + } +} + +/* a generic arc_done_func_t which you can use */ +/* ARGSUSED */ +void +arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + bcopy(buf->b_data, arg, buf->b_hdr->b_size); + VERIFY(arc_buf_remove_ref(buf, arg) == 1); +} + +/* a generic arc_done_func_t which you can use */ +void +arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + arc_buf_t **bufp = arg; + if (zio && zio->io_error) { + VERIFY(arc_buf_remove_ref(buf, arg) == 1); + *bufp = NULL; + } else { + *bufp = buf; + } +} + +static void +arc_read_done(zio_t *zio) +{ + arc_buf_hdr_t *hdr, *found; + arc_buf_t *buf; + arc_buf_t *abuf; /* buffer we're assigning to callback */ + kmutex_t *hash_lock; + arc_callback_t *callback_list, *acb; + int freeable = FALSE; + + buf = zio->io_private; + hdr = buf->b_hdr; + + /* + * The hdr was inserted into hash-table and removed from lists + * prior to starting I/O. We should find this header, since + * it's in the hash table, and it should be legit since it's + * not possible to evict it during the I/O. The only possible + * reason for it not to be found is if we were freed during the + * read. + */ + found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, + &hash_lock); + + ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || + (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); + + /* byteswap if necessary */ + callback_list = hdr->b_acb; + ASSERT(callback_list != NULL); + if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) + callback_list->acb_byteswap(buf->b_data, hdr->b_size); + + arc_cksum_compute(buf); + + /* create copies of the data buffer for the callers */ + abuf = buf; + for (acb = callback_list; acb; acb = acb->acb_next) { + if (acb->acb_done) { + if (abuf == NULL) + abuf = arc_buf_clone(buf); + acb->acb_buf = abuf; + abuf = NULL; + } + } + hdr->b_acb = NULL; + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + ASSERT(!HDR_BUF_AVAILABLE(hdr)); + if (abuf == buf) + hdr->b_flags |= ARC_BUF_AVAILABLE; + + ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + + if (zio->io_error != 0) { + hdr->b_flags |= ARC_IO_ERROR; + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); + if (HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); + freeable = refcount_is_zero(&hdr->b_refcnt); + /* convert checksum errors into IO errors */ + if (zio->io_error == ECKSUM) + zio->io_error = EIO; + } + + /* + * Broadcast before we drop the hash_lock to avoid the possibility + * that the hdr (and hence the cv) might be freed before we get to + * the cv_broadcast(). + */ + cv_broadcast(&hdr->b_cv); + + if (hash_lock) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + if (zio->io_error == 0 && hdr->b_state == arc_anon) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else { + /* + * This block was freed while we waited for the read to + * complete. It has been removed from the hash table and + * moved to the anonymous state (so that it won't show up + * in the cache). + */ + ASSERT3P(hdr->b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + /* execute each callback and free its structure */ + while ((acb = callback_list) != NULL) { + if (acb->acb_done) + acb->acb_done(zio, acb->acb_buf, acb->acb_private); + + if (acb->acb_zio_dummy != NULL) { + acb->acb_zio_dummy->io_error = zio->io_error; + zio_nowait(acb->acb_zio_dummy); + } + + callback_list = acb->acb_next; + kmem_free(acb, sizeof (arc_callback_t)); + } + + if (freeable) + arc_hdr_destroy(hdr); +} + +/* + * "Read" the block block at the specified DVA (in bp) via the + * cache. If the block is found in the cache, invoke the provided + * callback immediately and return. Note that the `zio' parameter + * in the callback will be NULL in this case, since no IO was + * required. If the block is not in the cache pass the read request + * on to the spa with a substitute callback function, so that the + * requested block will be added to the cache. + * + * If a read request arrives for a block that has a read in-progress, + * either wait for the in-progress read to complete (and return the + * results); or, if this is a read with a "done" func, add a record + * to the read to invoke the "done" func when the read completes, + * and return; or just return. + * + * arc_read_done() will invoke all the requested "done" functions + * for readers of this block. + */ +int +arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, + arc_done_func_t *done, void *private, int priority, int flags, + uint32_t *arc_flags, zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + zio_t *rzio; + +top: + hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + if (hdr && hdr->b_datacnt > 0) { + + *arc_flags |= ARC_CACHED; + + if (HDR_IO_IN_PROGRESS(hdr)) { + + if (*arc_flags & ARC_WAIT) { + cv_wait(&hdr->b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + ASSERT(*arc_flags & ARC_NOWAIT); + + if (done) { + arc_callback_t *acb = NULL; + + acb = kmem_zalloc(sizeof (arc_callback_t), + KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_byteswap = swap; + if (pio != NULL) + acb->acb_zio_dummy = zio_null(pio, + spa, NULL, NULL, flags); + + ASSERT(acb->acb_done != NULL); + acb->acb_next = hdr->b_acb; + hdr->b_acb = acb; + add_reference(hdr, hash_lock, private); + mutex_exit(hash_lock); + return (0); + } + mutex_exit(hash_lock); + return (0); + } + + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + if (done) { + add_reference(hdr, hash_lock, private); + /* + * If this block is already in use, create a new + * copy of the data so that we will be guaranteed + * that arc_release() will always succeed. + */ + buf = hdr->b_buf; + ASSERT(buf); + ASSERT(buf->b_data); + if (HDR_BUF_AVAILABLE(hdr)) { + ASSERT(buf->b_efunc == NULL); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + } else { + buf = arc_buf_clone(buf); + } + } else if (*arc_flags & ARC_PREFETCH && + refcount_count(&hdr->b_refcnt) == 0) { + hdr->b_flags |= ARC_PREFETCH; + } + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); + + if (done) + done(NULL, buf, private); + } else { + uint64_t size = BP_GET_LSIZE(bp); + arc_callback_t *acb; + + if (hdr == NULL) { + /* this block is not in the cache */ + arc_buf_hdr_t *exists; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + buf = arc_buf_alloc(spa, size, private, type); + hdr = buf->b_hdr; + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = bp->blk_birth; + hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* somebody beat us to the hash insert */ + mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + (void) arc_buf_remove_ref(buf, private); + goto top; /* restart the IO request */ + } + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) { + (void) remove_reference(hdr, hash_lock, + private); + hdr->b_flags |= ARC_PREFETCH; + } + if (BP_GET_LEVEL(bp) > 0) + hdr->b_flags |= ARC_INDIRECT; + } else { + /* this block is in the ghost cache */ + ASSERT(GHOST_STATE(hdr->b_state)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); + ASSERT(hdr->b_buf == NULL); + + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) + hdr->b_flags |= ARC_PREFETCH; + else + add_reference(hdr, hash_lock, private); + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + arc_get_data_buf(buf); + ASSERT(hdr->b_datacnt == 0); + hdr->b_datacnt = 1; + + } + + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_byteswap = swap; + + ASSERT(hdr->b_acb == NULL); + hdr->b_acb = acb; + hdr->b_flags |= ARC_IO_IN_PROGRESS; + + /* + * If the buffer has been evicted, migrate it to a present state + * before issuing the I/O. Once we drop the hash-table lock, + * the header will be marked as I/O in progress and have an + * attached buffer. At this point, anybody who finds this + * buffer ought to notice that it's legit but has a pending I/O. + */ + + if (GHOST_STATE(hdr->b_state)) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + + ASSERT3U(hdr->b_size, ==, size); + DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, + zbookmark_t *, zb); + ARCSTAT_BUMP(arcstat_misses); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, misses); + + rzio = zio_read(pio, spa, bp, buf->b_data, size, + arc_read_done, buf, priority, flags, zb); + + if (*arc_flags & ARC_WAIT) + return (zio_wait(rzio)); + + ASSERT(*arc_flags & ARC_NOWAIT); + zio_nowait(rzio); + } + return (0); +} + +/* + * arc_read() variant to support pool traversal. If the block is already + * in the ARC, make a copy of it; otherwise, the caller will do the I/O. + * The idea is that we don't want pool traversal filling up memory, but + * if the ARC already has the data anyway, we shouldn't pay for the I/O. + */ +int +arc_tryread(spa_t *spa, blkptr_t *bp, void *data) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_mtx; + int rc = 0; + + hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); + + if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { + arc_buf_t *buf = hdr->b_buf; + + ASSERT(buf); + while (buf->b_data == NULL) { + buf = buf->b_next; + ASSERT(buf); + } + bcopy(buf->b_data, data, hdr->b_size); + } else { + rc = ENOENT; + } + + if (hash_mtx) + mutex_exit(hash_mtx); + + return (rc); +} + +void +arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) +{ + ASSERT(buf->b_hdr != NULL); + ASSERT(buf->b_hdr->b_state != arc_anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + buf->b_efunc = func; + buf->b_private = private; +} + +/* + * This is used by the DMU to let the ARC know that a buffer is + * being evicted, so the ARC should clean up. If this arc buf + * is not yet in the evicted state, it will be put there. + */ +int +arc_buf_evict(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + arc_buf_t **bufp; + + mutex_enter(&arc_eviction_mtx); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(). + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&arc_eviction_mtx); + return (0); + } + hash_lock = HDR_LOCK(hdr); + mutex_exit(&arc_eviction_mtx); + + mutex_enter(hash_lock); + + if (buf->b_data == NULL) { + /* + * We are on the eviction list. + */ + mutex_exit(hash_lock); + mutex_enter(&arc_eviction_mtx); + if (buf->b_hdr == NULL) { + /* + * We are already in arc_do_user_evicts(). + */ + mutex_exit(&arc_eviction_mtx); + return (0); + } else { + arc_buf_t copy = *buf; /* structure assignment */ + /* + * Process this buffer now + * but let arc_do_user_evicts() do the reaping. + */ + buf->b_efunc = NULL; + mutex_exit(&arc_eviction_mtx); + VERIFY(copy.b_efunc(©) == 0); + return (1); + } + } + + ASSERT(buf->b_hdr == hdr); + ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + /* + * Pull this buffer off of the hdr + */ + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = buf->b_next; + + ASSERT(buf->b_data != NULL); + arc_buf_destroy(buf, FALSE, FALSE); + + if (hdr->b_datacnt == 0) { + arc_state_t *old_state = hdr->b_state; + arc_state_t *evicted_state; + + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + + evicted_state = + (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&old_state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags = ARC_IN_HASH_TABLE; + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&old_state->arcs_mtx); + } + mutex_exit(hash_lock); + + VERIFY(buf->b_efunc(buf) == 0); + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); + return (1); +} + +/* + * Release this buffer from the cache. This must be done + * after a read and prior to modifying the buffer contents. + * If the buffer has more than one reference, we must make + * make a new hdr for the buffer. + */ +void +arc_release(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_refcnt) > 0); + + if (hdr->b_state == arc_anon) { + /* this buffer is already released */ + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); + ASSERT(BUF_EMPTY(hdr)); + ASSERT(buf->b_efunc == NULL); + arc_buf_thaw(buf); + return; + } + + mutex_enter(hash_lock); + + /* + * Do we have more than one buf? + */ + if (hdr->b_buf != buf || buf->b_next != NULL) { + arc_buf_hdr_t *nhdr; + arc_buf_t **bufp; + uint64_t blksz = hdr->b_size; + spa_t *spa = hdr->b_spa; + arc_buf_contents_t type = hdr->b_type; + + ASSERT(hdr->b_datacnt > 1); + /* + * Pull the data off of this buf and attach it to + * a new anonymous buf. + */ + (void) remove_reference(hdr, hash_lock, tag); + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = (*bufp)->b_next; + buf->b_next = NULL; + + ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); + atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_refcnt)) { + ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); + atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); + } + hdr->b_datacnt -= 1; + arc_cksum_verify(buf); + + mutex_exit(hash_lock); + + nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + nhdr->b_size = blksz; + nhdr->b_spa = spa; + nhdr->b_type = type; + nhdr->b_buf = buf; + nhdr->b_state = arc_anon; + nhdr->b_arc_access = 0; + nhdr->b_flags = 0; + nhdr->b_datacnt = 1; + nhdr->b_freeze_cksum = NULL; + (void) refcount_add(&nhdr->b_refcnt, tag); + buf->b_hdr = nhdr; + atomic_add_64(&arc_anon->arcs_size, blksz); + + hdr = nhdr; + } else { + ASSERT(refcount_count(&hdr->b_refcnt) == 1); + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_arc_access = 0; + mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + arc_buf_thaw(buf); + } + buf->b_efunc = NULL; + buf->b_private = NULL; +} + +int +arc_released(arc_buf_t *buf) +{ + return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); +} + +int +arc_has_callback(arc_buf_t *buf) +{ + return (buf->b_efunc != NULL); +} + +#ifdef ZFS_DEBUG +int +arc_referenced(arc_buf_t *buf) +{ + return (refcount_count(&buf->b_hdr->b_refcnt)); +} +#endif + +static void +arc_write_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + + if (callback->awcb_ready) { + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + callback->awcb_ready(zio, buf, callback->awcb_private); + } + arc_cksum_compute(buf); +} + +static void +arc_write_done(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + hdr->b_acb = NULL; + + /* this buffer is on no lists and is not in the hash table */ + ASSERT3P(hdr->b_state, ==, arc_anon); + + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = zio->io_bp->blk_birth; + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + /* + * If the block to be written was all-zero, we may have + * compressed it away. In this case no write was performed + * so there will be no dva/birth-date/checksum. The buffer + * must therefor remain anonymous (and uncached). + */ + if (!BUF_EMPTY(hdr)) { + arc_buf_hdr_t *exists; + kmutex_t *hash_lock; + + arc_cksum_verify(buf); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* + * This can only happen if we overwrite for + * sync-to-convergence, because we remove + * buffers from the hash table when we arc_free(). + */ + ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), + BP_IDENTITY(zio->io_bp))); + ASSERT3U(zio->io_bp_orig.blk_birth, ==, + zio->io_bp->blk_birth); + + ASSERT(refcount_is_zero(&exists->b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else if (callback->awcb_done == NULL) { + int destroy_hdr; + /* + * This is an anonymous buffer with no user callback, + * destroy it if there are no active references. + */ + mutex_enter(&arc_eviction_mtx); + destroy_hdr = refcount_is_zero(&hdr->b_refcnt); + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + } + + if (callback->awcb_done) { + ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); + } + + kmem_free(callback, sizeof (arc_write_callback_t)); +} + +zio_t * +arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, + uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + arc_write_callback_t *callback; + zio_t *zio; + + /* this is a private buffer - no locking required */ + ASSERT3P(hdr->b_state, ==, arc_anon); + ASSERT(BUF_EMPTY(hdr)); + ASSERT(!HDR_IO_ERROR(hdr)); + ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); + ASSERT(hdr->b_acb == 0); + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + callback->awcb_ready = ready; + callback->awcb_done = done; + callback->awcb_private = private; + callback->awcb_buf = buf; + hdr->b_flags |= ARC_IO_IN_PROGRESS; + zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, + buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, + priority, flags, zb); + + return (zio); +} + +int +arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, uint32_t arc_flags) +{ + arc_buf_hdr_t *ab; + kmutex_t *hash_lock; + zio_t *zio; + + /* + * If this buffer is in the cache, release it, so it + * can be re-used. + */ + ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + if (ab != NULL) { + /* + * The checksum of blocks to free is not always + * preserved (eg. on the deadlist). However, if it is + * nonzero, it should match what we have in the cache. + */ + ASSERT(bp->blk_cksum.zc_word[0] == 0 || + ab->b_cksum0 == bp->blk_cksum.zc_word[0]); + if (ab->b_state != arc_anon) + arc_change_state(arc_anon, ab, hash_lock); + if (HDR_IO_IN_PROGRESS(ab)) { + /* + * This should only happen when we prefetch. + */ + ASSERT(ab->b_flags & ARC_PREFETCH); + ASSERT3U(ab->b_datacnt, ==, 1); + ab->b_flags |= ARC_FREED_IN_READ; + if (HDR_IN_HASH_TABLE(ab)) + buf_hash_remove(ab); + ab->b_arc_access = 0; + bzero(&ab->b_dva, sizeof (dva_t)); + ab->b_birth = 0; + ab->b_cksum0 = 0; + ab->b_buf->b_efunc = NULL; + ab->b_buf->b_private = NULL; + mutex_exit(hash_lock); + } else if (refcount_is_zero(&ab->b_refcnt)) { + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + ARCSTAT_BUMP(arcstat_deleted); + } else { + /* + * We still have an active reference on this + * buffer. This can happen, e.g., from + * dbuf_unoverride(). + */ + ASSERT(!HDR_IN_HASH_TABLE(ab)); + ab->b_arc_access = 0; + bzero(&ab->b_dva, sizeof (dva_t)); + ab->b_birth = 0; + ab->b_cksum0 = 0; + ab->b_buf->b_efunc = NULL; + ab->b_buf->b_private = NULL; + mutex_exit(hash_lock); + } + } + + zio = zio_free(pio, spa, txg, bp, done, private); + + if (arc_flags & ARC_WAIT) + return (zio_wait(zio)); + + ASSERT(arc_flags & ARC_NOWAIT); + zio_nowait(zio); + + return (0); +} + +void +arc_tempreserve_clear(uint64_t tempreserve) +{ + atomic_add_64(&arc_tempreserve, -tempreserve); + ASSERT((int64_t)arc_tempreserve >= 0); +} + +int +arc_tempreserve_space(uint64_t tempreserve) +{ +#ifdef ZFS_DEBUG + /* + * Once in a while, fail for no reason. Everything should cope. + */ + if (spa_get_random(10000) == 0) { + dprintf("forcing random failure\n"); + return (ERESTART); + } +#endif + if (tempreserve > arc_c/4 && !arc_no_grow) + arc_c = MIN(arc_c_max, tempreserve * 4); + if (tempreserve > arc_c) + return (ENOMEM); + + /* + * Throttle writes when the amount of dirty data in the cache + * gets too large. We try to keep the cache less than half full + * of dirty blocks so that our sync times don't grow too large. + * Note: if two requests come in concurrently, we might let them + * both succeed, when one of them should fail. Not a huge deal. + * + * XXX The limit should be adjusted dynamically to keep the time + * to sync a dataset fixed (around 1-5 seconds?). + */ + + if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && + arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { + dprintf("failing, arc_tempreserve=%lluK anon=%lluK " + "tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve>>10, arc_anon->arcs_lsize>>10, + tempreserve>>10, arc_c>>10); + return (ERESTART); + } + atomic_add_64(&arc_tempreserve, tempreserve); + return (0); +} + +#ifdef _KERNEL +static eventhandler_tag zfs_event_lowmem = NULL; + +static void +zfs_lowmem(void *arg __unused, int howto __unused) +{ + + zfs_needfree = 1; + cv_signal(&arc_reclaim_thr_cv); + while (zfs_needfree) + tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5); +} +#endif + +void +arc_init(void) +{ + mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + + /* Convert seconds to clock ticks */ + arc_min_prefetch_lifespan = 1 * hz; + + /* Start out with 1/8 of all memory */ + arc_c = physmem * PAGESIZE / 8; +#if 0 +#ifdef _KERNEL + /* + * On architectures where the physical memory can be larger + * than the addressable space (intel in 32-bit mode), we may + * need to limit the cache to 1/8 of VM size. + */ + arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); +#endif +#endif + /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ + arc_c_min = MAX(arc_c / 4, 64<<20); + /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ + if (arc_c * 8 >= 1<<30) + arc_c_max = (arc_c * 8) - (1<<30); + else + arc_c_max = arc_c_min; + arc_c_max = MAX(arc_c * 6, arc_c_max); +#ifdef notyet + /* + * Allow the tunables to override our calculations if they are + * reasonable (ie. over 64MB) + */ + if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) + arc_c_max = zfs_arc_max; + if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) + arc_c_min = zfs_arc_min; +#endif + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_size = 0; + + mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + + buf_init(); + + arc_thread_exit = 0; + arc_eviction_list = NULL; + mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); + bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, + sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (arc_ksp != NULL) { + arc_ksp->ks_data = &arc_stats; + kstat_install(arc_ksp); + } + + (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + +#ifdef _KERNEL + zfs_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, zfs_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); +#endif + + arc_dead = FALSE; +} + +void +arc_fini(void) +{ + mutex_enter(&arc_reclaim_thr_lock); + arc_thread_exit = 1; + cv_signal(&arc_reclaim_thr_cv); + while (arc_thread_exit != 0) + cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); + mutex_exit(&arc_reclaim_thr_lock); + + arc_flush(); + + arc_dead = TRUE; + + if (arc_ksp != NULL) { + kstat_delete(arc_ksp); + arc_ksp = NULL; + } + + mutex_destroy(&arc_eviction_mtx); + mutex_destroy(&arc_reclaim_thr_lock); + cv_destroy(&arc_reclaim_thr_cv); + + list_destroy(&arc_mru->arcs_list); + list_destroy(&arc_mru_ghost->arcs_list); + list_destroy(&arc_mfu->arcs_list); + list_destroy(&arc_mfu_ghost->arcs_list); + + mutex_destroy(&arc_anon->arcs_mtx); + mutex_destroy(&arc_mru->arcs_mtx); + mutex_destroy(&arc_mru_ghost->arcs_mtx); + mutex_destroy(&arc_mfu->arcs_mtx); + mutex_destroy(&arc_mfu_ghost->arcs_mtx); + + buf_fini(); + +#ifdef _KERNEL + if (zfs_event_lowmem != NULL) + EVENTHANDLER_DEREGISTER(vm_lowmem, zfs_event_lowmem); +#endif +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c new file mode 100644 index 0000000..4442b1f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c @@ -0,0 +1,312 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/bplist.h> +#include <sys/zfs_context.h> + +static int +bplist_hold(bplist_t *bpl) +{ + ASSERT(MUTEX_HELD(&bpl->bpl_lock)); + if (bpl->bpl_dbuf == NULL) { + int err = dmu_bonus_hold(bpl->bpl_mos, + bpl->bpl_object, bpl, &bpl->bpl_dbuf); + if (err) + return (err); + bpl->bpl_phys = bpl->bpl_dbuf->db_data; + } + return (0); +} + +uint64_t +bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx) +{ + int size; + + size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ? + BPLIST_SIZE_V0 : sizeof (bplist_phys_t); + + return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize, + DMU_OT_BPLIST_HDR, size, tx)); +} + +void +bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx) +{ + VERIFY(dmu_object_free(mos, object, tx) == 0); +} + +int +bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object) +{ + dmu_object_info_t doi; + int err; + + err = dmu_object_info(mos, object, &doi); + if (err) + return (err); + + mutex_enter(&bpl->bpl_lock); + + ASSERT(bpl->bpl_dbuf == NULL); + ASSERT(bpl->bpl_phys == NULL); + ASSERT(bpl->bpl_cached_dbuf == NULL); + ASSERT(bpl->bpl_queue == NULL); + ASSERT(object != 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR); + + bpl->bpl_mos = mos; + bpl->bpl_object = object; + bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1); + bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT; + bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t)); + + mutex_exit(&bpl->bpl_lock); + return (0); +} + +void +bplist_close(bplist_t *bpl) +{ + mutex_enter(&bpl->bpl_lock); + + ASSERT(bpl->bpl_queue == NULL); + + if (bpl->bpl_cached_dbuf) { + dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); + bpl->bpl_cached_dbuf = NULL; + } + if (bpl->bpl_dbuf) { + dmu_buf_rele(bpl->bpl_dbuf, bpl); + bpl->bpl_dbuf = NULL; + bpl->bpl_phys = NULL; + } + + mutex_exit(&bpl->bpl_lock); +} + +boolean_t +bplist_empty(bplist_t *bpl) +{ + boolean_t rv; + + if (bpl->bpl_object == 0) + return (B_TRUE); + + mutex_enter(&bpl->bpl_lock); + VERIFY(0 == bplist_hold(bpl)); /* XXX */ + rv = (bpl->bpl_phys->bpl_entries == 0); + mutex_exit(&bpl->bpl_lock); + + return (rv); +} + +static int +bplist_cache(bplist_t *bpl, uint64_t blkid) +{ + int err = 0; + + if (bpl->bpl_cached_dbuf == NULL || + bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) { + if (bpl->bpl_cached_dbuf != NULL) + dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); + err = dmu_buf_hold(bpl->bpl_mos, + bpl->bpl_object, blkid << bpl->bpl_blockshift, + bpl, &bpl->bpl_cached_dbuf); + ASSERT(err || bpl->bpl_cached_dbuf->db_size == + 1ULL << bpl->bpl_blockshift); + } + return (err); +} + +int +bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) +{ + uint64_t blk, off; + blkptr_t *bparray; + int err; + + mutex_enter(&bpl->bpl_lock); + + err = bplist_hold(bpl); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + if (*itorp >= bpl->bpl_phys->bpl_entries) { + mutex_exit(&bpl->bpl_lock); + return (ENOENT); + } + + blk = *itorp >> bpl->bpl_bpshift; + off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); + + err = bplist_cache(bpl, blk); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + bparray = bpl->bpl_cached_dbuf->db_data; + *bp = bparray[off]; + (*itorp)++; + mutex_exit(&bpl->bpl_lock); + return (0); +} + +int +bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx) +{ + uint64_t blk, off; + blkptr_t *bparray; + int err; + + ASSERT(!BP_IS_HOLE(bp)); + mutex_enter(&bpl->bpl_lock); + err = bplist_hold(bpl); + if (err) + return (err); + + blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; + off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); + + err = bplist_cache(bpl, blk); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx); + bparray = bpl->bpl_cached_dbuf->db_data; + bparray[off] = *bp; + + /* We never need the fill count. */ + bparray[off].blk_fill = 0; + + /* The bplist will compress better if we can leave off the checksum */ + bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); + + dmu_buf_will_dirty(bpl->bpl_dbuf, tx); + bpl->bpl_phys->bpl_entries++; + bpl->bpl_phys->bpl_bytes += + bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp); + if (bpl->bpl_havecomp) { + bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp); + bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp); + } + mutex_exit(&bpl->bpl_lock); + + return (0); +} + +/* + * Deferred entry; will be written later by bplist_sync(). + */ +void +bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp) +{ + bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP); + + ASSERT(!BP_IS_HOLE(bp)); + mutex_enter(&bpl->bpl_lock); + bpq->bpq_blk = *bp; + bpq->bpq_next = bpl->bpl_queue; + bpl->bpl_queue = bpq; + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_sync(bplist_t *bpl, dmu_tx_t *tx) +{ + bplist_q_t *bpq; + + mutex_enter(&bpl->bpl_lock); + while ((bpq = bpl->bpl_queue) != NULL) { + bpl->bpl_queue = bpq->bpq_next; + mutex_exit(&bpl->bpl_lock); + VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx)); + kmem_free(bpq, sizeof (*bpq)); + mutex_enter(&bpl->bpl_lock); + } + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) +{ + mutex_enter(&bpl->bpl_lock); + ASSERT3P(bpl->bpl_queue, ==, NULL); + VERIFY(0 == bplist_hold(bpl)); + dmu_buf_will_dirty(bpl->bpl_dbuf, tx); + VERIFY(0 == dmu_free_range(bpl->bpl_mos, + bpl->bpl_object, 0, -1ULL, tx)); + bpl->bpl_phys->bpl_entries = 0; + bpl->bpl_phys->bpl_bytes = 0; + if (bpl->bpl_havecomp) { + bpl->bpl_phys->bpl_comp = 0; + bpl->bpl_phys->bpl_uncomp = 0; + } + mutex_exit(&bpl->bpl_lock); +} + +int +bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + uint64_t itor = 0, comp = 0, uncomp = 0; + int err; + blkptr_t bp; + + mutex_enter(&bpl->bpl_lock); + + err = bplist_hold(bpl); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + *usedp = bpl->bpl_phys->bpl_bytes; + if (bpl->bpl_havecomp) { + *compp = bpl->bpl_phys->bpl_comp; + *uncompp = bpl->bpl_phys->bpl_uncomp; + } + mutex_exit(&bpl->bpl_lock); + + if (!bpl->bpl_havecomp) { + while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { + comp += BP_GET_PSIZE(&bp); + uncomp += BP_GET_UCSIZE(&bp); + } + if (err == ENOENT) + err = 0; + *compp = comp; + *uncompp = uncomp; + } + + return (err); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c new file mode 100644 index 0000000..1fde66f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -0,0 +1,2240 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> + +static void dbuf_destroy(dmu_buf_impl_t *db); +static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, + int compress, dmu_tx_t *tx); +static arc_done_func_t dbuf_write_ready; +static arc_done_func_t dbuf_write_done; + +int zfs_mdcomp_disable = 0; +SYSCTL_DECL(_vfs_zfs); +TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); +SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, + &zfs_mdcomp_disable, 0, "Disable metadata compression"); + +/* + * Global data structures and functions for the dbuf cache. + */ +static kmem_cache_t *dbuf_cache; + +/* ARGSUSED */ +static int +dbuf_cons(void *vdb, void *unused, int kmflag) +{ + dmu_buf_impl_t *db = vdb; + bzero(db, sizeof (dmu_buf_impl_t)); + + mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + refcount_create(&db->db_holds); + return (0); +} + +/* ARGSUSED */ +static void +dbuf_dest(void *vdb, void *unused) +{ + dmu_buf_impl_t *db = vdb; + mutex_destroy(&db->db_mtx); + cv_destroy(&db->db_changed); + refcount_destroy(&db->db_holds); +} + +/* + * dbuf hash table routines + */ +static dbuf_hash_table_t dbuf_hash_table; + +static uint64_t dbuf_hash_count; + +static uint64_t +dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) +{ + uintptr_t osv = (uintptr_t)os; + uint64_t crc = -1ULL; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; + + crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); + + return (crc); +} + +#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); + +#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ + ((dbuf)->db.db_object == (obj) && \ + (dbuf)->db_objset == (os) && \ + (dbuf)->db_level == (level) && \ + (dbuf)->db_blkid == (blkid)) + +dmu_buf_impl_t * +dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_impl_t *os = dn->dn_objset; + uint64_t obj = dn->dn_object; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *db; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { + if (DBUF_EQUAL(db, os, obj, level, blkid)) { + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (db); + } + mutex_exit(&db->db_mtx); + } + } + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static dmu_buf_impl_t * +dbuf_hash_insert(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_impl_t *os = db->db_objset; + uint64_t obj = db->db.db_object; + int level = db->db_level; + uint64_t blkid = db->db_blkid; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { + if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { + mutex_enter(&dbf->db_mtx); + if (dbf->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (dbf); + } + mutex_exit(&dbf->db_mtx); + } + } + + mutex_enter(&db->db_mtx); + db->db_hash_next = h->hash_table[idx]; + h->hash_table[idx] = db; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, 1); + + return (NULL); +} + +/* + * Remove an entry from the hash table. This operation will + * fail if there are any existing holds on the db. + */ +static void +dbuf_hash_remove(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf, **dbp; + + /* + * We musn't hold db_mtx to maintin lock ordering: + * DBUF_HASH_MUTEX > db_mtx. + */ + ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(db->db_state == DB_EVICTING); + ASSERT(!MUTEX_HELD(&db->db_mtx)); + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + dbp = &h->hash_table[idx]; + while ((dbf = *dbp) != db) { + dbp = &dbf->db_hash_next; + ASSERT(dbf != NULL); + } + *dbp = db->db_hash_next; + db->db_hash_next = NULL; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, -1); +} + +static arc_evict_func_t dbuf_do_evict; + +static void +dbuf_evict_user(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level != 0 || db->db_evict_func == NULL) + return; + + if (db->db_user_data_ptr_ptr) + *db->db_user_data_ptr_ptr = db->db.db_data; + db->db_evict_func(&db->db, db->db_user_ptr); + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; +} + +void +dbuf_evict(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL); + ASSERT(db->db_data_pending == NULL); + + dbuf_clear(db); + dbuf_destroy(db); +} + +void +dbuf_init(void) +{ + uint64_t hsize = 1ULL << 16; + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 4K block size. The table will take up + * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). + */ + while (hsize * 4096 < physmem * PAGESIZE) + hsize <<= 1; + +retry: + h->hash_table_mask = hsize - 1; + h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); + if (h->hash_table == NULL) { + /* XXX - we should really return an error instead of assert */ + ASSERT(hsize > (1ULL << 10)); + hsize >>= 1; + goto retry; + } + + dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + sizeof (dmu_buf_impl_t), + 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); +} + +void +dbuf_fini(void) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); + kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); + kmem_cache_destroy(dbuf_cache); +} + +/* + * Other stuff. + */ + +#ifdef ZFS_DEBUG +static void +dbuf_verify(dmu_buf_impl_t *db) +{ + dnode_t *dn = db->db_dnode; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + ASSERT(db->db_objset != NULL); + if (dn == NULL) { + ASSERT(db->db_parent == NULL); + ASSERT(db->db_blkptr == NULL); + } else { + ASSERT3U(db->db.db_object, ==, dn->dn_object); + ASSERT3P(db->db_objset, ==, dn->dn_objset); + ASSERT3U(db->db_level, <, dn->dn_nlevels); + ASSERT(db->db_blkid == DB_BONUS_BLKID || + list_head(&dn->dn_dbufs)); + } + if (db->db_blkid == DB_BONUS_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); + } else { + ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); + } + + if (db->db_level == 0) { + /* we can be momentarily larger in dnode_set_blksz() */ + if (db->db_blkid != DB_BONUS_BLKID && dn) { + ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); + } + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + dbuf_dirty_record_t *dr = db->db_data_pending; + /* + * it should only be modified in syncing + * context, so make sure we only have + * one copy of the data. + */ + ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); + } + } + + /* verify db->db_blkptr */ + if (db->db_blkptr) { + if (db->db_parent == dn->dn_dbuf) { + /* db is pointed to by the dnode */ + /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ + if (db->db.db_object == DMU_META_DNODE_OBJECT) + ASSERT(db->db_parent == NULL); + else + ASSERT(db->db_parent != NULL); + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + } else { + /* db is pointed to by an indirect block */ + int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; + ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); + ASSERT3U(db->db_parent->db.db_object, ==, + db->db.db_object); + /* + * dnode_grow_indblksz() can make this fail if we don't + * have the struct_rwlock. XXX indblksz no longer + * grows. safe to do this now? + */ + if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { + ASSERT3P(db->db_blkptr, ==, + ((blkptr_t *)db->db_parent->db.db_data + + db->db_blkid % epb)); + } + } + } + if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && + db->db.db_data && db->db_blkid != DB_BONUS_BLKID && + db->db_state != DB_FILL && !dn->dn_free_txg) { + /* + * If the blkptr isn't set but they have nonzero data, + * it had better be dirty, otherwise we'll lose that + * data when we evict this buffer. + */ + if (db->db_dirtycnt == 0) { + uint64_t *buf = db->db.db_data; + int i; + + for (i = 0; i < db->db.db_size >> 3; i++) { + ASSERT(buf[i] == 0); + } + } + } +} +#endif + +static void +dbuf_update_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_level == 0 && db->db_user_data_ptr_ptr) { + ASSERT(!refcount_is_zero(&db->db_holds)); + *db->db_user_data_ptr_ptr = db->db.db_data; + } +} + +static void +dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + db->db_buf = buf; + if (buf != NULL) { + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); + dbuf_update_data(db); + } else { + dbuf_evict_user(db); + db->db.db_data = NULL; + db->db_state = DB_UNCACHED; + } +} + +uint64_t +dbuf_whichblock(dnode_t *dn, uint64_t offset) +{ + if (dn->dn_datablkshift) { + return (offset >> dn->dn_datablkshift); + } else { + ASSERT3U(offset, <, dn->dn_datablksz); + return (0); + } +} + +static void +dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + + mutex_enter(&db->db_mtx); + ASSERT3U(db->db_state, ==, DB_READ); + /* + * All reads are synchronous, so we must have a hold on the dbuf + */ + ASSERT(refcount_count(&db->db_holds) > 0); + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + if (db->db_level == 0 && db->db_freed_in_flight) { + /* we were freed in flight; disregard any error */ + arc_release(buf, db); + bzero(buf->b_data, db->db.db_size); + arc_buf_freeze(buf); + db->db_freed_in_flight = FALSE; + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else if (zio == NULL || zio->io_error == 0) { + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else { + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT3P(db->db_buf, ==, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + db->db_state = DB_UNCACHED; + } + cv_broadcast(&db->db_changed); + mutex_exit(&db->db_mtx); + dbuf_rele(db, NULL); +} + +static void +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) +{ + blkptr_t *bp; + zbookmark_t zb; + uint32_t aflags = ARC_NOWAIT; + + ASSERT(!refcount_is_zero(&db->db_holds)); + /* We need the struct_rwlock to prevent db_blkptr from changing. */ + ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_buf == NULL); + + if (db->db_blkid == DB_BONUS_BLKID) { + ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); + db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); + if (db->db.db_size < DN_MAX_BONUSLEN) + bzero(db->db.db_data, DN_MAX_BONUSLEN); + bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, + db->db.db_size); + dbuf_update_data(db); + db->db_state = DB_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) + bp = NULL; + else + bp = db->db_blkptr; + + if (bp == NULL) + dprintf_dbuf(db, "blkptr: %s\n", "NULL"); + else + dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); + + if (bp == NULL || BP_IS_HOLE(bp)) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + ASSERT(bp == NULL || BP_IS_HOLE(bp)); + dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + db->db.db_size, db, type)); + bzero(db->db.db_data, db->db.db_size); + db->db_state = DB_CACHED; + *flags |= DB_RF_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + db->db_state = DB_READ; + mutex_exit(&db->db_mtx); + + zb.zb_objset = db->db_objset->os_dsl_dataset ? + db->db_objset->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + + dbuf_add_ref(db, NULL); + /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ + ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES); + (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, + db->db_level > 0 ? byteswap_uint64_array : + dmu_ot[db->db_dnode->dn_type].ot_byteswap, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, + (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + &aflags, &zb); + if (aflags & ARC_CACHED) + *flags |= DB_RF_CACHED; +} + +int +dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +{ + int err = 0; + int havepzio = (zio != NULL); + int prefetch; + + /* + * We don't have to hold the mutex to check db_state because it + * can't be freed while we have a hold on the buffer. + */ + ASSERT(!refcount_is_zero(&db->db_holds)); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + + prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_CACHED) { + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&db->db_dnode->dn_struct_rwlock); + } else if (db->db_state == DB_UNCACHED) { + if (zio == NULL) { + zio = zio_root(db->db_dnode->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + } + dbuf_read_impl(db, zio, &flags); + + /* dbuf_read_impl has dropped db_mtx for us */ + + if (prefetch) + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size, flags & DB_RF_CACHED); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&db->db_dnode->dn_struct_rwlock); + + if (!havepzio) + err = zio_wait(zio); + } else { + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&db->db_dnode->dn_struct_rwlock); + + mutex_enter(&db->db_mtx); + if ((flags & DB_RF_NEVERWAIT) == 0) { + while (db->db_state == DB_READ || + db->db_state == DB_FILL) { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + cv_wait(&db->db_changed, &db->db_mtx); + } + if (db->db_state == DB_UNCACHED) + err = EIO; + } + mutex_exit(&db->db_mtx); + } + + ASSERT(err || havepzio || db->db_state == DB_CACHED); + return (err); +} + +static void +dbuf_noread(dmu_buf_impl_t *db) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DB_BONUS_BLKID); + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + db->db.db_size, db, type)); + db->db_state = DB_FILL; + } else { + ASSERT3U(db->db_state, ==, DB_CACHED); + } + mutex_exit(&db->db_mtx); +} + +/* + * This is our just-in-time copy function. It makes a copy of + * buffers, that have been modified in a previous transaction + * group, before we modify them in the current active group. + * + * This function is used in two places: when we are dirtying a + * buffer for the first time in a txg, and when we are freeing + * a range in a dnode that includes this buffer. + * + * Note that when we are called from dbuf_free_range() we do + * not put a hold on the buffer, we just traverse the active + * dbuf list for the dnode. + */ +static void +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_level == 0); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + + if (dr == NULL || + (dr->dt.dl.dr_data != + ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + return; + + /* + * If the last dirty record for this dbuf has not yet synced + * and its referencing the dbuf data, either: + * reset the reference to point to a new copy, + * or (if there a no active holders) + * just null out the current db_data pointer. + */ + ASSERT(dr->dr_txg >= txg - 2); + if (db->db_blkid == DB_BONUS_BLKID) { + /* Note that the data bufs here are zio_bufs */ + dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dr->dt.dl.dr_data = arc_buf_alloc( + db->db_dnode->dn_objset->os_spa, size, db, type); + bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + } else { + dbuf_set_data(db, NULL); + } +} + +void +dbuf_unoverride(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dr->dr_txg; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); + ASSERT(db->db_level == 0); + + if (db->db_blkid == DB_BONUS_BLKID || + dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) + return; + + /* free this block */ + if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { + /* XXX can get silent EIO here */ + (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, + txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); + } + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + /* + * Release the already-written buffer, so we leave it in + * a consistent dirty state. Note that all callers are + * modifying the buffer, so they will immediately do + * another (redundant) arc_release(). Therefore, leave + * the buf thawed to save the effort of freezing & + * immediately re-thawing it. + */ + arc_release(dr->dt.dl.dr_data, db); +} + +void +dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db, *db_next; + uint64_t txg = tx->tx_txg; + + dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); + mutex_enter(&dn->dn_dbufs_mtx); + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + db_next = list_next(&dn->dn_dbufs, db); + ASSERT(db->db_blkid != DB_BONUS_BLKID); + if (db->db_level != 0) + continue; + dprintf_dbuf(db, "found buf %s\n", ""); + if (db->db_blkid < blkid || + db->db_blkid >= blkid+nblks) + continue; + + /* found a level 0 buffer in the range */ + if (dbuf_undirty(db, tx)) + continue; + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_UNCACHED || + db->db_state == DB_EVICTING) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + continue; + } + if (db->db_state == DB_READ || db->db_state == DB_FILL) { + /* will be handled in dbuf_read_done or dbuf_rele */ + db->db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + continue; + } + if (refcount_count(&db->db_holds) == 0) { + ASSERT(db->db_buf); + dbuf_clear(db); + continue; + } + /* The dbuf is referenced */ + + if (db->db_last_dirty != NULL) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + if (dr->dr_txg == txg) { + /* + * This buffer is "in-use", re-adjust the file + * size to reflect that this buffer may + * contain new data when we sync. + */ + if (db->db_blkid > dn->dn_maxblkid) + dn->dn_maxblkid = db->db_blkid; + dbuf_unoverride(dr); + } else { + /* + * This dbuf is not dirty in the open context. + * Either uncache it (if its not referenced in + * the open context) or reset its contents to + * empty. + */ + dbuf_fix_old_data(db, txg); + } + } + /* clear the contents if its cached */ + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + arc_release(db->db_buf, db); + bzero(db->db.db_data, db->db.db_size); + arc_buf_freeze(db->db_buf); + } + + mutex_exit(&db->db_mtx); + } + mutex_exit(&dn->dn_dbufs_mtx); +} + +static int +dbuf_new_block(dmu_buf_impl_t *db) +{ + dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; + uint64_t birth_txg = 0; + + /* Don't count meta-objects */ + if (ds == NULL) + return (FALSE); + + /* + * We don't need any locking to protect db_blkptr: + * If it's syncing, then db_last_dirty will be set + * so we'll ignore db_blkptr. + */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + /* If we have been dirtied since the last snapshot, its not new */ + if (db->db_last_dirty) + birth_txg = db->db_last_dirty->dr_txg; + else if (db->db_blkptr) + birth_txg = db->db_blkptr->blk_birth; + + if (birth_txg) + return (!dsl_dataset_block_freeable(ds, birth_txg)); + else + return (TRUE); +} + +void +dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) +{ + arc_buf_t *buf, *obuf; + int osize = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + ASSERT(db->db_blkid != DB_BONUS_BLKID); + + /* XXX does *this* func really need the lock? */ + ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); + + /* + * This call to dbuf_will_dirty() with the dn_struct_rwlock held + * is OK, because there can be no other references to the db + * when we are changing its size, so no concurrent DB_FILL can + * be happening. + */ + /* + * XXX we should be doing a dbuf_read, checking the return + * value and returning that up to our callers + */ + dbuf_will_dirty(db, tx); + + /* create the data buffer for the new block */ + buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); + + /* copy old block data to the new block */ + obuf = db->db_buf; + bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + /* zero the remainder */ + if (size > osize) + bzero((uint8_t *)buf->b_data + osize, size - osize); + + mutex_enter(&db->db_mtx); + dbuf_set_data(db, buf); + VERIFY(arc_buf_remove_ref(obuf, db) == 1); + db->db.db_size = size; + + if (db->db_level == 0) { + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + db->db_last_dirty->dt.dl.dr_data = buf; + } + mutex_exit(&db->db_mtx); + + dnode_willuse_space(db->db_dnode, size-osize, tx); +} + +dbuf_dirty_record_t * +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + dbuf_dirty_record_t **drp, *dr; + int drop_struct_lock = FALSE; + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + DMU_TX_DIRTY_BUF(tx, db); + + /* + * Shouldn't dirty a regular buffer in syncing context. Private + * objects may be dirtied in syncing context, but only if they + * were already pre-dirtied in open context. + * XXX We may want to prohibit dirtying in syncing context even + * if they did pre-dirty. + */ + ASSERT(!dmu_tx_is_syncing(tx) || + BP_IS_HOLE(dn->dn_objset->os_rootbp) || + dn->dn_object == DMU_META_DNODE_OBJECT || + dn->dn_objset->os_dsl_dataset == NULL || + dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); + + /* + * We make this assert for private objects as well, but after we + * check if we're already dirty. They are allowed to re-dirty + * in syncing context. + */ + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + mutex_enter(&db->db_mtx); + /* + * XXX make this true for indirects too? The problem is that + * transactions created with dmu_tx_create_assigned() from + * syncing context don't bother holding ahead. + */ + ASSERT(db->db_level != 0 || + db->db_state == DB_CACHED || db->db_state == DB_FILL); + + mutex_enter(&dn->dn_mtx); + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED && + !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + dn->dn_dirtyctx = + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); + ASSERT(dn->dn_dirtyctx_firstset == NULL); + dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); + } + mutex_exit(&dn->dn_mtx); + + /* + * If this buffer is already dirty, we're done. + */ + drp = &db->db_last_dirty; + ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + while (*drp && (*drp)->dr_txg > tx->tx_txg) + drp = &(*drp)->dr_next; + if (*drp && (*drp)->dr_txg == tx->tx_txg) { + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + /* + * If this buffer has already been written out, + * we now need to reset its state. + */ + dbuf_unoverride(*drp); + if (db->db.db_object != DMU_META_DNODE_OBJECT) + arc_buf_thaw(db->db_buf); + } + mutex_exit(&db->db_mtx); + return (*drp); + } + + /* + * Only valid if not already dirty. + */ + ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + ASSERT3U(dn->dn_nlevels, >, db->db_level); + ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || + dn->dn_phys->dn_nlevels > db->db_level || + dn->dn_next_nlevels[txgoff] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + + /* + * We should only be dirtying in syncing context if it's the + * mos, a spa os, or we're initializing the os. However, we are + * allowed to dirty in syncing context provided we already + * dirtied it in open context. Hence we must make this + * assertion only if we're not already dirty. + */ + ASSERT(!dmu_tx_is_syncing(tx) || + os->os_dsl_dataset == NULL || + !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || + !BP_IS_HOLE(os->os_rootbp)); + ASSERT(db->db.db_size != 0); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + /* + * If this buffer is dirty in an old transaction group we need + * to make a copy of it so that the changes we make in this + * transaction group won't leak out when we sync the older txg. + */ + dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + if (db->db_level == 0) { + void *data_old = db->db_buf; + + if (db->db_blkid == DB_BONUS_BLKID) { + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db.db_data; + } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { + /* + * Release the data buffer from the cache so that we + * can modify it without impacting possible other users + * of this cached data block. Note that indirect + * blocks and private objects are not released until the + * syncing state (since they are only modified then). + */ + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db_buf; + } + ASSERT(data_old != NULL); + dr->dt.dl.dr_data = data_old; + } else { + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + dr->dr_dbuf = db; + dr->dr_txg = tx->tx_txg; + dr->dr_next = *drp; + *drp = dr; + + /* + * We could have been freed_in_flight between the dbuf_noread + * and dbuf_dirty. We win, as though the dbuf_noread() had + * happened after the free. + */ + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + db->db_freed_in_flight = FALSE; + } + + if (db->db_blkid != DB_BONUS_BLKID) { + /* + * Update the accounting. + */ + if (!dbuf_new_block(db) && db->db_blkptr) { + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + dnode_willuse_space(dn, + -bp_get_dasize(os->os_spa, db->db_blkptr), tx); + } + dnode_willuse_space(dn, db->db.db_size, tx); + } + + /* + * This buffer is now part of this txg + */ + dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); + db->db_dirtycnt += 1; + ASSERT3U(db->db_dirtycnt, <=, 3); + + mutex_exit(&db->db_mtx); + + if (db->db_blkid == DB_BONUS_BLKID) { + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + dnode_setdirty(dn, tx); + return (dr); + } + + if (db->db_level == 0) { + dnode_new_blkid(dn, db->db_blkid, tx); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + if (db->db_level+1 < dn->dn_nlevels) { + dmu_buf_impl_t *parent = db->db_parent; + dbuf_dirty_record_t *di; + int parent_held = FALSE; + + if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + parent = dbuf_hold_level(dn, db->db_level+1, + db->db_blkid >> epbs, FTAG); + parent_held = TRUE; + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + ASSERT3U(db->db_level+1, ==, parent->db_level); + di = dbuf_dirty(parent, tx); + if (parent_held) + dbuf_rele(parent, FTAG); + + mutex_enter(&db->db_mtx); + /* possible race with dbuf_undirty() */ + if (db->db_last_dirty == dr || + dn->dn_object == DMU_META_DNODE_OBJECT) { + mutex_enter(&di->dt.di.dr_mtx); + ASSERT3U(di->dr_txg, ==, tx->tx_txg); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&di->dt.di.dr_children, dr); + mutex_exit(&di->dt.di.dr_mtx); + dr->dr_parent = di; + } + mutex_exit(&db->db_mtx); + } else { + ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_blkid < dn->dn_nblkptr); + ASSERT(db->db_parent == NULL || + db->db_parent == db->db_dnode->dn_dbuf); + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + } + + dnode_setdirty(dn, tx); + return (dr); +} + +static int +dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn = db->db_dnode; + uint64_t txg = tx->tx_txg; + dbuf_dirty_record_t *dr; + + ASSERT(txg != 0); + ASSERT(db->db_blkid != DB_BONUS_BLKID); + + mutex_enter(&db->db_mtx); + + /* + * If this buffer is not dirty, we're done. + */ + for (dr = db->db_last_dirty; dr; dr = dr->dr_next) + if (dr->dr_txg <= txg) + break; + if (dr == NULL || dr->dr_txg < txg) { + mutex_exit(&db->db_mtx); + return (0); + } + ASSERT(dr->dr_txg == txg); + + /* + * If this buffer is currently held, we cannot undirty + * it, since one of the current holders may be in the + * middle of an update. Note that users of dbuf_undirty() + * should not place a hold on the dbuf before the call. + */ + if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + /* Make sure we don't toss this buffer at sync phase */ + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + return (0); + } + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + ASSERT(db->db.db_size != 0); + + /* XXX would be nice to fix up dn_towrite_space[] */ + + db->db_last_dirty = dr->dr_next; + + if (dr->dr_parent) { + mutex_enter(&dr->dr_parent->dt.di.dr_mtx); + list_remove(&dr->dr_parent->dt.di.dr_children, dr); + mutex_exit(&dr->dr_parent->dt.di.dr_mtx); + } else if (db->db_level+1 == dn->dn_nlevels) { + ASSERT3P(db->db_parent, ==, dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); + mutex_exit(&dn->dn_mtx); + } + + if (db->db_level == 0) { + dbuf_unoverride(dr); + + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); + } else { + ASSERT(db->db_buf != NULL); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + /* XXX - mutex and list destroy? */ + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + + if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { + arc_buf_t *buf = db->db_buf; + + ASSERT(arc_released(buf)); + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + dbuf_evict(db); + return (1); + } + + mutex_exit(&db->db_mtx); + return (0); +} + +#pragma weak dmu_buf_will_dirty = dbuf_will_dirty +void +dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + int rf = DB_RF_MUST_SUCCEED; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + (void) dbuf_read(db, NULL, rf); + (void) dbuf_dirty(db, tx); +} + +void +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(tx->tx_txg != 0); + ASSERT(db->db_level == 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || + dmu_tx_private_ok(tx)); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); +} + +#pragma weak dmu_buf_fill_done = dbuf_fill_done +/* ARGSUSED */ +void +dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + + if (db->db_state == DB_FILL) { + if (db->db_level == 0 && db->db_freed_in_flight) { + ASSERT(db->db_blkid != DB_BONUS_BLKID); + /* we were freed while filling */ + /* XXX dbuf_undirty? */ + bzero(db->db.db_data, db->db.db_size); + db->db_freed_in_flight = FALSE; + } + db->db_state = DB_CACHED; + cv_broadcast(&db->db_changed); + } + mutex_exit(&db->db_mtx); +} + +/* + * "Clear" the contents of this dbuf. This will mark the dbuf + * EVICTING and clear *most* of its references. Unfortunetely, + * when we are not holding the dn_dbufs_mtx, we can't clear the + * entry in the dn_dbufs list. We have to wait until dbuf_destroy() + * in this case. For callers from the DMU we will usually see: + * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() + * For the arc callback, we will usually see: + * dbuf_do_evict()->dbuf_clear();dbuf_destroy() + * Sometimes, though, we will get a mix of these two: + * DMU: dbuf_clear()->arc_buf_evict() + * ARC: dbuf_do_evict()->dbuf_destroy() + */ +void +dbuf_clear(dmu_buf_impl_t *db) +{ + dnode_t *dn = db->db_dnode; + dmu_buf_impl_t *parent = db->db_parent; + dmu_buf_impl_t *dndb = dn->dn_dbuf; + int dbuf_gone = FALSE; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(refcount_is_zero(&db->db_holds)); + + dbuf_evict_user(db); + + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + if (db->db_blkid == DB_BONUS_BLKID) + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + db->db.db_data = NULL; + db->db_state = DB_UNCACHED; + } + + ASSERT3U(db->db_state, ==, DB_UNCACHED); + ASSERT(db->db_data_pending == NULL); + + db->db_state = DB_EVICTING; + db->db_blkptr = NULL; + + if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + list_remove(&dn->dn_dbufs, db); + dnode_rele(dn, db); + } + + if (db->db_buf) + dbuf_gone = arc_buf_evict(db->db_buf); + + if (!dbuf_gone) + mutex_exit(&db->db_mtx); + + /* + * If this dbuf is referened from an indirect dbuf, + * decrement the ref count on the indirect dbuf. + */ + if (parent && parent != dndb) + dbuf_rele(parent, db); +} + +static int +dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, + dmu_buf_impl_t **parentp, blkptr_t **bpp) +{ + int nlevels, epbs; + + *parentp = NULL; + *bpp = NULL; + + ASSERT(blkid != DB_BONUS_BLKID); + + if (dn->dn_phys->dn_nlevels == 0) + nlevels = 1; + else + nlevels = dn->dn_phys->dn_nlevels; + + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT3U(level * epbs, <, 64); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + if (level >= nlevels || + (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { + /* the buffer has no parent yet */ + return (ENOENT); + } else if (level < nlevels-1) { + /* this block is referenced from an indirect block */ + int err = dbuf_hold_impl(dn, level+1, + blkid >> epbs, fail_sparse, NULL, parentp); + if (err) + return (err); + err = dbuf_read(*parentp, NULL, + (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err) { + dbuf_rele(*parentp, NULL); + *parentp = NULL; + return (err); + } + *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + (blkid & ((1ULL << epbs) - 1)); + return (0); + } else { + /* the block is referenced from the dnode */ + ASSERT3U(level, ==, nlevels-1); + ASSERT(dn->dn_phys->dn_nblkptr == 0 || + blkid < dn->dn_phys->dn_nblkptr); + if (dn->dn_dbuf) { + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + } + *bpp = &dn->dn_phys->dn_blkptr[blkid]; + return (0); + } +} + +static dmu_buf_impl_t * +dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, + dmu_buf_impl_t *parent, blkptr_t *blkptr) +{ + objset_impl_t *os = dn->dn_objset; + dmu_buf_impl_t *db, *odb; + + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(dn->dn_type != DMU_OT_NONE); + + db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + + db->db_objset = os; + db->db.db_object = dn->dn_object; + db->db_level = level; + db->db_blkid = blkid; + db->db_last_dirty = NULL; + db->db_dirtycnt = 0; + db->db_dnode = dn; + db->db_parent = parent; + db->db_blkptr = blkptr; + + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; + db->db_immediate_evict = 0; + db->db_freed_in_flight = 0; + + if (blkid == DB_BONUS_BLKID) { + ASSERT3P(parent, ==, dn->dn_dbuf); + db->db.db_size = dn->dn_bonuslen; + db->db.db_offset = DB_BONUS_BLKID; + db->db_state = DB_UNCACHED; + /* the bonus dbuf is not placed in the hash table */ + return (db); + } else { + int blocksize = + db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; + db->db.db_size = blocksize; + db->db.db_offset = db->db_blkid * blocksize; + } + + /* + * Hold the dn_dbufs_mtx while we get the new dbuf + * in the hash table *and* added to the dbufs list. + * This prevents a possible deadlock with someone + * trying to look up this dbuf before its added to the + * dn_dbufs list. + */ + mutex_enter(&dn->dn_dbufs_mtx); + db->db_state = DB_EVICTING; + if ((odb = dbuf_hash_insert(db)) != NULL) { + /* someone else inserted it first */ + kmem_cache_free(dbuf_cache, db); + mutex_exit(&dn->dn_dbufs_mtx); + return (odb); + } + list_insert_head(&dn->dn_dbufs, db); + db->db_state = DB_UNCACHED; + mutex_exit(&dn->dn_dbufs_mtx); + + if (parent && parent != dn->dn_dbuf) + dbuf_add_ref(parent, db); + + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + refcount_count(&dn->dn_holds) > 0); + (void) refcount_add(&dn->dn_holds, db); + + dprintf_dbuf(db, "db=%p\n", db); + + return (db); +} + +static int +dbuf_do_evict(void *private) +{ + arc_buf_t *buf = private; + dmu_buf_impl_t *db = buf->b_private; + + if (!MUTEX_HELD(&db->db_mtx)) + mutex_enter(&db->db_mtx); + + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_state != DB_EVICTING) { + ASSERT(db->db_state == DB_CACHED); + DBUF_VERIFY(db); + db->db_buf = NULL; + dbuf_evict(db); + } else { + mutex_exit(&db->db_mtx); + dbuf_destroy(db); + } + return (0); +} + +static void +dbuf_destroy(dmu_buf_impl_t *db) +{ + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_blkid != DB_BONUS_BLKID) { + dnode_t *dn = db->db_dnode; + + /* + * If this dbuf is still on the dn_dbufs list, + * remove it from that list. + */ + if (list_link_active(&db->db_link)) { + mutex_enter(&dn->dn_dbufs_mtx); + list_remove(&dn->dn_dbufs, db); + mutex_exit(&dn->dn_dbufs_mtx); + + dnode_rele(dn, db); + } + dbuf_hash_remove(db); + } + db->db_parent = NULL; + db->db_dnode = NULL; + db->db_buf = NULL; + + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + + kmem_cache_free(dbuf_cache, db); +} + +void +dbuf_prefetch(dnode_t *dn, uint64_t blkid) +{ + dmu_buf_impl_t *db = NULL; + blkptr_t *bp = NULL; + + ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + if (dnode_block_freed(dn, blkid)) + return; + + /* dbuf_find() returns with db_mtx held */ + if (db = dbuf_find(dn, 0, blkid)) { + if (refcount_count(&db->db_holds) > 0) { + /* + * This dbuf is active. We assume that it is + * already CACHED, or else about to be either + * read or filled. + */ + mutex_exit(&db->db_mtx); + return; + } + mutex_exit(&db->db_mtx); + db = NULL; + } + + if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { + if (bp && !BP_IS_HOLE(bp)) { + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + zbookmark_t zb; + zb.zb_objset = dn->dn_objset->os_dsl_dataset ? + dn->dn_objset->os_dsl_dataset->ds_object : 0; + zb.zb_object = dn->dn_object; + zb.zb_level = 0; + zb.zb_blkid = blkid; + + (void) arc_read(NULL, dn->dn_objset->os_spa, bp, + dmu_ot[dn->dn_type].ot_byteswap, + NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zb); + } + if (db) + dbuf_rele(db, NULL); + } +} + +/* + * Returns with db_holds incremented, and db_mtx not held. + * Note: dn_struct_rwlock must be held. + */ +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, + void *tag, dmu_buf_impl_t **dbp) +{ + dmu_buf_impl_t *db, *parent = NULL; + + ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(dn->dn_nlevels, >, level); + + *dbp = NULL; +top: + /* dbuf_find() returns with db_mtx held */ + db = dbuf_find(dn, level, blkid); + + if (db == NULL) { + blkptr_t *bp = NULL; + int err; + + ASSERT3P(parent, ==, NULL); + err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); + if (fail_sparse) { + if (err == 0 && bp && BP_IS_HOLE(bp)) + err = ENOENT; + if (err) { + if (parent) + dbuf_rele(parent, NULL); + return (err); + } + } + if (err && err != ENOENT) + return (err); + db = dbuf_create(dn, level, blkid, parent, bp); + } + + if (db->db_buf && refcount_is_zero(&db->db_holds)) { + arc_buf_add_ref(db->db_buf, db); + if (db->db_buf->b_data == NULL) { + dbuf_clear(db); + if (parent) { + dbuf_rele(parent, NULL); + parent = NULL; + } + goto top; + } + ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + } + + ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); + + /* + * If this buffer is currently syncing out, and we are are + * still referencing it from db_data, we need to make a copy + * of it in case we decide we want to dirty it again in this txg. + */ + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + dn->dn_object != DMU_META_DNODE_OBJECT && + db->db_state == DB_CACHED && db->db_data_pending) { + dbuf_dirty_record_t *dr = db->db_data_pending; + + if (dr->dt.dl.dr_data == db->db_buf) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + dbuf_set_data(db, + arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + db->db.db_size, db, type)); + bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, + db->db.db_size); + } + } + + (void) refcount_add(&db->db_holds, tag); + dbuf_update_data(db); + DBUF_VERIFY(db); + mutex_exit(&db->db_mtx); + + /* NOTE: we can't rele the parent until after we drop the db_mtx */ + if (parent) + dbuf_rele(parent, NULL); + + ASSERT3P(db->db_dnode, ==, dn); + ASSERT3U(db->db_blkid, ==, blkid); + ASSERT3U(db->db_level, ==, level); + *dbp = db; + + return (0); +} + +dmu_buf_impl_t * +dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); + return (err ? NULL : db); +} + +dmu_buf_impl_t * +dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + return (err ? NULL : db); +} + +dmu_buf_impl_t * +dbuf_create_bonus(dnode_t *dn) +{ + dmu_buf_impl_t *db = dn->dn_bonus; + + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + ASSERT(dn->dn_bonus == NULL); + db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); + return (db); +} + +#pragma weak dmu_buf_add_ref = dbuf_add_ref +void +dbuf_add_ref(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds = refcount_add(&db->db_holds, tag); + ASSERT(holds > 1); +} + +#pragma weak dmu_buf_rele = dbuf_rele +void +dbuf_rele(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds; + + mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + + holds = refcount_remove(&db->db_holds, tag); + ASSERT(holds >= 0); + + /* + * We can't freeze indirects if there is a possibility that they + * may be modified in the current syncing context. + */ + if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + arc_buf_freeze(db->db_buf); + + if (holds == db->db_dirtycnt && + db->db_level == 0 && db->db_immediate_evict) + dbuf_evict_user(db); + + if (holds == 0) { + if (db->db_blkid == DB_BONUS_BLKID) { + mutex_exit(&db->db_mtx); + dnode_rele(db->db_dnode, db); + } else if (db->db_buf == NULL) { + /* + * This is a special case: we never associated this + * dbuf with any data allocated from the ARC. + */ + ASSERT3U(db->db_state, ==, DB_UNCACHED); + dbuf_evict(db); + } else if (arc_released(db->db_buf)) { + arc_buf_t *buf = db->db_buf; + /* + * This dbuf has anonymous data associated with it. + */ + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + dbuf_evict(db); + } else { + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); + mutex_exit(&db->db_mtx); + } + } else { + mutex_exit(&db->db_mtx); + } +} + +#pragma weak dmu_buf_refcount = dbuf_refcount +uint64_t +dbuf_refcount(dmu_buf_impl_t *db) +{ + return (refcount_count(&db->db_holds)); +} + +void * +dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_immediate_evict = TRUE; + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, + void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(db->db_level == 0); + + ASSERT((user_ptr == NULL) == (evict_func == NULL)); + + mutex_enter(&db->db_mtx); + + if (db->db_user_ptr == old_user_ptr) { + db->db_user_ptr = user_ptr; + db->db_user_data_ptr_ptr = user_data_ptr_ptr; + db->db_evict_func = evict_func; + + dbuf_update_data(db); + } else { + old_user_ptr = db->db_user_ptr; + } + + mutex_exit(&db->db_mtx); + return (old_user_ptr); +} + +void * +dmu_buf_get_user(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(!refcount_is_zero(&db->db_holds)); + + return (db->db_user_ptr); +} + +static void +dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) +{ + /* ASSERT(dmu_tx_is_syncing(tx) */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_blkptr != NULL) + return; + + if (db->db_level == dn->dn_phys->dn_nlevels-1) { + /* + * This buffer was allocated at a time when there was + * no available blkptrs from the dnode, or it was + * inappropriate to hook it in (i.e., nlevels mis-match). + */ + ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); + ASSERT(db->db_parent == NULL); + db->db_parent = dn->dn_dbuf; + db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; + DBUF_VERIFY(db); + } else { + dmu_buf_impl_t *parent = db->db_parent; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT(dn->dn_phys->dn_nlevels > 1); + if (parent == NULL) { + mutex_exit(&db->db_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + (void) dbuf_hold_impl(dn, db->db_level+1, + db->db_blkid >> epbs, FALSE, db, &parent); + rw_exit(&dn->dn_struct_rwlock); + mutex_enter(&db->db_mtx); + db->db_parent = parent; + } + db->db_blkptr = (blkptr_t *)parent->db.db_data + + (db->db_blkid & ((1ULL << epbs) - 1)); + DBUF_VERIFY(db); + } +} + +static void +dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn = db->db_dnode; + zio_t *zio; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + + ASSERT(db->db_level > 0); + DBUF_VERIFY(db); + + if (db->db_buf == NULL) { + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + mutex_enter(&db->db_mtx); + } + ASSERT3U(db->db_state, ==, DB_CACHED); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + ASSERT(db->db_buf != NULL); + + dbuf_check_blkptr(dn, db); + + db->db_data_pending = dr; + + arc_release(db->db_buf, db); + mutex_exit(&db->db_mtx); + + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4, + zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx); + + zio = dr->dr_zio; + mutex_enter(&dr->dt.di.dr_mtx); + dbuf_sync_list(&dr->dt.di.dr_children, tx); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_exit(&dr->dt.di.dr_mtx); + zio_nowait(zio); +} + +static void +dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + arc_buf_t **datap = &dr->dt.dl.dr_data; + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + uint64_t txg = tx->tx_txg; + int checksum, compress; + int blksz; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + /* + * To be synced, we must be dirtied. But we + * might have been freed after the dirty. + */ + if (db->db_state == DB_UNCACHED) { + /* This buffer has been freed since it was dirtied */ + ASSERT(db->db.db_data == NULL); + } else if (db->db_state == DB_FILL) { + /* This buffer was freed and is now being re-filled */ + ASSERT(db->db.db_data != dr->dt.dl.dr_data); + } else { + ASSERT3U(db->db_state, ==, DB_CACHED); + } + DBUF_VERIFY(db); + + /* + * If this is a bonus buffer, simply copy the bonus data into the + * dnode. It will be written out when the dnode is synced (and it + * will be synced, since it must have been dirty for dbuf_sync to + * be called). + */ + if (db->db_blkid == DB_BONUS_BLKID) { + dbuf_dirty_record_t **drp; + /* + * Use dn_phys->dn_bonuslen since db.db_size is the length + * of the bonus buffer in the open transaction rather than + * the syncing transaction. + */ + ASSERT(*datap != NULL); + ASSERT3U(db->db_level, ==, 0); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); + bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + if (*datap != db->db.db_data) + zio_buf_free(*datap, DN_MAX_BONUSLEN); + db->db_data_pending = NULL; + drp = &db->db_last_dirty; + while (*drp != dr) + drp = &(*drp)->dr_next; + ASSERT((*drp)->dr_next == NULL); + *drp = NULL; + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + mutex_exit(&db->db_mtx); + dbuf_rele(db, (void *)(uintptr_t)txg); + return; + } + + /* + * If this buffer is in the middle of an immdiate write, + * wait for the synchronous IO to complete. + */ + while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + cv_wait(&db->db_changed, &db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); + } + + dbuf_check_blkptr(dn, db); + + /* + * If this dbuf has already been written out via an immediate write, + * just complete the write by copying over the new block pointer and + * updating the accounting via the write-completion functions. + */ + if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + zio_t zio_fake; + + zio_fake.io_private = &db; + zio_fake.io_error = 0; + zio_fake.io_bp = db->db_blkptr; + zio_fake.io_bp_orig = *db->db_blkptr; + zio_fake.io_txg = txg; + + *db->db_blkptr = dr->dt.dl.dr_overridden_by; + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + db->db_data_pending = dr; + dr->dr_zio = &zio_fake; + mutex_exit(&db->db_mtx); + + if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) + dsl_dataset_block_kill(os->os_dsl_dataset, + &zio_fake.io_bp_orig, dn->dn_zio, tx); + + dbuf_write_ready(&zio_fake, db->db_buf, db); + dbuf_write_done(&zio_fake, db->db_buf, db); + + return; + } + + blksz = arc_buf_size(*datap); + + if (dn->dn_object != DMU_META_DNODE_OBJECT) { + /* + * If this buffer is currently "in use" (i.e., there are + * active holds and db_data still references it), then make + * a copy before we start the write so that any modifications + * from the open txg will not leak into this write. + * + * NOTE: this copy does not need to be made for objects only + * modified in the syncing context (e.g. DNONE_DNODE blocks). + */ + if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + bcopy(db->db.db_data, (*datap)->b_data, blksz); + } + } else { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + arc_release(db->db_buf, db); + } + + ASSERT(*datap != NULL); + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + + /* + * Allow dnode settings to override objset settings, + * except for metadata checksums. + */ + if (dmu_ot[dn->dn_type].ot_metadata) { + checksum = os->os_md_checksum; + compress = zio_compress_select(dn->dn_compress, + os->os_md_compress); + } else { + checksum = zio_checksum_select(dn->dn_checksum, + os->os_checksum); + compress = zio_compress_select(dn->dn_compress, + os->os_compress); + } + + dbuf_write(dr, *datap, checksum, compress, tx); + + ASSERT(!list_link_active(&dr->dr_dirty_node)); + if (dn->dn_object == DMU_META_DNODE_OBJECT) + list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); + else + zio_nowait(dr->dr_zio); +} + +void +dbuf_sync_list(list_t *list, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + + while (dr = list_head(list)) { + if (dr->dr_zio != NULL) { + /* + * If we find an already initialized zio then we + * are processing the meta-dnode, and we have finished. + * The dbufs for all dnodes are put back on the list + * during processing, so that we can zio_wait() + * these IOs after initiating all child IOs. + */ + ASSERT3U(dr->dr_dbuf->db.db_object, ==, + DMU_META_DNODE_OBJECT); + break; + } + list_remove(list, dr); + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); + } +} + +static void +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, + int compress, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + dmu_buf_impl_t *parent = db->db_parent; + uint64_t txg = tx->tx_txg; + zbookmark_t zb; + zio_t *zio; + int zio_flags; + + if (parent != dn->dn_dbuf) { + ASSERT(parent && parent->db_data_pending); + ASSERT(db->db_level == parent->db_level-1); + ASSERT(arc_released(parent->db_buf)); + zio = parent->db_data_pending->dr_zio; + } else { + ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + zio = dn->dn_zio; + } + + ASSERT(db->db_level == 0 || data == db->db_buf); + ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT(zio); + + zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + + zio_flags = ZIO_FLAG_MUSTSUCCEED; + if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0) + zio_flags |= ZIO_FLAG_METADATA; + if (BP_IS_OLDER(db->db_blkptr, txg)) + dsl_dataset_block_kill( + os->os_dsl_dataset, db->db_blkptr, zio, tx); + + dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress, + dmu_get_replication_level(os, &zb, dn->dn_type), txg, + db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); +} + +/* ARGSUSED */ +static void +dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + blkptr_t *bp_orig = &zio->io_bp_orig; + uint64_t fill = 0; + int old_size, new_size, i; + + dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); + + old_size = bp_get_dasize(os->os_spa, bp_orig); + new_size = bp_get_dasize(os->os_spa, zio->io_bp); + + dnode_diduse_space(dn, new_size-old_size); + + if (BP_IS_HOLE(zio->io_bp)) { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + if (bp_orig->blk_birth == tx->tx_txg) + dsl_dataset_block_kill(ds, bp_orig, NULL, tx); + ASSERT3U(db->db_blkptr->blk_fill, ==, 0); + return; + } + + mutex_enter(&db->db_mtx); + + if (db->db_level == 0) { + mutex_enter(&dn->dn_mtx); + if (db->db_blkid > dn->dn_phys->dn_maxblkid) + dn->dn_phys->dn_maxblkid = db->db_blkid; + mutex_exit(&dn->dn_mtx); + + if (dn->dn_type == DMU_OT_DNODE) { + dnode_phys_t *dnp = db->db.db_data; + for (i = db->db.db_size >> DNODE_SHIFT; i > 0; + i--, dnp++) { + if (dnp->dn_type != DMU_OT_NONE) + fill++; + } + } else { + fill = 1; + } + } else { + blkptr_t *bp = db->db.db_data; + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { + if (BP_IS_HOLE(bp)) + continue; + ASSERT3U(BP_GET_LSIZE(bp), ==, + db->db_level == 1 ? dn->dn_datablksz : + (1<<dn->dn_phys->dn_indblkshift)); + fill += bp->blk_fill; + } + } + + db->db_blkptr->blk_fill = fill; + BP_SET_TYPE(db->db_blkptr, dn->dn_type); + BP_SET_LEVEL(db->db_blkptr, db->db_level); + + mutex_exit(&db->db_mtx); + + /* We must do this after we've set the bp's type and level */ + if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + if (bp_orig->blk_birth == tx->tx_txg) + dsl_dataset_block_kill(ds, bp_orig, NULL, tx); + dsl_dataset_block_born(ds, zio->io_bp, tx); + } +} + +/* ARGSUSED */ +static void +dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + uint64_t txg = zio->io_txg; + dbuf_dirty_record_t **drp, *dr; + + ASSERT3U(zio->io_error, ==, 0); + + mutex_enter(&db->db_mtx); + + drp = &db->db_last_dirty; + while (*drp != db->db_data_pending) + drp = &(*drp)->dr_next; + ASSERT(!list_link_active(&(*drp)->dr_dirty_node)); + ASSERT((*drp)->dr_txg == txg); + ASSERT((*drp)->dr_next == NULL); + dr = *drp; + *drp = NULL; + + if (db->db_level == 0) { + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); + else if (!BP_IS_HOLE(db->db_blkptr)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); + else + ASSERT(arc_released(db->db_buf)); + } else { + dnode_t *dn = db->db_dnode; + + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + if (!BP_IS_HOLE(db->db_blkptr)) { + int epbs = + dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + db->db.db_size); + ASSERT3U(dn->dn_phys->dn_maxblkid + >> (db->db_level * epbs), >=, db->db_blkid); + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + cv_broadcast(&db->db_changed); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + db->db_data_pending = NULL; + mutex_exit(&db->db_mtx); + + dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", ""); + + dbuf_rele(db, (void *)(uintptr_t)txg); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c new file mode 100644 index 0000000..d3be6b4 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -0,0 +1,1029 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_prop.h> +#include <sys/dmu_zfetch.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> + +const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { + { byteswap_uint8_array, TRUE, "unallocated" }, + { zap_byteswap, TRUE, "object directory" }, + { byteswap_uint64_array, TRUE, "object array" }, + { byteswap_uint8_array, TRUE, "packed nvlist" }, + { byteswap_uint64_array, TRUE, "packed nvlist size" }, + { byteswap_uint64_array, TRUE, "bplist" }, + { byteswap_uint64_array, TRUE, "bplist header" }, + { byteswap_uint64_array, TRUE, "SPA space map header" }, + { byteswap_uint64_array, TRUE, "SPA space map" }, + { byteswap_uint64_array, TRUE, "ZIL intent log" }, + { dnode_buf_byteswap, TRUE, "DMU dnode" }, + { dmu_objset_byteswap, TRUE, "DMU objset" }, + { byteswap_uint64_array, TRUE, "DSL directory" }, + { zap_byteswap, TRUE, "DSL directory child map"}, + { zap_byteswap, TRUE, "DSL dataset snap map" }, + { zap_byteswap, TRUE, "DSL props" }, + { byteswap_uint64_array, TRUE, "DSL dataset" }, + { zfs_znode_byteswap, TRUE, "ZFS znode" }, + { zfs_acl_byteswap, TRUE, "ZFS ACL" }, + { byteswap_uint8_array, FALSE, "ZFS plain file" }, + { zap_byteswap, TRUE, "ZFS directory" }, + { zap_byteswap, TRUE, "ZFS master node" }, + { zap_byteswap, TRUE, "ZFS delete queue" }, + { byteswap_uint8_array, FALSE, "zvol object" }, + { zap_byteswap, TRUE, "zvol prop" }, + { byteswap_uint8_array, FALSE, "other uint8[]" }, + { byteswap_uint64_array, FALSE, "other uint64[]" }, + { zap_byteswap, TRUE, "other ZAP" }, + { zap_byteswap, TRUE, "persistent error log" }, + { byteswap_uint8_array, TRUE, "SPA history" }, + { byteswap_uint64_array, TRUE, "SPA history offsets" }, + { zap_byteswap, TRUE, "Pool properties" }, +}; + +int +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + uint64_t blkid; + dmu_buf_impl_t *db; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + blkid = dbuf_whichblock(dn, offset); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) { + err = EIO; + } else { + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err) { + dbuf_rele(db, tag); + db = NULL; + } + } + + dnode_rele(dn, FTAG); + *dbp = &db->db; + return (err); +} + +int +dmu_bonus_max(void) +{ + return (DN_MAX_BONUSLEN); +} + +/* + * returns ENOENT, EIO, or 0. + */ +int +dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + int err, count; + dmu_buf_impl_t *db; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus == NULL) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dn->dn_bonus = dbuf_create_bonus(dn); + } + db = dn->dn_bonus; + rw_exit(&dn->dn_struct_rwlock); + mutex_enter(&db->db_mtx); + count = refcount_add(&db->db_holds, tag); + mutex_exit(&db->db_mtx); + if (count == 1) + dnode_add_ref(dn, db); + dnode_rele(dn, FTAG); + + VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); + + *dbp = &db->db; + return (0); +} + +/* + * Note: longer-term, we should modify all of the dmu_buf_*() interfaces + * to take a held dnode rather than <os, object> -- the lookup is wasteful, + * and can induce severe lock contention when writing to several files + * whose dnodes are in the same block. + */ +static int +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dmu_buf_t **dbp; + uint64_t blkid, nblks, i; + uint32_t flags; + int err; + zio_t *zio; + + ASSERT(length <= DMU_MAX_ACCESS); + + flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; + if (length > zfetch_array_rd_sz) + flags |= DB_RF_NOPREFETCH; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - + P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; + } else { + if (offset + length > dn->dn_datablksz) { + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)offset, (longlong_t)length); + return (EIO); + } + nblks = 1; + } + dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + if (db == NULL) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_rele_array(dbp, nblks, tag); + zio_nowait(zio); + return (EIO); + } + /* initiate async i/o */ + if (read) { + rw_exit(&dn->dn_struct_rwlock); + (void) dbuf_read(db, zio, flags); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + } + dbp[i] = &db->db; + } + rw_exit(&dn->dn_struct_rwlock); + + /* wait for async i/o */ + err = zio_wait(zio); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + + /* wait for other io to complete */ + if (read) { + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || + db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = EIO; + mutex_exit(&db->db_mtx); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + } + } + + *numbufsp = nblks; + *dbpp = dbp; + return (0); +} + +static int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + int err; + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp); + + return (err); +} + +void +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) +{ + int i; + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; + + if (numbufs == 0) + return; + + for (i = 0; i < numbufs; i++) { + if (dbp[i]) + dbuf_rele(dbp[i], tag); + } + + kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); +} + +void +dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) +{ + dnode_t *dn; + uint64_t blkid; + int nblks, i, err; + + if (zfs_prefetch_disable) + return; + + if (len == 0) { /* they're interested in the bonus buffer */ + dn = os->os->os_meta_dnode; + + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, blkid); + rw_exit(&dn->dn_struct_rwlock); + return; + } + + /* + * XXX - Note, if the dnode for the requested object is not + * already cached, we will do a *synchronous* read in the + * dnode_hold() call. The same is true for any indirects. + */ + err = dnode_hold(os->os, object, FTAG, &dn); + if (err != 0) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - + P2ALIGN(offset, 1<<blkshift)) >> blkshift; + } else { + nblks = (offset < dn->dn_datablksz); + } + + if (nblks != 0) { + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) + dbuf_prefetch(dn, blkid+i); + } + + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); +} + +int +dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + dnode_t *dn; + int err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + ASSERT(offset < UINT64_MAX); + ASSERT(size == -1ULL || size <= UINT64_MAX - offset); + dnode_free_range(dn, offset, size, tx); + dnode_rele(dn, FTAG); + return (0); +} + +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf) +{ + dnode_t *dn; + dmu_buf_t **dbp; + int numbufs, i, err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + + /* + * Deal with odd block sizes, where there can't be data past the first + * block. If we ever do the tail block optimization, we will need to + * handle that here as well. + */ + if (dn->dn_datablkshift == 0) { + int newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)buf + newsz, size - newsz); + size = newsz; + } + + while (size > 0) { + uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + int err; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, + TRUE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + bcopy((char *)db->db_data + bufoff, buf, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + } + dnode_rele(dn, FTAG); + return (0); +} + +void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + if (size == 0) + return; + + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + bcopy(buf, (char *)db->db_data + bufoff, tocpy); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +#ifdef _KERNEL +int +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +{ + dmu_buf_t **dbp; + int numbufs, i, err; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, + &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); + if (err) + break; + + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +int +dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + int err = 0; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + /* + * XXX uiomove could block forever (eg. nfs-backed + * pages). There needs to be a uiolockdown() function + * to lock the pages in memory, so that uiomove won't + * block. + */ + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_WRITE, uio); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + if (err) + break; + + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +#ifndef __FreeBSD__ +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + page_t *pp, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = ppmapin(pp, PROT_READ, (caddr_t)-1); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + ppmapout(va); + pp = pp->p_next; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + if (err) + break; + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} +#endif /* !__FreeBSD__ */ +#endif /* _KERNEL */ + +typedef struct { + dbuf_dirty_record_t *dr; + dmu_sync_cb_t *done; + void *arg; +} dmu_sync_arg_t; + +/* ARGSUSED */ +static void +dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) +{ + dmu_sync_arg_t *in = varg; + dbuf_dirty_record_t *dr = in->dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + dmu_sync_cb_t *done = in->done; + + if (!BP_IS_HOLE(zio->io_bp)) { + zio->io_bp->blk_fill = 1; + BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); + BP_SET_LEVEL(zio->io_bp, 0); + } + + mutex_enter(&db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); + dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + cv_broadcast(&db->db_changed); + mutex_exit(&db->db_mtx); + + if (done) + done(&(db->db), in->arg); + + kmem_free(in, sizeof (dmu_sync_arg_t)); +} + +/* + * Intent log support: sync the block associated with db to disk. + * N.B. and XXX: the caller is responsible for making sure that the + * data isn't changing while dmu_sync() is writing it. + * + * Return values: + * + * EEXIST: this txg has already been synced, so there's nothing to to. + * The caller should not log the write. + * + * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. + * The caller should not log the write. + * + * EALREADY: this block is already in the process of being synced. + * The caller should track its progress (somehow). + * + * EINPROGRESS: the IO has been initiated. + * The caller should log this blkptr in the callback. + * + * 0: completed. Sets *bp to the blkptr just written. + * The caller should log this blkptr immediately. + */ +int +dmu_sync(zio_t *pio, dmu_buf_t *db_fake, + blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + objset_impl_t *os = db->db_objset; + dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; + tx_state_t *tx = &dp->dp_tx; + dbuf_dirty_record_t *dr; + dmu_sync_arg_t *in; + zbookmark_t zb; + zio_t *zio; + int zio_flags; + int err; + + ASSERT(BP_IS_HOLE(bp)); + ASSERT(txg != 0); + + + dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", + txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); + + /* + * XXX - would be nice if we could do this without suspending... + */ + txg_suspend(dp); + + /* + * If this txg already synced, there's nothing to do. + */ + if (txg <= tx->tx_synced_txg) { + txg_resume(dp); + /* + * If we're running ziltest, we need the blkptr regardless. + */ + if (txg > spa_freeze_txg(dp->dp_spa)) { + /* if db_blkptr == NULL, this was an empty write */ + if (db->db_blkptr) + *bp = *db->db_blkptr; /* structure assignment */ + return (0); + } + return (EEXIST); + } + + mutex_enter(&db->db_mtx); + + if (txg == tx->tx_syncing_txg) { + while (db->db_data_pending) { + /* + * IO is in-progress. Wait for it to finish. + * XXX - would be nice to be able to somehow "attach" + * this zio to the parent zio passed in. + */ + cv_wait(&db->db_changed, &db->db_mtx); + if (!db->db_data_pending && + db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { + /* + * IO was compressed away + */ + *bp = *db->db_blkptr; /* structure assignment */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (0); + } + ASSERT(db->db_data_pending || + (db->db_blkptr && db->db_blkptr->blk_birth == txg)); + } + + if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { + /* + * IO is already completed. + */ + *bp = *db->db_blkptr; /* structure assignment */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (0); + } + } + + dr = db->db_last_dirty; + while (dr && dr->dr_txg > txg) + dr = dr->dr_next; + if (dr == NULL || dr->dr_txg < txg) { + /* + * This dbuf isn't dirty, must have been free_range'd. + * There's no need to log writes to freed blocks, so we're done. + */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (ENOENT); + } + + ASSERT(dr->dr_txg == txg); + if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + /* + * We have already issued a sync write for this buffer. + */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (EALREADY); + } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * This buffer has already been synced. It could not + * have been dirtied since, or we would have cleared the state. + */ + *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (0); + } + + dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + in->dr = dr; + in->done = done; + in->arg = arg; + mutex_exit(&db->db_mtx); + txg_resume(dp); + + zb.zb_objset = os->os_dsl_dataset->ds_object; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + zio_flags = ZIO_FLAG_MUSTSUCCEED; + if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0) + zio_flags |= ZIO_FLAG_METADATA; + zio = arc_write(pio, os->os_spa, + zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), + zio_compress_select(db->db_dnode->dn_compress, os->os_compress), + dmu_get_replication_level(os, &zb, db->db_dnode->dn_type), + txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in, + ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb); + + if (pio) { + zio_nowait(zio); + err = EINPROGRESS; + } else { + err = zio_wait(zio); + ASSERT(err == 0); + } + return (err); +} + +int +dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + err = dnode_set_blksz(dn, size, ibs, tx); + dnode_rele(dn, FTAG); + return (err); +} + +void +dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* XXX assumes dnode_hold will not get an i/o error */ + (void) dnode_hold(os->os, object, FTAG, &dn); + ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + dn->dn_checksum = checksum; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +void +dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* XXX assumes dnode_hold will not get an i/o error */ + (void) dnode_hold(os->os, object, FTAG, &dn); + ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); + dn->dn_compress = compress; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +int +dmu_get_replication_level(objset_impl_t *os, + zbookmark_t *zb, dmu_object_type_t ot) +{ + int ncopies = os->os_copies; + + /* If it's the mos, it should have max copies set. */ + ASSERT(zb->zb_objset != 0 || + ncopies == spa_max_replication(os->os_spa)); + + if (dmu_ot[ot].ot_metadata || zb->zb_level != 0) + ncopies++; + return (MIN(ncopies, spa_max_replication(os->os_spa))); +} + +int +dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) +{ + dnode_t *dn; + int i, err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + /* + * Sync any current changes before + * we go trundling through the block pointers. + */ + for (i = 0; i < TXG_SIZE; i++) { + if (list_link_active(&dn->dn_dirty_link[i])) + break; + } + if (i != TXG_SIZE) { + dnode_rele(dn, FTAG); + txg_wait_synced(dmu_objset_pool(os), 0); + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + } + + err = dnode_next_offset(dn, hole, off, 1, 1, 0); + dnode_rele(dn, FTAG); + + return (err); +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + doi->doi_data_block_size = dn->dn_datablksz; + doi->doi_metadata_block_size = dn->dn_indblkshift ? + 1ULL << dn->dn_indblkshift : 0; + doi->doi_indirection = dn->dn_nlevels; + doi->doi_checksum = dn->dn_checksum; + doi->doi_compress = dn->dn_compress; + doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; + doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; + doi->doi_type = dn->dn_type; + doi->doi_bonus_size = dn->dn_bonuslen; + doi->doi_bonus_type = dn->dn_bonustype; + + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); +} + +/* + * Get information on a DMU object. + * If doi is NULL, just indicates whether the object exists. + */ +int +dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) +{ + dnode_t *dn; + int err = dnode_hold(os->os, object, FTAG, &dn); + + if (err) + return (err); + + if (doi != NULL) + dmu_object_info_from_dnode(dn, doi); + + dnode_rele(dn, FTAG); + return (0); +} + +/* + * As above, but faster; can be used when you have a held dbuf in hand. + */ +void +dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) +{ + dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); +} + +/* + * Faster still when you only care about the size. + * This is specifically optimized for zfs_getattr(). + */ +void +dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + + *blksize = dn->dn_datablksz; + /* add 1 for dnode space */ + *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> + SPA_MINBLOCKSHIFT) + 1; +} + +void +byteswap_uint64_array(void *vbuf, size_t size) +{ + uint64_t *buf = vbuf; + size_t count = size >> 3; + int i; + + ASSERT((size & 7) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_64(buf[i]); +} + +void +byteswap_uint32_array(void *vbuf, size_t size) +{ + uint32_t *buf = vbuf; + size_t count = size >> 2; + int i; + + ASSERT((size & 3) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_32(buf[i]); +} + +void +byteswap_uint16_array(void *vbuf, size_t size) +{ + uint16_t *buf = vbuf; + size_t count = size >> 1; + int i; + + ASSERT((size & 1) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_16(buf[i]); +} + +/* ARGSUSED */ +void +byteswap_uint8_array(void *vbuf, size_t size) +{ +} + +void +dmu_init(void) +{ + dbuf_init(); + dnode_init(); + arc_init(); +} + +void +dmu_fini(void) +{ + arc_fini(); + dnode_fini(); + dbuf_fini(); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c new file mode 100644 index 0000000..93168cc --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dnode.h> + +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + objset_impl_t *osi = os->os; + uint64_t object; + uint64_t L2_dnode_count = DNODES_PER_BLOCK << + (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); + dnode_t *dn = NULL; + int restarted = B_FALSE; + + mutex_enter(&osi->os_obj_lock); + for (;;) { + object = osi->os_obj_next; + /* + * Each time we polish off an L2 bp worth of dnodes + * (2^13 objects), move to another L2 bp that's still + * reasonably sparse (at most 1/4 full). Look from the + * beginning once, but after that keep looking from here. + * If we can't find one, just keep going from here. + */ + if (P2PHASE(object, L2_dnode_count) == 0) { + uint64_t offset = restarted ? object << DNODE_SHIFT : 0; + int error = dnode_next_offset(osi->os_meta_dnode, + B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); + restarted = B_TRUE; + if (error == 0) + object = offset >> DNODE_SHIFT; + } + osi->os_obj_next = ++object; + + /* + * XXX We should check for an i/o error here and return + * up to our caller. Actually we should pre-read it in + * dmu_tx_assign(), but there is currently no mechanism + * to do so. + */ + (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, + FTAG, &dn); + if (dn) + break; + + if (dmu_object_next(os, &object, B_TRUE, 0) == 0) + osi->os_obj_next = object - 1; + } + + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + mutex_exit(&osi->os_obj_lock); + + dmu_tx_add_new_object(tx, os, object); + return (object); +} + +int +dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) + return (EBADF); + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn); + if (err) + return (err); + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + dmu_tx_add_new_object(tx, os, object); + return (0); +} + +int +dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) + return (EBADF); + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err) + return (err); + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + return (0); +} + +int +dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err) + return (err); + + ASSERT(dn->dn_type != DMU_OT_NONE); + dnode_free(dn, tx); + dnode_rele(dn, FTAG); + + return (0); +} + +int +dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) +{ + uint64_t offset = (*objectp + 1) << DNODE_SHIFT; + int error; + + error = dnode_next_offset(os->os->os_meta_dnode, + hole, &offset, 0, DNODES_PER_BLOCK, txg); + + *objectp = offset >> DNODE_SHIFT; + + return (error); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c new file mode 100644 index 0000000..07f8c86 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -0,0 +1,1034 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dnode.h> +#include <sys/dbuf.h> +#include <sys/zvol.h> +#include <sys/dmu_tx.h> +#include <sys/zio_checksum.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/dmu_impl.h> + + +spa_t * +dmu_objset_spa(objset_t *os) +{ + return (os->os->os_spa); +} + +zilog_t * +dmu_objset_zil(objset_t *os) +{ + return (os->os->os_zil); +} + +dsl_pool_t * +dmu_objset_pool(objset_t *os) +{ + dsl_dataset_t *ds; + + if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir) + return (ds->ds_dir->dd_pool); + else + return (spa_get_dsl(os->os->os_spa)); +} + +dsl_dataset_t * +dmu_objset_ds(objset_t *os) +{ + return (os->os->os_dsl_dataset); +} + +dmu_objset_type_t +dmu_objset_type(objset_t *os) +{ + return (os->os->os_phys->os_type); +} + +void +dmu_objset_name(objset_t *os, char *buf) +{ + dsl_dataset_name(os->os->os_dsl_dataset, buf); +} + +uint64_t +dmu_objset_id(objset_t *os) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + + return (ds ? ds->ds_object : 0); +} + +static void +checksum_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); +} + +static void +compression_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval != ZIO_COMPRESS_INHERIT); + + osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); +} + +static void +copies_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval > 0); + ASSERT(newval <= spa_max_replication(osi->os_spa)); + + osi->os_copies = newval; +} + +void +dmu_objset_byteswap(void *buf, size_t size) +{ + objset_phys_t *osp = buf; + + ASSERT(size == sizeof (objset_phys_t)); + dnode_byteswap(&osp->os_meta_dnode); + byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); + osp->os_type = BSWAP_64(osp->os_type); +} + +int +dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + objset_impl_t **osip) +{ + objset_impl_t *winner, *osi; + int i, err, checksum; + + osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP); + osi->os.os = osi; + osi->os_dsl_dataset = ds; + osi->os_spa = spa; + osi->os_rootbp = bp; + if (!BP_IS_HOLE(osi->os_rootbp)) { + uint32_t aflags = ARC_WAIT; + zbookmark_t zb; + zb.zb_objset = ds ? ds->ds_object : 0; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = 0; + + dprintf_bp(osi->os_rootbp, "reading %s", ""); + err = arc_read(NULL, spa, osi->os_rootbp, + dmu_ot[DMU_OT_OBJSET].ot_byteswap, + arc_getbuf_func, &osi->os_phys_buf, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); + if (err) { + kmem_free(osi, sizeof (objset_impl_t)); + return (err); + } + osi->os_phys = osi->os_phys_buf->b_data; + arc_release(osi->os_phys_buf, &osi->os_phys_buf); + } else { + osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), + &osi->os_phys_buf, ARC_BUFC_METADATA); + osi->os_phys = osi->os_phys_buf->b_data; + bzero(osi->os_phys, sizeof (objset_phys_t)); + } + + /* + * Note: the changed_cb will be called once before the register + * func returns, thus changing the checksum/compression from the + * default (fletcher2/off). Snapshots don't need to know, and + * registering would complicate clone promotion. + */ + if (ds && ds->ds_phys->ds_num_children == 0) { + err = dsl_prop_register(ds, "checksum", + checksum_changed_cb, osi); + if (err == 0) + err = dsl_prop_register(ds, "compression", + compression_changed_cb, osi); + if (err == 0) + err = dsl_prop_register(ds, "copies", + copies_changed_cb, osi); + if (err) { + VERIFY(arc_buf_remove_ref(osi->os_phys_buf, + &osi->os_phys_buf) == 1); + kmem_free(osi, sizeof (objset_impl_t)); + return (err); + } + } else if (ds == NULL) { + /* It's the meta-objset. */ + osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; + osi->os_compress = ZIO_COMPRESS_LZJB; + osi->os_copies = spa_max_replication(spa); + } + + osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header); + + /* + * Metadata always gets compressed and checksummed. + * If the data checksum is multi-bit correctable, and it's not + * a ZBT-style checksum, then it's suitable for metadata as well. + * Otherwise, the metadata checksum defaults to fletcher4. + */ + checksum = osi->os_checksum; + + if (zio_checksum_table[checksum].ci_correctable && + !zio_checksum_table[checksum].ci_zbt) + osi->os_md_checksum = checksum; + else + osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4; + osi->os_md_compress = ZIO_COMPRESS_LZJB; + + for (i = 0; i < TXG_SIZE; i++) { + list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i])); + list_create(&osi->os_free_dnodes[i], sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i])); + } + list_create(&osi->os_dnodes, sizeof (dnode_t), + offsetof(dnode_t, dn_link)); + list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + + osi->os_meta_dnode = dnode_special_open(osi, + &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + + if (ds != NULL) { + winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict); + if (winner) { + dmu_objset_evict(ds, osi); + osi = winner; + } + } + + *osip = osi; + return (0); +} + +/* called from zpl */ +int +dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp) +{ + dsl_dataset_t *ds; + int err; + objset_t *os; + objset_impl_t *osi; + + os = kmem_alloc(sizeof (objset_t), KM_SLEEP); + err = dsl_dataset_open(name, mode, os, &ds); + if (err) { + kmem_free(os, sizeof (objset_t)); + return (err); + } + + osi = dsl_dataset_get_user_ptr(ds); + if (osi == NULL) { + err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), + ds, &ds->ds_phys->ds_bp, &osi); + if (err) { + dsl_dataset_close(ds, mode, os); + kmem_free(os, sizeof (objset_t)); + return (err); + } + } + + os->os = osi; + os->os_mode = mode; + + if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) { + dmu_objset_close(os); + return (EINVAL); + } + *osp = os; + return (0); +} + +void +dmu_objset_close(objset_t *os) +{ + dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os); + kmem_free(os, sizeof (objset_t)); +} + +int +dmu_objset_evict_dbufs(objset_t *os, int try) +{ + objset_impl_t *osi = os->os; + dnode_t *dn; + + mutex_enter(&osi->os_lock); + + /* process the mdn last, since the other dnodes have holds on it */ + list_remove(&osi->os_dnodes, osi->os_meta_dnode); + list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode); + + /* + * Find the first dnode with holds. We have to do this dance + * because dnode_add_ref() only works if you already have a + * hold. If there are no holds then it has no dbufs so OK to + * skip. + */ + for (dn = list_head(&osi->os_dnodes); + dn && refcount_is_zero(&dn->dn_holds); + dn = list_next(&osi->os_dnodes, dn)) + continue; + if (dn) + dnode_add_ref(dn, FTAG); + + while (dn) { + dnode_t *next_dn = dn; + + do { + next_dn = list_next(&osi->os_dnodes, next_dn); + } while (next_dn && refcount_is_zero(&next_dn->dn_holds)); + if (next_dn) + dnode_add_ref(next_dn, FTAG); + + mutex_exit(&osi->os_lock); + if (dnode_evict_dbufs(dn, try)) { + dnode_rele(dn, FTAG); + if (next_dn) + dnode_rele(next_dn, FTAG); + return (1); + } + dnode_rele(dn, FTAG); + mutex_enter(&osi->os_lock); + dn = next_dn; + } + mutex_exit(&osi->os_lock); + return (0); +} + +void +dmu_objset_evict(dsl_dataset_t *ds, void *arg) +{ + objset_impl_t *osi = arg; + objset_t os; + int i; + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL); + ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); + } + + if (ds && ds->ds_phys->ds_num_children == 0) { + VERIFY(0 == dsl_prop_unregister(ds, "checksum", + checksum_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "compression", + compression_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "copies", + copies_changed_cb, osi)); + } + + /* + * We should need only a single pass over the dnode list, since + * nothing can be added to the list at this point. + */ + os.os = osi; + (void) dmu_objset_evict_dbufs(&os, 0); + + ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); + ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); + ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL); + + dnode_special_close(osi->os_meta_dnode); + zil_free(osi->os_zil); + + VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); + mutex_destroy(&osi->os_lock); + mutex_destroy(&osi->os_obj_lock); + kmem_free(osi, sizeof (objset_impl_t)); +} + +/* called from dsl for meta-objset */ +objset_impl_t * +dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + dmu_objset_type_t type, dmu_tx_t *tx) +{ + objset_impl_t *osi; + dnode_t *mdn; + + ASSERT(dmu_tx_is_syncing(tx)); + VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi)); + mdn = osi->os_meta_dnode; + + dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, + DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); + + /* + * We don't want to have to increase the meta-dnode's nlevels + * later, because then we could do it in quescing context while + * we are also accessing it in open context. + * + * This precaution is not necessary for the MOS (ds == NULL), + * because the MOS is only updated in syncing context. + * This is most fortunate: the MOS is the only objset that + * needs to be synced multiple times as spa_sync() iterates + * to convergence, so minimizing its dn_nlevels matters. + */ + if (ds != NULL) { + int levels = 1; + + /* + * Determine the number of levels necessary for the meta-dnode + * to contain DN_MAX_OBJECT dnodes. + */ + while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < + DN_MAX_OBJECT * sizeof (dnode_phys_t)) + levels++; + + mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = + mdn->dn_nlevels = levels; + } + + ASSERT(type != DMU_OST_NONE); + ASSERT(type != DMU_OST_ANY); + ASSERT(type < DMU_OST_NUMTYPES); + osi->os_phys->os_type = type; + + dsl_dataset_dirty(ds, tx); + + return (osi); +} + +struct oscarg { + void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx); + void *userarg; + dsl_dataset_t *clone_parent; + const char *lastname; + dmu_objset_type_t type; +}; + +/* ARGSUSED */ +static int +dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct oscarg *oa = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + int err; + uint64_t ddobj; + + err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, + oa->lastname, sizeof (uint64_t), 1, &ddobj); + if (err != ENOENT) + return (err ? err : EEXIST); + + if (oa->clone_parent != NULL) { + /* + * You can't clone across pools. + */ + if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool) + return (EXDEV); + + /* + * You can only clone snapshots, not the head datasets. + */ + if (oa->clone_parent->ds_phys->ds_num_children == 0) + return (EINVAL); + } + return (0); +} + +static void +dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct oscarg *oa = arg2; + dsl_dataset_t *ds; + blkptr_t *bp; + uint64_t dsobj; + + ASSERT(dmu_tx_is_syncing(tx)); + + dsobj = dsl_dataset_create_sync(dd, oa->lastname, + oa->clone_parent, tx); + + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, + DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds)); + bp = dsl_dataset_get_blkptr(ds); + if (BP_IS_HOLE(bp)) { + objset_impl_t *osi; + + /* This is an empty dmu_objset; not a clone. */ + osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds), + ds, bp, oa->type, tx); + + if (oa->userfunc) + oa->userfunc(&osi->os, oa->userarg, tx); + } + dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG); +} + +int +dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, + void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg) +{ + dsl_dir_t *pdd; + const char *tail; + int err = 0; + struct oscarg oa = { 0 }; + + ASSERT(strchr(name, '@') == NULL); + err = dsl_dir_open(name, FTAG, &pdd, &tail); + if (err) + return (err); + if (tail == NULL) { + dsl_dir_close(pdd, FTAG); + return (EEXIST); + } + + dprintf("name=%s\n", name); + + oa.userfunc = func; + oa.userarg = arg; + oa.lastname = tail; + oa.type = type; + if (clone_parent != NULL) { + /* + * You can't clone to a different type. + */ + if (clone_parent->os->os_phys->os_type != type) { + dsl_dir_close(pdd, FTAG); + return (EINVAL); + } + oa.clone_parent = clone_parent->os->os_dsl_dataset; + } + err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, + dmu_objset_create_sync, pdd, &oa, 5); + dsl_dir_close(pdd, FTAG); + return (err); +} + +int +dmu_objset_destroy(const char *name) +{ + objset_t *os; + int error; + + /* + * If it looks like we'll be able to destroy it, and there's + * an unplayed replay log sitting around, destroy the log. + * It would be nicer to do this in dsl_dataset_destroy_sync(), + * but the replay log objset is modified in open context. + */ + error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os); + if (error == 0) { + zil_destroy(dmu_objset_zil(os), B_FALSE); + dmu_objset_close(os); + } + + return (dsl_dataset_destroy(name)); +} + +int +dmu_objset_rollback(const char *name) +{ + int err; + objset_t *os; + + err = dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os); + if (err == 0) { + err = zil_suspend(dmu_objset_zil(os)); + if (err == 0) + zil_resume(dmu_objset_zil(os)); + if (err == 0) { + /* XXX uncache everything? */ + err = dsl_dataset_rollback(os->os->os_dsl_dataset); + } + dmu_objset_close(os); + } + return (err); +} + +struct snaparg { + dsl_sync_task_group_t *dstg; + char *snapname; + char failed[MAXPATHLEN]; +}; + +static int +dmu_objset_snapshot_one(char *name, void *arg) +{ + struct snaparg *sn = arg; + objset_t *os; + dmu_objset_stats_t stat; + int err; + + (void) strcpy(sn->failed, name); + + err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os); + if (err != 0) + return (err); + + /* + * If the objset is in an inconsistent state, return busy. + */ + dmu_objset_fast_stat(os, &stat); + if (stat.dds_inconsistent) { + dmu_objset_close(os); + return (EBUSY); + } + + /* + * NB: we need to wait for all in-flight changes to get to disk, + * so that we snapshot those changes. zil_suspend does this as + * a side effect. + */ + err = zil_suspend(dmu_objset_zil(os)); + if (err == 0) { + dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, + dsl_dataset_snapshot_sync, os, sn->snapname, 3); + } else { + dmu_objset_close(os); + } + + return (err); +} + +int +dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) +{ + dsl_sync_task_t *dst; + struct snaparg sn = { 0 }; + char *cp; + spa_t *spa; + int err; + + (void) strcpy(sn.failed, fsname); + + cp = strchr(fsname, '/'); + if (cp) { + *cp = '\0'; + err = spa_open(fsname, &spa, FTAG); + *cp = '/'; + } else { + err = spa_open(fsname, &spa, FTAG); + } + if (err) + return (err); + + sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + sn.snapname = snapname; + + if (recursive) { + err = dmu_objset_find(fsname, + dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); + } else { + err = dmu_objset_snapshot_one(fsname, &sn); + } + + if (err) + goto out; + + err = dsl_sync_task_group_wait(sn.dstg); + + for (dst = list_head(&sn.dstg->dstg_tasks); dst; + dst = list_next(&sn.dstg->dstg_tasks, dst)) { + objset_t *os = dst->dst_arg1; + if (dst->dst_err) + dmu_objset_name(os, sn.failed); + zil_resume(dmu_objset_zil(os)); + dmu_objset_close(os); + } +out: + if (err) + (void) strcpy(fsname, sn.failed); + dsl_sync_task_group_destroy(sn.dstg); + spa_close(spa, FTAG); + return (err); +} + +static void +dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) +{ + dnode_t *dn; + + while (dn = list_head(list)) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + ASSERT(dn->dn_dbuf->db_data_pending); + /* + * Initialize dn_zio outside dnode_sync() + * to accomodate meta-dnode + */ + dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; + ASSERT(dn->dn_zio); + + ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); + list_remove(list, dn); + dnode_sync(dn, tx); + } +} + +/* ARGSUSED */ +static void +ready(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + objset_impl_t *os = arg; + blkptr_t *bp = os->os_rootbp; + dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; + int i; + + /* + * Update rootbp fill count. + */ + bp->blk_fill = 1; /* count the meta-dnode */ + for (i = 0; i < dnp->dn_nblkptr; i++) + bp->blk_fill += dnp->dn_blkptr[i].blk_fill; +} + +/* ARGSUSED */ +static void +killer(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + objset_impl_t *os = arg; + + ASSERT3U(zio->io_error, ==, 0); + + BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET); + BP_SET_LEVEL(zio->io_bp, 0); + + if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), + BP_IDENTITY(&zio->io_bp_orig))) { + if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) + dsl_dataset_block_kill(os->os_dsl_dataset, + &zio->io_bp_orig, NULL, os->os_synctx); + dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp, + os->os_synctx); + } + arc_release(os->os_phys_buf, &os->os_phys_buf); +} + +/* called from dsl */ +void +dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) +{ + int txgoff; + zbookmark_t zb; + zio_t *zio; + list_t *list; + dbuf_dirty_record_t *dr; + int zio_flags; + + dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); + + ASSERT(dmu_tx_is_syncing(tx)); + /* XXX the write_done callback should really give us the tx... */ + os->os_synctx = tx; + + if (os->os_dsl_dataset == NULL) { + /* + * This is the MOS. If we have upgraded, + * spa_max_replication() could change, so reset + * os_copies here. + */ + os->os_copies = spa_max_replication(os->os_spa); + } + + /* + * Create the root block IO + */ + zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = 0; + zio_flags = ZIO_FLAG_MUSTSUCCEED; + if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0) + zio_flags |= ZIO_FLAG_METADATA; + if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) + dsl_dataset_block_kill(os->os_dsl_dataset, + os->os_rootbp, pio, tx); + zio = arc_write(pio, os->os_spa, os->os_md_checksum, + os->os_md_compress, + dmu_get_replication_level(os, &zb, DMU_OT_OBJSET), + tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os, + ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); + + /* + * Sync meta-dnode - the parent IO for the sync is the root block + */ + os->os_meta_dnode->dn_zio = zio; + dnode_sync(os->os_meta_dnode, tx); + + txgoff = tx->tx_txg & TXG_MASK; + + dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx); + dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx); + + list = &os->os_meta_dnode->dn_dirty_records[txgoff]; + while (dr = list_head(list)) { + ASSERT(dr->dr_dbuf->db_level == 0); + list_remove(list, dr); + if (dr->dr_zio) + zio_nowait(dr->dr_zio); + } + /* + * Free intent log blocks up to this tx. + */ + zil_sync(os->os_zil, tx); + zio_nowait(zio); +} + +void +dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp) +{ + dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp, + usedobjsp, availobjsp); +} + +uint64_t +dmu_objset_fsid_guid(objset_t *os) +{ + return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset)); +} + +void +dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) +{ + stat->dds_type = os->os->os_phys->os_type; + if (os->os->os_dsl_dataset) + dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat); +} + +void +dmu_objset_stats(objset_t *os, nvlist_t *nv) +{ + ASSERT(os->os->os_dsl_dataset || + os->os->os_phys->os_type == DMU_OST_META); + + if (os->os->os_dsl_dataset != NULL) + dsl_dataset_stats(os->os->os_dsl_dataset, nv); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, + os->os->os_phys->os_type); +} + +int +dmu_objset_is_snapshot(objset_t *os) +{ + if (os->os->os_dsl_dataset != NULL) + return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset)); + else + return (B_FALSE); +} + +int +dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + zap_cursor_t cursor; + zap_attribute_t attr; + + if (ds->ds_phys->ds_snapnames_zapobj == 0) + return (ENOENT); + + zap_cursor_init_serialized(&cursor, + ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (ENOENT); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (ENAMETOOLONG); + } + + (void) strcpy(name, attr.za_name); + if (idp) + *idp = attr.za_first_integer; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +int +dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp) +{ + dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir; + zap_cursor_t cursor; + zap_attribute_t attr; + + /* there is no next dir on a snapshot! */ + if (os->os->os_dsl_dataset->ds_object != + dd->dd_phys->dd_head_dataset_obj) + return (ENOENT); + + zap_cursor_init_serialized(&cursor, + dd->dd_pool->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (ENOENT); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (ENAMETOOLONG); + } + + (void) strcpy(name, attr.za_name); + if (idp) + *idp = attr.za_first_integer; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +/* + * Find all objsets under name, and for each, call 'func(child_name, arg)'. + */ +int +dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) +{ + dsl_dir_t *dd; + objset_t *os; + uint64_t snapobj; + zap_cursor_t zc; + zap_attribute_t attr; + char *child; + int do_self, err; + + err = dsl_dir_open(name, FTAG, &dd, NULL); + if (err) + return (err); + + /* NB: the $MOS dir doesn't have a head dataset */ + do_self = (dd->dd_phys->dd_head_dataset_obj != 0); + + /* + * Iterate over all children. + */ + if (flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, &attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr.za_integer_length == sizeof (uint64_t)); + ASSERT(attr.za_num_integers == 1); + + /* + * No separating '/' because parent's name ends in /. + */ + child = kmem_alloc(MAXPATHLEN, KM_SLEEP); + /* XXX could probably just use name here */ + dsl_dir_name(dd, child); + (void) strcat(child, "/"); + (void) strcat(child, attr.za_name); + err = dmu_objset_find(child, func, arg, flags); + kmem_free(child, MAXPATHLEN); + if (err) + break; + } + zap_cursor_fini(&zc); + + if (err) { + dsl_dir_close(dd, FTAG); + return (err); + } + } + + /* + * Iterate over all snapshots. + */ + if ((flags & DS_FIND_SNAPSHOTS) && + dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) { + + snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj; + dmu_objset_close(os); + + for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, &attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr.za_integer_length == sizeof (uint64_t)); + ASSERT(attr.za_num_integers == 1); + + child = kmem_alloc(MAXPATHLEN, KM_SLEEP); + /* XXX could probably just use name here */ + dsl_dir_name(dd, child); + (void) strcat(child, "@"); + (void) strcat(child, attr.za_name); + err = func(child, arg); + kmem_free(child, MAXPATHLEN); + if (err) + break; + } + zap_cursor_fini(&zc); + } + + dsl_dir_close(dd, FTAG); + + if (err) + return (err); + + /* + * Apply to self if appropriate. + */ + if (do_self) + err = func(name, arg); + return (err); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c new file mode 100644 index 0000000..46facc3 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -0,0 +1,1009 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> + +struct backuparg { + dmu_replay_record_t *drr; + kthread_t *td; + struct file *fp; + objset_t *os; + zio_cksum_t zc; + int err; +}; + +static int +dump_bytes(struct backuparg *ba, void *buf, int len) +{ + struct uio auio; + struct iovec aiov; + + ASSERT3U(len % 8, ==, 0); + + fletcher_4_incremental_native(buf, len, &ba->zc); + + aiov.iov_base = buf; + aiov.iov_len = len; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_offset = (off_t)-1; + auio.uio_td = ba->td; +#ifdef _KERNEL + if (ba->fp->f_type == DTYPE_VNODE) + bwillwrite(); + ba->err = fo_write(ba->fp, &auio, ba->td->td_ucred, 0, ba->td); +#else + fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); + ba->err = EOPNOTSUPP; +#endif + + return (ba->err); +} + +static int +dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, + uint64_t length) +{ + /* write a FREE record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREE; + ba->drr->drr_u.drr_free.drr_object = object; + ba->drr->drr_u.drr_free.drr_offset = offset; + ba->drr->drr_u.drr_free.drr_length = length; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + return (0); +} + +static int +dump_data(struct backuparg *ba, dmu_object_type_t type, + uint64_t object, uint64_t offset, int blksz, void *data) +{ + /* write a DATA record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_WRITE; + ba->drr->drr_u.drr_write.drr_object = object; + ba->drr->drr_u.drr_write.drr_type = type; + ba->drr->drr_u.drr_write.drr_offset = offset; + ba->drr->drr_u.drr_write.drr_length = blksz; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + if (dump_bytes(ba, data, blksz)) + return (EINTR); + return (0); +} + +static int +dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) +{ + /* write a FREEOBJECTS record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREEOBJECTS; + ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; + ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + return (0); +} + +static int +dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) +{ + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) + return (dump_freeobjects(ba, object, 1)); + + /* write an OBJECT record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_OBJECT; + ba->drr->drr_u.drr_object.drr_object = object; + ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; + ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; + ba->drr->drr_u.drr_object.drr_blksz = + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; + ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; + ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + + if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) + return (EINTR); + + /* free anything past the end of the file */ + if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) + return (EINTR); + if (ba->err) + return (EINTR); + return (0); +} + +#define BP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +static int +backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +{ + struct backuparg *ba = arg; + uint64_t object = bc->bc_bookmark.zb_object; + int level = bc->bc_bookmark.zb_level; + uint64_t blkid = bc->bc_bookmark.zb_blkid; + blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; + dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; + void *data = bc->bc_data; + int err = 0; + + if (SIGPENDING(curthread)) + return (EINTR); + + ASSERT(data || bp == NULL); + + if (bp == NULL && object == 0) { + uint64_t span = BP_SPAN(bc->bc_dnode, level); + uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; + err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); + } else if (bp == NULL) { + uint64_t span = BP_SPAN(bc->bc_dnode, level); + err = dump_free(ba, object, blkid * span, span); + } else if (data && level == 0 && type == DMU_OT_DNODE) { + dnode_phys_t *blk = data; + int i; + int blksz = BP_GET_LSIZE(bp); + + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = + (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = dump_dnode(ba, dnobj, blk+i); + if (err) + break; + } + } else if (level == 0 && + type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { + int blksz = BP_GET_LSIZE(bp); + if (data == NULL) { + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + zbookmark_t zb; + + zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; + zb.zb_object = object; + zb.zb_level = level; + zb.zb_blkid = blkid; + (void) arc_read(NULL, spa, bp, + dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, + &aflags, &zb); + + if (abuf) { + err = dump_data(ba, type, object, blkid * blksz, + blksz, abuf->b_data); + (void) arc_buf_remove_ref(abuf, &abuf); + } + } else { + err = dump_data(ba, type, object, blkid * blksz, + blksz, data); + } + } + + ASSERT(err == 0 || err == EINTR); + return (err); +} + +int +dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) +{ + dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; + dmu_replay_record_t *drr; + struct backuparg ba; + int err; + + /* tosnap must be a snapshot */ + if (ds->ds_phys->ds_next_snap_obj == 0) + return (EINVAL); + + /* fromsnap must be an earlier snapshot from the same fs as tosnap */ + if (fromds && (ds->ds_dir != fromds->ds_dir || + fromds->ds_phys->ds_creation_txg >= + ds->ds_phys->ds_creation_txg)) + return (EXDEV); + + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; + drr->drr_u.drr_begin.drr_creation_time = + ds->ds_phys->ds_creation_time; + drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; + drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; + if (fromds) + drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; + dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); + + ba.drr = drr; + ba.td = curthread; + ba.fp = fp; + ba.os = tosnap; + ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (ba.err); + } + + err = traverse_dsl_dataset(ds, + fromds ? fromds->ds_phys->ds_creation_txg : 0, + ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, + backup_cb, &ba); + + if (err) { + if (err == EINTR && ba.err) + err = ba.err; + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (err); + } + + bzero(drr, sizeof (dmu_replay_record_t)); + drr->drr_type = DRR_END; + drr->drr_u.drr_end.drr_checksum = ba.zc; + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (ba.err); + } + + kmem_free(drr, sizeof (dmu_replay_record_t)); + + return (0); +} + +struct restorearg { + int err; + int byteswap; + kthread_t *td; + struct file *fp; + char *buf; + uint64_t voff; + int buflen; /* number of valid bytes in buf */ + int bufoff; /* next offset to read */ + int bufsize; /* amount of memory allocated for buf */ + zio_cksum_t zc; +}; + +/* ARGSUSED */ +static int +replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct drr_begin *drrb = arg2; + const char *snapname; + int err; + uint64_t val; + + /* must already be a snapshot of this fs */ + if (ds->ds_phys->ds_prev_snap_obj == 0) + return (ENODEV); + + /* most recent snapshot must match fromguid */ + if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) + return (ENODEV); + /* must not have any changes since most recent snapshot */ + if (ds->ds_phys->ds_bp.blk_birth > + ds->ds_prev->ds_phys->ds_creation_txg) + return (ETXTBSY); + + /* new snapshot name must not exist */ + snapname = strrchr(drrb->drr_toname, '@'); + if (snapname == NULL) + return (EEXIST); + + snapname++; + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + return (0); +} + +/* ARGSUSED */ +static void +replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; +} + +/* ARGSUSED */ +static int +replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct drr_begin *drrb = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + char *cp; + uint64_t val; + int err; + + cp = strchr(drrb->drr_toname, '@'); + *cp = '\0'; + err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, + strrchr(drrb->drr_toname, '/') + 1, + sizeof (uint64_t), 1, &val); + *cp = '@'; + + if (err != ENOENT) + return (err ? err : EEXIST); + + return (0); +} + +static void +replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct drr_begin *drrb = arg2; + char *cp; + dsl_dataset_t *ds; + uint64_t dsobj; + + cp = strchr(drrb->drr_toname, '@'); + *cp = '\0'; + dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1, + NULL, tx); + *cp = '@'; + + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, + DS_MODE_EXCLUSIVE, FTAG, &ds)); + + (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), + ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); +} + +static int +replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + struct drr_begin *drrb = arg2; + char *snapname; + + /* XXX verify that drr_toname is in dd */ + + snapname = strchr(drrb->drr_toname, '@'); + if (snapname == NULL) + return (EINVAL); + snapname++; + + return (dsl_dataset_snapshot_check(os, snapname, tx)); +} + +static void +replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + struct drr_begin *drrb = arg2; + char *snapname; + dsl_dataset_t *ds, *hds; + + snapname = strchr(drrb->drr_toname, '@') + 1; + + dsl_dataset_snapshot_sync(os, snapname, tx); + + /* set snapshot's creation time and guid */ + hds = os->os->os_dsl_dataset; + VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool, + hds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT, + FTAG, &ds)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_creation_time = drrb->drr_creation_time; + ds->ds_phys->ds_guid = drrb->drr_toguid; + ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + + dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); + + dmu_buf_will_dirty(hds->ds_dbuf, tx); + hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; +} + +static int +restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid) +{ + struct uio auio; + struct iovec aiov; + int error; + + aiov.iov_base = buf; + aiov.iov_len = len; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_offset = off; + auio.uio_td = ra->td; +#ifdef _KERNEL + error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); +#else + fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); + error = EOPNOTSUPP; +#endif + *resid = auio.uio_resid; + return (error); +} + +static void * +restore_read(struct restorearg *ra, int len) +{ + void *rv; + + /* some things will require 8-byte alignment, so everything must */ + ASSERT3U(len % 8, ==, 0); + + while (ra->buflen - ra->bufoff < len) { + int resid; + int leftover = ra->buflen - ra->bufoff; + + (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); + + ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover, + ra->bufsize - leftover, ra->voff, &resid); + + ra->voff += ra->bufsize - leftover - resid; + ra->buflen = ra->bufsize - resid; + ra->bufoff = 0; + if (resid == ra->bufsize - leftover) + ra->err = EINVAL; + if (ra->err) + return (NULL); + /* Could compute checksum here? */ + } + + ASSERT3U(ra->bufoff % 8, ==, 0); + ASSERT3U(ra->buflen - ra->bufoff, >=, len); + rv = ra->buf + ra->bufoff; + ra->bufoff += len; + if (ra->byteswap) + fletcher_4_incremental_byteswap(rv, len, &ra->zc); + else + fletcher_4_incremental_native(rv, len, &ra->zc); + return (rv); +} + +static void +backup_byteswap(dmu_replay_record_t *drr) +{ +#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) +#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) + drr->drr_type = BSWAP_32(drr->drr_type); + switch (drr->drr_type) { + case DRR_BEGIN: + DO64(drr_begin.drr_magic); + DO64(drr_begin.drr_version); + DO64(drr_begin.drr_creation_time); + DO32(drr_begin.drr_type); + DO64(drr_begin.drr_toguid); + DO64(drr_begin.drr_fromguid); + break; + case DRR_OBJECT: + DO64(drr_object.drr_object); + /* DO64(drr_object.drr_allocation_txg); */ + DO32(drr_object.drr_type); + DO32(drr_object.drr_bonustype); + DO32(drr_object.drr_blksz); + DO32(drr_object.drr_bonuslen); + break; + case DRR_FREEOBJECTS: + DO64(drr_freeobjects.drr_firstobj); + DO64(drr_freeobjects.drr_numobjs); + break; + case DRR_WRITE: + DO64(drr_write.drr_object); + DO32(drr_write.drr_type); + DO64(drr_write.drr_offset); + DO64(drr_write.drr_length); + break; + case DRR_FREE: + DO64(drr_free.drr_object); + DO64(drr_free.drr_offset); + DO64(drr_free.drr_length); + break; + case DRR_END: + DO64(drr_end.drr_checksum.zc_word[0]); + DO64(drr_end.drr_checksum.zc_word[1]); + DO64(drr_end.drr_checksum.zc_word[2]); + DO64(drr_end.drr_checksum.zc_word[3]); + break; + } +#undef DO64 +#undef DO32 +} + +static int +restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) +{ + int err; + dmu_tx_t *tx; + + err = dmu_object_info(os, drro->drr_object, NULL); + + if (err != 0 && err != ENOENT) + return (EINVAL); + + if (drro->drr_type == DMU_OT_NONE || + drro->drr_type >= DMU_OT_NUMTYPES || + drro->drr_bonustype >= DMU_OT_NUMTYPES || + drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || + P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || + drro->drr_blksz < SPA_MINBLOCKSIZE || + drro->drr_blksz > SPA_MAXBLOCKSIZE || + drro->drr_bonuslen > DN_MAX_BONUSLEN) { + return (EINVAL); + } + + tx = dmu_tx_create(os); + + if (err == ENOENT) { + /* currently free, want to be allocated */ + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_object_claim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } else { + /* currently allocated, want to be allocated */ + dmu_tx_hold_bonus(tx, drro->drr_object); + /* + * We may change blocksize, so need to + * hold_write + */ + dmu_tx_hold_write(tx, drro->drr_object, 0, 1); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + err = dmu_object_reclaim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } + if (err) { + dmu_tx_commit(tx); + return (EINVAL); + } + + dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); + dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); + + if (drro->drr_bonuslen) { + dmu_buf_t *db; + void *data; + VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + ASSERT3U(db->db_size, ==, drro->drr_bonuslen); + data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); + if (data == NULL) { + dmu_tx_commit(tx); + return (ra->err); + } + bcopy(data, db->db_data, db->db_size); + if (ra->byteswap) { + dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + drro->drr_bonuslen); + } + dmu_buf_rele(db, FTAG); + } + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_freeobjects(struct restorearg *ra, objset_t *os, + struct drr_freeobjects *drrfo) +{ + uint64_t obj; + + if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) + return (EINVAL); + + for (obj = drrfo->drr_firstobj; + obj < drrfo->drr_firstobj + drrfo->drr_numobjs; + (void) dmu_object_next(os, &obj, FALSE, 0)) { + dmu_tx_t *tx; + int err; + + if (dmu_object_info(os, obj, NULL) != 0) + continue; + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, obj); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_object_free(os, obj, tx); + dmu_tx_commit(tx); + if (err && err != ENOENT) + return (EINVAL); + } + return (0); +} + +static int +restore_write(struct restorearg *ra, objset_t *os, + struct drr_write *drrw) +{ + dmu_tx_t *tx; + void *data; + int err; + + if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || + drrw->drr_type >= DMU_OT_NUMTYPES) + return (EINVAL); + + data = restore_read(ra, drrw->drr_length); + if (data == NULL) + return (ra->err); + + if (dmu_object_info(os, drrw->drr_object, NULL) != 0) + return (EINVAL); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrw->drr_object, + drrw->drr_offset, drrw->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + if (ra->byteswap) + dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); + dmu_write(os, drrw->drr_object, + drrw->drr_offset, drrw->drr_length, data, tx); + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_free(struct restorearg *ra, objset_t *os, + struct drr_free *drrf) +{ + dmu_tx_t *tx; + int err; + + if (drrf->drr_length != -1ULL && + drrf->drr_offset + drrf->drr_length < drrf->drr_offset) + return (EINVAL); + + if (dmu_object_info(os, drrf->drr_object, NULL) != 0) + return (EINVAL); + + tx = dmu_tx_create(os); + + dmu_tx_hold_free(tx, drrf->drr_object, + drrf->drr_offset, drrf->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_free_range(os, drrf->drr_object, + drrf->drr_offset, drrf->drr_length, tx); + dmu_tx_commit(tx); + return (err); +} + +int +dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, + boolean_t force, struct file *fp, uint64_t voffset) +{ + kthread_t *td = curthread; + struct restorearg ra; + dmu_replay_record_t *drr; + char *cp; + objset_t *os = NULL; + zio_cksum_t pzc; + + bzero(&ra, sizeof (ra)); + ra.td = td; + ra.fp = fp; + ra.voff = voffset; + ra.bufsize = 1<<20; + ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + + if (drrb->drr_magic == DMU_BACKUP_MAGIC) { + ra.byteswap = FALSE; + } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + ra.byteswap = TRUE; + } else { + ra.err = EINVAL; + goto out; + } + + /* + * NB: this assumes that struct drr_begin will be the largest in + * dmu_replay_record_t's drr_u, and thus we don't need to pad it + * with zeros to make it the same length as we wrote out. + */ + ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN; + ((dmu_replay_record_t *)ra.buf)->drr_pad = 0; + ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb; + if (ra.byteswap) { + fletcher_4_incremental_byteswap(ra.buf, + sizeof (dmu_replay_record_t), &ra.zc); + } else { + fletcher_4_incremental_native(ra.buf, + sizeof (dmu_replay_record_t), &ra.zc); + } + (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */ + + if (ra.byteswap) { + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_version = BSWAP_64(drrb->drr_version); + drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } + + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + + if (drrb->drr_version != DMU_BACKUP_VERSION || + drrb->drr_type >= DMU_OST_NUMTYPES || + strchr(drrb->drr_toname, '@') == NULL) { + ra.err = EINVAL; + goto out; + } + + /* + * Process the begin in syncing context. + */ + if (drrb->drr_fromguid) { + /* incremental backup */ + dsl_dataset_t *ds = NULL; + + cp = strchr(tosnap, '@'); + *cp = '\0'; + ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds); + *cp = '@'; + if (ra.err) + goto out; + + /* + * Only do the rollback if the most recent snapshot + * matches the incremental source + */ + if (force) { + if (ds->ds_prev == NULL || + ds->ds_prev->ds_phys->ds_guid != + drrb->drr_fromguid) { + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + kmem_free(ra.buf, ra.bufsize); + return (ENODEV); + } + (void) dsl_dataset_rollback(ds); + } + ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool, + replay_incremental_check, replay_incremental_sync, + ds, drrb, 1); + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + } else { + /* full backup */ + dsl_dir_t *dd = NULL; + const char *tail; + + /* can't restore full backup into topmost fs, for now */ + if (strrchr(drrb->drr_toname, '/') == NULL) { + ra.err = EINVAL; + goto out; + } + + cp = strchr(tosnap, '@'); + *cp = '\0'; + ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail); + *cp = '@'; + if (ra.err) + goto out; + if (tail == NULL) { + ra.err = EEXIST; + goto out; + } + + ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check, + replay_full_sync, dd, drrb, 5); + dsl_dir_close(dd, FTAG); + } + if (ra.err) + goto out; + + /* + * Open the objset we are modifying. + */ + + cp = strchr(tosnap, '@'); + *cp = '\0'; + ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, + DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); + *cp = '@'; + ASSERT3U(ra.err, ==, 0); + + /* + * Read records and process them. + */ + pzc = ra.zc; + while (ra.err == 0 && + NULL != (drr = restore_read(&ra, sizeof (*drr)))) { + if (SIGPENDING(td)) { + ra.err = EINTR; + goto out; + } + + if (ra.byteswap) + backup_byteswap(drr); + + switch (drr->drr_type) { + case DRR_OBJECT: + { + /* + * We need to make a copy of the record header, + * because restore_{object,write} may need to + * restore_read(), which will invalidate drr. + */ + struct drr_object drro = drr->drr_u.drr_object; + ra.err = restore_object(&ra, os, &drro); + break; + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects drrfo = + drr->drr_u.drr_freeobjects; + ra.err = restore_freeobjects(&ra, os, &drrfo); + break; + } + case DRR_WRITE: + { + struct drr_write drrw = drr->drr_u.drr_write; + ra.err = restore_write(&ra, os, &drrw); + break; + } + case DRR_FREE: + { + struct drr_free drrf = drr->drr_u.drr_free; + ra.err = restore_free(&ra, os, &drrf); + break; + } + case DRR_END: + { + struct drr_end drre = drr->drr_u.drr_end; + /* + * We compare against the *previous* checksum + * value, because the stored checksum is of + * everything before the DRR_END record. + */ + if (drre.drr_checksum.zc_word[0] != 0 && + !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) { + ra.err = ECKSUM; + goto out; + } + + ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> + ds_dir->dd_pool, replay_end_check, replay_end_sync, + os, drrb, 3); + goto out; + } + default: + ra.err = EINVAL; + goto out; + } + pzc = ra.zc; + } + +out: + if (os) + dmu_objset_close(os); + + /* + * Make sure we don't rollback/destroy unless we actually + * processed the begin properly. 'os' will only be set if this + * is the case. + */ + if (ra.err && os && tosnap && strchr(tosnap, '@')) { + /* + * rollback or destroy what we created, so we don't + * leave it in the restoring state. + */ + dsl_dataset_t *ds; + int err; + + cp = strchr(tosnap, '@'); + *cp = '\0'; + err = dsl_dataset_open(tosnap, + DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, + FTAG, &ds); + if (err == 0) { + txg_wait_synced(ds->ds_dir->dd_pool, 0); + if (drrb->drr_fromguid) { + /* incremental: rollback to most recent snap */ + (void) dsl_dataset_rollback(ds); + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + } else { + /* full: destroy whole fs */ + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + (void) dsl_dataset_destroy(tosnap); + } + } + *cp = '@'; + } + + kmem_free(ra.buf, ra.bufsize); + if (sizep) + *sizep = ra.voff; + return (ra.err); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c new file mode 100644 index 0000000..3d2bc3e --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -0,0 +1,888 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_impl.h> + +#define BP_SPAN_SHIFT(level, width) ((level) * (width)) + +#define BP_EQUAL(b1, b2) \ + (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ + (b1)->blk_birth == (b2)->blk_birth) + +/* + * Compare two bookmarks. + * + * For ADVANCE_PRE, the visitation order is: + * + * objset 0, 1, 2, ..., ZB_MAXOBJSET. + * object 0, 1, 2, ..., ZB_MAXOBJECT. + * blkoff 0, 1, 2, ... + * level ZB_MAXLEVEL, ..., 2, 1, 0. + * + * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid + * ordering vector is: + * + * < objset, object, blkoff, -level > + * + * For ADVANCE_POST, the starting offsets aren't sequential but ending + * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. + * The visitation order is: + * + * objset 1, 2, ..., ZB_MAXOBJSET, 0. + * object 1, 2, ..., ZB_MAXOBJECT, 0. + * blkoff 1, 2, ... + * level 0, 1, 2, ..., ZB_MAXLEVEL. + * + * and thus a valid ordering vector is: + * + * < objset - 1, object - 1, blkoff, level > + * + * Both orderings can be expressed as: + * + * < objset + bias, object + bias, blkoff, level ^ bias > + * + * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) + * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). + * + * Special case: an objset's osphys is represented as level -1 of object 0. + * It is always either the very first or very last block we visit in an objset. + * Therefore, if either bookmark's level is -1, level alone determines order. + */ +static int +compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, + int advance) +{ + int bias = (advance & ADVANCE_PRE) ? 0 : -1; + uint64_t sblkoff, eblkoff; + int slevel, elevel, wshift; + + if (szb->zb_objset + bias < ezb->zb_objset + bias) + return (-1); + + if (szb->zb_objset + bias > ezb->zb_objset + bias) + return (1); + + slevel = szb->zb_level; + elevel = ezb->zb_level; + + if ((slevel | elevel) < 0) + return ((slevel ^ bias) - (elevel ^ bias)); + + if (szb->zb_object + bias < ezb->zb_object + bias) + return (-1); + + if (szb->zb_object + bias > ezb->zb_object + bias) + return (1); + + if (dnp == NULL) + return (0); + + wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; + + sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); + eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); + + if (sblkoff < eblkoff) + return (-1); + + if (sblkoff > eblkoff) + return (1); + + return ((elevel ^ bias) - (slevel ^ bias)); +} + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +#define SET_BOOKMARK_LB(zb, level, blkid) \ +{ \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +static int +advance_objset(zseg_t *zseg, uint64_t objset, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + + if (advance & ADVANCE_PRE) { + if (objset >= ZB_MAXOBJSET) + return (ERANGE); + SET_BOOKMARK(zb, objset, 0, -1, 0); + } else { + if (objset >= ZB_MAXOBJSET) + objset = 0; + SET_BOOKMARK(zb, objset, 1, 0, 0); + } + + if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +advance_object(zseg_t *zseg, uint64_t object, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + + if (advance & ADVANCE_PRE) { + if (object >= ZB_MAXOBJECT) { + SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); + } else { + SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); + } + } else { + if (zb->zb_object == 0) { + SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); + } else { + if (object >= ZB_MAXOBJECT) + object = 0; + SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); + } + } + + if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +advance_from_osphys(zseg_t *zseg, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + + ASSERT(zb->zb_object == 0); + ASSERT(zb->zb_level == -1); + ASSERT(zb->zb_blkid == 0); + + if (advance & ADVANCE_PRE) { + SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); + } else { + if (zb->zb_objset == 0) + return (ERANGE); + SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); + } + + if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; + int maxlevel = dnp->dn_nlevels - 1; + int level = zb->zb_level; + uint64_t blkid = zb->zb_blkid; + + if (advance & ADVANCE_PRE) { + if (level > 0 && rc == 0) { + level--; + blkid <<= wshift; + } else { + blkid++; + + if ((blkid << BP_SPAN_SHIFT(level, wshift)) > + dnp->dn_maxblkid) + return (ERANGE); + + while (level < maxlevel) { + if (P2PHASE(blkid, 1ULL << wshift)) + break; + blkid >>= wshift; + level++; + } + } + } else { + if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { + blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); + level = 0; + } else { + blkid >>= wshift; + level++; + } + + while ((blkid << BP_SPAN_SHIFT(level, wshift)) > + dnp->dn_maxblkid) { + if (level == maxlevel) + return (ERANGE); + blkid >>= wshift; + level++; + } + } + SET_BOOKMARK_LB(zb, level, blkid); + + if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) +{ + /* + * Before we issue the callback, prune against maxtxg. + * + * We prune against mintxg before we get here because it's a big win. + * If a given block was born in txg 37, then we know that the entire + * subtree below that block must have been born in txg 37 or earlier. + * We can therefore lop off huge branches of the tree as we go. + * + * There's no corresponding optimization for maxtxg because knowing + * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's + * children. In fact, the copy-on-write design of ZFS ensures that + * top-level blocks will pretty much always be new. + * + * Therefore, in the name of simplicity we don't prune against + * maxtxg until the last possible moment -- that being right now. + */ + if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) + return (0); + + /* + * Debugging: verify that the order we visit things agrees with the + * order defined by compare_bookmark(). We don't check this for + * log blocks because there's no defined ordering for them; they're + * always visited (or not) as part of visiting the objset_phys_t. + */ + if (bc->bc_errno == 0 && bc != &th->th_zil_cache) { + zbookmark_t *zb = &bc->bc_bookmark; + zbookmark_t *szb = &zseg->seg_start; + zbookmark_t *ezb = &zseg->seg_end; + zbookmark_t *lzb = &th->th_lastcb; + dnode_phys_t *dnp = bc->bc_dnode; + + ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); + ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); + ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || + lzb->zb_level == ZB_NO_LEVEL); + *lzb = *zb; + } + + th->th_callbacks++; + return (th->th_func(bc, th->th_spa, th->th_arg)); +} + +static int +traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, + dnode_phys_t *dnp) +{ + zbookmark_t *zb = &bc->bc_bookmark; + int error; + + th->th_hits++; + + bc->bc_dnode = dnp; + bc->bc_errno = 0; + + if (BP_EQUAL(&bc->bc_blkptr, bp)) + return (0); + + bc->bc_blkptr = *bp; + + if (bc->bc_data == NULL) + return (0); + + if (BP_IS_HOLE(bp)) { + ASSERT(th->th_advance & ADVANCE_HOLES); + return (0); + } + + if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { + error = EIO; + } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { + error = 0; + th->th_arc_hits++; + } else { + error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, + BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, + th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb)); + + if (BP_SHOULD_BYTESWAP(bp) && error == 0) + (zb->zb_level > 0 ? byteswap_uint64_array : + dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, + BP_GET_LSIZE(bp)); + th->th_reads++; + } + + if (error) { + bc->bc_errno = error; + error = traverse_callback(th, NULL, bc); + ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); + bc->bc_blkptr.blk_birth = -1ULL; + } + + dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", + bc - &th->th_cache[0][0], error, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); + + return (error); +} + +static int +find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) +{ + zbookmark_t *zb = &zseg->seg_start; + traverse_blk_cache_t *bc; + blkptr_t *bp = dnp->dn_blkptr; + int i, first, level; + int nbp = dnp->dn_nblkptr; + int minlevel = zb->zb_level; + int maxlevel = dnp->dn_nlevels - 1; + int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; + int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); + uint64_t blkid = zb->zb_blkid >> bp_shift; + int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; + int rc; + + if (minlevel > maxlevel || blkid >= nbp) + return (ERANGE); + + for (level = maxlevel; level >= minlevel; level--) { + first = P2PHASE(blkid, 1ULL << wshift); + + for (i = first; i < nbp; i++) + if (bp[i].blk_birth > zseg->seg_mintxg || + BP_IS_HOLE(&bp[i]) && do_holes) + break; + + if (i != first) { + i--; + SET_BOOKMARK_LB(zb, level, blkid + (i - first)); + return (ENOTBLK); + } + + bc = &th->th_cache[depth][level]; + + SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, + level, blkid); + + if (rc = traverse_read(th, bc, bp + i, dnp)) { + if (rc != EAGAIN) { + SET_BOOKMARK_LB(zb, level, blkid); + } + return (rc); + } + + if (BP_IS_HOLE(&bp[i])) { + SET_BOOKMARK_LB(zb, level, blkid); + th->th_lastcb.zb_level = ZB_NO_LEVEL; + return (0); + } + + nbp = 1 << wshift; + bp = bc->bc_data; + bp_shift -= wshift; + blkid = zb->zb_blkid >> bp_shift; + } + + return (0); +} + +static int +get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, + uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) +{ + zseg_t zseg; + zbookmark_t *zb = &zseg.seg_start; + uint64_t object = *objectp; + int i, rc; + + SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); + SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); + + zseg.seg_mintxg = txg; + zseg.seg_maxtxg = -1ULL; + + for (;;) { + rc = find_block(th, &zseg, mdn, depth); + + if (rc == EAGAIN || rc == EINTR || rc == ERANGE) + break; + + if (rc == 0 && zb->zb_level == 0) { + dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; + for (i = 0; i < DNODES_PER_BLOCK; i++) { + object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; + if (object >= *objectp && + dnp[i].dn_type != DMU_OT_NONE && + (type == -1 || dnp[i].dn_type == type)) { + *objectp = object; + *dnpp = &dnp[i]; + return (0); + } + } + } + + rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); + + if (rc == ERANGE) + break; + } + + if (rc == ERANGE) + *objectp = ZB_MAXOBJECT; + + return (rc); +} + +/* ARGSUSED */ +static void +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + traverse_handle_t *th = arg; + traverse_blk_cache_t *bc = &th->th_zil_cache; + zbookmark_t *zb = &bc->bc_bookmark; + zseg_t *zseg = list_head(&th->th_seglist); + + if (bp->blk_birth <= zseg->seg_mintxg) + return; + + if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { + zb->zb_object = 0; + zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + bc->bc_blkptr = *bp; + (void) traverse_callback(th, zseg, bc); + } +} + +/* ARGSUSED */ +static void +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + traverse_handle_t *th = arg; + traverse_blk_cache_t *bc = &th->th_zil_cache; + zbookmark_t *zb = &bc->bc_bookmark; + zseg_t *zseg = list_head(&th->th_seglist); + + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + + if (bp->blk_birth <= zseg->seg_mintxg) + return; + + if (claim_txg != 0 && bp->blk_birth >= claim_txg) { + zb->zb_object = lr->lr_foid; + zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); + bc->bc_blkptr = *bp; + (void) traverse_callback(th, zseg, bc); + } + } +} + +static void +traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc) +{ + spa_t *spa = th->th_spa; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_phys_t *osphys = bc->bc_data; + zil_header_t *zh = &osphys->os_zil_header; + uint64_t claim_txg = zh->zh_claim_txg; + zilog_t *zilog; + + ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]); + ASSERT(bc->bc_bookmark.zb_level == -1); + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && (spa_mode & FWRITE)) + return; + + th->th_zil_cache.bc_bookmark = bc->bc_bookmark; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, + claim_txg); + + zil_free(zilog); +} + +static int +traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) +{ + zbookmark_t *zb = &zseg->seg_start; + traverse_blk_cache_t *bc; + dnode_phys_t *dn, *dn_tmp; + int worklimit = 100; + int rc; + + dprintf("<%llu, %llu, %d, %llx>\n", + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); + + bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; + dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; + + SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); + + rc = traverse_read(th, bc, mosbp, dn); + + if (rc) /* If we get ERESTART, we've got nowhere left to go */ + return (rc == ERESTART ? EINTR : rc); + + ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); + + if (zb->zb_objset != 0) { + uint64_t objset = zb->zb_objset; + dsl_dataset_phys_t *dsp; + + rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, + DMU_OT_DSL_DATASET, ZB_MOS_CACHE); + + if (objset != zb->zb_objset) + rc = advance_objset(zseg, objset, th->th_advance); + + if (rc != 0) + return (rc); + + dsp = DN_BONUS(dn_tmp); + + bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; + dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; + + SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); + + /* + * If we're traversing an open snapshot, we know that it + * can't be deleted (because it's open) and it can't change + * (because it's a snapshot). Therefore, once we've gotten + * from the uberblock down to the snapshot's objset_phys_t, + * we no longer need to synchronize with spa_sync(); we're + * traversing a completely static block tree from here on. + */ + if (th->th_advance & ADVANCE_NOLOCK) { + ASSERT(th->th_locked); + rw_exit(spa_traverse_rwlock(th->th_spa)); + th->th_locked = 0; + } + + rc = traverse_read(th, bc, &dsp->ds_bp, dn); + + if (rc != 0) { + if (rc == ERESTART) + rc = advance_objset(zseg, zb->zb_objset + 1, + th->th_advance); + return (rc); + } + + if (th->th_advance & ADVANCE_PRUNE) + zseg->seg_mintxg = + MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); + } + + if (zb->zb_level == -1) { + ASSERT(zb->zb_object == 0); + ASSERT(zb->zb_blkid == 0); + ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET); + + if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { + rc = traverse_callback(th, zseg, bc); + if (rc) { + ASSERT(rc == EINTR); + return (rc); + } + if ((th->th_advance & ADVANCE_ZIL) && + zb->zb_objset != 0) + traverse_zil(th, bc); + } + + return (advance_from_osphys(zseg, th->th_advance)); + } + + if (zb->zb_object != 0) { + uint64_t object = zb->zb_object; + + rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, + zseg->seg_mintxg, -1, ZB_MDN_CACHE); + + if (object != zb->zb_object) + rc = advance_object(zseg, object, th->th_advance); + + if (rc != 0) + return (rc); + + dn = dn_tmp; + } + + if (zb->zb_level == ZB_MAXLEVEL) + zb->zb_level = dn->dn_nlevels - 1; + + for (;;) { + rc = find_block(th, zseg, dn, ZB_DN_CACHE); + + if (rc == EAGAIN || rc == EINTR || rc == ERANGE) + break; + + if (rc == 0) { + bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; + ASSERT(bc->bc_dnode == dn); + ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); + rc = traverse_callback(th, zseg, bc); + if (rc) { + ASSERT(rc == EINTR); + return (rc); + } + if (BP_IS_HOLE(&bc->bc_blkptr)) { + ASSERT(th->th_advance & ADVANCE_HOLES); + rc = ENOTBLK; + } + } + + rc = advance_block(zseg, dn, rc, th->th_advance); + + if (rc == ERANGE) + break; + + /* + * Give spa_sync() a chance to run. + */ + if (th->th_locked && spa_traverse_wanted(th->th_spa)) { + th->th_syncs++; + return (EAGAIN); + } + + if (--worklimit == 0) + return (EAGAIN); + } + + if (rc == ERANGE) + rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); + + return (rc); +} + +/* + * It is the caller's responsibility to ensure that the dsl_dataset_t + * doesn't go away during traversal. + */ +int +traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, + blkptr_cb_t func, void *arg) +{ + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + traverse_handle_t *th; + int err; + + th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); + + traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); + + while ((err = traverse_more(th)) == EAGAIN) + continue; + + traverse_fini(th); + return (err); +} + +int +traverse_more(traverse_handle_t *th) +{ + zseg_t *zseg = list_head(&th->th_seglist); + uint64_t save_txg; /* XXX won't be necessary with real itinerary */ + krwlock_t *rw = spa_traverse_rwlock(th->th_spa); + blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); + int rc; + + if (zseg == NULL) + return (0); + + th->th_restarts++; + + save_txg = zseg->seg_mintxg; + + rw_enter(rw, RW_READER); + th->th_locked = 1; + + rc = traverse_segment(th, zseg, mosbp); + ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); + + if (th->th_locked) + rw_exit(rw); + th->th_locked = 0; + + zseg->seg_mintxg = save_txg; + + if (rc == ERANGE) { + list_remove(&th->th_seglist, zseg); + kmem_free(zseg, sizeof (*zseg)); + return (EAGAIN); + } + + return (rc); +} + +/* + * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves + * are not included. The blocks covered by this segment will all have + * mintxg < birth < maxtxg. + */ +static void +traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, + uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, + uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) +{ + zseg_t *zseg; + + zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); + + zseg->seg_mintxg = mintxg; + zseg->seg_maxtxg = maxtxg; + + zseg->seg_start.zb_objset = sobjset; + zseg->seg_start.zb_object = sobject; + zseg->seg_start.zb_level = slevel; + zseg->seg_start.zb_blkid = sblkid; + + zseg->seg_end.zb_objset = eobjset; + zseg->seg_end.zb_object = eobject; + zseg->seg_end.zb_level = elevel; + zseg->seg_end.zb_blkid = eblkid; + + list_insert_tail(&th->th_seglist, zseg); +} + +void +traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, + uint64_t objset, uint64_t object) +{ + if (th->th_advance & ADVANCE_PRE) + traverse_add_segment(th, mintxg, maxtxg, + objset, object, ZB_MAXLEVEL, 0, + objset, object, 0, ZB_MAXBLKID); + else + traverse_add_segment(th, mintxg, maxtxg, + objset, object, 0, 0, + objset, object, 0, ZB_MAXBLKID); +} + +void +traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, + uint64_t objset) +{ + if (th->th_advance & ADVANCE_PRE) + traverse_add_segment(th, mintxg, maxtxg, + objset, 0, -1, 0, + objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); + else + traverse_add_segment(th, mintxg, maxtxg, + objset, 1, 0, 0, + objset, 0, -1, 0); +} + +void +traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) +{ + if (th->th_advance & ADVANCE_PRE) + traverse_add_segment(th, mintxg, maxtxg, + 0, 0, -1, 0, + ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); + else + traverse_add_segment(th, mintxg, maxtxg, + 1, 1, 0, 0, + 0, 0, -1, 0); +} + +traverse_handle_t * +traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, + int zio_flags) +{ + traverse_handle_t *th; + int d, l; + + th = kmem_zalloc(sizeof (*th), KM_SLEEP); + + th->th_spa = spa; + th->th_func = func; + th->th_arg = arg; + th->th_advance = advance; + th->th_lastcb.zb_level = ZB_NO_LEVEL; + th->th_noread.zb_level = ZB_NO_LEVEL; + th->th_zio_flags = zio_flags; + + list_create(&th->th_seglist, sizeof (zseg_t), + offsetof(zseg_t, seg_node)); + + for (d = 0; d < ZB_DEPTH; d++) { + for (l = 0; l < ZB_MAXLEVEL; l++) { + if ((advance & ADVANCE_DATA) || + l != 0 || d != ZB_DN_CACHE) + th->th_cache[d][l].bc_data = + zio_buf_alloc(SPA_MAXBLOCKSIZE); + } + } + + return (th); +} + +void +traverse_fini(traverse_handle_t *th) +{ + int d, l; + zseg_t *zseg; + + for (d = 0; d < ZB_DEPTH; d++) + for (l = 0; l < ZB_MAXLEVEL; l++) + if (th->th_cache[d][l].bc_data != NULL) + zio_buf_free(th->th_cache[d][l].bc_data, + SPA_MAXBLOCKSIZE); + + while ((zseg = list_head(&th->th_seglist)) != NULL) { + list_remove(&th->th_seglist, zseg); + kmem_free(zseg, sizeof (*zseg)); + } + + list_destroy(&th->th_seglist); + + dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", + th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, + th->th_syncs, th->th_restarts); + + kmem_free(th, sizeof (*th)); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c new file mode 100644 index 0000000..13fd8d4 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -0,0 +1,992 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ +#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ +#include <sys/dsl_pool.h> +#include <sys/zap_impl.h> /* for fzap_default_block_shift */ +#include <sys/spa.h> +#include <sys/zfs_context.h> + +typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, + uint64_t arg1, uint64_t arg2); + + +dmu_tx_t * +dmu_tx_create_dd(dsl_dir_t *dd) +{ + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + tx->tx_dir = dd; + if (dd) + tx->tx_pool = dd->dd_pool; + list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), + offsetof(dmu_tx_hold_t, txh_node)); +#ifdef ZFS_DEBUG + refcount_create(&tx->tx_space_written); + refcount_create(&tx->tx_space_freed); +#endif + return (tx); +} + +dmu_tx_t * +dmu_tx_create(objset_t *os) +{ + dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); + tx->tx_objset = os; + tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); + return (tx); +} + +dmu_tx_t * +dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) +{ + dmu_tx_t *tx = dmu_tx_create_dd(NULL); + + ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); + tx->tx_pool = dp; + tx->tx_txg = txg; + tx->tx_anyobj = TRUE; + + return (tx); +} + +int +dmu_tx_is_syncing(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +int +dmu_tx_private_ok(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +static dmu_tx_hold_t * +dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, + enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) +{ + dmu_tx_hold_t *txh; + dnode_t *dn = NULL; + int err; + + if (object != DMU_NEW_OBJECT) { + err = dnode_hold(os->os, object, tx, &dn); + if (err) { + tx->tx_err = err; + return (NULL); + } + + if (err == 0 && tx->tx_txg != 0) { + mutex_enter(&dn->dn_mtx); + /* + * dn->dn_assigned_txg == tx->tx_txg doesn't pose a + * problem, but there's no way for it to happen (for + * now, at least). + */ + ASSERT(dn->dn_assigned_txg == 0); + dn->dn_assigned_txg = tx->tx_txg; + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + } + + txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + txh->txh_tx = tx; + txh->txh_dnode = dn; +#ifdef ZFS_DEBUG + txh->txh_type = type; + txh->txh_arg1 = arg1; + txh->txh_arg2 = arg2; +#endif + list_insert_tail(&tx->tx_holds, txh); + + return (txh); +} + +void +dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) +{ + /* + * If we're syncing, they can manipulate any object anyhow, and + * the hold on the dnode_t can cause problems. + */ + if (!dmu_tx_is_syncing(tx)) { + (void) dmu_tx_hold_object_impl(tx, os, + object, THT_NEWOBJECT, 0, 0); + } +} + +static int +dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) +{ + int err; + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold_level(dn, level, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) + return (EIO); + err = dbuf_read(db, zio, DB_RF_CANFAIL); + dbuf_rele(db, FTAG); + return (err); +} + +/* ARGSUSED */ +static void +dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + uint64_t start, end, i; + int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; + int err = 0; + + if (len == 0) + return; + + min_bs = SPA_MINBLOCKSHIFT; + max_bs = SPA_MAXBLOCKSHIFT; + min_ibs = DN_MIN_INDBLKSHIFT; + max_ibs = DN_MAX_INDBLKSHIFT; + + + /* + * For i/o error checking, read the first and last level-0 + * blocks (if they are not aligned), and all the level-1 blocks. + */ + + if (dn) { + if (dn->dn_maxblkid == 0) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err) + goto out; + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || + len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err) + goto out; + } + + /* last level-0 block */ + end = (off+len-1) >> dn->dn_datablkshift; + if (end != start && + P2PHASE(off+len, dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(zio, dn, 0, end); + if (err) + goto out; + } + + /* level-1 blocks */ + if (dn->dn_nlevels > 1) { + start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; + end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (i = start+1; i < end; i++) { + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err) + goto out; + } + } + + err = zio_wait(zio); + if (err) + goto out; + } + } + + /* + * If there's more than one block, the blocksize can't change, + * so we can make a more precise estimate. Alternatively, + * if the dnode's ibs is larger than max_ibs, always use that. + * This ensures that if we reduce DN_MAX_INDBLKSHIFT, + * the code will still work correctly on existing pools. + */ + if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { + min_ibs = max_ibs = dn->dn_indblkshift; + if (dn->dn_datablkshift != 0) + min_bs = max_bs = dn->dn_datablkshift; + } + + /* + * 'end' is the last thing we will access, not one past. + * This way we won't overflow when accessing the last byte. + */ + start = P2ALIGN(off, 1ULL << max_bs); + end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; + txh->txh_space_towrite += end - start + 1; + + start >>= min_bs; + end >>= min_bs; + + epbs = min_ibs - SPA_BLKPTRSHIFT; + + /* + * The object contains at most 2^(64 - min_bs) blocks, + * and each indirect level maps 2^epbs. + */ + for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { + start >>= epbs; + end >>= epbs; + /* + * If we increase the number of levels of indirection, + * we'll need new blkid=0 indirect blocks. If start == 0, + * we're already accounting for that blocks; and if end == 0, + * we can't increase the number of levels beyond that. + */ + if (start != 0 && end != 0) + txh->txh_space_towrite += 1ULL << max_ibs; + txh->txh_space_towrite += (end - start + 1) << max_ibs; + } + + ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); + +out: + if (err) + txh->txh_tx->tx_err = err; +} + +static void +dmu_tx_count_dnode(dmu_tx_hold_t *txh) +{ + dnode_t *dn = txh->txh_dnode; + dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; + uint64_t space = mdn->dn_datablksz + + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); + + if (dn && dn->dn_dbuf->db_blkptr && + dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + dn->dn_dbuf->db_blkptr->blk_birth)) { + txh->txh_space_tooverwrite += space; + } else { + txh->txh_space_towrite += space; + } +} + +void +dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + ASSERT(len < DMU_MAX_ACCESS); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_WRITE, off, len); + if (txh == NULL) + return; + + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); +} + +static void +dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + uint64_t blkid, nblks; + uint64_t space = 0; + dnode_t *dn = txh->txh_dnode; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + spa_t *spa = txh->txh_tx->tx_pool->dp_spa; + int dirty; + + /* + * We don't need to use any locking to check for dirtyness + * because it's OK if we get stale data -- the dnode may become + * dirty immediately after our check anyway. This is just a + * means to avoid the expensive count when we aren't sure we + * need it. We need to be able to deal with a dirty dnode. + */ + dirty = list_link_active(&dn->dn_dirty_link[0]) | + list_link_active(&dn->dn_dirty_link[1]) | + list_link_active(&dn->dn_dirty_link[2]) | + list_link_active(&dn->dn_dirty_link[3]); + if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0) + return; + + /* + * the struct_rwlock protects us against dn_phys->dn_nlevels + * changing, in case (against all odds) we manage to dirty & + * sync out the changes after we check for being dirty. + * also, dbuf_hold_impl() wants us to have the struct_rwlock. + * + * It's fine to use dn_datablkshift rather than the dn_phys + * equivalent because if it is changing, maxblkid==0 and we will + * bail. + */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_phys->dn_maxblkid == 0) { + if (off == 0 && len >= dn->dn_datablksz) { + blkid = 0; + nblks = 1; + } else { + rw_exit(&dn->dn_struct_rwlock); + return; + } + } else { + blkid = off >> dn->dn_datablkshift; + nblks = (off + len) >> dn->dn_datablkshift; + + if (blkid >= dn->dn_phys->dn_maxblkid) { + rw_exit(&dn->dn_struct_rwlock); + return; + } + if (blkid + nblks > dn->dn_phys->dn_maxblkid) + nblks = dn->dn_phys->dn_maxblkid - blkid; + + /* don't bother after 128,000 blocks */ + nblks = MIN(nblks, 128*1024); + } + + if (dn->dn_phys->dn_nlevels == 1) { + int i; + for (i = 0; i < nblks; i++) { + blkptr_t *bp = dn->dn_phys->dn_blkptr; + ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); + bp += blkid + i; + if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { + dprintf_bp(bp, "can free old%s", ""); + space += bp_get_dasize(spa, bp); + } + } + nblks = 0; + } + + while (nblks) { + dmu_buf_impl_t *dbuf; + int err, epbs, blkoff, tochk; + + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + blkoff = P2PHASE(blkid, 1<<epbs); + tochk = MIN((1<<epbs) - blkoff, nblks); + + err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf); + if (err == 0) { + int i; + blkptr_t *bp; + + err = dbuf_read(dbuf, NULL, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL); + if (err != 0) { + txh->txh_tx->tx_err = err; + dbuf_rele(dbuf, FTAG); + break; + } + + bp = dbuf->db.db_data; + bp += blkoff; + + for (i = 0; i < tochk; i++) { + if (dsl_dataset_block_freeable(ds, + bp[i].blk_birth)) { + dprintf_bp(&bp[i], + "can free old%s", ""); + space += bp_get_dasize(spa, &bp[i]); + } + } + dbuf_rele(dbuf, FTAG); + } + if (err && err != ENOENT) { + txh->txh_tx->tx_err = err; + break; + } + + blkid += tochk; + nblks -= tochk; + } + rw_exit(&dn->dn_struct_rwlock); + + txh->txh_space_tofree += space; +} + +void +dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) +{ + dmu_tx_hold_t *txh; + dnode_t *dn; + uint64_t start, end, i; + int err, shift; + zio_t *zio; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_FREE, off, len); + if (txh == NULL) + return; + dn = txh->txh_dnode; + + /* first block */ + if (off != 0) + dmu_tx_count_write(txh, off, 1); + /* last block */ + if (len != DMU_OBJECT_END) + dmu_tx_count_write(txh, off+len, 1); + + if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) + return; + if (len == DMU_OBJECT_END) + len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + + /* + * For i/o error checking, read the first and last level-0 + * blocks, and all the level-1 blocks. The above count_write's + * will take care of the level-0 blocks. + */ + if (dn->dn_nlevels > 1) { + shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + start = off >> shift; + end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; + + zio = zio_root(tx->tx_pool->dp_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + for (i = start; i <= end; i++) { + uint64_t ibyte = i << shift; + err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0); + i = ibyte >> shift; + if (err == ESRCH) + break; + if (err) { + tx->tx_err = err; + return; + } + + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err) { + tx->tx_err = err; + return; + } + } + err = zio_wait(zio); + if (err) { + tx->tx_err = err; + return; + } + } + + dmu_tx_count_dnode(txh); + dmu_tx_count_free(txh, off, len); +} + +void +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) +{ + dmu_tx_hold_t *txh; + dnode_t *dn; + uint64_t nblocks; + int epbs, err; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_ZAP, add, (uintptr_t)name); + if (txh == NULL) + return; + dn = txh->txh_dnode; + + dmu_tx_count_dnode(txh); + + if (dn == NULL) { + /* + * We will be able to fit a new object's entries into one leaf + * block. So there will be at most 2 blocks total, + * including the header block. + */ + dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); + return; + } + + ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + + if (dn->dn_maxblkid == 0 && !add) { + /* + * If there is only one block (i.e. this is a micro-zap) + * and we are not adding anything, the accounting is simple. + */ + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err) { + tx->tx_err = err; + return; + } + + /* + * Use max block size here, since we don't know how much + * the size will change between now and the dbuf dirty call. + */ + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + dn->dn_phys->dn_blkptr[0].blk_birth)) + txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + else + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + return; + } + + if (dn->dn_maxblkid > 0 && name) { + /* + * access the name in this fat-zap so that we'll check + * for i/o errors to the leaf blocks, etc. + */ + err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, + 8, 0, NULL); + if (err == EIO) { + tx->tx_err = err; + return; + } + } + + /* + * 3 blocks overwritten: target leaf, ptrtbl block, header block + * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks + */ + dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, + (3 + add ? 3 : 0) << dn->dn_datablkshift); + + /* + * If the modified blocks are scattered to the four winds, + * we'll have to modify an indirect twig for each. + */ + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) + txh->txh_space_towrite += 3 << dn->dn_indblkshift; +} + +void +dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_BONUS, 0, 0); + if (txh) + dmu_tx_count_dnode(txh); +} + +void +dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) +{ + dmu_tx_hold_t *txh; + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + DMU_NEW_OBJECT, THT_SPACE, space, 0); + + txh->txh_space_towrite += space; +} + +int +dmu_tx_holds(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + int holds = 0; + + /* + * By asserting that the tx is assigned, we're counting the + * number of dn_tx_holds, which is the same as the number of + * dn_holds. Otherwise, we'd be counting dn_holds, but + * dn_tx_holds could be 0. + */ + ASSERT(tx->tx_txg != 0); + + /* if (tx->tx_anyobj == TRUE) */ + /* return (0); */ + + for (txh = list_head(&tx->tx_holds); txh; + txh = list_next(&tx->tx_holds, txh)) { + if (txh->txh_dnode && txh->txh_dnode->dn_object == object) + holds++; + } + + return (holds); +} + +#ifdef ZFS_DEBUG +void +dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) +{ + dmu_tx_hold_t *txh; + int match_object = FALSE, match_offset = FALSE; + dnode_t *dn = db->db_dnode; + + ASSERT(tx->tx_txg != 0); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); + ASSERT3U(dn->dn_object, ==, db->db.db_object); + + if (tx->tx_anyobj) + return; + + /* XXX No checking on the meta dnode for now */ + if (db->db.db_object == DMU_META_DNODE_OBJECT) + return; + + for (txh = list_head(&tx->tx_holds); txh; + txh = list_next(&tx->tx_holds, txh)) { + ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); + if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) + match_object = TRUE; + if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { + int datablkshift = dn->dn_datablkshift ? + dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int shift = datablkshift + epbs * db->db_level; + uint64_t beginblk = shift >= 64 ? 0 : + (txh->txh_arg1 >> shift); + uint64_t endblk = shift >= 64 ? 0 : + ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); + uint64_t blkid = db->db_blkid; + + /* XXX txh_arg2 better not be zero... */ + + dprintf("found txh type %x beginblk=%llx endblk=%llx\n", + txh->txh_type, beginblk, endblk); + + switch (txh->txh_type) { + case THT_WRITE: + if (blkid >= beginblk && blkid <= endblk) + match_offset = TRUE; + /* + * We will let this hold work for the bonus + * buffer so that we don't need to hold it + * when creating a new object. + */ + if (blkid == DB_BONUS_BLKID) + match_offset = TRUE; + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; + case THT_FREE: + if (blkid == beginblk && + (txh->txh_arg1 != 0 || + dn->dn_maxblkid == 0)) + match_offset = TRUE; + if (blkid == endblk && + txh->txh_arg2 != DMU_OBJECT_END) + match_offset = TRUE; + break; + case THT_BONUS: + if (blkid == DB_BONUS_BLKID) + match_offset = TRUE; + break; + case THT_ZAP: + match_offset = TRUE; + break; + case THT_NEWOBJECT: + match_object = TRUE; + break; + default: + ASSERT(!"bad txh_type"); + } + } + if (match_object && match_offset) + return; + } + panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", + (u_longlong_t)db->db.db_object, db->db_level, + (u_longlong_t)db->db_blkid); +} +#endif + +static int +dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + dmu_tx_hold_t *txh; + uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite; + + ASSERT3U(tx->tx_txg, ==, 0); + if (tx->tx_err) + return (tx->tx_err); + + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); + tx->tx_needassign_txh = NULL; + + /* + * NB: No error returns are allowed after txg_hold_open, but + * before processing the dnode holds, due to the + * dmu_tx_unassign() logic. + */ + + towrite = tofree = tooverwrite = 0; + for (txh = list_head(&tx->tx_holds); txh; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + if (dn != NULL) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_assigned_txg == tx->tx_txg - 1) { + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = txh; + return (ERESTART); + } + if (dn->dn_assigned_txg == 0) + dn->dn_assigned_txg = tx->tx_txg; + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + towrite += txh->txh_space_towrite; + tofree += txh->txh_space_tofree; + tooverwrite += txh->txh_space_tooverwrite; + } + + /* + * NB: This check must be after we've held the dnodes, so that + * the dmu_tx_unassign() logic will work properly + */ + if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) + return (ERESTART); + + /* + * If a snapshot has been taken since we made our estimates, + * assume that we won't be able to free or overwrite anything. + */ + if (tx->tx_objset && + dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > + tx->tx_lastsnap_txg) { + towrite += tooverwrite; + tooverwrite = tofree = 0; + } + + /* + * Convert logical size to worst-case allocated size. + */ + fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; + lsize = towrite + tooverwrite; + asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); + +#ifdef ZFS_DEBUG + tx->tx_space_towrite = asize; + tx->tx_space_tofree = tofree; + tx->tx_space_tooverwrite = tooverwrite; +#endif + + if (tx->tx_dir && asize != 0) { + int err = dsl_dir_tempreserve_space(tx->tx_dir, + lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx); + if (err) + return (err); + } + + return (0); +} + +static void +dmu_tx_unassign(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + if (tx->tx_txg == 0) + return; + + txg_rele_to_quiesce(&tx->tx_txgh); + + for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + txg_rele_to_sync(&tx->tx_txgh); + + tx->tx_lasttried_txg = tx->tx_txg; + tx->tx_txg = 0; +} + +/* + * Assign tx to a transaction group. txg_how can be one of: + * + * (1) TXG_WAIT. If the current open txg is full, waits until there's + * a new one. This should be used when you're not holding locks. + * If will only fail if we're truly out of space (or over quota). + * + * (2) TXG_NOWAIT. If we can't assign into the current open txg without + * blocking, returns immediately with ERESTART. This should be used + * whenever you're holding locks. On an ERESTART error, the caller + * should drop locks, do a dmu_tx_wait(tx), and try again. + * + * (3) A specific txg. Use this if you need to ensure that multiple + * transactions all sync in the same txg. Like TXG_NOWAIT, it + * returns ERESTART if it can't assign you into the requested txg. + */ +int +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + int err; + + ASSERT(tx->tx_txg == 0); + ASSERT(txg_how != 0); + ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { + dmu_tx_unassign(tx); + + if (err != ERESTART || txg_how != TXG_WAIT) + return (err); + + dmu_tx_wait(tx); + } + + txg_rele_to_quiesce(&tx->tx_txgh); + + return (0); +} + +void +dmu_tx_wait(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg == 0); + ASSERT(tx->tx_lasttried_txg != 0); + + if (tx->tx_needassign_txh) { + dnode_t *dn = tx->tx_needassign_txh->txh_dnode; + + mutex_enter(&dn->dn_mtx); + while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) + cv_wait(&dn->dn_notxholds, &dn->dn_mtx); + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = NULL; + } else { + txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); + } +} + +void +dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) +{ +#ifdef ZFS_DEBUG + if (tx->tx_dir == NULL || delta == 0) + return; + + if (delta > 0) { + ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, + tx->tx_space_towrite); + (void) refcount_add_many(&tx->tx_space_written, delta, NULL); + } else { + (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); + } +#endif +} + +void +dmu_tx_commit(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg != 0); + + while (txh = list_head(&tx->tx_holds)) { + dnode_t *dn = txh->txh_dnode; + + list_remove(&tx->tx_holds, txh); + kmem_free(txh, sizeof (dmu_tx_hold_t)); + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + dnode_rele(dn, tx); + } + + if (tx->tx_tempreserve_cookie) + dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + + if (tx->tx_anyobj == FALSE) + txg_rele_to_sync(&tx->tx_txgh); +#ifdef ZFS_DEBUG + dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", + tx->tx_space_towrite, refcount_count(&tx->tx_space_written), + tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); + refcount_destroy_many(&tx->tx_space_written, + refcount_count(&tx->tx_space_written)); + refcount_destroy_many(&tx->tx_space_freed, + refcount_count(&tx->tx_space_freed)); +#endif + kmem_free(tx, sizeof (dmu_tx_t)); +} + +void +dmu_tx_abort(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + while (txh = list_head(&tx->tx_holds)) { + dnode_t *dn = txh->txh_dnode; + + list_remove(&tx->tx_holds, txh); + kmem_free(txh, sizeof (dmu_tx_hold_t)); + if (dn != NULL) + dnode_rele(dn, tx); + } +#ifdef ZFS_DEBUG + refcount_destroy_many(&tx->tx_space_written, + refcount_count(&tx->tx_space_written)); + refcount_destroy_many(&tx->tx_space_freed, + refcount_count(&tx->tx_space_freed)); +#endif + kmem_free(tx, sizeof (dmu_tx_t)); +} + +uint64_t +dmu_tx_get_txg(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + return (tx->tx_txg); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c new file mode 100644 index 0000000..78d625c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -0,0 +1,655 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_zfetch.h> +#include <sys/dmu.h> +#include <sys/dbuf.h> + +/* + * I'm against tune-ables, but these should probably exist as tweakable globals + * until we can get this working the way we want it to. + */ + +int zfs_prefetch_disable = 0; +SYSCTL_DECL(_vfs_zfs); +TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable); +SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN, + &zfs_prefetch_disable, 0, "Disable prefetch"); + +/* max # of streams per zfetch */ +uint32_t zfetch_max_streams = 8; +/* min time before stream reclaim */ +uint32_t zfetch_min_sec_reap = 2; +/* max number of blocks to fetch at a time */ +uint32_t zfetch_block_cap = 256; +/* number of bytes in a array_read at which we stop prefetching (1Mb) */ +uint64_t zfetch_array_rd_sz = 1024 * 1024; + +/* forward decls for static routines */ +static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); +static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); +static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); +static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); +static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); +static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); +static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); +static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); +static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); + +/* + * Given a zfetch structure and a zstream structure, determine whether the + * blocks to be read are part of a co-linear pair of existing prefetch + * streams. If a set is found, coalesce the streams, removing one, and + * configure the prefetch so it looks for a strided access pattern. + * + * In other words: if we find two sequential access streams that are + * the same length and distance N appart, and this read is N from the + * last stream, then we are probably in a strided access pattern. So + * combine the two sequential streams into a single strided stream. + * + * If no co-linear streams are found, return NULL. + */ +static int +dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) +{ + zstream_t *z_walk; + zstream_t *z_comp; + + if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) + return (0); + + if (zh == NULL) { + rw_exit(&zf->zf_rwlock); + return (0); + } + + for (z_walk = list_head(&zf->zf_stream); z_walk; + z_walk = list_next(&zf->zf_stream, z_walk)) { + for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; + z_comp = list_next(&zf->zf_stream, z_comp)) { + int64_t diff; + + if (z_walk->zst_len != z_walk->zst_stride || + z_comp->zst_len != z_comp->zst_stride) { + continue; + } + + diff = z_comp->zst_offset - z_walk->zst_offset; + if (z_comp->zst_offset + diff == zh->zst_offset) { + z_walk->zst_offset = zh->zst_offset; + z_walk->zst_direction = diff < 0 ? -1 : 1; + z_walk->zst_stride = + diff * z_walk->zst_direction; + z_walk->zst_ph_offset = + zh->zst_offset + z_walk->zst_stride; + dmu_zfetch_stream_remove(zf, z_comp); + mutex_destroy(&z_comp->zst_lock); + kmem_free(z_comp, sizeof (zstream_t)); + + dmu_zfetch_dofetch(zf, z_walk); + + rw_exit(&zf->zf_rwlock); + return (1); + } + + diff = z_walk->zst_offset - z_comp->zst_offset; + if (z_walk->zst_offset + diff == zh->zst_offset) { + z_walk->zst_offset = zh->zst_offset; + z_walk->zst_direction = diff < 0 ? -1 : 1; + z_walk->zst_stride = + diff * z_walk->zst_direction; + z_walk->zst_ph_offset = + zh->zst_offset + z_walk->zst_stride; + dmu_zfetch_stream_remove(zf, z_comp); + mutex_destroy(&z_comp->zst_lock); + kmem_free(z_comp, sizeof (zstream_t)); + + dmu_zfetch_dofetch(zf, z_walk); + + rw_exit(&zf->zf_rwlock); + return (1); + } + } + } + + rw_exit(&zf->zf_rwlock); + return (0); +} + +/* + * Given a zstream_t, determine the bounds of the prefetch. Then call the + * routine that actually prefetches the individual blocks. + */ +static void +dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) +{ + uint64_t prefetch_tail; + uint64_t prefetch_limit; + uint64_t prefetch_ofst; + uint64_t prefetch_len; + uint64_t blocks_fetched; + + zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); + zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); + + prefetch_tail = MAX((int64_t)zs->zst_ph_offset, + (int64_t)(zs->zst_offset + zs->zst_stride)); + /* + * XXX: use a faster division method? + */ + prefetch_limit = zs->zst_offset + zs->zst_len + + (zs->zst_cap * zs->zst_stride) / zs->zst_len; + + while (prefetch_tail < prefetch_limit) { + prefetch_ofst = zs->zst_offset + zs->zst_direction * + (prefetch_tail - zs->zst_offset); + + prefetch_len = zs->zst_len; + + /* + * Don't prefetch beyond the end of the file, if working + * backwards. + */ + if ((zs->zst_direction == ZFETCH_BACKWARD) && + (prefetch_ofst > prefetch_tail)) { + prefetch_len += prefetch_ofst; + prefetch_ofst = 0; + } + + /* don't prefetch more than we're supposed to */ + if (prefetch_len > zs->zst_len) + break; + + blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, + prefetch_ofst, zs->zst_len); + + prefetch_tail += zs->zst_stride; + /* stop if we've run out of stuff to prefetch */ + if (blocks_fetched < zs->zst_len) + break; + } + zs->zst_ph_offset = prefetch_tail; + zs->zst_last = lbolt; +} + +/* + * This takes a pointer to a zfetch structure and a dnode. It performs the + * necessary setup for the zfetch structure, grokking data from the + * associated dnode. + */ +void +dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) +{ + if (zf == NULL) { + return; + } + + zf->zf_dnode = dno; + zf->zf_stream_cnt = 0; + zf->zf_alloc_fail = 0; + + list_create(&zf->zf_stream, sizeof (zstream_t), + offsetof(zstream_t, zst_node)); + + rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); +} + +/* + * This function computes the actual size, in blocks, that can be prefetched, + * and fetches it. + */ +static uint64_t +dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) +{ + uint64_t fetchsz; + uint64_t i; + + fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); + + for (i = 0; i < fetchsz; i++) { + dbuf_prefetch(dn, blkid + i); + } + + return (fetchsz); +} + +/* + * this function returns the number of blocks that would be prefetched, based + * upon the supplied dnode, blockid, and nblks. This is used so that we can + * update streams in place, and then prefetch with their old value after the + * fact. This way, we can delay the prefetch, but subsequent accesses to the + * stream won't result in the same data being prefetched multiple times. + */ +static uint64_t +dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) +{ + uint64_t fetchsz; + + if (blkid > dn->dn_maxblkid) { + return (0); + } + + /* compute fetch size */ + if (blkid + nblks + 1 > dn->dn_maxblkid) { + fetchsz = (dn->dn_maxblkid - blkid) + 1; + ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); + } else { + fetchsz = nblks; + } + + + return (fetchsz); +} + +/* + * given a zfetch and a zsearch structure, see if there is an associated zstream + * for this block read. If so, it starts a prefetch for the stream it + * located and returns true, otherwise it returns false + */ +static int +dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) +{ + zstream_t *zs; + int64_t diff; + int reset = !prefetched; + int rc = 0; + + if (zh == NULL) + return (0); + + /* + * XXX: This locking strategy is a bit coarse; however, it's impact has + * yet to be tested. If this turns out to be an issue, it can be + * modified in a number of different ways. + */ + + rw_enter(&zf->zf_rwlock, RW_READER); +top: + + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + + /* + * XXX - should this be an assert? + */ + if (zs->zst_len == 0) { + /* bogus stream */ + continue; + } + + /* + * We hit this case when we are in a strided prefetch stream: + * we will read "len" blocks before "striding". + */ + if (zh->zst_offset >= zs->zst_offset && + zh->zst_offset < zs->zst_offset + zs->zst_len) { + /* already fetched */ + rc = 1; + goto out; + } + + /* + * This is the forward sequential read case: we increment + * len by one each time we hit here, so we will enter this + * case on every read. + */ + if (zh->zst_offset == zs->zst_offset + zs->zst_len) { + + reset = !prefetched && zs->zst_len > 1; + + mutex_enter(&zs->zst_lock); + + if (zh->zst_offset != zs->zst_offset + zs->zst_len) { + mutex_exit(&zs->zst_lock); + goto top; + } + zs->zst_len += zh->zst_len; + diff = zs->zst_len - zfetch_block_cap; + if (diff > 0) { + zs->zst_offset += diff; + zs->zst_len = zs->zst_len > diff ? + zs->zst_len - diff : 0; + } + zs->zst_direction = ZFETCH_FORWARD; + + break; + + /* + * Same as above, but reading backwards through the file. + */ + } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { + /* backwards sequential access */ + + reset = !prefetched && zs->zst_len > 1; + + mutex_enter(&zs->zst_lock); + + if (zh->zst_offset != zs->zst_offset - zh->zst_len) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset = zs->zst_offset > zh->zst_len ? + zs->zst_offset - zh->zst_len : 0; + zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? + zs->zst_ph_offset - zh->zst_len : 0; + zs->zst_len += zh->zst_len; + + diff = zs->zst_len - zfetch_block_cap; + if (diff > 0) { + zs->zst_ph_offset = zs->zst_ph_offset > diff ? + zs->zst_ph_offset - diff : 0; + zs->zst_len = zs->zst_len > diff ? + zs->zst_len - diff : zs->zst_len; + } + zs->zst_direction = ZFETCH_BACKWARD; + + break; + + } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < + zs->zst_len) && (zs->zst_len != zs->zst_stride)) { + /* strided forward access */ + + mutex_enter(&zs->zst_lock); + + if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= + zs->zst_len) || (zs->zst_len == zs->zst_stride)) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset += zs->zst_stride; + zs->zst_direction = ZFETCH_FORWARD; + + break; + + } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < + zs->zst_len) && (zs->zst_len != zs->zst_stride)) { + /* strided reverse access */ + + mutex_enter(&zs->zst_lock); + + if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= + zs->zst_len) || (zs->zst_len == zs->zst_stride)) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset = zs->zst_offset > zs->zst_stride ? + zs->zst_offset - zs->zst_stride : 0; + zs->zst_ph_offset = (zs->zst_ph_offset > + (2 * zs->zst_stride)) ? + (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; + zs->zst_direction = ZFETCH_BACKWARD; + + break; + } + } + + if (zs) { + if (reset) { + zstream_t *remove = zs; + + rc = 0; + mutex_exit(&zs->zst_lock); + rw_exit(&zf->zf_rwlock); + rw_enter(&zf->zf_rwlock, RW_WRITER); + /* + * Relocate the stream, in case someone removes + * it while we were acquiring the WRITER lock. + */ + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + if (zs == remove) { + dmu_zfetch_stream_remove(zf, zs); + mutex_destroy(&zs->zst_lock); + kmem_free(zs, sizeof (zstream_t)); + break; + } + } + } else { + rc = 1; + dmu_zfetch_dofetch(zf, zs); + mutex_exit(&zs->zst_lock); + } + } +out: + rw_exit(&zf->zf_rwlock); + return (rc); +} + +/* + * Clean-up state associated with a zfetch structure. This frees allocated + * structure members, empties the zf_stream tree, and generally makes things + * nice. This doesn't free the zfetch_t itself, that's left to the caller. + */ +void +dmu_zfetch_rele(zfetch_t *zf) +{ + zstream_t *zs; + zstream_t *zs_next; + + ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); + + for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { + zs_next = list_next(&zf->zf_stream, zs); + + list_remove(&zf->zf_stream, zs); + mutex_destroy(&zs->zst_lock); + kmem_free(zs, sizeof (zstream_t)); + } + list_destroy(&zf->zf_stream); + rw_destroy(&zf->zf_rwlock); + + zf->zf_dnode = NULL; +} + +/* + * Given a zfetch and zstream structure, insert the zstream structure into the + * AVL tree contained within the zfetch structure. Peform the appropriate + * book-keeping. It is possible that another thread has inserted a stream which + * matches one that we are about to insert, so we must be sure to check for this + * case. If one is found, return failure, and let the caller cleanup the + * duplicates. + */ +static int +dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) +{ + zstream_t *zs_walk; + zstream_t *zs_next; + + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + + for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { + zs_next = list_next(&zf->zf_stream, zs_walk); + + if (dmu_zfetch_streams_equal(zs_walk, zs)) { + return (0); + } + } + + list_insert_head(&zf->zf_stream, zs); + zf->zf_stream_cnt++; + + return (1); +} + + +/* + * Walk the list of zstreams in the given zfetch, find an old one (by time), and + * reclaim it for use by the caller. + */ +static zstream_t * +dmu_zfetch_stream_reclaim(zfetch_t *zf) +{ + zstream_t *zs; + + if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) + return (0); + + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + + if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap) + break; + } + + if (zs) { + dmu_zfetch_stream_remove(zf, zs); + mutex_destroy(&zs->zst_lock); + bzero(zs, sizeof (zstream_t)); + } else { + zf->zf_alloc_fail++; + } + rw_exit(&zf->zf_rwlock); + + return (zs); +} + +/* + * Given a zfetch and zstream structure, remove the zstream structure from its + * container in the zfetch structure. Perform the appropriate book-keeping. + */ +static void +dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + + list_remove(&zf->zf_stream, zs); + zf->zf_stream_cnt--; +} + +static int +dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) +{ + if (zs1->zst_offset != zs2->zst_offset) + return (0); + + if (zs1->zst_len != zs2->zst_len) + return (0); + + if (zs1->zst_stride != zs2->zst_stride) + return (0); + + if (zs1->zst_ph_offset != zs2->zst_ph_offset) + return (0); + + if (zs1->zst_cap != zs2->zst_cap) + return (0); + + if (zs1->zst_direction != zs2->zst_direction) + return (0); + + return (1); +} + +/* + * This is the prefetch entry point. It calls all of the other dmu_zfetch + * routines to create, delete, find, or operate upon prefetch streams. + */ +void +dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) +{ + zstream_t zst; + zstream_t *newstream; + int fetched; + int inserted; + unsigned int blkshft; + uint64_t blksz; + + if (zfs_prefetch_disable) + return; + + /* files that aren't ln2 blocksz are only one block -- nothing to do */ + if (!zf->zf_dnode->dn_datablkshift) + return; + + /* convert offset and size, into blockid and nblocks */ + blkshft = zf->zf_dnode->dn_datablkshift; + blksz = (1 << blkshft); + + bzero(&zst, sizeof (zstream_t)); + zst.zst_offset = offset >> blkshft; + zst.zst_len = (P2ROUNDUP(offset + size, blksz) - + P2ALIGN(offset, blksz)) >> blkshft; + + fetched = dmu_zfetch_find(zf, &zst, prefetched); + if (!fetched) { + fetched = dmu_zfetch_colinear(zf, &zst); + } + + if (!fetched) { + newstream = dmu_zfetch_stream_reclaim(zf); + + /* + * we still couldn't find a stream, drop the lock, and allocate + * one if possible. Otherwise, give up and go home. + */ + if (newstream == NULL) { + uint64_t maxblocks; + uint32_t max_streams; + uint32_t cur_streams; + + cur_streams = zf->zf_stream_cnt; + maxblocks = zf->zf_dnode->dn_maxblkid; + + max_streams = MIN(zfetch_max_streams, + (maxblocks / zfetch_block_cap)); + if (max_streams == 0) { + max_streams++; + } + + if (cur_streams >= max_streams) { + return; + } + + newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); + } + + newstream->zst_offset = zst.zst_offset; + newstream->zst_len = zst.zst_len; + newstream->zst_stride = zst.zst_len; + newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; + newstream->zst_cap = zst.zst_len; + newstream->zst_direction = ZFETCH_FORWARD; + newstream->zst_last = lbolt; + + mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); + + rw_enter(&zf->zf_rwlock, RW_WRITER); + inserted = dmu_zfetch_stream_insert(zf, newstream); + rw_exit(&zf->zf_rwlock); + + if (!inserted) { + mutex_destroy(&newstream->zst_lock); + kmem_free(newstream, sizeof (zstream_t)); + } + } +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c new file mode 100644 index 0000000..65bb518 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -0,0 +1,1370 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> + +static int free_range_compar(const void *node1, const void *node2); + +static kmem_cache_t *dnode_cache; + +static dnode_phys_t dnode_phys_zero; + +int zfs_default_bs = SPA_MINBLOCKSHIFT; +int zfs_default_ibs = DN_MAX_INDBLKSHIFT; + +/* ARGSUSED */ +static int +dnode_cons(void *arg, void *unused, int kmflag) +{ + int i; + dnode_t *dn = arg; + bzero(dn, sizeof (dnode_t)); + + cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); + rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); + refcount_create(&dn->dn_holds); + refcount_create(&dn->dn_tx_holds); + + for (i = 0; i < TXG_SIZE; i++) { + avl_create(&dn->dn_ranges[i], free_range_compar, + sizeof (free_range_t), + offsetof(struct free_range, fr_node)); + list_create(&dn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + + list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + return (0); +} + +/* ARGSUSED */ +static void +dnode_dest(void *arg, void *unused) +{ + int i; + dnode_t *dn = arg; + + cv_destroy(&dn->dn_notxholds); + rw_destroy(&dn->dn_struct_rwlock); + mutex_destroy(&dn->dn_mtx); + mutex_destroy(&dn->dn_dbufs_mtx); + refcount_destroy(&dn->dn_holds); + refcount_destroy(&dn->dn_tx_holds); + + for (i = 0; i < TXG_SIZE; i++) { + avl_destroy(&dn->dn_ranges[i]); + list_destroy(&dn->dn_dirty_records[i]); + } + + list_destroy(&dn->dn_dbufs); +} + +void +dnode_init(void) +{ + dnode_cache = kmem_cache_create("dnode_t", + sizeof (dnode_t), + 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); +} + +void +dnode_fini(void) +{ + kmem_cache_destroy(dnode_cache); +} + + +#ifdef ZFS_DEBUG +void +dnode_verify(dnode_t *dn) +{ + int drop_struct_lock = FALSE; + + ASSERT(dn->dn_phys); + ASSERT(dn->dn_objset); + + ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + + if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) + return; + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { + int i; + ASSERT3U(dn->dn_indblkshift, >=, 0); + ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); + if (dn->dn_datablkshift) { + ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); + ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); + } + ASSERT3U(dn->dn_nlevels, <=, 30); + ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); + ASSERT3U(dn->dn_nblkptr, >=, 1); + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(dn->dn_datablksz, ==, + dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); + ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + + dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); + } + } + if (dn->dn_phys->dn_type != DMU_OT_NONE) + ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL); + if (dn->dn_dbuf != NULL) { + ASSERT3P(dn->dn_phys, ==, + (dnode_phys_t *)dn->dn_dbuf->db.db_data + + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); +} +#endif + +void +dnode_byteswap(dnode_phys_t *dnp) +{ + uint64_t *buf64 = (void*)&dnp->dn_blkptr; + int i; + + if (dnp->dn_type == DMU_OT_NONE) { + bzero(dnp, sizeof (dnode_phys_t)); + return; + } + + dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); + dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); + dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); + dnp->dn_used = BSWAP_64(dnp->dn_used); + + /* + * dn_nblkptr is only one byte, so it's OK to read it in either + * byte order. We can't read dn_bouslen. + */ + ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); + ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); + for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) + buf64[i] = BSWAP_64(buf64[i]); + + /* + * OK to check dn_bonuslen for zero, because it won't matter if + * we have the wrong byte order. This is necessary because the + * dnode dnode is smaller than a regular dnode. + */ + if (dnp->dn_bonuslen != 0) { + /* + * Note that the bonus length calculated here may be + * longer than the actual bonus buffer. This is because + * we always put the bonus buffer after the last block + * pointer (instead of packing it against the end of the + * dnode buffer). + */ + int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); + size_t len = DN_MAX_BONUSLEN - off; + ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); + dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); + } +} + +void +dnode_buf_byteswap(void *vbuf, size_t size) +{ + dnode_phys_t *buf = vbuf; + int i; + + ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT)); + ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); + + size >>= DNODE_SHIFT; + for (i = 0; i < size; i++) { + dnode_byteswap(buf); + buf++; + } +} + +static int +free_range_compar(const void *node1, const void *node2) +{ + const free_range_t *rp1 = node1; + const free_range_t *rp2 = node2; + + if (rp1->fr_blkid < rp2->fr_blkid) + return (-1); + else if (rp1->fr_blkid > rp2->fr_blkid) + return (1); + else return (0); +} + +static void +dnode_setdblksz(dnode_t *dn, int size) +{ + ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, >=, SPA_MINBLOCKSIZE); + ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, + 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); + dn->dn_datablksz = size; + dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; + dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0; +} + +static dnode_t * +dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, + uint64_t object) +{ + dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + (void) dnode_cons(dn, NULL, 0); /* XXX */ + + dn->dn_objset = os; + dn->dn_object = object; + dn->dn_dbuf = db; + dn->dn_phys = dnp; + + if (dnp->dn_datablkszsec) + dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + dn->dn_indblkshift = dnp->dn_indblkshift; + dn->dn_nlevels = dnp->dn_nlevels; + dn->dn_type = dnp->dn_type; + dn->dn_nblkptr = dnp->dn_nblkptr; + dn->dn_checksum = dnp->dn_checksum; + dn->dn_compress = dnp->dn_compress; + dn->dn_bonustype = dnp->dn_bonustype; + dn->dn_bonuslen = dnp->dn_bonuslen; + dn->dn_maxblkid = dnp->dn_maxblkid; + + dmu_zfetch_init(&dn->dn_zfetch, dn); + + ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + mutex_enter(&os->os_lock); + list_insert_head(&os->os_dnodes, dn); + mutex_exit(&os->os_lock); + + return (dn); +} + +static void +dnode_destroy(dnode_t *dn) +{ + objset_impl_t *os = dn->dn_objset; + +#ifdef ZFS_DEBUG + int i; + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT(NULL == list_head(&dn->dn_dirty_records[i])); + ASSERT(0 == avl_numnodes(&dn->dn_ranges[i])); + } + ASSERT(NULL == list_head(&dn->dn_dbufs)); +#endif + + mutex_enter(&os->os_lock); + list_remove(&os->os_dnodes, dn); + mutex_exit(&os->os_lock); + + if (dn->dn_dirtyctx_firstset) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + dmu_zfetch_rele(&dn->dn_zfetch); + if (dn->dn_bonus) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_evict(dn->dn_bonus); + dn->dn_bonus = NULL; + } + kmem_cache_free(dnode_cache, dn); +} + +void +dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int i; + + if (blocksize == 0) + blocksize = 1 << zfs_default_bs; + else if (blocksize > SPA_MAXBLOCKSIZE) + blocksize = SPA_MAXBLOCKSIZE; + else + blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); + + if (ibs == 0) + ibs = zfs_default_ibs; + + ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); + + dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, + dn->dn_object, tx->tx_txg, blocksize, ibs); + + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); + ASSERT(ot != DMU_OT_NONE); + ASSERT3U(ot, <, DMU_OT_NUMTYPES); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0)); + ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT3U(dn->dn_maxblkid, ==, 0); + ASSERT3U(dn->dn_allocated_txg, ==, 0); + ASSERT3U(dn->dn_assigned_txg, ==, 0); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nlevels[i], ==, 0); + ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_blksz[i], ==, 0); + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); + ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0); + } + + dn->dn_type = ot; + dnode_setdblksz(dn, blocksize); + dn->dn_indblkshift = ibs; + dn->dn_nlevels = 1; + dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + dn->dn_dirtyctx = 0; + + dn->dn_free_txg = 0; + if (dn->dn_dirtyctx_firstset) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + + dn->dn_allocated_txg = tx->tx_txg; + + dnode_setdirty(dn, tx); + dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; +} + +void +dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int i; + dmu_buf_impl_t *db = NULL; + + ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); + ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0); + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + ASSERT(tx->tx_txg != 0); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0)); + ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + + for (i = 0; i < TXG_SIZE; i++) + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + + /* clean up any unreferenced dbufs */ + (void) dnode_evict_dbufs(dn, 0); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + /* + * XXX I should really have a generation number to tell if we + * need to do this... + */ + if (blocksize != dn->dn_datablksz || + dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) { + /* free all old data */ + dnode_free_range(dn, 0, -1ULL, tx); + } + + /* change blocksize */ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (blocksize != dn->dn_datablksz && + (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || + list_head(&dn->dn_dbufs) != NULL)) { + db = dbuf_hold(dn, 0, FTAG); + dbuf_new_size(db, blocksize, tx); + } + dnode_setdblksz(dn, blocksize); + dnode_setdirty(dn, tx); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; + rw_exit(&dn->dn_struct_rwlock); + if (db) { + dbuf_rele(db, FTAG); + db = NULL; + } + + /* change type */ + dn->dn_type = ot; + + if (dn->dn_bonuslen != bonuslen) { + /* change bonus size */ + if (bonuslen == 0) + bonuslen = 1; /* XXX */ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dn->dn_bonus = dbuf_create_bonus(dn); + db = dn->dn_bonus; + rw_exit(&dn->dn_struct_rwlock); + if (refcount_add(&db->db_holds, FTAG) == 1) + dnode_add_ref(dn, db); + VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); + mutex_enter(&db->db_mtx); + ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); + ASSERT(db->db.db_data != NULL); + db->db.db_size = bonuslen; + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + } + + /* change bonus size and type */ + mutex_enter(&dn->dn_mtx); + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + + /* + * NB: we have to do the dbuf_rele after we've changed the + * dn_bonuslen, for the sake of dbuf_verify(). + */ + if (db) + dbuf_rele(db, FTAG); + + dn->dn_allocated_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); +} + +void +dnode_special_close(dnode_t *dn) +{ + /* + * Wait for final references to the dnode to clear. This can + * only happen if the arc is asyncronously evicting state that + * has a hold on this dnode while we are trying to evict this + * dnode. + */ + while (refcount_count(&dn->dn_holds) > 0) + delay(1); + dnode_destroy(dn); +} + +dnode_t * +dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object) +{ + dnode_t *dn = dnode_create(os, dnp, NULL, object); + DNODE_VERIFY(dn); + return (dn); +} + +static void +dnode_buf_pageout(dmu_buf_t *db, void *arg) +{ + dnode_t **children_dnodes = arg; + int i; + int epb = db->db_size >> DNODE_SHIFT; + + for (i = 0; i < epb; i++) { + dnode_t *dn = children_dnodes[i]; + int n; + + if (dn == NULL) + continue; +#ifdef ZFS_DEBUG + /* + * If there are holds on this dnode, then there should + * be holds on the dnode's containing dbuf as well; thus + * it wouldn't be eligable for eviction and this function + * would not have been called. + */ + ASSERT(refcount_is_zero(&dn->dn_holds)); + ASSERT(list_head(&dn->dn_dbufs) == NULL); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + + for (n = 0; n < TXG_SIZE; n++) + ASSERT(!list_link_active(&dn->dn_dirty_link[n])); +#endif + children_dnodes[i] = NULL; + dnode_destroy(dn); + } + kmem_free(children_dnodes, epb * sizeof (dnode_t *)); +} + +/* + * errors: + * EINVAL - invalid object number. + * EIO - i/o error. + * succeeds even for free dnodes. + */ +int +dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, + void *tag, dnode_t **dnp) +{ + int epb, idx, err; + int drop_struct_lock = FALSE; + int type; + uint64_t blk; + dnode_t *mdn, *dn; + dmu_buf_impl_t *db; + dnode_t **children_dnodes; + + if (object == 0 || object >= DN_MAX_OBJECT) + return (EINVAL); + + mdn = os->os_meta_dnode; + + DNODE_VERIFY(mdn); + + if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { + rw_enter(&mdn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); + + db = dbuf_hold(mdn, blk, FTAG); + if (drop_struct_lock) + rw_exit(&mdn->dn_struct_rwlock); + if (db == NULL) + return (EIO); + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err) { + dbuf_rele(db, FTAG); + return (err); + } + + ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); + epb = db->db.db_size >> DNODE_SHIFT; + + idx = object & (epb-1); + + children_dnodes = dmu_buf_get_user(&db->db); + if (children_dnodes == NULL) { + dnode_t **winner; + children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *), + KM_SLEEP); + if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, + dnode_buf_pageout)) { + kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + children_dnodes = winner; + } + } + + if ((dn = children_dnodes[idx]) == NULL) { + dnode_t *winner; + dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx, + db, object); + winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); + if (winner != NULL) { + dnode_destroy(dn); + dn = winner; + } + } + + mutex_enter(&dn->dn_mtx); + type = dn->dn_type; + if (dn->dn_free_txg || + ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || + ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) { + mutex_exit(&dn->dn_mtx); + dbuf_rele(db, FTAG); + return (type == DMU_OT_NONE ? ENOENT : EEXIST); + } + mutex_exit(&dn->dn_mtx); + + if (refcount_add(&dn->dn_holds, tag) == 1) + dbuf_add_ref(db, dn); + + DNODE_VERIFY(dn); + ASSERT3P(dn->dn_dbuf, ==, db); + ASSERT3U(dn->dn_object, ==, object); + dbuf_rele(db, FTAG); + + *dnp = dn; + return (0); +} + +/* + * Return held dnode if the object is allocated, NULL if not. + */ +int +dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp) +{ + return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); +} + +void +dnode_add_ref(dnode_t *dn, void *tag) +{ + ASSERT(refcount_count(&dn->dn_holds) > 0); + (void) refcount_add(&dn->dn_holds, tag); +} + +void +dnode_rele(dnode_t *dn, void *tag) +{ + uint64_t refs; + + refs = refcount_remove(&dn->dn_holds, tag); + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ + if (refs == 0 && dn->dn_dbuf) + dbuf_rele(dn->dn_dbuf, dn); +} + +void +dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) +{ + objset_impl_t *os = dn->dn_objset; + uint64_t txg = tx->tx_txg; + + if (dn->dn_object == DMU_META_DNODE_OBJECT) + return; + + DNODE_VERIFY(dn); + +#ifdef ZFS_DEBUG + mutex_enter(&dn->dn_mtx); + ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); + /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */ + mutex_exit(&dn->dn_mtx); +#endif + + mutex_enter(&os->os_lock); + + /* + * If we are already marked dirty, we're done. + */ + if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { + mutex_exit(&os->os_lock); + return; + } + + ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); + ASSERT(dn->dn_datablksz != 0); + ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); + + dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", + dn->dn_object, txg); + + if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) { + list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn); + } else { + list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn); + } + + mutex_exit(&os->os_lock); + + /* + * The dnode maintains a hold on its containing dbuf as + * long as there are holds on it. Each instantiated child + * dbuf maintaines a hold on the dnode. When the last child + * drops its hold, the dnode will drop its hold on the + * containing dbuf. We add a "dirty hold" here so that the + * dnode will hang around after we finish processing its + * children. + */ + dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg); + + (void) dbuf_dirty(dn->dn_dbuf, tx); + + dsl_dataset_dirty(os->os_dsl_dataset, tx); +} + +void +dnode_free(dnode_t *dn, dmu_tx_t *tx) +{ + int txgoff = tx->tx_txg & TXG_MASK; + + dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg); + + /* we should be the only holder... hopefully */ + /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ + + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { + mutex_exit(&dn->dn_mtx); + return; + } + dn->dn_free_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); + + /* + * If the dnode is already dirty, it needs to be moved from + * the dirty list to the free list. + */ + mutex_enter(&dn->dn_objset->os_lock); + if (list_link_active(&dn->dn_dirty_link[txgoff])) { + list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn); + list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn); + mutex_exit(&dn->dn_objset->os_lock); + } else { + mutex_exit(&dn->dn_objset->os_lock); + dnode_setdirty(dn, tx); + } +} + +/* + * Try to change the block size for the indicated dnode. This can only + * succeed if there are no blocks allocated or dirty beyond first block + */ +int +dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db, *db_next; + int have_db0 = FALSE; + + if (size == 0) + size = SPA_MINBLOCKSIZE; + if (size > SPA_MAXBLOCKSIZE) + size = SPA_MAXBLOCKSIZE; + else + size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); + + if (ibs == dn->dn_indblkshift) + ibs = 0; + + if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) + return (0); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* Check for any allocated blocks beyond the first */ + if (dn->dn_phys->dn_maxblkid != 0) + goto fail; + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + db_next = list_next(&dn->dn_dbufs, db); + + if (db->db_blkid == 0) { + have_db0 = TRUE; + } else if (db->db_blkid != DB_BONUS_BLKID) { + mutex_exit(&dn->dn_dbufs_mtx); + goto fail; + } + } + mutex_exit(&dn->dn_dbufs_mtx); + + if (ibs && dn->dn_nlevels != 1) + goto fail; + + db = NULL; + if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) { + /* obtain the old block */ + db = dbuf_hold(dn, 0, FTAG); + dbuf_new_size(db, size, tx); + } + + dnode_setdblksz(dn, size); + dnode_setdirty(dn, tx); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; + if (ibs) { + dn->dn_indblkshift = ibs; + dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + } + + if (db) + dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); + return (0); + +fail: + rw_exit(&dn->dn_struct_rwlock); + return (ENOTSUP); +} + +void +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) +{ + uint64_t txgoff = tx->tx_txg & TXG_MASK; + int drop_struct_lock = FALSE; + int epbs, new_nlevels; + uint64_t sz; + + ASSERT(blkid != DB_BONUS_BLKID); + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + drop_struct_lock = TRUE; + } + + if (blkid <= dn->dn_maxblkid) + goto out; + + dn->dn_maxblkid = blkid; + + /* + * Compute the number of levels necessary to support the new maxblkid. + */ + new_nlevels = 1; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (sz = dn->dn_nblkptr; + sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) + new_nlevels++; + + if (new_nlevels > dn->dn_nlevels) { + int old_nlevels = dn->dn_nlevels; + dmu_buf_impl_t *db; + list_t *list; + dbuf_dirty_record_t *new, *dr, *dr_next; + + dn->dn_nlevels = new_nlevels; + + ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); + dn->dn_next_nlevels[txgoff] = new_nlevels; + + /* dirty the left indirects */ + db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); + new = dbuf_dirty(db, tx); + dbuf_rele(db, FTAG); + + /* transfer the dirty records to the new indirect */ + mutex_enter(&dn->dn_mtx); + mutex_enter(&new->dt.di.dr_mtx); + list = &dn->dn_dirty_records[txgoff]; + for (dr = list_head(list); dr; dr = dr_next) { + dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); + if (dr->dr_dbuf->db_level != new_nlevels-1 && + dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) { + ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + list_remove(&dn->dn_dirty_records[txgoff], dr); + list_insert_tail(&new->dt.di.dr_children, dr); + dr->dr_parent = new; + } + } + mutex_exit(&new->dt.di.dr_mtx); + mutex_exit(&dn->dn_mtx); + } + +out: + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +{ + avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; + avl_index_t where; + free_range_t *rp; + free_range_t rp_tofind; + uint64_t endblk = blkid + nblks; + + ASSERT(MUTEX_HELD(&dn->dn_mtx)); + ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */ + + dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", + blkid, nblks, tx->tx_txg); + rp_tofind.fr_blkid = blkid; + rp = avl_find(tree, &rp_tofind, &where); + if (rp == NULL) + rp = avl_nearest(tree, where, AVL_BEFORE); + if (rp == NULL) + rp = avl_nearest(tree, where, AVL_AFTER); + + while (rp && (rp->fr_blkid <= blkid + nblks)) { + uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks; + free_range_t *nrp = AVL_NEXT(tree, rp); + + if (blkid <= rp->fr_blkid && endblk >= fr_endblk) { + /* clear this entire range */ + avl_remove(tree, rp); + kmem_free(rp, sizeof (free_range_t)); + } else if (blkid <= rp->fr_blkid && + endblk > rp->fr_blkid && endblk < fr_endblk) { + /* clear the beginning of this range */ + rp->fr_blkid = endblk; + rp->fr_nblks = fr_endblk - endblk; + } else if (blkid > rp->fr_blkid && blkid < fr_endblk && + endblk >= fr_endblk) { + /* clear the end of this range */ + rp->fr_nblks = blkid - rp->fr_blkid; + } else if (blkid > rp->fr_blkid && endblk < fr_endblk) { + /* clear a chunk out of this range */ + free_range_t *new_rp = + kmem_alloc(sizeof (free_range_t), KM_SLEEP); + + new_rp->fr_blkid = endblk; + new_rp->fr_nblks = fr_endblk - endblk; + avl_insert_here(tree, new_rp, rp, AVL_AFTER); + rp->fr_nblks = blkid - rp->fr_blkid; + } + /* there may be no overlap */ + rp = nrp; + } +} + +void +dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + uint64_t blkoff, blkid, nblks; + int blksz, head; + int trunc = FALSE; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + blksz = dn->dn_datablksz; + + /* If the range is past the end of the file, this is a no-op */ + if (off >= blksz * (dn->dn_maxblkid+1)) + goto out; + if (len == -1ULL) { + len = UINT64_MAX - off; + trunc = TRUE; + } + + /* + * First, block align the region to free: + */ + if (ISP2(blksz)) { + head = P2NPHASE(off, blksz); + blkoff = P2PHASE(off, blksz); + } else { + ASSERT(dn->dn_maxblkid == 0); + if (off == 0 && len >= blksz) { + /* Freeing the whole block; don't do any head. */ + head = 0; + } else { + /* Freeing part of the block. */ + head = blksz - off; + ASSERT3U(head, >, 0); + } + blkoff = off; + } + /* zero out any partial block data at the start of the range */ + if (head) { + ASSERT3U(blkoff + head, ==, blksz); + if (len < head) + head = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, + FTAG, &db) == 0) { + caddr_t data; + + /* don't dirty if it isn't on disk and isn't dirty */ + if (db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dbuf_will_dirty(db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + data = db->db.db_data; + bzero(data + blkoff, head); + } + dbuf_rele(db, FTAG); + } + off += head; + len -= head; + } + + /* If the range was less than one block, we're done */ + if (len == 0 || off >= blksz * (dn->dn_maxblkid+1)) + goto out; + + if (!ISP2(blksz)) { + /* + * They are freeing the whole block of a + * non-power-of-two blocksize file. Skip all the messy + * math. + */ + ASSERT3U(off, ==, 0); + ASSERT3U(len, >=, blksz); + blkid = 0; + nblks = 1; + } else { + int tail; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int blkshift = dn->dn_datablkshift; + + /* If the remaining range is past end of file, we're done */ + if (off > dn->dn_maxblkid << blkshift) + goto out; + + if (off + len == UINT64_MAX) + tail = 0; + else + tail = P2PHASE(len, blksz); + + ASSERT3U(P2PHASE(off, blksz), ==, 0); + /* zero out any partial block data at the end of the range */ + if (tail) { + if (len < tail) + tail = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), + TRUE, FTAG, &db) == 0) { + /* don't dirty if not on disk and not dirty */ + if (db->db_last_dirty || + (db->db_blkptr && + !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dbuf_will_dirty(db, tx); + rw_enter(&dn->dn_struct_rwlock, + RW_WRITER); + bzero(db->db.db_data, tail); + } + dbuf_rele(db, FTAG); + } + len -= tail; + } + /* If the range did not include a full block, we are done */ + if (len == 0) + goto out; + + /* dirty the left indirects */ + if (dn->dn_nlevels > 1 && off != 0) { + db = dbuf_hold_level(dn, 1, + (off - head) >> (blkshift + epbs), FTAG); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } + + /* dirty the right indirects */ + if (dn->dn_nlevels > 1 && !trunc) { + db = dbuf_hold_level(dn, 1, + (off + len + tail - 1) >> (blkshift + epbs), FTAG); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } + + /* + * Finally, add this range to the dnode range list, we + * will finish up this free operation in the syncing phase. + */ + ASSERT(IS_P2ALIGNED(off, 1<<blkshift)); + ASSERT(off + len == UINT64_MAX || + IS_P2ALIGNED(len, 1<<blkshift)); + blkid = off >> blkshift; + nblks = len >> blkshift; + + if (trunc) + dn->dn_maxblkid = (blkid ? blkid - 1 : 0); + } + + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, blkid, nblks, tx); + { + free_range_t *rp, *found; + avl_index_t where; + avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; + + /* Add new range to dn_ranges */ + rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP); + rp->fr_blkid = blkid; + rp->fr_nblks = nblks; + found = avl_find(tree, rp, &where); + ASSERT(found == NULL); + avl_insert(tree, rp, where); + dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", + blkid, nblks, tx->tx_txg); + } + mutex_exit(&dn->dn_mtx); + + dbuf_free_range(dn, blkid, nblks, tx); + dnode_setdirty(dn, tx); +out: + rw_exit(&dn->dn_struct_rwlock); +} + +/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ +uint64_t +dnode_block_freed(dnode_t *dn, uint64_t blkid) +{ + free_range_t range_tofind; + void *dp = spa_get_dsl(dn->dn_objset->os_spa); + int i; + + if (blkid == DB_BONUS_BLKID) + return (FALSE); + + /* + * If we're in the process of opening the pool, dp will not be + * set yet, but there shouldn't be anything dirty. + */ + if (dp == NULL) + return (FALSE); + + if (dn->dn_free_txg) + return (TRUE); + + /* + * If dn_datablkshift is not set, then there's only a single + * block, in which case there will never be a free range so it + * won't matter. + */ + range_tofind.fr_blkid = blkid; + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + free_range_t *range_found; + avl_index_t idx; + + range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx); + if (range_found) { + ASSERT(range_found->fr_nblks > 0); + break; + } + range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE); + if (range_found && + range_found->fr_blkid + range_found->fr_nblks > blkid) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + +/* call from syncing context when we actually write/free space for this dnode */ +void +dnode_diduse_space(dnode_t *dn, int64_t delta) +{ + uint64_t space; + dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", + dn, dn->dn_phys, + (u_longlong_t)dn->dn_phys->dn_used, + (longlong_t)delta); + + mutex_enter(&dn->dn_mtx); + space = DN_USED_BYTES(dn->dn_phys); + if (delta > 0) { + ASSERT3U(space + delta, >=, space); /* no overflow */ + } else { + ASSERT3U(space, >=, -delta); /* no underflow */ + } + space += delta; + if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) { + ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); + ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0); + dn->dn_phys->dn_used = space >> DEV_BSHIFT; + } else { + dn->dn_phys->dn_used = space; + dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; + } + mutex_exit(&dn->dn_mtx); +} + +/* + * Call when we think we're going to write/free space in open context. + * Be conservative (ie. OK to write less than this or free more than + * this, but don't write more or free less). + */ +void +dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) +{ + objset_impl_t *os = dn->dn_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + + if (space > 0) + space = spa_get_asize(os->os_spa, space); + + if (ds) + dsl_dir_willuse_space(ds->ds_dir, space, tx); + + dmu_tx_willuse_space(tx, space); +} + +static int +dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, + int lvl, uint64_t blkfill, uint64_t txg) +{ + dmu_buf_impl_t *db = NULL; + void *data = NULL; + uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t epb = 1ULL << epbs; + uint64_t minfill, maxfill; + int i, error, span; + + dprintf("probing object %llu offset %llx level %d of %u\n", + dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); + + if (lvl == dn->dn_phys->dn_nlevels) { + error = 0; + epb = dn->dn_phys->dn_nblkptr; + data = dn->dn_phys->dn_blkptr; + } else { + uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); + if (error) { + if (error == ENOENT) + return (hole ? 0 : ESRCH); + return (error); + } + error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); + if (error) { + dbuf_rele(db, FTAG); + return (error); + } + data = db->db.db_data; + } + + if (db && txg && + (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) { + error = ESRCH; + } else if (lvl == 0) { + dnode_phys_t *dnp = data; + span = DNODE_SHIFT; + ASSERT(dn->dn_type == DMU_OT_DNODE); + + for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) { + boolean_t newcontents = B_TRUE; + if (txg) { + int j; + newcontents = B_FALSE; + for (j = 0; j < dnp[i].dn_nblkptr; j++) { + if (dnp[i].dn_blkptr[j].blk_birth > txg) + newcontents = B_TRUE; + } + } + if (!dnp[i].dn_type == hole && newcontents) + break; + *offset += 1ULL << span; + } + if (i == blkfill) + error = ESRCH; + } else { + blkptr_t *bp = data; + span = (lvl - 1) * epbs + dn->dn_datablkshift; + minfill = 0; + maxfill = blkfill << ((lvl - 1) * epbs); + + if (hole) + maxfill--; + else + minfill++; + + for (i = (*offset >> span) & ((1ULL << epbs) - 1); + i < epb; i++) { + if (bp[i].blk_fill >= minfill && + bp[i].blk_fill <= maxfill && + bp[i].blk_birth > txg) + break; + *offset += 1ULL << span; + } + if (i >= epb) + error = ESRCH; + } + + if (db) + dbuf_rele(db, FTAG); + + return (error); +} + +/* + * Find the next hole, data, or sparse region at or after *offset. + * The value 'blkfill' tells us how many items we expect to find + * in an L0 data block; this value is 1 for normal objects, + * DNODES_PER_BLOCK for the meta dnode, and some fraction of + * DNODES_PER_BLOCK when searching for sparse regions thereof. + * + * Examples: + * + * dnode_next_offset(dn, hole, offset, 1, 1, 0); + * Finds the next hole/data in a file. + * Used in dmu_offset_next(). + * + * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg); + * Finds the next free/allocated dnode an objset's meta-dnode. + * Only finds objects that have new contents since txg (ie. + * bonus buffer changes and content removal are ignored). + * Used in dmu_object_next(). + * + * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0); + * Finds the next L2 meta-dnode bp that's at most 1/4 full. + * Used in dmu_object_alloc(). + */ +int +dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset, + int minlvl, uint64_t blkfill, uint64_t txg) +{ + int lvl, maxlvl; + int error = 0; + uint64_t initial_offset = *offset; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (dn->dn_phys->dn_nlevels == 0) { + rw_exit(&dn->dn_struct_rwlock); + return (ESRCH); + } + + if (dn->dn_datablkshift == 0) { + if (*offset < dn->dn_datablksz) { + if (hole) + *offset = dn->dn_datablksz; + } else { + error = ESRCH; + } + rw_exit(&dn->dn_struct_rwlock); + return (error); + } + + maxlvl = dn->dn_phys->dn_nlevels; + + for (lvl = minlvl; lvl <= maxlvl; lvl++) { + error = dnode_next_offset_level(dn, + hole, offset, lvl, blkfill, txg); + if (error != ESRCH) + break; + } + + while (--lvl >= minlvl && error == 0) { + error = dnode_next_offset_level(dn, + hole, offset, lvl, blkfill, txg); + } + + rw_exit(&dn->dn_struct_rwlock); + + if (error == 0 && initial_offset > *offset) + error = ESRCH; + + return (error); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c new file mode 100644 index 0000000..08f60e8 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -0,0 +1,621 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> + +static void +dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int txgoff = tx->tx_txg & TXG_MASK; + int nblkptr = dn->dn_phys->dn_nblkptr; + int old_toplvl = dn->dn_phys->dn_nlevels - 1; + int new_level = dn->dn_next_nlevels[txgoff]; + int i; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* this dnode can't be paged out because it's dirty */ + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); + + db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); + ASSERT(db != NULL); + + dn->dn_phys->dn_nlevels = new_level; + dprintf("os=%p obj=%llu, increase to %d\n", + dn->dn_objset, dn->dn_object, + dn->dn_phys->dn_nlevels); + + /* check for existing blkptrs in the dnode */ + for (i = 0; i < nblkptr; i++) + if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) + break; + if (i != nblkptr) { + /* transfer dnode's block pointers to new indirect block */ + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + ASSERT(db->db.db_data); + ASSERT(arc_released(db->db_buf)); + ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); + bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + sizeof (blkptr_t) * nblkptr); + arc_buf_freeze(db->db_buf); + } + + /* set dbuf's parent pointers to new indirect buf */ + for (i = 0; i < nblkptr; i++) { + dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i); + + if (child == NULL) + continue; + ASSERT3P(child->db_dnode, ==, dn); + if (child->db_parent && child->db_parent != dn->dn_dbuf) { + ASSERT(child->db_parent->db_level == db->db_level); + ASSERT(child->db_blkptr != + &dn->dn_phys->dn_blkptr[child->db_blkid]); + mutex_exit(&child->db_mtx); + continue; + } + ASSERT(child->db_parent == NULL || + child->db_parent == dn->dn_dbuf); + + child->db_parent = db; + dbuf_add_ref(db, child); + if (db->db.db_data) + child->db_blkptr = (blkptr_t *)db->db.db_data + i; + else + child->db_blkptr = NULL; + dprintf_dbuf_bp(child, child->db_blkptr, + "changed db_blkptr to new indirect %s", ""); + + mutex_exit(&child->db_mtx); + } + + bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); + + dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); +} + +static void +free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) +{ + objset_impl_t *os = dn->dn_objset; + uint64_t bytesfreed = 0; + int i; + + dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num); + + for (i = 0; i < num; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + + bytesfreed += bp_get_dasize(os->os_spa, bp); + ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); + dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx); + bzero(bp, sizeof (blkptr_t)); + } + dnode_diduse_space(dn, -bytesfreed); +} + +#ifdef ZFS_DEBUG +static void +free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) +{ + int off, num; + int i, err, epbs; + uint64_t txg = tx->tx_txg; + + epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + off = start - (db->db_blkid * 1<<epbs); + num = end - start + 1; + + ASSERT3U(off, >=, 0); + ASSERT3U(num, >=, 0); + ASSERT3U(db->db_level, >, 0); + ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift); + ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); + ASSERT(db->db_blkptr != NULL); + + for (i = off; i < off+num; i++) { + uint64_t *buf; + dmu_buf_impl_t *child; + dbuf_dirty_record_t *dr; + int j; + + ASSERT(db->db_level == 1); + + rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(db->db_dnode, db->db_level-1, + (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + rw_exit(&db->db_dnode->dn_struct_rwlock); + if (err == ENOENT) + continue; + ASSERT(err == 0); + ASSERT(child->db_level == 0); + dr = child->db_last_dirty; + while (dr && dr->dr_txg > txg) + dr = dr->dr_next; + ASSERT(dr == NULL || dr->dr_txg == txg); + + /* data_old better be zeroed */ + if (dr) { + buf = dr->dt.dl.dr_data->b_data; + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + child, i, off, num); + } + } + } + + /* + * db_data better be zeroed unless it's dirty in a + * future txg. + */ + mutex_enter(&child->db_mtx); + buf = child->db.db_data; + if (buf != NULL && child->db_state != DB_FILL && + child->db_last_dirty == NULL) { + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + child, i, off, num); + } + } + } + mutex_exit(&child->db_mtx); + + dbuf_rele(child, FTAG); + } +} +#endif + +static int +free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, + dmu_tx_t *tx) +{ + dnode_t *dn = db->db_dnode; + blkptr_t *bp; + dmu_buf_impl_t *subdb; + uint64_t start, end, dbstart, dbend, i; + int epbs, shift, err; + int all = TRUE; + + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + arc_release(db->db_buf, db); + bp = (blkptr_t *)db->db.db_data; + + epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + shift = (db->db_level - 1) * epbs; + dbstart = db->db_blkid << epbs; + start = blkid >> shift; + if (dbstart < start) { + bp += start - dbstart; + all = FALSE; + } else { + start = dbstart; + } + dbend = ((db->db_blkid + 1) << epbs) - 1; + end = (blkid + nblks - 1) >> shift; + if (dbend <= end) + end = dbend; + else if (all) + all = trunc; + ASSERT3U(start, <=, end); + + if (db->db_level == 1) { + FREE_VERIFY(db, start, end, tx); + free_blocks(dn, bp, end-start+1, tx); + arc_buf_freeze(db->db_buf); + ASSERT(all || db->db_last_dirty); + return (all); + } + + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); + ASSERT3U(err, ==, 0); + rw_exit(&dn->dn_struct_rwlock); + + if (free_children(subdb, blkid, nblks, trunc, tx)) { + ASSERT3P(subdb->db_blkptr, ==, bp); + free_blocks(dn, bp, 1, tx); + } else { + all = FALSE; + } + dbuf_rele(subdb, FTAG); + } + arc_buf_freeze(db->db_buf); +#ifdef ZFS_DEBUG + bp -= (end-start)+1; + for (i = start; i <= end; i++, bp++) { + if (i == start && blkid != 0) + continue; + else if (i == end && !trunc) + continue; + ASSERT3U(bp->blk_birth, ==, 0); + } +#endif + ASSERT(all || db->db_last_dirty); + return (all); +} + +/* + * free_range: Traverse the indicated range of the provided file + * and "free" all the blocks contained there. + */ +static void +dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +{ + blkptr_t *bp = dn->dn_phys->dn_blkptr; + dmu_buf_impl_t *db; + int trunc, start, end, shift, i, err; + int dnlevel = dn->dn_phys->dn_nlevels; + + if (blkid > dn->dn_phys->dn_maxblkid) + return; + + ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); + trunc = blkid + nblks > dn->dn_phys->dn_maxblkid; + if (trunc) + nblks = dn->dn_phys->dn_maxblkid - blkid + 1; + + /* There are no indirect blocks in the object */ + if (dnlevel == 1) { + if (blkid >= dn->dn_phys->dn_nblkptr) { + /* this range was never made persistent */ + return; + } + ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); + free_blocks(dn, bp + blkid, nblks, tx); + if (trunc) { + uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); + ASSERT(off < dn->dn_phys->dn_maxblkid || + dn->dn_phys->dn_maxblkid == 0 || + dnode_next_offset(dn, FALSE, &off, + 1, 1, 0) != 0); + } + return; + } + + shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); + start = blkid >> shift; + ASSERT(start < dn->dn_phys->dn_nblkptr); + end = (blkid + nblks - 1) >> shift; + bp += start; + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); + ASSERT3U(err, ==, 0); + rw_exit(&dn->dn_struct_rwlock); + + if (free_children(db, blkid, nblks, trunc, tx)) { + ASSERT3P(db->db_blkptr, ==, bp); + free_blocks(dn, bp, 1, tx); + } + dbuf_rele(db, FTAG); + } + if (trunc) { + uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); + ASSERT(off < dn->dn_phys->dn_maxblkid || + dn->dn_phys->dn_maxblkid == 0 || + dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0); + } +} + +/* + * Try to kick all the dnodes dbufs out of the cache... + */ +int +dnode_evict_dbufs(dnode_t *dn, int try) +{ + int progress; + int pass = 0; + + do { + dmu_buf_impl_t *db, marker; + int evicting = FALSE; + + progress = FALSE; + mutex_enter(&dn->dn_dbufs_mtx); + list_insert_tail(&dn->dn_dbufs, &marker); + db = list_head(&dn->dn_dbufs); + for (; db != ▮ db = list_head(&dn->dn_dbufs)) { + list_remove(&dn->dn_dbufs, db); + list_insert_tail(&dn->dn_dbufs, db); + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_EVICTING) { + progress = TRUE; + evicting = TRUE; + mutex_exit(&db->db_mtx); + } else if (refcount_is_zero(&db->db_holds)) { + progress = TRUE; + ASSERT(!arc_released(db->db_buf)); + dbuf_clear(db); /* exits db_mtx for us */ + } else { + mutex_exit(&db->db_mtx); + } + + } + list_remove(&dn->dn_dbufs, &marker); + /* + * NB: we need to drop dn_dbufs_mtx between passes so + * that any DB_EVICTING dbufs can make progress. + * Ideally, we would have some cv we could wait on, but + * since we don't, just wait a bit to give the other + * thread a chance to run. + */ + mutex_exit(&dn->dn_dbufs_mtx); + if (evicting) + delay(1); + pass++; + ASSERT(pass < 100); /* sanity check */ + } while (progress); + + /* + * This function works fine even if it can't evict everything. + * If were only asked to try to evict everything then + * return an error if we can't. Otherwise panic as the caller + * expects total eviction. + */ + if (list_head(&dn->dn_dbufs) != NULL) { + if (try) { + return (1); + } else { + panic("dangling dbufs (dn=%p, dbuf=%p)\n", + dn, list_head(&dn->dn_dbufs)); + } + } + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_evict(dn->dn_bonus); + dn->dn_bonus = NULL; + } + rw_exit(&dn->dn_struct_rwlock); + return (0); +} + +static void +dnode_undirty_dbufs(list_t *list) +{ + dbuf_dirty_record_t *dr; + + while (dr = list_head(list)) { + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dr->dr_txg; + + mutex_enter(&db->db_mtx); + /* XXX - use dbuf_undirty()? */ + list_remove(list, dr); + ASSERT(db->db_last_dirty == dr); + db->db_last_dirty = NULL; + db->db_dirtycnt -= 1; + if (db->db_level == 0) { + ASSERT(db->db_blkid == DB_BONUS_BLKID || + dr->dt.dl.dr_data == db->db_buf); + dbuf_unoverride(dr); + mutex_exit(&db->db_mtx); + } else { + mutex_exit(&db->db_mtx); + dnode_undirty_dbufs(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + dbuf_rele(db, (void *)(uintptr_t)txg); + } +} + +static void +dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) +{ + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(dmu_tx_is_syncing(tx)); + + dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); + (void) dnode_evict_dbufs(dn, 0); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + /* + * XXX - It would be nice to assert this, but we may still + * have residual holds from async evictions from the arc... + * + * zfs_obj_to_path() also depends on this being + * commented out. + * + * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); + */ + + /* Undirty next bits */ + dn->dn_next_nlevels[txgoff] = 0; + dn->dn_next_indblkshift[txgoff] = 0; + dn->dn_next_blksz[txgoff] = 0; + + /* free up all the blocks in the file. */ + dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx); + ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); + + /* ASSERT(blkptrs are zero); */ + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(dn->dn_type != DMU_OT_NONE); + + ASSERT(dn->dn_free_txg > 0); + if (dn->dn_allocated_txg != dn->dn_free_txg) + dbuf_will_dirty(dn->dn_dbuf, tx); + bzero(dn->dn_phys, sizeof (dnode_phys_t)); + + mutex_enter(&dn->dn_mtx); + dn->dn_type = DMU_OT_NONE; + dn->dn_maxblkid = 0; + dn->dn_allocated_txg = 0; + mutex_exit(&dn->dn_mtx); + + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + /* + * Now that we've released our hold, the dnode may + * be evicted, so we musn't access it. + */ +} + +/* + * Write out the dnode's dirty buffers. + * + * NOTE: The dnode is kept in memory by being dirty. Once the + * dirty bit is cleared, it may be evicted. Beware of this! + */ +void +dnode_sync(dnode_t *dn, dmu_tx_t *tx) +{ + free_range_t *rp; + dnode_phys_t *dnp = dn->dn_phys; + int txgoff = tx->tx_txg & TXG_MASK; + list_t *list = &dn->dn_dirty_records[txgoff]; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); + DNODE_VERIFY(dn); + + ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); + + mutex_enter(&dn->dn_mtx); + if (dn->dn_allocated_txg == tx->tx_txg) { + /* The dnode is newly allocated or reallocated */ + if (dnp->dn_type == DMU_OT_NONE) { + /* this is a first alloc, not a realloc */ + /* XXX shouldn't the phys already be zeroed? */ + bzero(dnp, DNODE_CORE_SIZE); + dnp->dn_nlevels = 1; + } + + if (dn->dn_nblkptr > dnp->dn_nblkptr) { + /* zero the new blkptrs we are gaining */ + bzero(dnp->dn_blkptr + dnp->dn_nblkptr, + sizeof (blkptr_t) * + (dn->dn_nblkptr - dnp->dn_nblkptr)); + } + dnp->dn_type = dn->dn_type; + dnp->dn_bonustype = dn->dn_bonustype; + dnp->dn_bonuslen = dn->dn_bonuslen; + dnp->dn_nblkptr = dn->dn_nblkptr; + } + + ASSERT(dnp->dn_nlevels > 1 || + BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_GET_LSIZE(&dnp->dn_blkptr[0]) == + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + + if (dn->dn_next_blksz[txgoff]) { + ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], + SPA_MINBLOCKSIZE) == 0); + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || + list_head(list) != NULL || + dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == + dnp->dn_datablkszsec); + dnp->dn_datablkszsec = + dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; + dn->dn_next_blksz[txgoff] = 0; + } + + if (dn->dn_next_indblkshift[txgoff]) { + ASSERT(dnp->dn_nlevels == 1); + dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; + dn->dn_next_indblkshift[txgoff] = 0; + } + + /* + * Just take the live (open-context) values for checksum and compress. + * Strictly speaking it's a future leak, but nothing bad happens if we + * start using the new checksum or compress algorithm a little early. + */ + dnp->dn_checksum = dn->dn_checksum; + dnp->dn_compress = dn->dn_compress; + + mutex_exit(&dn->dn_mtx); + + /* process all the "freed" ranges in the file */ + if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) { + for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL; + rp = AVL_PREV(&dn->dn_ranges[txgoff], rp)) + dnode_sync_free_range(dn, + rp->fr_blkid, rp->fr_nblks, tx); + } + mutex_enter(&dn->dn_mtx); + for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) { + free_range_t *last = rp; + rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp); + avl_remove(&dn->dn_ranges[txgoff], last); + kmem_free(last, sizeof (free_range_t)); + } + mutex_exit(&dn->dn_mtx); + + if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { + dnode_sync_free(dn, tx); + return; + } + + if (dn->dn_next_nlevels[txgoff]) { + dnode_increase_indirection(dn, tx); + dn->dn_next_nlevels[txgoff] = 0; + } + + dbuf_sync_list(list, tx); + + if (dn->dn_object != DMU_META_DNODE_OBJECT) { + ASSERT3P(list_head(list), ==, NULL); + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + } + + /* + * Although we have dropped our reference to the dnode, it + * can't be evicted until its written, and we haven't yet + * initiated the IO for the dnode's dbuf. + */ +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c new file mode 100644 index 0000000..a9707a0 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -0,0 +1,1889 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_traverse.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/zio.h> +#include <sys/zap.h> +#include <sys/unique.h> +#include <sys/zfs_context.h> + +static dsl_checkfunc_t dsl_dataset_destroy_begin_check; +static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; +static dsl_checkfunc_t dsl_dataset_rollback_check; +static dsl_syncfunc_t dsl_dataset_rollback_sync; +static dsl_checkfunc_t dsl_dataset_destroy_check; +static dsl_syncfunc_t dsl_dataset_destroy_sync; + +#define DS_REF_MAX (1ULL << 62) + +#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE + +/* + * We use weighted reference counts to express the various forms of exclusion + * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open + * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE. + * This makes the exclusion logic simple: the total refcnt for all opens cannot + * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their + * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume + * just over half of the refcnt space, so there can't be more than one, but it + * can peacefully coexist with any number of STANDARD opens. + */ +static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = { + 0, /* DS_MODE_NONE - invalid */ + 1, /* DS_MODE_STANDARD - unlimited number */ + (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */ + DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */ +}; + + +void +dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +{ + int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + + dprintf_bp(bp, "born, ds=%p\n", ds); + + ASSERT(dmu_tx_is_syncing(tx)); + /* It could have been compressed away to nothing */ + if (BP_IS_HOLE(bp)) + return; + ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); + ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); + if (ds == NULL) { + /* + * Account for the meta-objset space in its placeholder + * dsl_dir. + */ + ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + used, compressed, uncompressed, tx); + dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + return; + } + dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_lock); + ds->ds_phys->ds_used_bytes += used; + ds->ds_phys->ds_compressed_bytes += compressed; + ds->ds_phys->ds_uncompressed_bytes += uncompressed; + ds->ds_phys->ds_unique_bytes += used; + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, + used, compressed, uncompressed, tx); +} + +void +dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, + dmu_tx_t *tx) +{ + int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + + ASSERT(dmu_tx_is_syncing(tx)); + /* No block pointer => nothing to free */ + if (BP_IS_HOLE(bp)) + return; + + ASSERT(used > 0); + if (ds == NULL) { + int err; + /* + * Account for the meta-objset space in its placeholder + * dataset. + */ + err = arc_free(pio, tx->tx_pool->dp_spa, + tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); + ASSERT(err == 0); + + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + -used, -compressed, -uncompressed, tx); + dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + return; + } + ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { + int err; + + dprintf_bp(bp, "freeing: %s", ""); + err = arc_free(pio, tx->tx_pool->dp_spa, + tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); + ASSERT(err == 0); + + mutex_enter(&ds->ds_lock); + /* XXX unique_bytes is not accurate for head datasets */ + /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */ + ds->ds_phys->ds_unique_bytes -= used; + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, + -used, -compressed, -uncompressed, tx); + } else { + dprintf_bp(bp, "putting on dead list: %s", ""); + VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); + /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ + if (ds->ds_phys->ds_prev_snap_obj != 0) { + ASSERT3U(ds->ds_prev->ds_object, ==, + ds->ds_phys->ds_prev_snap_obj); + ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); + if (ds->ds_prev->ds_phys->ds_next_snap_obj == + ds->ds_object && bp->blk_birth > + ds->ds_prev->ds_phys->ds_prev_snap_txg) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + mutex_enter(&ds->ds_prev->ds_lock); + ds->ds_prev->ds_phys->ds_unique_bytes += + used; + mutex_exit(&ds->ds_prev->ds_lock); + } + } + } + mutex_enter(&ds->ds_lock); + ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); + ds->ds_phys->ds_used_bytes -= used; + ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); + ds->ds_phys->ds_compressed_bytes -= compressed; + ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); + ds->ds_phys->ds_uncompressed_bytes -= uncompressed; + mutex_exit(&ds->ds_lock); +} + +uint64_t +dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) +{ + uint64_t trysnap = 0; + + if (ds == NULL) + return (0); + /* + * The snapshot creation could fail, but that would cause an + * incorrect FALSE return, which would only result in an + * overestimation of the amount of space that an operation would + * consume, which is OK. + * + * There's also a small window where we could miss a pending + * snapshot, because we could set the sync task in the quiescing + * phase. So this should only be used as a guess. + */ + if (ds->ds_trysnap_txg > + spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) + trysnap = ds->ds_trysnap_txg; + return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); +} + +int +dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) +{ + return (blk_birth > dsl_dataset_prev_snap_txg(ds)); +} + +/* ARGSUSED */ +static void +dsl_dataset_evict(dmu_buf_t *db, void *dsv) +{ + dsl_dataset_t *ds = dsv; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* open_refcount == DS_REF_MAX when deleting */ + ASSERT(ds->ds_open_refcount == 0 || + ds->ds_open_refcount == DS_REF_MAX); + + dprintf_ds(ds, "evicting %s\n", ""); + + unique_remove(ds->ds_phys->ds_fsid_guid); + + if (ds->ds_user_ptr != NULL) + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + + if (ds->ds_prev) { + dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); + ds->ds_prev = NULL; + } + + bplist_close(&ds->ds_deadlist); + dsl_dir_close(ds->ds_dir, ds); + + if (list_link_active(&ds->ds_synced_link)) + list_remove(&dp->dp_synced_objsets, ds); + + mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_deadlist.bpl_lock); + + kmem_free(ds, sizeof (dsl_dataset_t)); +} + +static int +dsl_dataset_get_snapname(dsl_dataset_t *ds) +{ + dsl_dataset_phys_t *headphys; + int err; + dmu_buf_t *headdbuf; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + if (ds->ds_snapname[0]) + return (0); + if (ds->ds_phys->ds_next_snap_obj == 0) + return (0); + + err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, + FTAG, &headdbuf); + if (err) + return (err); + headphys = headdbuf->db_data; + err = zap_value_search(dp->dp_meta_objset, + headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname); + dmu_buf_rele(headdbuf, FTAG); + return (err); +} + +int +dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, + int mode, void *tag, dsl_dataset_t **dsp) +{ + uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + dsl_dataset_t *ds; + int err; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + + err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); + if (err) + return (err); + ds = dmu_buf_get_user(dbuf); + if (ds == NULL) { + dsl_dataset_t *winner; + + ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); + ds->ds_dbuf = dbuf; + ds->ds_object = dsobj; + ds->ds_phys = dbuf->db_data; + + mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, + NULL); + + err = bplist_open(&ds->ds_deadlist, + mos, ds->ds_phys->ds_deadlist_obj); + if (err == 0) { + err = dsl_dir_open_obj(dp, + ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); + } + if (err) { + /* + * we don't really need to close the blist if we + * just opened it. + */ + mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_deadlist.bpl_lock); + kmem_free(ds, sizeof (dsl_dataset_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } + + if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) { + ds->ds_snapname[0] = '\0'; + if (ds->ds_phys->ds_prev_snap_obj) { + err = dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_NONE, ds, &ds->ds_prev); + } + } else { + if (snapname) { +#ifdef ZFS_DEBUG + dsl_dataset_phys_t *headphys; + dmu_buf_t *headdbuf; + err = dmu_bonus_hold(mos, + ds->ds_dir->dd_phys->dd_head_dataset_obj, + FTAG, &headdbuf); + if (err == 0) { + headphys = headdbuf->db_data; + uint64_t foundobj; + err = zap_lookup(dp->dp_meta_objset, + headphys->ds_snapnames_zapobj, + snapname, sizeof (foundobj), 1, + &foundobj); + ASSERT3U(foundobj, ==, dsobj); + dmu_buf_rele(headdbuf, FTAG); + } +#endif + (void) strcat(ds->ds_snapname, snapname); + } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { + err = dsl_dataset_get_snapname(ds); + } + } + + if (err == 0) { + winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, + dsl_dataset_evict); + } + if (err || winner) { + bplist_close(&ds->ds_deadlist); + if (ds->ds_prev) { + dsl_dataset_close(ds->ds_prev, + DS_MODE_NONE, ds); + } + dsl_dir_close(ds->ds_dir, ds); + mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_deadlist.bpl_lock); + kmem_free(ds, sizeof (dsl_dataset_t)); + if (err) { + dmu_buf_rele(dbuf, tag); + return (err); + } + ds = winner; + } else { + uint64_t new = + unique_insert(ds->ds_phys->ds_fsid_guid); + if (new != ds->ds_phys->ds_fsid_guid) { + /* XXX it won't necessarily be synced... */ + ds->ds_phys->ds_fsid_guid = new; + } + } + } + ASSERT3P(ds->ds_dbuf, ==, dbuf); + ASSERT3P(ds->ds_phys, ==, dbuf->db_data); + + mutex_enter(&ds->ds_lock); + if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY && + (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) && + !DS_MODE_IS_INCONSISTENT(mode)) || + (ds->ds_open_refcount + weight > DS_REF_MAX)) { + mutex_exit(&ds->ds_lock); + dsl_dataset_close(ds, DS_MODE_NONE, tag); + return (EBUSY); + } + ds->ds_open_refcount += weight; + mutex_exit(&ds->ds_lock); + + *dsp = ds; + return (0); +} + +int +dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, + void *tag, dsl_dataset_t **dsp) +{ + dsl_dir_t *dd; + dsl_pool_t *dp; + const char *tail; + uint64_t obj; + dsl_dataset_t *ds = NULL; + int err = 0; + + err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail); + if (err) + return (err); + + dp = dd->dd_pool; + obj = dd->dd_phys->dd_head_dataset_obj; + rw_enter(&dp->dp_config_rwlock, RW_READER); + if (obj == 0) { + /* A dataset with no associated objset */ + err = ENOENT; + goto out; + } + + if (tail != NULL) { + objset_t *mos = dp->dp_meta_objset; + + err = dsl_dataset_open_obj(dp, obj, NULL, + DS_MODE_NONE, tag, &ds); + if (err) + goto out; + obj = ds->ds_phys->ds_snapnames_zapobj; + dsl_dataset_close(ds, DS_MODE_NONE, tag); + ds = NULL; + + if (tail[0] != '@') { + err = ENOENT; + goto out; + } + tail++; + + /* Look for a snapshot */ + if (!DS_MODE_IS_READONLY(mode)) { + err = EROFS; + goto out; + } + dprintf("looking for snapshot '%s'\n", tail); + err = zap_lookup(mos, obj, tail, 8, 1, &obj); + if (err) + goto out; + } + err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds); + +out: + rw_exit(&dp->dp_config_rwlock); + dsl_dir_close(dd, FTAG); + + ASSERT3U((err == 0), ==, (ds != NULL)); + /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */ + + *dsp = ds; + return (err); +} + +int +dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp)); +} + +void +dsl_dataset_name(dsl_dataset_t *ds, char *name) +{ + if (ds == NULL) { + (void) strcpy(name, "mos"); + } else { + dsl_dir_name(ds->ds_dir, name); + VERIFY(0 == dsl_dataset_get_snapname(ds)); + if (ds->ds_snapname[0]) { + (void) strcat(name, "@"); + if (!MUTEX_HELD(&ds->ds_lock)) { + /* + * We use a "recursive" mutex so that we + * can call dprintf_ds() with ds_lock held. + */ + mutex_enter(&ds->ds_lock); + (void) strcat(name, ds->ds_snapname); + mutex_exit(&ds->ds_lock); + } else { + (void) strcat(name, ds->ds_snapname); + } + } + } +} + +void +dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag) +{ + uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; + mutex_enter(&ds->ds_lock); + ASSERT3U(ds->ds_open_refcount, >=, weight); + ds->ds_open_refcount -= weight; + dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n", + mode, ds->ds_open_refcount); + mutex_exit(&ds->ds_lock); + + dmu_buf_rele(ds->ds_dbuf, tag); +} + +void +dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx) +{ + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + dsl_dataset_t *ds; + uint64_t dsobj; + dsl_dir_t *dd; + + dsl_dir_create_root(mos, ddobjp, tx); + VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd)); + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); + VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_fsid_guid = unique_create(); + unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_snapnames_zapobj = + zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + dmu_buf_rele(dbuf, FTAG); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_head_dataset_obj = dsobj; + dsl_dir_close(dd, FTAG); + + VERIFY(0 == + dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds)); + (void) dmu_objset_create_impl(dp->dp_spa, ds, + &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); +} + +uint64_t +dsl_dataset_create_sync(dsl_dir_t *pdd, + const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx) +{ + dsl_pool_t *dp = pdd->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj, ddobj; + objset_t *mos = dp->dp_meta_objset; + dsl_dir_t *dd; + + ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp); + ASSERT(clone_parent == NULL || + clone_parent->ds_phys->ds_num_children > 0); + ASSERT(lastname[0] != '@'); + ASSERT(dmu_tx_is_syncing(tx)); + + ddobj = dsl_dir_create_sync(pdd, lastname, tx); + VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); + VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_fsid_guid = unique_create(); + unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_snapnames_zapobj = + zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + if (clone_parent) { + dsphys->ds_prev_snap_obj = clone_parent->ds_object; + dsphys->ds_prev_snap_txg = + clone_parent->ds_phys->ds_creation_txg; + dsphys->ds_used_bytes = + clone_parent->ds_phys->ds_used_bytes; + dsphys->ds_compressed_bytes = + clone_parent->ds_phys->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + clone_parent->ds_phys->ds_uncompressed_bytes; + dsphys->ds_bp = clone_parent->ds_phys->ds_bp; + + dmu_buf_will_dirty(clone_parent->ds_dbuf, tx); + clone_parent->ds_phys->ds_num_children++; + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object; + } + dmu_buf_rele(dbuf, FTAG); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_head_dataset_obj = dsobj; + dsl_dir_close(dd, FTAG); + + return (dsobj); +} + +struct destroyarg { + dsl_sync_task_group_t *dstg; + char *snapname; + void *tag; + char *failed; +}; + +static int +dsl_snapshot_destroy_one(char *name, void *arg) +{ + struct destroyarg *da = arg; + dsl_dataset_t *ds; + char *cp; + int err; + + (void) strcat(name, "@"); + (void) strcat(name, da->snapname); + err = dsl_dataset_open(name, + DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, + da->tag, &ds); + cp = strchr(name, '@'); + *cp = '\0'; + if (err == ENOENT) + return (0); + if (err) { + (void) strcpy(da->failed, name); + return (err); + } + + dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, ds, da->tag, 0); + return (0); +} + +/* + * Destroy 'snapname' in all descendants of 'fsname'. + */ +#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy +int +dsl_snapshots_destroy(char *fsname, char *snapname) +{ + int err; + struct destroyarg da; + dsl_sync_task_t *dst; + spa_t *spa; + char *cp; + + cp = strchr(fsname, '/'); + if (cp) { + *cp = '\0'; + err = spa_open(fsname, &spa, FTAG); + *cp = '/'; + } else { + err = spa_open(fsname, &spa, FTAG); + } + if (err) + return (err); + da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + da.snapname = snapname; + da.tag = FTAG; + da.failed = fsname; + + err = dmu_objset_find(fsname, + dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); + + if (err == 0) + err = dsl_sync_task_group_wait(da.dstg); + + for (dst = list_head(&da.dstg->dstg_tasks); dst; + dst = list_next(&da.dstg->dstg_tasks, dst)) { + dsl_dataset_t *ds = dst->dst_arg1; + if (dst->dst_err) { + dsl_dataset_name(ds, fsname); + cp = strchr(fsname, '@'); + *cp = '\0'; + } + /* + * If it was successful, destroy_sync would have + * closed the ds + */ + if (err) + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + } + + dsl_sync_task_group_destroy(da.dstg); + spa_close(spa, FTAG); + return (err); +} + +int +dsl_dataset_destroy(const char *name) +{ + int err; + dsl_sync_task_group_t *dstg; + objset_t *os; + dsl_dataset_t *ds; + dsl_dir_t *dd; + uint64_t obj; + + if (strchr(name, '@')) { + /* Destroying a snapshot is simpler */ + err = dsl_dataset_open(name, + DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, + FTAG, &ds); + if (err) + return (err); + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_destroy_check, dsl_dataset_destroy_sync, + ds, FTAG, 0); + if (err) + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + return (err); + } + + err = dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os); + if (err) + return (err); + ds = os->os->os_dsl_dataset; + dd = ds->ds_dir; + + /* + * Check for errors and mark this ds as inconsistent, in + * case we crash while freeing the objects. + */ + err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, + dsl_dataset_destroy_begin_sync, ds, NULL, 0); + if (err) { + dmu_objset_close(os); + return (err); + } + + /* + * remove the objects in open context, so that we won't + * have too much to do in syncing context. + */ + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, + ds->ds_phys->ds_prev_snap_txg)) { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); + dmu_tx_hold_bonus(tx, obj); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + /* + * Perhaps there is not enough disk + * space. Just deal with it from + * dsl_dataset_destroy_sync(). + */ + dmu_tx_abort(tx); + continue; + } + VERIFY(0 == dmu_object_free(os, obj, tx)); + dmu_tx_commit(tx); + } + /* Make sure it's not dirty before we finish destroying it. */ + txg_wait_synced(dd->dd_pool, 0); + + dmu_objset_close(os); + if (err != ESRCH) + return (err); + + err = dsl_dataset_open(name, + DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, + FTAG, &ds); + if (err) + return (err); + + err = dsl_dir_open(name, FTAG, &dd, NULL); + if (err) { + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + return (err); + } + + /* + * Blow away the dsl_dir + head dataset. + */ + dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); + dsl_sync_task_create(dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, ds, FTAG, 0); + dsl_sync_task_create(dstg, dsl_dir_destroy_check, + dsl_dir_destroy_sync, dd, FTAG, 0); + err = dsl_sync_task_group_wait(dstg); + dsl_sync_task_group_destroy(dstg); + /* if it is successful, *destroy_sync will close the ds+dd */ + if (err) { + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + dsl_dir_close(dd, FTAG); + } + return (err); +} + +int +dsl_dataset_rollback(dsl_dataset_t *ds) +{ + ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX); + return (dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_rollback_check, dsl_dataset_rollback_sync, + ds, NULL, 0)); +} + +void * +dsl_dataset_set_user_ptr(dsl_dataset_t *ds, + void *p, dsl_dataset_evict_func_t func) +{ + void *old; + + mutex_enter(&ds->ds_lock); + old = ds->ds_user_ptr; + if (old == NULL) { + ds->ds_user_ptr = p; + ds->ds_user_evict_func = func; + } + mutex_exit(&ds->ds_lock); + return (old); +} + +void * +dsl_dataset_get_user_ptr(dsl_dataset_t *ds) +{ + return (ds->ds_user_ptr); +} + + +blkptr_t * +dsl_dataset_get_blkptr(dsl_dataset_t *ds) +{ + return (&ds->ds_phys->ds_bp); +} + +void +dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + /* If it's the meta-objset, set dp_meta_rootbp */ + if (ds == NULL) { + tx->tx_pool->dp_meta_rootbp = *bp; + } else { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_bp = *bp; + } +} + +spa_t * +dsl_dataset_get_spa(dsl_dataset_t *ds) +{ + return (ds->ds_dir->dd_pool->dp_spa); +} + +void +dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp; + + if (ds == NULL) /* this is the meta-objset */ + return; + + ASSERT(ds->ds_user_ptr != NULL); + + if (ds->ds_phys->ds_next_snap_obj != 0) + panic("dirtying snapshot!"); + + dp = ds->ds_dir->dd_pool; + + if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + } +} + +struct killarg { + uint64_t *usedp; + uint64_t *compressedp; + uint64_t *uncompressedp; + zio_t *zio; + dmu_tx_t *tx; +}; + +static int +kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +{ + struct killarg *ka = arg; + blkptr_t *bp = &bc->bc_blkptr; + + ASSERT3U(bc->bc_errno, ==, 0); + + /* + * Since this callback is not called concurrently, no lock is + * needed on the accounting values. + */ + *ka->usedp += bp_get_dasize(spa, bp); + *ka->compressedp += BP_GET_PSIZE(bp); + *ka->uncompressedp += BP_GET_UCSIZE(bp); + /* XXX check for EIO? */ + (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL, + ARC_NOWAIT); + return (0); +} + +/* ARGSUSED */ +static int +dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + + /* + * There must be a previous snapshot. I suppose we could roll + * it back to being empty (and re-initialize the upper (ZPL) + * layer). But for now there's no way to do this via the user + * interface. + */ + if (ds->ds_phys->ds_prev_snap_txg == 0) + return (EINVAL); + + /* + * This must not be a snapshot. + */ + if (ds->ds_phys->ds_next_snap_obj != 0) + return (EINVAL); + + /* + * If we made changes this txg, traverse_dsl_dataset won't find + * them. Try again. + */ + if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) + return (EAGAIN); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + /* Zero out the deadlist. */ + bplist_close(&ds->ds_deadlist); + bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); + ds->ds_phys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, + ds->ds_phys->ds_deadlist_obj)); + + { + /* Free blkptrs that we gave birth to */ + zio_t *zio; + uint64_t used = 0, compressed = 0, uncompressed = 0; + struct killarg ka; + + zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); + ka.usedp = &used; + ka.compressedp = &compressed; + ka.uncompressedp = &uncompressed; + ka.zio = zio; + ka.tx = tx; + (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + ADVANCE_POST, kill_blkptr, &ka); + (void) zio_wait(zio); + + dsl_dir_diduse_space(ds->ds_dir, + -used, -compressed, -uncompressed, tx); + } + + /* Change our contents to that of the prev snapshot */ + ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); + ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; + ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes; + ds->ds_phys->ds_compressed_bytes = + ds->ds_prev->ds_phys->ds_compressed_bytes; + ds->ds_phys->ds_uncompressed_bytes = + ds->ds_prev->ds_phys->ds_uncompressed_bytes; + ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; + ds->ds_phys->ds_unique_bytes = 0; + + if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_unique_bytes = 0; + } +} + +/* ARGSUSED */ +static int +dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) + return (EINVAL); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + + /* Mark it as inconsistent on-disk, in case we crash */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; +} + +/* ARGSUSED */ +static int +dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + + /* Can't delete a branch point. */ + if (ds->ds_phys->ds_num_children > 1) + return (EEXIST); + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) + return (EINVAL); + + /* + * If we made changes this txg, traverse_dsl_dataset won't find + * them. Try again. + */ + if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) + return (EAGAIN); + + /* XXX we should do some i/o error checking... */ + return (0); +} + +static void +dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t used = 0, compressed = 0, uncompressed = 0; + zio_t *zio; + int err; + int after_branch_point = FALSE; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *ds_prev = NULL; + uint64_t obj; + + ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX); + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT(ds->ds_prev == NULL || + ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); + ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + + obj = ds->ds_object; + + if (ds->ds_phys->ds_prev_snap_obj != 0) { + if (ds->ds_prev) { + ds_prev = ds->ds_prev; + } else { + VERIFY(0 == dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_NONE, FTAG, &ds_prev)); + } + after_branch_point = + (ds_prev->ds_phys->ds_next_snap_obj != obj); + + dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); + if (after_branch_point && + ds->ds_phys->ds_next_snap_obj == 0) { + /* This clone is toast. */ + ASSERT(ds_prev->ds_phys->ds_num_children > 1); + ds_prev->ds_phys->ds_num_children--; + } else if (!after_branch_point) { + ds_prev->ds_phys->ds_next_snap_obj = + ds->ds_phys->ds_next_snap_obj; + } + } + + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + + if (ds->ds_phys->ds_next_snap_obj != 0) { + blkptr_t bp; + dsl_dataset_t *ds_next; + uint64_t itor = 0; + + spa_scrub_restart(dp->dp_spa, tx->tx_txg); + + VERIFY(0 == dsl_dataset_open_obj(dp, + ds->ds_phys->ds_next_snap_obj, NULL, + DS_MODE_NONE, FTAG, &ds_next)); + ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); + + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); + ds_next->ds_phys->ds_prev_snap_obj = + ds->ds_phys->ds_prev_snap_obj; + ds_next->ds_phys->ds_prev_snap_txg = + ds->ds_phys->ds_prev_snap_txg; + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, + ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); + + /* + * Transfer to our deadlist (which will become next's + * new deadlist) any entries from next's current + * deadlist which were born before prev, and free the + * other entries. + * + * XXX we're doing this long task with the config lock held + */ + while (bplist_iterate(&ds_next->ds_deadlist, &itor, + &bp) == 0) { + if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { + VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, + &bp, tx)); + if (ds_prev && !after_branch_point && + bp.blk_birth > + ds_prev->ds_phys->ds_prev_snap_txg) { + ds_prev->ds_phys->ds_unique_bytes += + bp_get_dasize(dp->dp_spa, &bp); + } + } else { + used += bp_get_dasize(dp->dp_spa, &bp); + compressed += BP_GET_PSIZE(&bp); + uncompressed += BP_GET_UCSIZE(&bp); + /* XXX check return value? */ + (void) arc_free(zio, dp->dp_spa, tx->tx_txg, + &bp, NULL, NULL, ARC_NOWAIT); + } + } + + /* free next's deadlist */ + bplist_close(&ds_next->ds_deadlist); + bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); + + /* set next's deadlist to our deadlist */ + ds_next->ds_phys->ds_deadlist_obj = + ds->ds_phys->ds_deadlist_obj; + VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, + ds_next->ds_phys->ds_deadlist_obj)); + ds->ds_phys->ds_deadlist_obj = 0; + + if (ds_next->ds_phys->ds_next_snap_obj != 0) { + /* + * Update next's unique to include blocks which + * were previously shared by only this snapshot + * and it. Those blocks will be born after the + * prev snap and before this snap, and will have + * died after the next snap and before the one + * after that (ie. be on the snap after next's + * deadlist). + * + * XXX we're doing this long task with the + * config lock held + */ + dsl_dataset_t *ds_after_next; + + VERIFY(0 == dsl_dataset_open_obj(dp, + ds_next->ds_phys->ds_next_snap_obj, NULL, + DS_MODE_NONE, FTAG, &ds_after_next)); + itor = 0; + while (bplist_iterate(&ds_after_next->ds_deadlist, + &itor, &bp) == 0) { + if (bp.blk_birth > + ds->ds_phys->ds_prev_snap_txg && + bp.blk_birth <= + ds->ds_phys->ds_creation_txg) { + ds_next->ds_phys->ds_unique_bytes += + bp_get_dasize(dp->dp_spa, &bp); + } + } + + dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG); + ASSERT3P(ds_next->ds_prev, ==, NULL); + } else { + /* + * It would be nice to update the head dataset's + * unique. To do so we would have to traverse + * it for blocks born after ds_prev, which is + * pretty expensive just to maintain something + * for debugging purposes. + */ + ASSERT3P(ds_next->ds_prev, ==, ds); + dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE, + ds_next); + if (ds_prev) { + VERIFY(0 == dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_NONE, ds_next, &ds_next->ds_prev)); + } else { + ds_next->ds_prev = NULL; + } + } + dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG); + + /* + * NB: unique_bytes is not accurate for head objsets + * because we don't update it when we delete the most + * recent snapshot -- see above comment. + */ + ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); + } else { + /* + * There's no next snapshot, so this is a head dataset. + * Destroy the deadlist. Unless it's a clone, the + * deadlist should be empty. (If it's a clone, it's + * safe to ignore the deadlist contents.) + */ + struct killarg ka; + + ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); + bplist_close(&ds->ds_deadlist); + bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); + ds->ds_phys->ds_deadlist_obj = 0; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * XXX we're doing this long task with the config lock held + */ + ka.usedp = &used; + ka.compressedp = &compressed; + ka.uncompressedp = &uncompressed; + ka.zio = zio; + ka.tx = tx; + err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + ADVANCE_POST, kill_blkptr, &ka); + ASSERT3U(err, ==, 0); + } + + err = zio_wait(zio); + ASSERT3U(err, ==, 0); + + dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx); + + if (ds->ds_phys->ds_snapnames_zapobj) { + err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); + ASSERT(err == 0); + } + + if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { + /* Erase the link in the dataset */ + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; + /* + * dsl_dir_sync_destroy() called us, they'll destroy + * the dataset. + */ + } else { + /* remove from snapshot namespace */ + dsl_dataset_t *ds_head; + VERIFY(0 == dsl_dataset_open_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL, + DS_MODE_NONE, FTAG, &ds_head)); + VERIFY(0 == dsl_dataset_get_snapname(ds)); +#ifdef ZFS_DEBUG + { + uint64_t val; + err = zap_lookup(mos, + ds_head->ds_phys->ds_snapnames_zapobj, + ds->ds_snapname, 8, 1, &val); + ASSERT3U(err, ==, 0); + ASSERT3U(val, ==, obj); + } +#endif + err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj, + ds->ds_snapname, tx); + ASSERT(err == 0); + dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG); + } + + if (ds_prev && ds->ds_prev != ds_prev) + dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG); + + spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag); + VERIFY(0 == dmu_object_free(mos, obj, tx)); + +} + +/* ARGSUSED */ +int +dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + dsl_dataset_t *ds = os->os->os_dsl_dataset; + const char *snapname = arg2; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int err; + uint64_t value; + + /* + * We don't allow multiple snapshots of the same txg. If there + * is already one, try again. + */ + if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) + return (EAGAIN); + + /* + * Check for conflicting name snapshot name. + */ + err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj, + snapname, 8, 1, &value); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + ds->ds_trysnap_txg = tx->tx_txg; + return (0); +} + +void +dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + dsl_dataset_t *ds = os->os->os_dsl_dataset; + const char *snapname = arg2; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj; + objset_t *mos = dp->dp_meta_objset; + int err; + + spa_scrub_restart(dp->dp_spa, tx->tx_txg); + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); + VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + dsphys->ds_dir_obj = ds->ds_dir->dd_object; + dsphys->ds_fsid_guid = unique_create(); + unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; + dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; + dsphys->ds_next_snap_obj = ds->ds_object; + dsphys->ds_num_children = 1; + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; + dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; + dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; + dsphys->ds_flags = ds->ds_phys->ds_flags; + dsphys->ds_bp = ds->ds_phys->ds_bp; + dmu_buf_rele(dbuf, FTAG); + + ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); + if (ds->ds_prev) { + ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == + ds->ds_object || + ds->ds_prev->ds_phys->ds_num_children > 1); + if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, + ds->ds_prev->ds_phys->ds_creation_txg); + ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; + } + } + + bplist_close(&ds->ds_deadlist); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg); + ds->ds_phys->ds_prev_snap_obj = dsobj; + ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg; + ds->ds_phys->ds_unique_bytes = 0; + ds->ds_phys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, + ds->ds_phys->ds_deadlist_obj)); + + dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); + err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, + snapname, 8, 1, &dsobj, tx); + ASSERT(err == 0); + + if (ds->ds_prev) + dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); + VERIFY(0 == dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, snapname, + DS_MODE_NONE, ds, &ds->ds_prev)); +} + +void +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(ds->ds_user_ptr != NULL); + ASSERT(ds->ds_phys->ds_next_snap_obj == 0); + + dsl_dir_dirty(ds->ds_dir, tx); + dmu_objset_sync(ds->ds_user_ptr, zio, tx); + /* Unneeded? bplist_close(&ds->ds_deadlist); */ +} + +void +dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + dsl_dir_stats(ds->ds_dir, nv); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, + ds->ds_phys->ds_creation_time); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, + ds->ds_phys->ds_creation_txg); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, + ds->ds_phys->ds_used_bytes); + + if (ds->ds_phys->ds_next_snap_obj) { + /* + * This is a snapshot; override the dd's space used with + * our unique space and compression ratio. + */ + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, + ds->ds_phys->ds_unique_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, + ds->ds_phys->ds_compressed_bytes == 0 ? 100 : + (ds->ds_phys->ds_uncompressed_bytes * 100 / + ds->ds_phys->ds_compressed_bytes)); + } +} + +void +dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) +{ + stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; + stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; + if (ds->ds_phys->ds_next_snap_obj) { + stat->dds_is_snapshot = B_TRUE; + stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; + } + + /* clone origin is really a dsl_dir thing... */ + if (ds->ds_dir->dd_phys->dd_clone_parent_obj) { + dsl_dataset_t *ods; + + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool, + ds->ds_dir->dd_phys->dd_clone_parent_obj, + NULL, DS_MODE_NONE, FTAG, &ods)); + dsl_dataset_name(ods, stat->dds_clone_of); + dsl_dataset_close(ods, DS_MODE_NONE, FTAG); + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + } +} + +uint64_t +dsl_dataset_fsid_guid(dsl_dataset_t *ds) +{ + return (ds->ds_phys->ds_fsid_guid); +} + +void +dsl_dataset_space(dsl_dataset_t *ds, + uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp) +{ + *refdbytesp = ds->ds_phys->ds_used_bytes; + *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); + *usedobjsp = ds->ds_phys->ds_bp.blk_fill; + *availobjsp = DN_MAX_OBJECT - *usedobjsp; +} + +/* ARGSUSED */ +static int +dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + char *newsnapname = arg2; + dsl_dir_t *dd = ds->ds_dir; + objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dataset_t *hds; + uint64_t val; + int err; + + err = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds); + if (err) + return (err); + + /* new name better not be in use */ + err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj, + newsnapname, 8, 1, &val); + dsl_dataset_close(hds, DS_MODE_NONE, FTAG); + + if (err == 0) + err = EEXIST; + else if (err == ENOENT) + err = 0; + return (err); +} + +static void +dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + char *newsnapname = arg2; + dsl_dir_t *dd = ds->ds_dir; + objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dataset_t *hds; + int err; + + ASSERT(ds->ds_phys->ds_next_snap_obj != 0); + + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds)); + + VERIFY(0 == dsl_dataset_get_snapname(ds)); + err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj, + ds->ds_snapname, tx); + ASSERT3U(err, ==, 0); + mutex_enter(&ds->ds_lock); + (void) strcpy(ds->ds_snapname, newsnapname); + mutex_exit(&ds->ds_lock); + err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, + ds->ds_snapname, 8, 1, &ds->ds_object, tx); + ASSERT3U(err, ==, 0); + + dsl_dataset_close(hds, DS_MODE_NONE, FTAG); +} + +#pragma weak dmu_objset_rename = dsl_dataset_rename +int +dsl_dataset_rename(const char *oldname, const char *newname) +{ + dsl_dir_t *dd; + dsl_dataset_t *ds; + const char *tail; + int err; + + err = dsl_dir_open(oldname, FTAG, &dd, &tail); + if (err) + return (err); + if (tail == NULL) { + err = dsl_dir_rename(dd, newname); + dsl_dir_close(dd, FTAG); + return (err); + } + if (tail[0] != '@') { + /* the name ended in a nonexistant component */ + dsl_dir_close(dd, FTAG); + return (ENOENT); + } + + dsl_dir_close(dd, FTAG); + + /* new name must be snapshot in same filesystem */ + tail = strchr(newname, '@'); + if (tail == NULL) + return (EINVAL); + tail++; + if (strncmp(oldname, newname, tail - newname) != 0) + return (EXDEV); + + err = dsl_dataset_open(oldname, + DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds); + if (err) + return (err); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_snapshot_rename_check, + dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); + + dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); + + return (err); +} + +struct promotearg { + uint64_t used, comp, uncomp, unique; + uint64_t newnext_obj, snapnames_obj; +}; + +static int +dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *hds = arg1; + struct promotearg *pa = arg2; + dsl_dir_t *dd = hds->ds_dir; + dsl_pool_t *dp = hds->ds_dir->dd_pool; + dsl_dir_t *pdd = NULL; + dsl_dataset_t *ds = NULL; + dsl_dataset_t *pivot_ds = NULL; + dsl_dataset_t *newnext_ds = NULL; + int err; + char *name = NULL; + uint64_t itor = 0; + blkptr_t bp; + + bzero(pa, sizeof (*pa)); + + /* Check that it is a clone */ + if (dd->dd_phys->dd_clone_parent_obj == 0) + return (EINVAL); + + /* Since this is so expensive, don't do the preliminary check */ + if (!dmu_tx_is_syncing(tx)) + return (0); + + if (err = dsl_dataset_open_obj(dp, + dd->dd_phys->dd_clone_parent_obj, + NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds)) + goto out; + pdd = pivot_ds->ds_dir; + + { + dsl_dataset_t *phds; + if (err = dsl_dataset_open_obj(dd->dd_pool, + pdd->dd_phys->dd_head_dataset_obj, + NULL, DS_MODE_NONE, FTAG, &phds)) + goto out; + pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj; + dsl_dataset_close(phds, DS_MODE_NONE, FTAG); + } + + if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) { + err = EXDEV; + goto out; + } + + /* find pivot point's new next ds */ + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object, + NULL, DS_MODE_NONE, FTAG, &newnext_ds)); + while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) { + dsl_dataset_t *prev; + + if (err = dsl_dataset_open_obj(dd->dd_pool, + newnext_ds->ds_phys->ds_prev_snap_obj, + NULL, DS_MODE_NONE, FTAG, &prev)) + goto out; + dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG); + newnext_ds = prev; + } + pa->newnext_obj = newnext_ds->ds_object; + + /* compute pivot point's new unique space */ + while ((err = bplist_iterate(&newnext_ds->ds_deadlist, + &itor, &bp)) == 0) { + if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg) + pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp); + } + if (err != ENOENT) + goto out; + + /* Walk the snapshots that we are moving */ + name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + ds = pivot_ds; + /* CONSTCOND */ + while (TRUE) { + uint64_t val, dlused, dlcomp, dluncomp; + dsl_dataset_t *prev; + + /* Check that the snapshot name does not conflict */ + dsl_dataset_name(ds, name); + err = zap_lookup(dd->dd_pool->dp_meta_objset, + hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, + 8, 1, &val); + if (err != ENOENT) { + if (err == 0) + err = EEXIST; + goto out; + } + + /* + * compute space to transfer. Each snapshot gave birth to: + * (my used) - (prev's used) + (deadlist's used) + */ + pa->used += ds->ds_phys->ds_used_bytes; + pa->comp += ds->ds_phys->ds_compressed_bytes; + pa->uncomp += ds->ds_phys->ds_uncompressed_bytes; + + /* If we reach the first snapshot, we're done. */ + if (ds->ds_phys->ds_prev_snap_obj == 0) + break; + + if (err = bplist_space(&ds->ds_deadlist, + &dlused, &dlcomp, &dluncomp)) + goto out; + if (err = dsl_dataset_open_obj(dd->dd_pool, + ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE, + FTAG, &prev)) + goto out; + pa->used += dlused - prev->ds_phys->ds_used_bytes; + pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes; + pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes; + + /* + * We could be a clone of a clone. If we reach our + * parent's branch point, we're done. + */ + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG); + break; + } + if (ds != pivot_ds) + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + ds = prev; + } + + /* Check that there is enough space here */ + err = dsl_dir_transfer_possible(pdd, dd, pa->used); + +out: + if (ds && ds != pivot_ds) + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + if (pivot_ds) + dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG); + if (newnext_ds) + dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG); + if (name) + kmem_free(name, MAXPATHLEN); + return (err); +} + +static void +dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *hds = arg1; + struct promotearg *pa = arg2; + dsl_dir_t *dd = hds->ds_dir; + dsl_pool_t *dp = hds->ds_dir->dd_pool; + dsl_dir_t *pdd = NULL; + dsl_dataset_t *ds, *pivot_ds; + char *name; + + ASSERT(dd->dd_phys->dd_clone_parent_obj != 0); + ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); + + VERIFY(0 == dsl_dataset_open_obj(dp, + dd->dd_phys->dd_clone_parent_obj, + NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds)); + /* + * We need to explicitly open pdd, since pivot_ds's pdd will be + * changing. + */ + VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object, + NULL, FTAG, &pdd)); + + /* move snapshots to this dir */ + name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + ds = pivot_ds; + /* CONSTCOND */ + while (TRUE) { + dsl_dataset_t *prev; + + /* move snap name entry */ + dsl_dataset_name(ds, name); + VERIFY(0 == zap_remove(dp->dp_meta_objset, + pa->snapnames_obj, ds->ds_snapname, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, + hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, + 8, 1, &ds->ds_object, tx)); + + /* change containing dsl_dir */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object); + ds->ds_phys->ds_dir_obj = dd->dd_object; + ASSERT3P(ds->ds_dir, ==, pdd); + dsl_dir_close(ds->ds_dir, ds); + VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, + NULL, ds, &ds->ds_dir)); + + ASSERT3U(dsl_prop_numcb(ds), ==, 0); + + if (ds->ds_phys->ds_prev_snap_obj == 0) + break; + + VERIFY(0 == dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE, + FTAG, &prev)); + + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG); + break; + } + if (ds != pivot_ds) + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + ds = prev; + } + if (ds != pivot_ds) + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + + /* change pivot point's next snap */ + dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx); + pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj; + + /* change clone_parent-age */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object); + dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj; + dmu_buf_will_dirty(pdd->dd_dbuf, tx); + pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object; + + /* change space accounting */ + dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx); + dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx); + pivot_ds->ds_phys->ds_unique_bytes = pa->unique; + + dsl_dir_close(pdd, FTAG); + dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG); + kmem_free(name, MAXPATHLEN); +} + +int +dsl_dataset_promote(const char *name) +{ + dsl_dataset_t *ds; + int err; + dmu_object_info_t doi; + struct promotearg pa; + + err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds); + if (err) + return (err); + + err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, &doi); + if (err) { + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + return (err); + } + + /* + * Add in 128x the snapnames zapobj size, since we will be moving + * a bunch of snapnames to the promoted ds, and dirtying their + * bonus buffers. + */ + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_promote_check, + dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + return (err); +} + +/* + * Given a pool name and a dataset object number in that pool, + * return the name of that dataset. + */ +int +dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) +{ + spa_t *spa; + dsl_pool_t *dp; + dsl_dataset_t *ds = NULL; + int error; + + if ((error = spa_open(pname, &spa, FTAG)) != 0) + return (error); + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + if ((error = dsl_dataset_open_obj(dp, obj, + NULL, DS_MODE_NONE, FTAG, &ds)) != 0) { + rw_exit(&dp->dp_config_rwlock); + spa_close(spa, FTAG); + return (error); + } + dsl_dataset_name(ds, buf); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + rw_exit(&dp->dp_config_rwlock); + spa_close(spa, FTAG); + + return (0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c new file mode 100644 index 0000000..97779a2 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -0,0 +1,1192 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/arc.h> +#include "zfs_namecheck.h" + +static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd); +static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); + + +/* ARGSUSED */ +static void +dsl_dir_evict(dmu_buf_t *db, void *arg) +{ + dsl_dir_t *dd = arg; + dsl_pool_t *dp = dd->dd_pool; + int t; + + for (t = 0; t < TXG_SIZE; t++) { + ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); + ASSERT(dd->dd_tempreserved[t] == 0); + ASSERT(dd->dd_space_towrite[t] == 0); + } + + ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes); + + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + + spa_close(dd->dd_pool->dp_spa, dd); + + /* + * The props callback list should be empty since they hold the + * dir open. + */ + list_destroy(&dd->dd_prop_cbs); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); +} + +int +dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **ddp) +{ + dmu_buf_t *dbuf; + dsl_dir_t *dd; + int err; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + + err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); + if (err) + return (err); + dd = dmu_buf_get_user(dbuf); +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbuf, &doi); + ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); + } +#endif + /* XXX assert bonus buffer size is correct */ + if (dd == NULL) { + dsl_dir_t *winner; + int err; + + dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); + dd->dd_object = ddobj; + dd->dd_dbuf = dbuf; + dd->dd_pool = dp; + dd->dd_phys = dbuf->db_data; + dd->dd_used_bytes = dd->dd_phys->dd_used_bytes; + mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); + + list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), + offsetof(dsl_prop_cb_record_t, cbr_node)); + + if (dd->dd_phys->dd_parent_obj) { + err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, + NULL, dd, &dd->dd_parent); + if (err) { + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } + if (tail) { +#ifdef ZFS_DEBUG + uint64_t foundobj; + + err = zap_lookup(dp->dp_meta_objset, + dd->dd_parent->dd_phys-> + dd_child_dir_zapobj, + tail, sizeof (foundobj), 1, &foundobj); + ASSERT(err || foundobj == ddobj); +#endif + (void) strcpy(dd->dd_myname, tail); + } else { + err = zap_value_search(dp->dp_meta_objset, + dd->dd_parent->dd_phys-> + dd_child_dir_zapobj, + ddobj, dd->dd_myname); + } + if (err) { + dsl_dir_close(dd->dd_parent, dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } + } else { + (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); + } + + winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, + dsl_dir_evict); + if (winner) { + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dd = winner; + } else { + spa_open_ref(dp->dp_spa, dd); + } + } + + /* + * The dsl_dir_t has both open-to-close and instantiate-to-evict + * holds on the spa. We need the open-to-close holds because + * otherwise the spa_refcnt wouldn't change when we open a + * dir which the spa also has open, so we could incorrectly + * think it was OK to unload/export/destroy the pool. We need + * the instantiate-to-evict hold because the dsl_dir_t has a + * pointer to the dd_pool, which has a pointer to the spa_t. + */ + spa_open_ref(dp->dp_spa, tag); + ASSERT3P(dd->dd_pool, ==, dp); + ASSERT3U(dd->dd_object, ==, ddobj); + ASSERT3P(dd->dd_dbuf, ==, dbuf); + *ddp = dd; + return (0); +} + +void +dsl_dir_close(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + +/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ +void +dsl_dir_name(dsl_dir_t *dd, char *buf) +{ + if (dd->dd_parent) { + dsl_dir_name(dd->dd_parent, buf); + (void) strcat(buf, "/"); + } else { + buf[0] = '\0'; + } + if (!MUTEX_HELD(&dd->dd_lock)) { + /* + * recursive mutex so that we can use + * dprintf_dd() with dd_lock held + */ + mutex_enter(&dd->dd_lock); + (void) strcat(buf, dd->dd_myname); + mutex_exit(&dd->dd_lock); + } else { + (void) strcat(buf, dd->dd_myname); + } +} + +int +dsl_dir_is_private(dsl_dir_t *dd) +{ + int rv = FALSE; + + if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) + rv = TRUE; + if (dataset_name_hidden(dd->dd_myname)) + rv = TRUE; + return (rv); +} + + +static int +getcomponent(const char *path, char *component, const char **nextp) +{ + char *p; + if (path == NULL) + return (ENOENT); + /* This would be a good place to reserve some namespace... */ + p = strpbrk(path, "/@"); + if (p && (p[1] == '/' || p[1] == '@')) { + /* two separators in a row */ + return (EINVAL); + } + if (p == NULL || p == path) { + /* + * if the first thing is an @ or /, it had better be an + * @ and it had better not have any more ats or slashes, + * and it had better have something after the @. + */ + if (p != NULL && + (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) + return (EINVAL); + if (strlen(path) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(component, path); + p = NULL; + } else if (p[0] == '/') { + if (p-path >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(component, path, p - path); + component[p-path] = '\0'; + p++; + } else if (p[0] == '@') { + /* + * if the next separator is an @, there better not be + * any more slashes. + */ + if (strchr(path, '/')) + return (EINVAL); + if (p-path >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(component, path, p - path); + component[p-path] = '\0'; + } else { + ASSERT(!"invalid p"); + } + *nextp = p; + return (0); +} + +/* + * same as dsl_open_dir, ignore the first component of name and use the + * spa instead + */ +int +dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, + dsl_dir_t **ddp, const char **tailp) +{ + char buf[MAXNAMELEN]; + const char *next, *nextnext = NULL; + int err; + dsl_dir_t *dd; + dsl_pool_t *dp; + uint64_t ddobj; + int openedspa = FALSE; + + dprintf("%s\n", name); + + err = getcomponent(name, buf, &next); + if (err) + return (err); + if (spa == NULL) { + err = spa_open(buf, &spa, FTAG); + if (err) { + dprintf("spa_open(%s) failed\n", buf); + return (err); + } + openedspa = TRUE; + + /* XXX this assertion belongs in spa_open */ + ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); + } + + dp = spa_get_dsl(spa); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); + if (err) { + rw_exit(&dp->dp_config_rwlock); + if (openedspa) + spa_close(spa, FTAG); + return (err); + } + + while (next != NULL) { + dsl_dir_t *child_ds; + err = getcomponent(next, buf, &nextnext); + if (err) + break; + ASSERT(next[0] != '\0'); + if (next[0] == '@') + break; + dprintf("looking up %s in obj%lld\n", + buf, dd->dd_phys->dd_child_dir_zapobj); + + err = zap_lookup(dp->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj, + buf, sizeof (ddobj), 1, &ddobj); + if (err) { + if (err == ENOENT) + err = 0; + break; + } + + err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); + if (err) + break; + dsl_dir_close(dd, tag); + dd = child_ds; + next = nextnext; + } + rw_exit(&dp->dp_config_rwlock); + + if (err) { + dsl_dir_close(dd, tag); + if (openedspa) + spa_close(spa, FTAG); + return (err); + } + + /* + * It's an error if there's more than one component left, or + * tailp==NULL and there's any component left. + */ + if (next != NULL && + (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { + /* bad path name */ + dsl_dir_close(dd, tag); + dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); + err = ENOENT; + } + if (tailp) + *tailp = next; + if (openedspa) + spa_close(spa, FTAG); + *ddp = dd; + return (err); +} + +/* + * Return the dsl_dir_t, and possibly the last component which couldn't + * be found in *tail. Return NULL if the path is bogus, or if + * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' + * means that the last component is a snapshot. + */ +int +dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) +{ + return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); +} + +uint64_t +dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) +{ + objset_t *mos = pds->dd_pool->dp_meta_objset; + uint64_t ddobj; + dsl_dir_phys_t *dsphys; + dmu_buf_t *dbuf; + + ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, + DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); + VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, + name, sizeof (uint64_t), 1, &ddobj, tx)); + VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + + dsphys->dd_creation_time = gethrestime_sec(); + dsphys->dd_parent_obj = pds->dd_object; + dsphys->dd_props_zapobj = zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + dsphys->dd_child_dir_zapobj = zap_create(mos, + DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); + dmu_buf_rele(dbuf, FTAG); + + return (ddobj); +} + +/* ARGSUSED */ +int +dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + int err; + uint64_t count; + + /* + * There should be exactly two holds, both from + * dsl_dataset_destroy: one on the dd directory, and one on its + * head ds. Otherwise, someone is trying to lookup something + * inside this dir while we want to destroy it. The + * config_rwlock ensures that nobody else opens it after we + * check. + */ + if (dmu_buf_refcount(dd->dd_dbuf) > 2) + return (EBUSY); + + err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); + if (err) + return (err); + if (count != 0) + return (EEXIST); + + return (0); +} + +void +dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t val, obj; + + ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); + ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); + + /* Remove our reservation. */ + val = 0; + dsl_dir_set_reservation_sync(dd, &val, tx); + ASSERT3U(dd->dd_used_bytes, ==, 0); + ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); + + VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); + VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); + VERIFY(0 == zap_remove(mos, + dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); + + obj = dd->dd_object; + dsl_dir_close(dd, tag); + VERIFY(0 == dmu_object_free(mos, obj, tx)); +} + +void +dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) +{ + dsl_dir_phys_t *dsp; + dmu_buf_t *dbuf; + int error; + + *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, + DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); + + error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, + sizeof (uint64_t), 1, ddobjp, tx); + ASSERT3U(error, ==, 0); + + VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsp = dbuf->db_data; + + dsp->dd_creation_time = gethrestime_sec(); + dsp->dd_props_zapobj = zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + dsp->dd_child_dir_zapobj = zap_create(mos, + DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); + + dmu_buf_rele(dbuf, FTAG); +} + +void +dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) +{ + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, + dsl_dir_space_available(dd, NULL, 0, TRUE)); + + mutex_enter(&dd->dd_lock); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, + dd->dd_phys->dd_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, + dd->dd_phys->dd_reserved); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, + dd->dd_phys->dd_compressed_bytes == 0 ? 100 : + (dd->dd_phys->dd_uncompressed_bytes * 100 / + dd->dd_phys->dd_compressed_bytes)); + mutex_exit(&dd->dd_lock); + + if (dd->dd_phys->dd_clone_parent_obj) { + dsl_dataset_t *ds; + char buf[MAXNAMELEN]; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_clone_parent_obj, + NULL, DS_MODE_NONE, FTAG, &ds)); + dsl_dataset_name(ds, buf); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + rw_exit(&dd->dd_pool->dp_config_rwlock); + + dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); + } +} + +void +dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + + ASSERT(dd->dd_phys); + + if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(dd->dd_dbuf, dd); + } +} + +static int64_t +parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) +{ + uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); + uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); + return (new_accounted - old_accounted); +} + +void +dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); + dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, + dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); + dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; + dd->dd_phys->dd_used_bytes = dd->dd_used_bytes; + mutex_exit(&dd->dd_lock); + + /* release the hold from dsl_dir_dirty */ + dmu_buf_rele(dd->dd_dbuf, dd); +} + +static uint64_t +dsl_dir_estimated_space(dsl_dir_t *dd) +{ + int64_t space; + int i; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + space = dd->dd_phys->dd_used_bytes; + ASSERT(space >= 0); + for (i = 0; i < TXG_SIZE; i++) { + space += dd->dd_space_towrite[i&TXG_MASK]; + ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); + } + return (space); +} + +/* + * How much space would dd have available if ancestor had delta applied + * to it? If ondiskonly is set, we're only interested in what's + * on-disk, not estimated pending changes. + */ +uint64_t +dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly) +{ + uint64_t parentspace, myspace, quota, used; + + /* + * If there are no restrictions otherwise, assume we have + * unlimited space available. + */ + quota = UINT64_MAX; + parentspace = UINT64_MAX; + + if (dd->dd_parent != NULL) { + parentspace = dsl_dir_space_available(dd->dd_parent, + ancestor, delta, ondiskonly); + } + + mutex_enter(&dd->dd_lock); + if (dd->dd_phys->dd_quota != 0) + quota = dd->dd_phys->dd_quota; + if (ondiskonly) { + used = dd->dd_used_bytes; + } else { + used = dsl_dir_estimated_space(dd); + } + if (dd == ancestor) + used += delta; + + if (dd->dd_parent == NULL) { + uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); + quota = MIN(quota, poolsize); + } + + if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { + /* + * We have some space reserved, in addition to what our + * parent gave us. + */ + parentspace += dd->dd_phys->dd_reserved - used; + } + + if (used > quota) { + /* over quota */ + myspace = 0; + + /* + * While it's OK to be a little over quota, if + * we think we are using more space than there + * is in the pool (which is already 1.6% more than + * dsl_pool_adjustedsize()), something is very + * wrong. + */ + ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa)); + } else { + /* + * the lesser of the space provided by our parent and + * the space left in our quota + */ + myspace = MIN(parentspace, quota - used); + } + + mutex_exit(&dd->dd_lock); + + return (myspace); +} + +struct tempreserve { + list_node_t tr_node; + dsl_dir_t *tr_ds; + uint64_t tr_size; +}; + +/* + * Reserve space in this dsl_dir, to be used in this tx's txg. + * After the space has been dirtied (and thus + * dsl_dir_willuse_space() has been called), the reservation should + * be canceled, using dsl_dir_tempreserve_clear(). + */ +static int +dsl_dir_tempreserve_impl(dsl_dir_t *dd, + uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx) +{ + uint64_t txg = tx->tx_txg; + uint64_t est_used, quota, parent_rsrv; + int edquot = EDQUOT; + int txgidx = txg & TXG_MASK; + int i; + struct tempreserve *tr; + + ASSERT3U(txg, !=, 0); + ASSERT3S(asize, >=, 0); + + mutex_enter(&dd->dd_lock); + /* + * Check against the dsl_dir's quota. We don't add in the delta + * when checking for over-quota because they get one free hit. + */ + est_used = dsl_dir_estimated_space(dd); + for (i = 0; i < TXG_SIZE; i++) + est_used += dd->dd_tempreserved[i]; + + quota = UINT64_MAX; + + if (dd->dd_phys->dd_quota) + quota = dd->dd_phys->dd_quota; + + /* + * If this transaction will result in a net free of space, we want + * to let it through, but we have to be careful: the space that it + * frees won't become available until *after* this txg syncs. + * Therefore, to ensure that it's possible to remove files from + * a full pool without inducing transient overcommits, we throttle + * netfree transactions against a quota that is slightly larger, + * but still within the pool's allocation slop. In cases where + * we're very close to full, this will allow a steady trickle of + * removes to get through. + */ + if (dd->dd_parent == NULL) { + uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); + if (poolsize < quota) { + quota = poolsize; + edquot = ENOSPC; + } + } else if (netfree) { + quota = UINT64_MAX; + } + + /* + * If they are requesting more space, and our current estimate + * is over quota. They get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (asize > 0 && est_used > quota) { + if (dd->dd_space_towrite[txg & TXG_MASK] != 0 || + dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 || + dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 || + dd->dd_used_bytes < quota) + edquot = ERESTART; + dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " + "quota=%lluK tr=%lluK err=%d\n", + dd->dd_used_bytes>>10, est_used>>10, + quota>>10, asize>>10, edquot); + mutex_exit(&dd->dd_lock); + return (edquot); + } + + /* We need to up our estimated delta before dropping dd_lock */ + dd->dd_tempreserved[txgidx] += asize; + + parent_rsrv = parent_delta(dd, est_used, asize); + mutex_exit(&dd->dd_lock); + + tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_ds = dd; + tr->tr_size = asize; + list_insert_tail(tr_list, tr); + + /* see if it's OK with our parent */ + if (dd->dd_parent && parent_rsrv) { + return (dsl_dir_tempreserve_impl(dd->dd_parent, + parent_rsrv, netfree, tr_list, tx)); + } else { + return (0); + } +} + +/* + * Reserve space in this dsl_dir, to be used in this tx's txg. + * After the space has been dirtied (and thus + * dsl_dir_willuse_space() has been called), the reservation should + * be canceled, using dsl_dir_tempreserve_clear(). + */ +int +dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, + uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx) +{ + int err = 0; + list_t *tr_list; + + tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(tr_list, sizeof (struct tempreserve), + offsetof(struct tempreserve, tr_node)); + ASSERT3S(asize, >=, 0); + ASSERT3S(fsize, >=, 0); + + err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, + tr_list, tx); + + if (err == 0) { + struct tempreserve *tr; + + err = arc_tempreserve_space(lsize); + if (err == 0) { + tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_ds = NULL; + tr->tr_size = lsize; + list_insert_tail(tr_list, tr); + } + } + + if (err) + dsl_dir_tempreserve_clear(tr_list, tx); + else + *tr_cookiep = tr_list; + return (err); +} + +/* + * Clear a temporary reservation that we previously made with + * dsl_dir_tempreserve_space(). + */ +void +dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) +{ + int txgidx = tx->tx_txg & TXG_MASK; + list_t *tr_list = tr_cookie; + struct tempreserve *tr; + + ASSERT3U(tx->tx_txg, !=, 0); + + while (tr = list_head(tr_list)) { + if (tr->tr_ds == NULL) { + arc_tempreserve_clear(tr->tr_size); + } else { + mutex_enter(&tr->tr_ds->dd_lock); + ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, + tr->tr_size); + tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; + mutex_exit(&tr->tr_ds->dd_lock); + } + list_remove(tr_list, tr); + kmem_free(tr, sizeof (struct tempreserve)); + } + + kmem_free(tr_list, sizeof (list_t)); +} + +/* + * Call in open context when we think we're going to write/free space, + * eg. when dirtying data. Be conservative (ie. OK to write less than + * this or free more than this, but don't write more or free less). + */ +void +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +{ + int64_t parent_space; + uint64_t est_used; + + mutex_enter(&dd->dd_lock); + if (space > 0) + dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; + + est_used = dsl_dir_estimated_space(dd); + parent_space = parent_delta(dd, est_used, space); + mutex_exit(&dd->dd_lock); + + /* Make sure that we clean up dd_space_to* */ + dsl_dir_dirty(dd, tx); + + /* XXX this is potentially expensive and unnecessary... */ + if (parent_space && dd->dd_parent) + dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); +} + +/* call from syncing context when we actually write/free space for this dd */ +void +dsl_dir_diduse_space(dsl_dir_t *dd, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) +{ + int64_t accounted_delta; + + ASSERT(dmu_tx_is_syncing(tx)); + + dsl_dir_dirty(dd, tx); + + mutex_enter(&dd->dd_lock); + accounted_delta = parent_delta(dd, dd->dd_used_bytes, used); + ASSERT(used >= 0 || dd->dd_used_bytes >= -used); + ASSERT(compressed >= 0 || + dd->dd_phys->dd_compressed_bytes >= -compressed); + ASSERT(uncompressed >= 0 || + dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); + dd->dd_used_bytes += used; + dd->dd_phys->dd_uncompressed_bytes += uncompressed; + dd->dd_phys->dd_compressed_bytes += compressed; + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent != NULL) { + dsl_dir_diduse_space(dd->dd_parent, + accounted_delta, compressed, uncompressed, tx); + } +} + +/* ARGSUSED */ +static int +dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + int err = 0; + uint64_t towrite; + + if (new_quota == 0) + return (0); + + mutex_enter(&dd->dd_lock); + /* + * If we are doing the preliminary check in open context, and + * there are pending changes, then don't fail it, since the + * pending changes could under-estimat the amount of space to be + * freed up. + */ + towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] + + dd->dd_space_towrite[2] + dd->dd_space_towrite[3]; + if ((dmu_tx_is_syncing(tx) || towrite == 0) && + (new_quota < dd->dd_phys->dd_reserved || + new_quota < dsl_dir_estimated_space(dd))) { + err = ENOSPC; + } + mutex_exit(&dd->dd_lock); + return (err); +} + +static void +dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + dd->dd_phys->dd_quota = new_quota; + mutex_exit(&dd->dd_lock); +} + +int +dsl_dir_set_quota(const char *ddname, uint64_t quota) +{ + dsl_dir_t *dd; + int err; + + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) + return (err); + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(dd->dd_pool, 0); + + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, dd, "a, 0); + dsl_dir_close(dd, FTAG); + return (err); +} + +/* ARGSUSED */ +static int +dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + uint64_t used, avail; + int64_t delta; + + if (new_reservation > INT64_MAX) + return (EOVERFLOW); + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) + return (0); + + mutex_enter(&dd->dd_lock); + used = dd->dd_used_bytes; + delta = MAX(used, new_reservation) - + MAX(used, dd->dd_phys->dd_reserved); + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent) { + avail = dsl_dir_space_available(dd->dd_parent, + NULL, 0, FALSE); + } else { + avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; + } + + if (delta > 0 && delta > avail) + return (ENOSPC); + if (delta > 0 && dd->dd_phys->dd_quota > 0 && + new_reservation > dd->dd_phys->dd_quota) + return (ENOSPC); + return (0); +} + +static void +dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + uint64_t used; + int64_t delta; + + mutex_enter(&dd->dd_lock); + used = dd->dd_used_bytes; + delta = MAX(used, new_reservation) - + MAX(used, dd->dd_phys->dd_reserved); + mutex_exit(&dd->dd_lock); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_reserved = new_reservation; + + if (dd->dd_parent != NULL) { + /* Roll up this additional usage into our ancestors */ + dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx); + } +} + +int +dsl_dir_set_reservation(const char *ddname, uint64_t reservation) +{ + dsl_dir_t *dd; + int err; + + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) + return (err); + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, + dsl_dir_set_reservation_sync, dd, &reservation, 0); + dsl_dir_close(dd, FTAG); + return (err); +} + +static dsl_dir_t * +closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) +{ + for (; ds1; ds1 = ds1->dd_parent) { + dsl_dir_t *dd; + for (dd = ds2; dd; dd = dd->dd_parent) { + if (ds1 == dd) + return (dd); + } + } + return (NULL); +} + +/* + * If delta is applied to dd, how much of that delta would be applied to + * ancestor? Syncing context only. + */ +static int64_t +would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) +{ + if (dd == ancestor) + return (delta); + + mutex_enter(&dd->dd_lock); + delta = parent_delta(dd, dd->dd_used_bytes, delta); + mutex_exit(&dd->dd_lock); + return (would_change(dd->dd_parent, delta, ancestor)); +} + +struct renamearg { + dsl_dir_t *newparent; + const char *mynewname; +}; + +/* ARGSUSED */ +static int +dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct renamearg *ra = arg2; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + int err; + uint64_t val; + + /* There should be 2 references: the open and the dirty */ + if (dmu_buf_refcount(dd->dd_dbuf) > 2) + return (EBUSY); + + /* check for existing name */ + err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, + ra->mynewname, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + if (ra->newparent != dd->dd_parent) { + /* is there enough space? */ + uint64_t myspace = + MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); + + /* no rename into our descendant */ + if (closest_common_ancestor(dd, ra->newparent) == dd) + return (EINVAL); + + if (err = dsl_dir_transfer_possible(dd->dd_parent, + ra->newparent, myspace)) + return (err); + } + + return (0); +} + +static void +dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct renamearg *ra = arg2; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + int err; + + ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); + + if (ra->newparent != dd->dd_parent) { + uint64_t myspace = + MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); + + dsl_dir_diduse_space(dd->dd_parent, -myspace, + -dd->dd_phys->dd_compressed_bytes, + -dd->dd_phys->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(ra->newparent, myspace, + dd->dd_phys->dd_compressed_bytes, + dd->dd_phys->dd_uncompressed_bytes, tx); + } + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + /* remove from old parent zapobj */ + err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, + dd->dd_myname, tx); + ASSERT3U(err, ==, 0); + + (void) strcpy(dd->dd_myname, ra->mynewname); + dsl_dir_close(dd->dd_parent, dd); + dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; + VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, + ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); + + /* add to new parent zapobj */ + err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, + dd->dd_myname, 8, 1, &dd->dd_object, tx); + ASSERT3U(err, ==, 0); +} + +int +dsl_dir_rename(dsl_dir_t *dd, const char *newname) +{ + struct renamearg ra; + int err; + + /* new parent should exist */ + err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); + if (err) + return (err); + + /* can't rename to different pool */ + if (dd->dd_pool != ra.newparent->dd_pool) { + err = ENXIO; + goto out; + } + + /* new name should not already exist */ + if (ra.mynewname == NULL) { + err = EEXIST; + goto out; + } + + + err = dsl_sync_task_do(dd->dd_pool, + dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); + +out: + dsl_dir_close(ra.newparent, FTAG); + return (err); +} + +int +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) +{ + dsl_dir_t *ancestor; + int64_t adelta; + uint64_t avail; + + ancestor = closest_common_ancestor(sdd, tdd); + adelta = would_change(sdd, -space, ancestor); + avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); + if (avail < space) + return (ENOSPC); + + return (0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c new file mode 100644 index 0000000..7046254 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -0,0 +1,255 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> + +static int +dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp) +{ + uint64_t obj; + int err; + + err = zap_lookup(dp->dp_meta_objset, + dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, + MOS_DIR_NAME, sizeof (obj), 1, &obj); + if (err) + return (err); + + return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp)); +} + +static dsl_pool_t * +dsl_pool_open_impl(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp; + blkptr_t *bp = spa_get_rootblkptr(spa); + + dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); + dp->dp_spa = spa; + dp->dp_meta_rootbp = *bp; + rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); + txg_init(dp, txg); + + txg_list_create(&dp->dp_dirty_datasets, + offsetof(dsl_dataset_t, ds_dirty_link)); + txg_list_create(&dp->dp_dirty_dirs, + offsetof(dsl_dir_t, dd_dirty_link)); + txg_list_create(&dp->dp_sync_tasks, + offsetof(dsl_sync_task_group_t, dstg_node)); + list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t), + offsetof(dsl_dataset_t, ds_synced_link)); + + return (dp); +} + +int +dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + objset_impl_t *osi; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); + if (err) + goto out; + dp->dp_meta_objset = &osi->os; + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, + &dp->dp_root_dir_obj); + if (err) + goto out; + + err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir); + if (err) + goto out; + + err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir); + if (err) + goto out; + +out: + rw_exit(&dp->dp_config_rwlock); + if (err) + dsl_pool_close(dp); + else + *dpp = dp; + + return (err); +} + +void +dsl_pool_close(dsl_pool_t *dp) +{ + /* drop our reference from dsl_pool_open() */ + if (dp->dp_mos_dir) + dsl_dir_close(dp->dp_mos_dir, dp); + if (dp->dp_root_dir) + dsl_dir_close(dp->dp_root_dir, dp); + + /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ + if (dp->dp_meta_objset) + dmu_objset_evict(NULL, dp->dp_meta_objset->os); + + txg_list_destroy(&dp->dp_dirty_datasets); + txg_list_destroy(&dp->dp_dirty_dirs); + list_destroy(&dp->dp_synced_objsets); + + arc_flush(); + txg_fini(dp); + rw_destroy(&dp->dp_config_rwlock); + kmem_free(dp, sizeof (dsl_pool_t)); +} + +dsl_pool_t * +dsl_pool_create(spa_t *spa, uint64_t txg) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); + dp->dp_meta_objset = &dmu_objset_create_impl(spa, + NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; + + /* create the pool directory */ + err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); + ASSERT3U(err, ==, 0); + + /* create and open the root dir */ + dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx); + VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir)); + + /* create and open the meta-objset dir */ + (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx); + VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir)); + + dmu_tx_commit(tx); + + return (dp); +} + +void +dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) +{ + zio_t *zio; + dmu_tx_t *tx; + dsl_dir_t *dd; + dsl_dataset_t *ds; + dsl_sync_task_group_t *dstg; + objset_impl_t *mosi = dp->dp_meta_objset->os; + int err; + + tx = dmu_tx_create_assigned(dp, txg); + + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + if (!list_link_active(&ds->ds_synced_link)) + list_insert_tail(&dp->dp_synced_objsets, ds); + else + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, zio, tx); + } + err = zio_wait(zio); + ASSERT(err == 0); + + while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) + dsl_sync_task_group_sync(dstg, tx); + while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) + dsl_dir_sync(dd, tx); + + if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || + list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dmu_objset_sync(mosi, zio, tx); + err = zio_wait(zio); + ASSERT(err == 0); + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + } + + dmu_tx_commit(tx); +} + +void +dsl_pool_zil_clean(dsl_pool_t *dp) +{ + dsl_dataset_t *ds; + + while (ds = list_head(&dp->dp_synced_objsets)) { + list_remove(&dp->dp_synced_objsets, ds); + ASSERT(ds->ds_user_ptr != NULL); + zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); + dmu_buf_rele(ds->ds_dbuf, ds); + } +} + +/* + * TRUE if the current thread is the tx_sync_thread or if we + * are being called from SPA context during pool initialization. + */ +int +dsl_pool_sync_context(dsl_pool_t *dp) +{ + return (curthread == dp->dp_tx.tx_sync_thread || + spa_get_dsl(dp->dp_spa) == NULL); +} + +uint64_t +dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) +{ + uint64_t space, resv; + + /* + * Reserve about 1.6% (1/64), or at least 32MB, for allocation + * efficiency. + * XXX The intent log is not accounted for, so it must fit + * within this slop. + * + * If we're trying to assess whether it's OK to do a free, + * cut the reservation in half to allow forward progress + * (e.g. make it possible to rm(1) files from a full pool). + */ + space = spa_get_dspace(dp->dp_spa); + resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); + if (netfree) + resv >>= 1; + + return (space - resv); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c new file mode 100644 index 0000000..2fff66d --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c @@ -0,0 +1,501 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/spa.h> +#include <sys/zio_checksum.h> /* for the default checksum value */ +#include <sys/zap.h> +#include <sys/fs/zfs.h> + +#include "zfs_prop.h" + +static int +dodefault(const char *propname, int intsz, int numint, void *buf) +{ + zfs_prop_t prop; + + if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL || + zfs_prop_readonly(prop)) + return (ENOENT); + + if (zfs_prop_get_type(prop) == prop_type_string) { + if (intsz != 1) + return (EOVERFLOW); + (void) strncpy(buf, zfs_prop_default_string(prop), numint); + } else { + if (intsz != 8 || numint < 1) + return (EOVERFLOW); + + *(uint64_t *)buf = zfs_prop_default_numeric(prop); + } + + return (0); +} + +static int +dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, + int intsz, int numint, void *buf, char *setpoint) +{ + int err = ENOENT; + zfs_prop_t prop; + + if (setpoint) + setpoint[0] = '\0'; + + prop = zfs_name_to_prop(propname); + + /* + * Note: dd may be NULL, therefore we shouldn't dereference it + * ouside this loop. + */ + for (; dd != NULL; dd = dd->dd_parent) { + objset_t *mos = dd->dd_pool->dp_meta_objset; + ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, + propname, intsz, numint, buf); + if (err != ENOENT) { + if (setpoint) + dsl_dir_name(dd, setpoint); + break; + } + + /* + * Break out of this loop for non-inheritable properties. + */ + if (prop != ZFS_PROP_INVAL && + !zfs_prop_inheritable(prop)) + break; + } + if (err == ENOENT) + err = dodefault(propname, intsz, numint, buf); + + return (err); +} + +/* + * Register interest in the named property. We'll call the callback + * once to notify it of the current property value, and again each time + * the property changes, until this callback is unregistered. + * + * Return 0 on success, errno if the prop is not an integer value. + */ +int +dsl_prop_register(dsl_dataset_t *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg) +{ + dsl_dir_t *dd = ds->ds_dir; + uint64_t value; + dsl_prop_cb_record_t *cbr; + int err; + int need_rwlock; + + need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock); + if (need_rwlock) + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + + err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL); + if (err != 0) { + rw_exit(&dd->dd_pool->dp_config_rwlock); + return (err); + } + + cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); + cbr->cbr_ds = ds; + cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP); + (void) strcpy((char *)cbr->cbr_propname, propname); + cbr->cbr_func = callback; + cbr->cbr_arg = cbarg; + mutex_enter(&dd->dd_lock); + list_insert_head(&dd->dd_prop_cbs, cbr); + mutex_exit(&dd->dd_lock); + + cbr->cbr_func(cbr->cbr_arg, value); + + VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object, + NULL, cbr, &dd)); + if (need_rwlock) + rw_exit(&dd->dd_pool->dp_config_rwlock); + /* Leave dataset open until this callback is unregistered */ + return (0); +} + +int +dsl_prop_get_ds(dsl_dir_t *dd, const char *propname, + int intsz, int numints, void *buf, char *setpoint) +{ + int err; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint); + rw_exit(&dd->dd_pool->dp_config_rwlock); + + return (err); +} + +int +dsl_prop_get(const char *ddname, const char *propname, + int intsz, int numints, void *buf, char *setpoint) +{ + dsl_dir_t *dd; + const char *tail; + int err; + + err = dsl_dir_open(ddname, FTAG, &dd, &tail); + if (err) + return (err); + if (tail && tail[0] != '@') { + dsl_dir_close(dd, FTAG); + return (ENOENT); + } + + err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint); + + dsl_dir_close(dd, FTAG); + return (err); +} + +/* + * Get the current property value. It may have changed by the time this + * function returns, so it is NOT safe to follow up with + * dsl_prop_register() and assume that the value has not changed in + * between. + * + * Return 0 on success, ENOENT if ddname is invalid. + */ +int +dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint) +{ + return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); +} + +/* + * Unregister this callback. Return 0 on success, ENOENT if ddname is + * invalid, ENOMSG if no matching callback registered. + */ +int +dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_cb_record_t *cbr; + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); + cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds && + cbr->cbr_func == callback && + cbr->cbr_arg == cbarg && + strcmp(cbr->cbr_propname, propname) == 0) + break; + } + + if (cbr == NULL) { + mutex_exit(&dd->dd_lock); + return (ENOMSG); + } + + list_remove(&dd->dd_prop_cbs, cbr); + mutex_exit(&dd->dd_lock); + kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); + kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); + + /* Clean up from dsl_prop_register */ + dsl_dir_close(dd, cbr); + return (0); +} + +/* + * Return the number of callbacks that are registered for this dataset. + */ +int +dsl_prop_numcb(dsl_dataset_t *ds) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_cb_record_t *cbr; + int num = 0; + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); + cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds) + num++; + } + mutex_exit(&dd->dd_lock); + + return (num); +} + +static void +dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, + const char *propname, uint64_t value, int first) +{ + dsl_dir_t *dd; + dsl_prop_cb_record_t *cbr; + objset_t *mos = dp->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + int err; + + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); + if (err) + return; + + if (!first) { + /* + * If the prop is set here, then this change is not + * being inherited here or below; stop the recursion. + */ + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, + 8, 1, &value); + if (err == 0) { + dsl_dir_close(dd, FTAG); + return; + } + ASSERT3U(err, ==, ENOENT); + } + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); + cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { + if (strcmp(cbr->cbr_propname, propname) == 0) { + cbr->cbr_func(cbr->cbr_arg, value); + } + } + mutex_exit(&dd->dd_lock); + + for (zap_cursor_init(&zc, mos, + dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + /* XXX recursion could blow stack; esp. za! */ + dsl_prop_changed_notify(dp, za.za_first_integer, + propname, value, FALSE); + } + zap_cursor_fini(&zc); + dsl_dir_close(dd, FTAG); +} + +struct prop_set_arg { + const char *name; + int intsz; + int numints; + const void *buf; +}; + + +static void +dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct prop_set_arg *psa = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t zapobj = dd->dd_phys->dd_props_zapobj; + uint64_t intval; + int isint; + + isint = (dodefault(psa->name, 8, 1, &intval) == 0); + + if (psa->numints == 0) { + int err = zap_remove(mos, zapobj, psa->name, tx); + ASSERT(err == 0 || err == ENOENT); + if (isint) { + VERIFY(0 == dsl_prop_get_impl(dd->dd_parent, + psa->name, 8, 1, &intval, NULL)); + } + } else { + VERIFY(0 == zap_update(mos, zapobj, psa->name, + psa->intsz, psa->numints, psa->buf, tx)); + if (isint) + intval = *(uint64_t *)psa->buf; + } + + if (isint) { + dsl_prop_changed_notify(dd->dd_pool, + dd->dd_object, psa->name, intval, TRUE); + } +} + +int +dsl_prop_set_dd(dsl_dir_t *dd, const char *propname, + int intsz, int numints, const void *buf) +{ + struct prop_set_arg psa; + + psa.name = propname; + psa.intsz = intsz; + psa.numints = numints; + psa.buf = buf; + + return (dsl_sync_task_do(dd->dd_pool, + NULL, dsl_prop_set_sync, dd, &psa, 2)); +} + +int +dsl_prop_set(const char *ddname, const char *propname, + int intsz, int numints, const void *buf) +{ + dsl_dir_t *dd; + int err; + + /* + * We must do these checks before we get to the syncfunc, since + * it can't fail. + */ + if (strlen(propname) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + if (intsz * numints >= ZAP_MAXVALUELEN) + return (E2BIG); + + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) + return (err); + err = dsl_prop_set_dd(dd, propname, intsz, numints, buf); + dsl_dir_close(dd, FTAG); + return (err); +} + +/* + * Iterate over all properties for this dataset and return them in an nvlist. + */ +int +dsl_prop_get_all(objset_t *os, nvlist_t **nvp) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dir_t *dd = ds->ds_dir; + int err = 0; + dsl_pool_t *dp; + objset_t *mos; + + if (dsl_dataset_is_snapshot(ds)) { + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + return (0); + } + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + dp = dd->dd_pool; + mos = dp->dp_meta_objset; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + for (; dd != NULL; dd = dd->dd_parent) { + char setpoint[MAXNAMELEN]; + zap_cursor_t zc; + zap_attribute_t za; + + dsl_dir_name(dd, setpoint); + + for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + nvlist_t *propval; + zfs_prop_t prop; + /* + * Skip non-inheritable properties. + */ + if ((prop = zfs_name_to_prop(za.za_name)) != + ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) && + dd != ds->ds_dir) + continue; + + if (nvlist_lookup_nvlist(*nvp, za.za_name, + &propval) == 0) + continue; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + if (za.za_integer_length == 1) { + /* + * String property + */ + char *tmp = kmem_alloc(za.za_num_integers, + KM_SLEEP); + err = zap_lookup(mos, + dd->dd_phys->dd_props_zapobj, + za.za_name, 1, za.za_num_integers, + tmp); + if (err != 0) { + kmem_free(tmp, za.za_num_integers); + break; + } + VERIFY(nvlist_add_string(propval, + ZFS_PROP_VALUE, tmp) == 0); + kmem_free(tmp, za.za_num_integers); + } else { + /* + * Integer property + */ + ASSERT(za.za_integer_length == 8); + (void) nvlist_add_uint64(propval, + ZFS_PROP_VALUE, za.za_first_integer); + } + + VERIFY(nvlist_add_string(propval, + ZFS_PROP_SOURCE, setpoint) == 0); + VERIFY(nvlist_add_nvlist(*nvp, za.za_name, + propval) == 0); + nvlist_free(propval); + } + zap_cursor_fini(&zc); + + if (err != ENOENT) + break; + err = 0; + } + rw_exit(&dp->dp_config_rwlock); + + return (err); +} + +void +dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) +{ + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0); + VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); + nvlist_free(propval); +} + +void +dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) +{ + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0); + VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); + nvlist_free(propval); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c new file mode 100644 index 0000000..17deb56 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c @@ -0,0 +1,196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> + +#define DST_AVG_BLKSHIFT 14 + +/* ARGSUSED */ +static int +dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) +{ + return (0); +} + +dsl_sync_task_group_t * +dsl_sync_task_group_create(dsl_pool_t *dp) +{ + dsl_sync_task_group_t *dstg; + + dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP); + list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), + offsetof(dsl_sync_task_t, dst_node)); + dstg->dstg_pool = dp; + + return (dstg); +} + +void +dsl_sync_task_create(dsl_sync_task_group_t *dstg, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified) +{ + dsl_sync_task_t *dst; + + if (checkfunc == NULL) + checkfunc = dsl_null_checkfunc; + dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP); + dst->dst_checkfunc = checkfunc; + dst->dst_syncfunc = syncfunc; + dst->dst_arg1 = arg1; + dst->dst_arg2 = arg2; + list_insert_tail(&dstg->dstg_tasks, dst); + + dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT; +} + +int +dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) +{ + dmu_tx_t *tx; + uint64_t txg; + dsl_sync_task_t *dst; + +top: + tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + + txg = dmu_tx_get_txg(tx); + + /* Do a preliminary error check. */ + dstg->dstg_err = 0; + rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER); + for (dst = list_head(&dstg->dstg_tasks); dst; + dst = list_next(&dstg->dstg_tasks, dst)) { +#ifdef ZFS_DEBUG + /* + * Only check half the time, otherwise, the sync-context + * check will almost never fail. + */ + if (spa_get_random(2) == 0) + continue; +#endif + dst->dst_err = + dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); + if (dst->dst_err) + dstg->dstg_err = dst->dst_err; + } + rw_exit(&dstg->dstg_pool->dp_config_rwlock); + + if (dstg->dstg_err) { + dmu_tx_commit(tx); + return (dstg->dstg_err); + } + + VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); + + dmu_tx_commit(tx); + + txg_wait_synced(dstg->dstg_pool, txg); + + if (dstg->dstg_err == EAGAIN) + goto top; + + return (dstg->dstg_err); +} + +void +dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) +{ + dsl_sync_task_t *dst; + + while (dst = list_head(&dstg->dstg_tasks)) { + list_remove(&dstg->dstg_tasks, dst); + kmem_free(dst, sizeof (dsl_sync_task_t)); + } + kmem_free(dstg, sizeof (dsl_sync_task_group_t)); +} + +void +dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +{ + dsl_sync_task_t *dst; + void *tr_cookie; + + ASSERT3U(dstg->dstg_err, ==, 0); + + /* + * Check for sufficient space. + */ + dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir, + dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx); + /* don't bother trying again */ + if (dstg->dstg_err == ERESTART) + dstg->dstg_err = EAGAIN; + if (dstg->dstg_err) + return; + + /* + * Check for errors by calling checkfuncs. + */ + rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER); + for (dst = list_head(&dstg->dstg_tasks); dst; + dst = list_next(&dstg->dstg_tasks, dst)) { + dst->dst_err = + dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); + if (dst->dst_err) + dstg->dstg_err = dst->dst_err; + } + + if (dstg->dstg_err == 0) { + /* + * Execute sync tasks. + */ + for (dst = list_head(&dstg->dstg_tasks); dst; + dst = list_next(&dstg->dstg_tasks, dst)) { + dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); + } + } + rw_exit(&dstg->dstg_pool->dp_config_rwlock); + + dsl_dir_tempreserve_clear(tr_cookie, tx); +} + +int +dsl_sync_task_do(dsl_pool_t *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified) +{ + dsl_sync_task_group_t *dstg; + int err; + + dstg = dsl_sync_task_group_create(dp); + dsl_sync_task_create(dstg, checkfunc, syncfunc, + arg1, arg2, blocks_modified); + err = dsl_sync_task_group_wait(dstg); + dsl_sync_task_group_destroy(dstg); + return (err); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c b/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c new file mode 100644 index 0000000..edda3c9 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c @@ -0,0 +1,145 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/byteorder.h> +#include <sys/spa.h> + +void +fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 += ip[0]; + a1 += ip[1]; + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); +} + +void +fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 += BSWAP_64(ip[0]); + a1 += BSWAP_64(ip[1]); + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); +} + +void +fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + for (a = b = c = d = 0; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + for (a = b = c = d = 0; ip < ipend; ip++) { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_incremental_native(const void *buf, uint64_t size, + zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = zcp->zc_word[0]; + b = zcp->zc_word[1]; + c = zcp->zc_word[2]; + d = zcp->zc_word[3]; + + for (; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_incremental_byteswap(const void *buf, uint64_t size, + zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = zcp->zc_word[0]; + b = zcp->zc_word[1]; + c = zcp->zc_word[2]; + d = zcp->zc_word[3]; + + for (; ip < ipend; ip++) { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c b/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c new file mode 100644 index 0000000..b257d4a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/zmod.h> + +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <strings.h> +#endif + +size_t +gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + size_t dstlen = d_len; + + ASSERT(d_len <= s_len); + + if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) { + if (d_len != s_len) + return (s_len); + + bcopy(s_start, d_start, s_len); + return (s_len); + } + + return (dstlen); +} + +/*ARGSUSED*/ +int +gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + size_t dstlen = d_len; + + ASSERT(d_len >= s_len); + + if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK) + return (-1); + + return (0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c new file mode 100644 index 0000000..a88b85c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c @@ -0,0 +1,129 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * We keep our own copy of this algorithm for 2 main reasons: + * 1. If we didn't, anyone modifying common/os/compress.c would + * directly break our on disk format + * 2. Our version of lzjb does not have a number of checks that the + * common/os version needs and uses + * In particular, we are adding the "feature" that compress() can + * take a destination buffer size and return -1 if the data will not + * compress to d_len or less. + */ + +#include <sys/zfs_context.h> +#include <sys/types.h> + +#define MATCH_BITS 6 +#define MATCH_MIN 3 +#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) +#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) +#define LEMPEL_SIZE 256 + +/*ARGSUSED*/ +size_t +lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *cpy, *copymap; + int copymask = 1 << (NBBY - 1); + int mlen, offset; + uint16_t *hp; + uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */ + + while (src < (uchar_t *)s_start + s_len) { + if ((copymask <<= 1) == (1 << NBBY)) { + if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { + if (d_len != s_len) + return (s_len); + mlen = s_len; + for (src = s_start, dst = d_start; mlen; mlen--) + *dst++ = *src++; + return (s_len); + } + copymask = 1; + copymap = dst; + *dst++ = 0; + } + if (src > (uchar_t *)s_start + s_len - MATCH_MAX) { + *dst++ = *src++; + continue; + } + hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) & + (LEMPEL_SIZE - 1)]; + offset = (intptr_t)(src - *hp) & OFFSET_MASK; + *hp = (uint16_t)(uintptr_t)src; + cpy = src - offset; + if (cpy >= (uchar_t *)s_start && cpy != src && + src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) { + *copymap |= copymask; + for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) + if (src[mlen] != cpy[mlen]) + break; + *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | + (offset >> NBBY); + *dst++ = (uchar_t)offset; + src += mlen; + } else { + *dst++ = *src++; + } + } + return (dst - (uchar_t *)d_start); +} + +/*ARGSUSED*/ +int +lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *d_end = (uchar_t *)d_start + d_len; + uchar_t *cpy, copymap; + int copymask = 1 << (NBBY - 1); + + while (dst < d_end) { + if ((copymask <<= 1) == (1 << NBBY)) { + copymask = 1; + copymap = *src++; + } + if (copymap & copymask) { + int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; + int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; + src += 2; + if ((cpy = dst - offset) < (uchar_t *)d_start) + return (-1); + while (--mlen >= 0 && dst < d_end) + *dst++ = *cpy++; + } else { + *dst++ = *src++; + } + } + return (0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c new file mode 100644 index 0000000..0dba134 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -0,0 +1,1023 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/space_map.h> +#include <sys/metaslab_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> + +uint64_t metaslab_aliquot = 512ULL << 10; + +/* + * ========================================================================== + * Metaslab classes + * ========================================================================== + */ +metaslab_class_t * +metaslab_class_create(void) +{ + metaslab_class_t *mc; + + mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + + mc->mc_rotor = NULL; + + return (mc); +} + +void +metaslab_class_destroy(metaslab_class_t *mc) +{ + metaslab_group_t *mg; + + while ((mg = mc->mc_rotor) != NULL) { + metaslab_class_remove(mc, mg); + metaslab_group_destroy(mg); + } + + kmem_free(mc, sizeof (metaslab_class_t)); +} + +void +metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) +{ + metaslab_group_t *mgprev, *mgnext; + + ASSERT(mg->mg_class == NULL); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; + mg->mg_class = mc; +} + +void +metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) +{ + metaslab_group_t *mgprev, *mgnext; + + ASSERT(mg->mg_class == mc); + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; + mg->mg_class = NULL; +} + +/* + * ========================================================================== + * Metaslab groups + * ========================================================================== + */ +static int +metaslab_compare(const void *x1, const void *x2) +{ + const metaslab_t *m1 = x1; + const metaslab_t *m2 = x2; + + if (m1->ms_weight < m2->ms_weight) + return (1); + if (m1->ms_weight > m2->ms_weight) + return (-1); + + /* + * If the weights are identical, use the offset to force uniqueness. + */ + if (m1->ms_map.sm_start < m2->ms_map.sm_start) + return (-1); + if (m1->ms_map.sm_start > m2->ms_map.sm_start) + return (1); + + ASSERT3P(m1, ==, m2); + + return (0); +} + +metaslab_group_t * +metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) +{ + metaslab_group_t *mg; + + mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&mg->mg_metaslab_tree, metaslab_compare, + sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); + mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); + mg->mg_vd = vd; + metaslab_class_add(mc, mg); + + return (mg); +} + +void +metaslab_group_destroy(metaslab_group_t *mg) +{ + avl_destroy(&mg->mg_metaslab_tree); + mutex_destroy(&mg->mg_lock); + kmem_free(mg, sizeof (metaslab_group_t)); +} + +static void +metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +{ + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == NULL); + msp->ms_group = mg; + msp->ms_weight = 0; + avl_add(&mg->mg_metaslab_tree, msp); + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) +{ + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_group = NULL; + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + /* + * Although in principle the weight can be any value, in + * practice we do not use values in the range [1, 510]. + */ + ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_weight = weight; + avl_add(&mg->mg_metaslab_tree, msp); + mutex_exit(&mg->mg_lock); +} + +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static void +metaslab_ff_load(space_map_t *sm) +{ + ASSERT(sm->sm_ppd == NULL); + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); +} + +static void +metaslab_ff_unload(space_map_t *sm) +{ + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); + sm->sm_ppd = NULL; +} + +static uint64_t +metaslab_ff_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + space_seg_t *ss, ssearch; + avl_index_t where; + + ssearch.ss_start = *cursor; + ssearch.ss_end = *cursor + size; + + ss = avl_find(t, &ssearch, &where); + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + + while (ss != NULL) { + uint64_t offset = P2ROUNDUP(ss->ss_start, align); + + if (offset + size <= ss->ss_end) { + *cursor = offset + size; + return (offset); + } + ss = AVL_NEXT(t, ss); + } + + /* + * If we know we've searched the whole map (*cursor == 0), give up. + * Otherwise, reset the cursor to the beginning and try again. + */ + if (*cursor == 0) + return (-1ULL); + + *cursor = 0; + return (metaslab_ff_alloc(sm, size)); +} + +/* ARGSUSED */ +static void +metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +/* ARGSUSED */ +static void +metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +static space_map_ops_t metaslab_ff_ops = { + metaslab_ff_load, + metaslab_ff_unload, + metaslab_ff_alloc, + metaslab_ff_claim, + metaslab_ff_free +}; + +/* + * ========================================================================== + * Metaslabs + * ========================================================================== + */ +metaslab_t * +metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, + uint64_t start, uint64_t size, uint64_t txg) +{ + vdev_t *vd = mg->mg_vd; + metaslab_t *msp; + + msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); + mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); + + msp->ms_smo_syncing = *smo; + + /* + * We create the main space map here, but we don't create the + * allocmaps and freemaps until metaslab_sync_done(). This serves + * two purposes: it allows metaslab_sync_done() to detect the + * addition of new space; and for debugging, it ensures that we'd + * data fault on any attempt to use this metaslab before it's ready. + */ + space_map_create(&msp->ms_map, start, size, + vd->vdev_ashift, &msp->ms_lock); + + metaslab_group_add(mg, msp); + + /* + * If we're opening an existing pool (txg == 0) or creating + * a new one (txg == TXG_INITIAL), all space is available now. + * If we're adding space to an existing pool, the new space + * does not become available until after this txg has synced. + */ + if (txg <= TXG_INITIAL) + metaslab_sync_done(msp, 0); + + if (txg != 0) { + /* + * The vdev is dirty, but the metaslab isn't -- it just needs + * to have metaslab_sync_done() invoked from vdev_sync_done(). + * [We could just dirty the metaslab, but that would cause us + * to allocate a space map object for it, which is wasteful + * and would mess up the locality logic in metaslab_weight().] + */ + ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); + vdev_dirty(vd, 0, NULL, txg); + vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); + } + + return (msp); +} + +void +metaslab_fini(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + int t; + + vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, + -msp->ms_smo.smo_alloc); + + metaslab_group_remove(mg, msp); + + mutex_enter(&msp->ms_lock); + + space_map_unload(&msp->ms_map); + space_map_destroy(&msp->ms_map); + + for (t = 0; t < TXG_SIZE; t++) { + space_map_destroy(&msp->ms_allocmap[t]); + space_map_destroy(&msp->ms_freemap[t]); + } + + mutex_exit(&msp->ms_lock); + mutex_destroy(&msp->ms_lock); + + kmem_free(msp, sizeof (metaslab_t)); +} + +#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) +#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) +#define METASLAB_ACTIVE_MASK \ + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) +#define METASLAB_SMO_BONUS_MULTIPLIER 2 + +static uint64_t +metaslab_weight(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; + vdev_t *vd = mg->mg_vd; + uint64_t weight, space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * The baseline weight is the metaslab's free space. + */ + space = sm->sm_size - smo->smo_alloc; + weight = space; + + /* + * Modern disks have uniform bit density and constant angular velocity. + * Therefore, the outer recording zones are faster (higher bandwidth) + * than the inner zones by the ratio of outer to inner track diameter, + * which is typically around 2:1. We account for this by assigning + * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). + * In effect, this means that we'll select the metaslab with the most + * free bandwidth rather than simply the one with the most free space. + */ + weight = 2 * weight - + ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; + ASSERT(weight >= space && weight <= 2 * space); + + /* + * For locality, assign higher weight to metaslabs we've used before. + */ + if (smo->smo_object != 0) + weight *= METASLAB_SMO_BONUS_MULTIPLIER; + ASSERT(weight >= space && + weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); + + /* + * If this metaslab is one we're actively using, adjust its weight to + * make it preferable to any inactive metaslab so we'll polish it off. + */ + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + + return (weight); +} + +static int +metaslab_activate(metaslab_t *msp, uint64_t activation_weight) +{ + space_map_t *sm = &msp->ms_map; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { + int error = space_map_load(sm, &metaslab_ff_ops, + SM_FREE, &msp->ms_smo, + msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); + if (error) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + metaslab_group_sort(msp->ms_group, msp, + msp->ms_weight | activation_weight); + } + ASSERT(sm->sm_loaded); + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); + + return (0); +} + +static void +metaslab_passivate(metaslab_t *msp, uint64_t size) +{ + /* + * If size < SPA_MINBLOCKSIZE, then we will not allocate from + * this metaslab again. In that case, it had better be empty, + * or we would be leaving space on the table. + */ + ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); + metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); + ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); +} + +/* + * Write a metaslab to disk in the context of the specified transaction group. + */ +void +metaslab_sync(metaslab_t *msp, uint64_t txg) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; + space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; + space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo_syncing; + dmu_buf_t *db; + dmu_tx_t *tx; + int t; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + /* + * The only state that can actually be changing concurrently with + * metaslab_sync() is the metaslab's ms_map. No other thread can + * be modifying this txg's allocmap, freemap, freed_map, or smo. + * Therefore, we only hold ms_lock to satify space_map ASSERTs. + * We drop it whenever we call into the DMU, because the DMU + * can call down to us (e.g. via zio_free()) at any time. + */ + mutex_enter(&msp->ms_lock); + + if (smo->smo_object == 0) { + ASSERT(smo->smo_objsize == 0); + ASSERT(smo->smo_alloc == 0); + mutex_exit(&msp->ms_lock); + smo->smo_object = dmu_object_alloc(mos, + DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, + DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); + ASSERT(smo->smo_object != 0); + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * + (sm->sm_start >> vd->vdev_ms_shift), + sizeof (uint64_t), &smo->smo_object, tx); + mutex_enter(&msp->ms_lock); + } + + space_map_walk(freemap, space_map_add, freed_map); + + if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= + 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { + /* + * The in-core space map representation is twice as compact + * as the on-disk one, so it's time to condense the latter + * by generating a pure allocmap from first principles. + * + * This metaslab is 100% allocated, + * minus the content of the in-core map (sm), + * minus what's been freed this txg (freed_map), + * minus allocations from txgs in the future + * (because they haven't been committed yet). + */ + space_map_vacate(allocmap, NULL, NULL); + space_map_vacate(freemap, NULL, NULL); + + space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); + + space_map_walk(sm, space_map_remove, allocmap); + space_map_walk(freed_map, space_map_remove, allocmap); + + for (t = 1; t < TXG_CONCURRENT_STATES; t++) + space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], + space_map_remove, allocmap); + + mutex_exit(&msp->ms_lock); + space_map_truncate(smo, mos, tx); + mutex_enter(&msp->ms_lock); + } + + space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); + space_map_sync(freemap, SM_FREE, smo, mos, tx); + + mutex_exit(&msp->ms_lock); + + VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + ASSERT3U(db->db_size, ==, sizeof (*smo)); + bcopy(smo, db->db_data, db->db_size); + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); +} + +/* + * Called after a transaction group has completely synced to mark + * all of the metaslab's free space as usable. + */ +void +metaslab_sync_done(metaslab_t *msp, uint64_t txg) +{ + space_map_obj_t *smo = &msp->ms_smo; + space_map_obj_t *smosync = &msp->ms_smo_syncing; + space_map_t *sm = &msp->ms_map; + space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + int t; + + mutex_enter(&msp->ms_lock); + + /* + * If this metaslab is just becoming available, initialize its + * allocmaps and freemaps and add its capacity to the vdev. + */ + if (freed_map->sm_size == 0) { + for (t = 0; t < TXG_SIZE; t++) { + space_map_create(&msp->ms_allocmap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + space_map_create(&msp->ms_freemap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + } + vdev_space_update(vd, sm->sm_size, 0); + } + + vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc); + + ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); + ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); + + /* + * If there's a space_map_load() in progress, wait for it to complete + * so that we have a consistent view of the in-core space map. + * Then, add everything we freed in this txg to the map. + */ + space_map_load_wait(sm); + space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); + + *smo = *smosync; + + /* + * If the map is loaded but no longer active, evict it as soon as all + * future allocations have synced. (If we unloaded it now and then + * loaded a moment later, the map wouldn't reflect those allocations.) + */ + if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { + int evictable = 1; + + for (t = 1; t < TXG_CONCURRENT_STATES; t++) + if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) + evictable = 0; + + if (evictable) + space_map_unload(sm); + } + + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + + mutex_exit(&msp->ms_lock); +} + +static uint64_t +metaslab_distance(metaslab_t *msp, dva_t *dva) +{ + uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; + uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; + uint64_t start = msp->ms_map.sm_start >> ms_shift; + + if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) + return (1ULL << 63); + + if (offset < start) + return ((start - offset) << ms_shift); + if (offset > start) + return ((offset - start) << ms_shift); + return (0); +} + +static uint64_t +metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, + uint64_t min_distance, dva_t *dva, int d) +{ + metaslab_t *msp = NULL; + uint64_t offset = -1ULL; + avl_tree_t *t = &mg->mg_metaslab_tree; + uint64_t activation_weight; + uint64_t target_distance; + int i; + + activation_weight = METASLAB_WEIGHT_PRIMARY; + for (i = 0; i < d; i++) + if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) + activation_weight = METASLAB_WEIGHT_SECONDARY; + + for (;;) { + mutex_enter(&mg->mg_lock); + for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { + if (msp->ms_weight < size) { + mutex_exit(&mg->mg_lock); + return (-1ULL); + } + + if (activation_weight == METASLAB_WEIGHT_PRIMARY) + break; + + target_distance = min_distance + + (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); + + for (i = 0; i < d; i++) + if (metaslab_distance(msp, &dva[i]) < + target_distance) + break; + if (i == d) + break; + } + mutex_exit(&mg->mg_lock); + if (msp == NULL) + return (-1ULL); + + mutex_enter(&msp->ms_lock); + + /* + * Ensure that the metaslab we have selected is still + * capable of handling our request. It's possible that + * another thread may have changed the weight while we + * were blocked on the metaslab lock. + */ + if (msp->ms_weight < size) { + mutex_exit(&msp->ms_lock); + continue; + } + + if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && + activation_weight == METASLAB_WEIGHT_PRIMARY) { + metaslab_passivate(msp, + msp->ms_weight & ~METASLAB_ACTIVE_MASK); + mutex_exit(&msp->ms_lock); + continue; + } + + if (metaslab_activate(msp, activation_weight) != 0) { + mutex_exit(&msp->ms_lock); + continue; + } + + if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) + break; + + metaslab_passivate(msp, size - 1); + + mutex_exit(&msp->ms_lock); + } + + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + + mutex_exit(&msp->ms_lock); + + return (offset); +} + +/* + * Allocate a block for the specified i/o. + */ +static int +metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, + dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid) +{ + metaslab_group_t *mg, *rotor; + metaslab_class_t *mc; + vdev_t *vd; + int dshift = 3; + int all_zero; + uint64_t offset = -1ULL; + uint64_t asize; + uint64_t distance; + + ASSERT(!DVA_IS_VALID(&dva[d])); + + mc = spa_metaslab_class_select(spa); + + /* + * Start at the rotor and loop through all mgs until we find something. + * Note that there's no locking on mc_rotor or mc_allocated because + * nothing actually breaks if we miss a few updates -- we just won't + * allocate quite as evenly. It all balances out over time. + * + * If we are doing ditto or log blocks, try to spread them across + * consecutive vdevs. If we're forced to reuse a vdev before we've + * allocated all of our ditto blocks, then try and spread them out on + * that vdev as much as possible. If it turns out to not be possible, + * gradually lower our standards until anything becomes acceptable. + * Also, allocating on consecutive vdevs (as opposed to random vdevs) + * gives us hope of containing our fault domains to something we're + * able to reason about. Otherwise, any two top-level vdev failures + * will guarantee the loss of data. With consecutive allocation, + * only two adjacent top-level vdev failures will result in data loss. + * + * If we are doing gang blocks (hintdva is non-NULL), try to keep + * ourselves on the same vdev as our gang block header. That + * way, we can hope for locality in vdev_cache, plus it makes our + * fault domains something tractable. + */ + if (hintdva) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); + if (hintdva_avoid) + mg = vd->vdev_mg->mg_next; + else + mg = vd->vdev_mg; + } else if (d != 0) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); + mg = vd->vdev_mg->mg_next; + } else { + mg = mc->mc_rotor; + } + rotor = mg; + +top: + all_zero = B_TRUE; + do { + vd = mg->mg_vd; + + distance = vd->vdev_asize >> dshift; + if (distance <= (1ULL << vd->vdev_ms_shift)) + distance = 0; + else + all_zero = B_FALSE; + + asize = vdev_psize_to_asize(vd, psize); + ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); + + offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + if (offset != -1ULL) { + /* + * If we've just selected this metaslab group, + * figure out whether the corresponding vdev is + * over- or under-used relative to the pool, + * and set an allocation bias to even it out. + */ + if (mc->mc_allocated == 0) { + vdev_stat_t *vs = &vd->vdev_stat; + uint64_t alloc, space; + int64_t vu, su; + + alloc = spa_get_alloc(spa); + space = spa_get_space(spa); + + /* + * Determine percent used in units of 0..1024. + * (This is just to avoid floating point.) + */ + vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); + su = (alloc << 10) / (space + 1); + + /* + * Bias by at most +/- 25% of the aliquot. + */ + mg->mg_bias = ((su - vu) * + (int64_t)mg->mg_aliquot) / (1024 * 4); + } + + if (atomic_add_64_nv(&mc->mc_allocated, asize) >= + mg->mg_aliquot + mg->mg_bias) { + mc->mc_rotor = mg->mg_next; + mc->mc_allocated = 0; + } + + DVA_SET_VDEV(&dva[d], vd->vdev_id); + DVA_SET_OFFSET(&dva[d], offset); + DVA_SET_GANG(&dva[d], 0); + DVA_SET_ASIZE(&dva[d], asize); + + return (0); + } + mc->mc_rotor = mg->mg_next; + mc->mc_allocated = 0; + } while ((mg = mg->mg_next) != rotor); + + if (!all_zero) { + dshift++; + ASSERT(dshift < 64); + goto top; + } + + bzero(&dva[d], sizeof (dva_t)); + + return (ENOSPC); +} + +/* + * Free the block represented by DVA in the context of the specified + * transaction group. + */ +static void +metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + metaslab_t *msp; + + ASSERT(DVA_IS_VALID(dva)); + + if (txg > spa_freeze_txg(spa)) + return; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { + cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", + (u_longlong_t)vdev, (u_longlong_t)offset); + ASSERT(0); + return; + } + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + mutex_enter(&msp->ms_lock); + + if (now) { + space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], + offset, size); + space_map_free(&msp->ms_map, offset, size); + } else { + if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); + + /* + * verify that this region is actually allocated in + * either a ms_allocmap or the ms_map + */ + if (msp->ms_map.sm_loaded) { + boolean_t allocd = B_FALSE; + int i; + + if (!space_map_contains(&msp->ms_map, offset, size)) { + allocd = B_TRUE; + } else { + for (i = 0; i < TXG_CONCURRENT_STATES; i++) { + space_map_t *sm = &msp->ms_allocmap + [(txg - i) & TXG_MASK]; + if (space_map_contains(sm, + offset, size)) { + allocd = B_TRUE; + break; + } + } + } + + if (!allocd) { + zfs_panic_recover("freeing free segment " + "(vdev=%llu offset=%llx size=%llx)", + (longlong_t)vdev, (longlong_t)offset, + (longlong_t)size); + } + } + + + } + + mutex_exit(&msp->ms_lock); +} + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +static int +metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + metaslab_t *msp; + int error; + + ASSERT(DVA_IS_VALID(dva)); + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) + return (ENXIO); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + mutex_enter(&msp->ms_lock); + + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + if (error) { + mutex_exit(&msp->ms_lock); + return (error); + } + + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + + space_map_claim(&msp->ms_map, offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + + mutex_exit(&msp->ms_lock); + + return (0); +} + +int +metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas, + uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid) +{ + dva_t *dva = bp->blk_dva; + dva_t *hintdva = hintbp->blk_dva; + int d; + int error = 0; + + ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); + ASSERT(BP_GET_NDVAS(bp) == 0); + ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + + for (d = 0; d < ndvas; d++) { + error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, + txg, hintbp_avoid); + if (error) { + for (d--; d >= 0; d--) { + metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + bzero(&dva[d], sizeof (dva_t)); + } + return (error); + } + } + ASSERT(error == 0); + ASSERT(BP_GET_NDVAS(bp) == ndvas); + + return (0); +} + +void +metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int d; + + ASSERT(!BP_IS_HOLE(bp)); + + for (d = 0; d < ndvas; d++) + metaslab_free_dva(spa, &dva[d], txg, now); +} + +int +metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int d, error; + int last_error = 0; + + ASSERT(!BP_IS_HOLE(bp)); + + for (d = 0; d < ndvas; d++) + if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) + last_error = error; + + return (last_error); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c new file mode 100644 index 0000000..411ed46 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +#if defined(DEBUG) || !defined(_KERNEL) + +#ifdef _KERNEL +int reference_tracking_enable = FALSE; /* runs out of memory too easily */ +#else +int reference_tracking_enable = TRUE; +#endif +int reference_history = 4; /* tunable */ + +static kmem_cache_t *reference_cache; +static kmem_cache_t *reference_history_cache; + +void +refcount_init(void) +{ + reference_cache = kmem_cache_create("reference_cache", + sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + reference_history_cache = kmem_cache_create("reference_history_cache", + sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +refcount_fini(void) +{ + kmem_cache_destroy(reference_cache); + kmem_cache_destroy(reference_history_cache); +} + +void +refcount_create(refcount_t *rc) +{ + list_create(&rc->rc_list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&rc->rc_removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); +} + +void +refcount_destroy_many(refcount_t *rc, uint64_t number) +{ + reference_t *ref; + + ASSERT(rc->rc_count == number); + while (ref = list_head(&rc->rc_list)) { + list_remove(&rc->rc_list, ref); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_list); + + while (ref = list_head(&rc->rc_removed)) { + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, ref->ref_removed); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_removed); + mutex_destroy(&rc->rc_mtx); +} + +void +refcount_destroy(refcount_t *rc) +{ + refcount_destroy_many(rc, 0); +} + +int +refcount_is_zero(refcount_t *rc) +{ + ASSERT(rc->rc_count >= 0); + return (rc->rc_count == 0); +} + +int64_t +refcount_count(refcount_t *rc) +{ + ASSERT(rc->rc_count >= 0); + return (rc->rc_count); +} + +int64_t +refcount_add_many(refcount_t *rc, uint64_t number, void *holder) +{ + reference_t *ref; + int64_t count; + + if (reference_tracking_enable) { + ref = kmem_cache_alloc(reference_cache, KM_SLEEP); + ref->ref_holder = holder; + ref->ref_number = number; + } + mutex_enter(&rc->rc_mtx); + ASSERT(rc->rc_count >= 0); + if (reference_tracking_enable) + list_insert_head(&rc->rc_list, ref); + rc->rc_count += number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + + return (count); +} + +int64_t +refcount_add(refcount_t *rc, void *holder) +{ + return (refcount_add_many(rc, 1, holder)); +} + +int64_t +refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) +{ + reference_t *ref; + int64_t count; + + mutex_enter(&rc->rc_mtx); + ASSERT(rc->rc_count >= number); + + if (!reference_tracking_enable) { + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder && ref->ref_number == number) { + list_remove(&rc->rc_list, ref); + if (reference_history > 0) { + ref->ref_removed = + kmem_cache_alloc(reference_history_cache, + KM_SLEEP); + list_insert_head(&rc->rc_removed, ref); + rc->rc_removed_count++; + if (rc->rc_removed_count >= reference_history) { + ref = list_tail(&rc->rc_removed); + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, + ref->ref_removed); + kmem_cache_free(reference_cache, ref); + rc->rc_removed_count--; + } + } else { + kmem_cache_free(reference_cache, ref); + } + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + } + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); +} + +int64_t +refcount_remove(refcount_t *rc, void *holder) +{ + return (refcount_remove_many(rc, 1, holder)); +} + +#endif diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c new file mode 100644 index 0000000..ce5c261 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c @@ -0,0 +1,131 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +/* + * SHA-256 checksum, as specified in FIPS 180-2, available at: + * http://csrc.nist.gov/cryptval + * + * This is a very compact implementation of SHA-256. + * It is designed to be simple and portable, not to be fast. + */ + +/* + * The literal definitions according to FIPS180-2 would be: + * + * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) + * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) + * + * We use logical equivalents which require one less op. + */ +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) +#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s))) +#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22)) +#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25)) +#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3)) +#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10)) + +static const uint32_t SHA256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +static void +SHA256Transform(uint32_t *H, const uint8_t *cp) +{ + uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64]; + + for (t = 0; t < 16; t++, cp += 4) + W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3]; + + for (t = 16; t < 64; t++) + W[t] = sigma1(W[t - 2]) + W[t - 7] + + sigma0(W[t - 15]) + W[t - 16]; + + a = H[0]; b = H[1]; c = H[2]; d = H[3]; + e = H[4]; f = H[5]; g = H[6]; h = H[7]; + + for (t = 0; t < 64; t++) { + T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t]; + T2 = SIGMA0(a) + Maj(a, b, c); + h = g; g = f; f = e; e = d + T1; + d = c; c = b; b = a; a = T1 + T2; + } + + H[0] += a; H[1] += b; H[2] += c; H[3] += d; + H[4] += e; H[5] += f; H[6] += g; H[7] += h; +} + +void +zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; + uint8_t pad[128]; + int padsize = size & 63; + int i; + + for (i = 0; i < size - padsize; i += 64) + SHA256Transform(H, (uint8_t *)buf + i); + + for (i = 0; i < padsize; i++) + pad[i] = ((uint8_t *)buf)[i]; + + for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) + pad[padsize] = 0; + + for (i = 0; i < 8; i++) + pad[padsize++] = (size << 3) >> (56 - 8 * i); + + for (i = 0; i < padsize; i += 64) + SHA256Transform(H, pad + i); + + ZIO_SET_CHECKSUM(zcp, + (uint64_t)H[0] << 32 | H[1], + (uint64_t)H[2] << 32 | H[3], + (uint64_t)H[4] << 32 | H[5], + (uint64_t)H[6] << 32 | H[7]); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c new file mode 100644 index 0000000..c218f72 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -0,0 +1,3265 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file contains all the routines used when modifying on-disk SPA state. + * This includes opening, importing, destroying, exporting a pool, and syncing a + * pool. + */ + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/dmu_traverse.h> +#include <sys/dmu_objset.h> +#include <sys/unique.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/fs/zfs.h> +#include <sys/callb.h> + +int zio_taskq_threads = 8; + +/* + * ========================================================================== + * SPA state manipulation (open/create/destroy/import/export) + * ========================================================================== + */ + +static int +spa_error_entry_compare(const void *a, const void *b) +{ + spa_error_entry_t *sa = (spa_error_entry_t *)a; + spa_error_entry_t *sb = (spa_error_entry_t *)b; + int ret; + + ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, + sizeof (zbookmark_t)); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +/* + * Utility function which retrieves copies of the current logs and + * re-initializes them in the process. + */ +void +spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) +{ + ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); + + bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); + bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); +} + +/* + * Activate an uninitialized pool. + */ +static void +spa_activate(spa_t *spa) +{ + int t; + + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + + spa->spa_state = POOL_STATE_ACTIVE; + + spa->spa_normal_class = metaslab_class_create(); + + for (t = 0; t < ZIO_TYPES; t++) { + spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", + zio_taskq_threads, maxclsyspri, 50, INT_MAX, + TASKQ_PREPOPULATE); + spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", + zio_taskq_threads, maxclsyspri, 50, INT_MAX, + TASKQ_PREPOPULATE); + } + + rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); + + mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + + list_create(&spa->spa_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_dirty_node)); + + txg_list_create(&spa->spa_vdev_txg_list, + offsetof(struct vdev, vdev_txg_node)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); +} + +/* + * Opposite of spa_activate(). + */ +static void +spa_deactivate(spa_t *spa) +{ + int t; + + ASSERT(spa->spa_sync_on == B_FALSE); + ASSERT(spa->spa_dsl_pool == NULL); + ASSERT(spa->spa_root_vdev == NULL); + + ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + + txg_list_destroy(&spa->spa_vdev_txg_list); + + list_destroy(&spa->spa_dirty_list); + + for (t = 0; t < ZIO_TYPES; t++) { + taskq_destroy(spa->spa_zio_issue_taskq[t]); + taskq_destroy(spa->spa_zio_intr_taskq[t]); + spa->spa_zio_issue_taskq[t] = NULL; + spa->spa_zio_intr_taskq[t] = NULL; + } + + metaslab_class_destroy(spa->spa_normal_class); + spa->spa_normal_class = NULL; + + /* + * If this was part of an import or the open otherwise failed, we may + * still have errors left in the queues. Empty them just in case. + */ + spa_errlog_drain(spa); + + avl_destroy(&spa->spa_errlist_scrub); + avl_destroy(&spa->spa_errlist_last); + + rw_destroy(&spa->spa_traverse_lock); + mutex_destroy(&spa->spa_uberblock_lock); + mutex_destroy(&spa->spa_errlog_lock); + mutex_destroy(&spa->spa_errlist_lock); + mutex_destroy(&spa->spa_config_lock.scl_lock); + cv_destroy(&spa->spa_config_lock.scl_cv); + mutex_destroy(&spa->spa_sync_bplist.bpl_lock); + mutex_destroy(&spa->spa_history_lock); + mutex_destroy(&spa->spa_props_lock); + + spa->spa_state = POOL_STATE_UNINITIALIZED; +} + +/* + * Verify a pool configuration, and construct the vdev tree appropriately. This + * will create all the necessary vdevs in the appropriate layout, with each vdev + * in the CLOSED state. This will prep the pool before open/creation/import. + * All vdev validation is done by the vdev_alloc() routine. + */ +static int +spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, + uint_t id, int atype) +{ + nvlist_t **child; + uint_t c, children; + int error; + + if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) + return (error); + + if ((*vdp)->vdev_ops->vdev_op_leaf) + return (0); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + vdev_free(*vdp); + *vdp = NULL; + return (EINVAL); + } + + for (c = 0; c < children; c++) { + vdev_t *vd; + if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, + atype)) != 0) { + vdev_free(*vdp); + *vdp = NULL; + return (error); + } + } + + ASSERT(*vdp != NULL); + + return (0); +} + +/* + * Opposite of spa_load(). + */ +static void +spa_unload(spa_t *spa) +{ + int i; + + /* + * Stop async tasks. + */ + spa_async_suspend(spa); + + /* + * Stop syncing. + */ + if (spa->spa_sync_on) { + txg_sync_stop(spa->spa_dsl_pool); + spa->spa_sync_on = B_FALSE; + } + + /* + * Wait for any outstanding prefetch I/O to complete. + */ + spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_exit(spa, FTAG); + + /* + * Close the dsl pool. + */ + if (spa->spa_dsl_pool) { + dsl_pool_close(spa->spa_dsl_pool); + spa->spa_dsl_pool = NULL; + } + + /* + * Close all vdevs. + */ + if (spa->spa_root_vdev) + vdev_free(spa->spa_root_vdev); + ASSERT(spa->spa_root_vdev == NULL); + + for (i = 0; i < spa->spa_nspares; i++) + vdev_free(spa->spa_spares[i]); + if (spa->spa_spares) { + kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); + spa->spa_spares = NULL; + } + if (spa->spa_sparelist) { + nvlist_free(spa->spa_sparelist); + spa->spa_sparelist = NULL; + } + + spa->spa_async_suspended = 0; +} + +/* + * Load (or re-load) the current list of vdevs describing the active spares for + * this pool. When this is called, we have some form of basic information in + * 'spa_sparelist'. We parse this into vdevs, try to open them, and then + * re-generate a more complete list including status information. + */ +static void +spa_load_spares(spa_t *spa) +{ + nvlist_t **spares; + uint_t nspares; + int i; + vdev_t *vd, *tvd; + + /* + * First, close and free any existing spare vdevs. + */ + for (i = 0; i < spa->spa_nspares; i++) { + vd = spa->spa_spares[i]; + + /* Undo the call to spa_activate() below */ + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && + tvd->vdev_isspare) + spa_spare_remove(tvd); + vdev_close(vd); + vdev_free(vd); + } + + if (spa->spa_spares) + kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); + + if (spa->spa_sparelist == NULL) + nspares = 0; + else + VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + spa->spa_nspares = (int)nspares; + spa->spa_spares = NULL; + + if (nspares == 0) + return; + + /* + * Construct the array of vdevs, opening them to get status in the + * process. For each spare, there is potentially two different vdev_t + * structures associated with it: one in the list of spares (used only + * for basic validation purposes) and one in the active vdev + * configuration (if it's spared in). During this phase we open and + * validate each vdev on the spare list. If the vdev also exists in the + * active configuration, then we also mark this vdev as an active spare. + */ + spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); + for (i = 0; i < spa->spa_nspares; i++) { + VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, + VDEV_ALLOC_SPARE) == 0); + ASSERT(vd != NULL); + + spa->spa_spares[i] = vd; + + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { + if (!tvd->vdev_isspare) + spa_spare_add(tvd); + + /* + * We only mark the spare active if we were successfully + * able to load the vdev. Otherwise, importing a pool + * with a bad active spare would result in strange + * behavior, because multiple pool would think the spare + * is actively in use. + * + * There is a vulnerability here to an equally bizarre + * circumstance, where a dead active spare is later + * brought back to life (onlined or otherwise). Given + * the rarity of this scenario, and the extra complexity + * it adds, we ignore the possibility. + */ + if (!vdev_is_dead(tvd)) + spa_spare_activate(tvd); + } + + if (vdev_open(vd) != 0) + continue; + + vd->vdev_top = vd; + (void) vdev_validate_spare(vd); + } + + /* + * Recompute the stashed list of spares, with status information + * this time. + */ + VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + DATA_TYPE_NVLIST_ARRAY) == 0); + + spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); + for (i = 0; i < spa->spa_nspares; i++) + spares[i] = vdev_config_generate(spa, spa->spa_spares[i], + B_TRUE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + spares, spa->spa_nspares) == 0); + for (i = 0; i < spa->spa_nspares; i++) + nvlist_free(spares[i]); + kmem_free(spares, spa->spa_nspares * sizeof (void *)); +} + +static int +load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) +{ + dmu_buf_t *db; + char *packed = NULL; + size_t nvsize = 0; + int error; + *value = NULL; + + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + packed = kmem_alloc(nvsize, KM_SLEEP); + error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); + if (error == 0) + error = nvlist_unpack(packed, nvsize, value, 0); + kmem_free(packed, nvsize); + + return (error); +} + +/* + * Load an existing storage pool, using the pool's builtin spa_config as a + * source of configuration information. + */ +static int +spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) +{ + int error = 0; + nvlist_t *nvroot = NULL; + vdev_t *rvd; + uberblock_t *ub = &spa->spa_uberblock; + uint64_t config_cache_txg = spa->spa_config_txg; + uint64_t pool_guid; + uint64_t version; + zio_t *zio; + + spa->spa_load_state = state; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { + error = EINVAL; + goto out; + } + + /* + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) + version = ZFS_VERSION_INITIAL; + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { + error = EEXIST; + goto out; + } + + spa->spa_load_guid = pool_guid; + + /* + * Parse the configuration into a vdev tree. We explicitly set the + * value that will be returned by spa_version() since parsing the + * configuration requires knowing the version number. + */ + spa_config_enter(spa, RW_WRITER, FTAG); + spa->spa_ubsync.ub_version = version; + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); + spa_config_exit(spa, FTAG); + + if (error != 0) + goto out; + + ASSERT(spa->spa_root_vdev == rvd); + ASSERT(spa_guid(spa) == pool_guid); + + /* + * Try to open all vdevs, loading each label in the process. + */ + if (vdev_open(rvd) != 0) { + error = ENXIO; + goto out; + } + + /* + * Validate the labels for all leaf vdevs. We need to grab the config + * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD + * flag. + */ + spa_config_enter(spa, RW_READER, FTAG); + error = vdev_validate(rvd); + spa_config_exit(spa, FTAG); + + if (error != 0) { + error = EBADF; + goto out; + } + + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + error = ENXIO; + goto out; + } + + /* + * Find the best uberblock. + */ + bzero(ub, sizeof (uberblock_t)); + + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + vdev_uberblock_load(zio, rvd, ub); + error = zio_wait(zio); + + /* + * If we weren't able to find a single valid uberblock, return failure. + */ + if (ub->ub_txg == 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = ENXIO; + goto out; + } + + /* + * If the pool is newer than the code, we can't open it. + */ + if (ub->ub_version > ZFS_VERSION) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_VERSION_NEWER); + error = ENOTSUP; + goto out; + } + + /* + * If the vdev guid sum doesn't match the uberblock, we have an + * incomplete configuration. + */ + if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_GUID_SUM); + error = ENXIO; + goto out; + } + + /* + * Initialize internal SPA structures. + */ + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_first_txg = spa_last_synced_txg(spa) + 1; + error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + if (error) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + goto out; + } + spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; + + if (zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, + sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + if (!mosconfig) { + nvlist_t *newconfig; + + if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + spa_config_set(spa, newconfig); + spa_unload(spa); + spa_deactivate(spa); + spa_activate(spa); + + return (spa_load(spa, newconfig, state, B_TRUE)); + } + + if (zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, + sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* + * Load the bit that tells us to use the new accounting function + * (raid-z deflation). If we have an older pool, this will not + * be present. + */ + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* + * Load the persistent error log. If we have an older pool, this will + * not be present. + */ + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, + sizeof (uint64_t), 1, &spa->spa_errlog_last); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, + sizeof (uint64_t), 1, &spa->spa_errlog_scrub); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* + * Load the history object. If we have an older pool, this + * will not be present. + */ + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, + sizeof (uint64_t), 1, &spa->spa_history); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* + * Load any hot spares for this pool. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + if (error == 0) { + ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); + if (load_nvlist(spa, spa->spa_spares_object, + &spa->spa_sparelist) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + spa_config_enter(spa, RW_WRITER, FTAG); + spa_load_spares(spa); + spa_config_exit(spa, FTAG); + } + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); + + if (error && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + if (error == 0) { + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZFS_PROP_BOOTFS), + sizeof (uint64_t), 1, &spa->spa_bootfs); + } + + /* + * Load the vdev state for all toplevel vdevs. + */ + vdev_load(rvd); + + /* + * Propagate the leaf DTLs we just loaded all the way up the tree. + */ + spa_config_enter(spa, RW_WRITER, FTAG); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE); + spa_config_exit(spa, FTAG); + + /* + * Check the state of the root vdev. If it can't be opened, it + * indicates one or more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + error = ENXIO; + goto out; + } + + if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { + dmu_tx_t *tx; + int need_update = B_FALSE; + int c; + + /* + * Claim log blocks that haven't been committed yet. + * This must all happen in a single txg. + */ + tx = dmu_tx_create_assigned(spa_get_dsl(spa), + spa_first_txg(spa)); + (void) dmu_objset_find(spa->spa_name, + zil_claim, tx, DS_FIND_CHILDREN); + dmu_tx_commit(tx); + + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + + /* + * Wait for all claims to sync. + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * If the config cache is stale, or we have uninitialized + * metaslabs (see spa_vdev_add()), then update the config. + */ + if (config_cache_txg != spa->spa_config_txg || + state == SPA_LOAD_IMPORT) + need_update = B_TRUE; + + for (c = 0; c < rvd->vdev_children; c++) + if (rvd->vdev_child[c]->vdev_ms_array == 0) + need_update = B_TRUE; + + /* + * Update the config cache asychronously in case we're the + * root pool, in which case the config cache isn't writable yet. + */ + if (need_update) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + + error = 0; +out: + if (error && error != EBADF) + zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); + spa->spa_load_state = SPA_LOAD_NONE; + spa->spa_ena = 0; + + return (error); +} + +/* + * Pool Open/Import + * + * The import case is identical to an open except that the configuration is sent + * down from userland, instead of grabbed from the configuration cache. For the + * case of an open, the pool configuration will exist in the + * POOL_STATE_UNITIALIZED state. + * + * The stats information (gen/count/ustats) is used to gather vdev statistics at + * the same time open the pool, without having to keep around the spa_t in some + * ambiguous state. + */ +static int +spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) +{ + spa_t *spa; + int error; + int loaded = B_FALSE; + int locked = B_FALSE; + + *spapp = NULL; + + /* + * As disgusting as this is, we need to support recursive calls to this + * function because dsl_dir_open() is called during spa_load(), and ends + * up calling spa_open() again. The real fix is to figure out how to + * avoid dsl_dir_open() calling this in the first place. + */ + if (mutex_owner(&spa_namespace_lock) != curthread) { + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } + + if ((spa = spa_lookup(pool)) == NULL) { + if (locked) + mutex_exit(&spa_namespace_lock); + return (ENOENT); + } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + + spa_activate(spa); + + error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); + + if (error == EBADF) { + /* + * If vdev_validate() returns failure (indicated by + * EBADF), it indicates that one of the vdevs indicates + * that the pool has been exported or destroyed. If + * this is the case, the config cache is out of sync and + * we should remove the pool from the namespace. + */ + zfs_post_ok(spa, NULL); + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + spa_config_sync(); + if (locked) + mutex_exit(&spa_namespace_lock); + return (ENOENT); + } + + if (error) { + /* + * We can't open the pool, but we still have useful + * information: the state of each vdev after the + * attempted vdev_open(). Return this to the user. + */ + if (config != NULL && spa->spa_root_vdev != NULL) { + spa_config_enter(spa, RW_READER, FTAG); + *config = spa_config_generate(spa, NULL, -1ULL, + B_TRUE); + spa_config_exit(spa, FTAG); + } + spa_unload(spa); + spa_deactivate(spa); + spa->spa_last_open_failed = B_TRUE; + if (locked) + mutex_exit(&spa_namespace_lock); + *spapp = NULL; + return (error); + } else { + zfs_post_ok(spa, NULL); + spa->spa_last_open_failed = B_FALSE; + } + + loaded = B_TRUE; + } + + spa_open_ref(spa, tag); + if (locked) + mutex_exit(&spa_namespace_lock); + + *spapp = spa; + + if (config != NULL) { + spa_config_enter(spa, RW_READER, FTAG); + *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + spa_config_exit(spa, FTAG); + } + + /* + * If we just loaded the pool, resilver anything that's out of date. + */ + if (loaded && (spa_mode & FWRITE)) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + return (0); +} + +int +spa_open(const char *name, spa_t **spapp, void *tag) +{ + return (spa_open_common(name, spapp, tag, NULL)); +} + +/* + * Lookup the given spa_t, incrementing the inject count in the process, + * preventing it from being exported or destroyed. + */ +spa_t * +spa_inject_addref(char *name) +{ + spa_t *spa; + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (NULL); + } + spa->spa_inject_ref++; + mutex_exit(&spa_namespace_lock); + + return (spa); +} + +void +spa_inject_delref(spa_t *spa) +{ + mutex_enter(&spa_namespace_lock); + spa->spa_inject_ref--; + mutex_exit(&spa_namespace_lock); +} + +static void +spa_add_spares(spa_t *spa, nvlist_t *config) +{ + nvlist_t **spares; + uint_t i, nspares; + nvlist_t *nvroot; + uint64_t guid; + vdev_stat_t *vs; + uint_t vsc; + uint64_t pool; + + if (spa->spa_nspares == 0) + return; + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + if (nspares != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + /* + * Go through and find any spares which have since been + * repurposed as an active spare. If this is the case, update + * their status appropriately. + */ + for (i = 0; i < nspares; i++) { + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + if (spa_spare_exists(guid, &pool) && pool != 0ULL) { + VERIFY(nvlist_lookup_uint64_array( + spares[i], ZPOOL_CONFIG_STATS, + (uint64_t **)&vs, &vsc) == 0); + vs->vs_state = VDEV_STATE_CANT_OPEN; + vs->vs_aux = VDEV_AUX_SPARED; + } + } + } +} + +int +spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) +{ + int error; + spa_t *spa; + + *config = NULL; + error = spa_open_common(name, &spa, FTAG, config); + + if (spa && *config != NULL) { + VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, + spa_get_errlog_size(spa)) == 0); + + spa_add_spares(spa, *config); + } + + /* + * We want to get the alternate root even for faulted pools, so we cheat + * and call spa_lookup() directly. + */ + if (altroot) { + if (spa == NULL) { + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(name); + if (spa) + spa_altroot(spa, altroot, buflen); + else + altroot[0] = '\0'; + spa = NULL; + mutex_exit(&spa_namespace_lock); + } else { + spa_altroot(spa, altroot, buflen); + } + } + + if (spa != NULL) + spa_close(spa, FTAG); + + return (error); +} + +/* + * Validate that the 'spares' array is well formed. We must have an array of + * nvlists, each which describes a valid leaf vdev. If this is an import (mode + * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long + * as they are well-formed. + */ +static int +spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) +{ + nvlist_t **spares; + uint_t i, nspares; + vdev_t *vd; + int error; + + /* + * It's acceptable to have no spares specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + return (0); + + if (nspares == 0) + return (EINVAL); + + /* + * Make sure the pool is formatted with a version that supports hot + * spares. + */ + if (spa_version(spa) < ZFS_VERSION_SPARES) + return (ENOTSUP); + + /* + * Set the pending spare list so we correctly handle device in-use + * checking. + */ + spa->spa_pending_spares = spares; + spa->spa_pending_nspares = nspares; + + for (i = 0; i < nspares; i++) { + if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, + mode)) != 0) + goto out; + + if (!vd->vdev_ops->vdev_op_leaf) { + vdev_free(vd); + error = EINVAL; + goto out; + } + + vd->vdev_top = vd; + + if ((error = vdev_open(vd)) == 0 && + (error = vdev_label_init(vd, crtxg, + VDEV_LABEL_SPARE)) == 0) { + VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } + + vdev_free(vd); + + if (error && mode != VDEV_ALLOC_SPARE) + goto out; + else + error = 0; + } + +out: + spa->spa_pending_spares = NULL; + spa->spa_pending_nspares = 0; + return (error); +} + +/* + * Pool Creation + */ +int +spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) +{ + spa_t *spa; + vdev_t *rvd; + dsl_pool_t *dp; + dmu_tx_t *tx; + int c, error = 0; + uint64_t txg = TXG_INITIAL; + nvlist_t **spares; + uint_t nspares; + + /* + * If this pool already exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } + + /* + * Allocate a new spa_t structure. + */ + spa = spa_add(pool, altroot); + spa_activate(spa); + + spa->spa_uberblock.ub_txg = txg - 1; + spa->spa_uberblock.ub_version = ZFS_VERSION; + spa->spa_ubsync = spa->spa_uberblock; + + /* + * Create the root vdev. + */ + spa_config_enter(spa, RW_WRITER, FTAG); + + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); + + ASSERT(error != 0 || rvd != NULL); + ASSERT(error != 0 || spa->spa_root_vdev == rvd); + + if (error == 0 && rvd->vdev_children == 0) + error = EINVAL; + + if (error == 0 && + (error = vdev_create(rvd, txg, B_FALSE)) == 0 && + (error = spa_validate_spares(spa, nvroot, txg, + VDEV_ALLOC_ADD)) == 0) { + for (c = 0; c < rvd->vdev_children; c++) + vdev_init(rvd->vdev_child[c], txg); + vdev_config_dirty(rvd); + } + + spa_config_exit(spa, FTAG); + + if (error != 0) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + /* + * Get the list of spares, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, RW_WRITER, FTAG); + spa_load_spares(spa); + spa_config_exit(spa, FTAG); + spa->spa_sync_spares = B_TRUE; + } + + spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); + spa->spa_meta_objset = dp->dp_meta_objset; + + tx = dmu_tx_create_assigned(dp, txg); + + /* + * Create the pool config object. + */ + spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, 1 << 14, + DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); + + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, + sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool config"); + } + + /* Newly created pools are always deflated. */ + spa->spa_deflate = TRUE; + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { + cmn_err(CE_PANIC, "failed to add deflate"); + } + + /* + * Create the deferred-free bplist object. Turn off compression + * because sync-to-convergence takes longer if the blocksize + * keeps changing. + */ + spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, + 1 << 14, tx); + dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, + ZIO_COMPRESS_OFF, tx); + + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, + sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { + cmn_err(CE_PANIC, "failed to add bplist"); + } + + /* + * Create the pool's history object. + */ + spa_history_create_obj(spa, tx); + + dmu_tx_commit(tx); + + spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + + /* + * We explicitly wait for the first transaction to complete so that our + * bean counters are appropriately updated. + */ + txg_wait_synced(spa->spa_dsl_pool, txg); + + spa_config_sync(); + + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Import the given pool into the system. We set up the necessary spa_t and + * then call spa_load() to do the dirty work. + */ +int +spa_import(const char *pool, nvlist_t *config, const char *altroot) +{ + spa_t *spa; + int error; + nvlist_t *nvroot; + nvlist_t **spares; + uint_t nspares; + + if (!(spa_mode & FWRITE)) + return (EROFS); + + /* + * If a pool with this name exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } + + /* + * Create and initialize the spa structure. + */ + spa = spa_add(pool, altroot); + spa_activate(spa); + + /* + * Pass off the heavy lifting to spa_load(). + * Pass TRUE for mosconfig because the user-supplied config + * is actually the one to trust when doing an import. + */ + error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); + + spa_config_enter(spa, RW_WRITER, FTAG); + /* + * Toss any existing sparelist, as it doesn't have any validity anymore, + * and conflicts with spa_has_spare(). + */ + if (spa->spa_sparelist) { + nvlist_free(spa->spa_sparelist); + spa->spa_sparelist = NULL; + spa_load_spares(spa); + } + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (error == 0) + error = spa_validate_spares(spa, nvroot, -1ULL, + VDEV_ALLOC_SPARE); + spa_config_exit(spa, FTAG); + + if (error != 0) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + /* + * Override any spares as specified by the user, as these may have + * correct device names/devids, etc. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + if (spa->spa_sparelist) + VERIFY(nvlist_remove(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_sparelist, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, RW_WRITER, FTAG); + spa_load_spares(spa); + spa_config_exit(spa, FTAG); + spa->spa_sync_spares = B_TRUE; + } + + /* + * Update the config cache to include the newly-imported pool. + */ + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + + mutex_exit(&spa_namespace_lock); + + /* + * Resilver anything that's out of date. + */ + if (spa_mode & FWRITE) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + return (0); +} + +/* + * This (illegal) pool name is used when temporarily importing a spa_t in order + * to get the vdev stats associated with the imported devices. + */ +#define TRYIMPORT_NAME "$import" + +nvlist_t * +spa_tryimport(nvlist_t *tryconfig) +{ + nvlist_t *config = NULL; + char *poolname; + spa_t *spa; + uint64_t state; + + if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) + return (NULL); + + if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) + return (NULL); + + /* + * Create and initialize the spa structure. + */ + mutex_enter(&spa_namespace_lock); + spa = spa_add(TRYIMPORT_NAME, NULL); + spa_activate(spa); + + /* + * Pass off the heavy lifting to spa_load(). + * Pass TRUE for mosconfig because the user-supplied config + * is actually the one to trust when doing an import. + */ + (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); + + /* + * If 'tryconfig' was at least parsable, return the current config. + */ + if (spa->spa_root_vdev != NULL) { + spa_config_enter(spa, RW_READER, FTAG); + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + spa_config_exit(spa, FTAG); + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, + poolname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + state) == 0); + + /* + * Add the list of hot spares. + */ + spa_add_spares(spa, config); + } + + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + + return (config); +} + +/* + * Pool export/destroy + * + * The act of destroying or exporting a pool is very simple. We make sure there + * is no more pending I/O and any references to the pool are gone. Then, we + * update the pool state and sync all the labels to disk, removing the + * configuration from the cache afterwards. + */ +static int +spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) +{ + spa_t *spa; + + if (oldconfig) + *oldconfig = NULL; + + if (!(spa_mode & FWRITE)) + return (EROFS); + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pool)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (ENOENT); + } + + /* + * Put a hold on the pool, drop the namespace lock, stop async tasks, + * reacquire the namespace lock, and see if we can export. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + /* + * The pool will be in core if it's openable, + * in which case we can modify its state. + */ + if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { + /* + * Objsets may be open only because they're dirty, so we + * have to force it to sync before checking spa_refcnt. + */ + spa_scrub_suspend(spa); + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * A pool cannot be exported or destroyed if there are active + * references. If we are resetting a pool, allow references by + * fault injection handlers. + */ + if (!spa_refcount_zero(spa) || + (spa->spa_inject_ref != 0 && + new_state != POOL_STATE_UNINITIALIZED)) { + spa_scrub_resume(spa); + spa_async_resume(spa); + mutex_exit(&spa_namespace_lock); + return (EBUSY); + } + + spa_scrub_resume(spa); + VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); + + /* + * We want this to be reflected on every label, + * so mark them all dirty. spa_unload() will do the + * final sync that pushes these changes out. + */ + if (new_state != POOL_STATE_UNINITIALIZED) { + spa_config_enter(spa, RW_WRITER, FTAG); + spa->spa_state = new_state; + spa->spa_final_txg = spa_last_synced_txg(spa) + 1; + vdev_config_dirty(spa->spa_root_vdev); + spa_config_exit(spa, FTAG); + } + } + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + + if (oldconfig && spa->spa_config) + VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); + + if (new_state != POOL_STATE_UNINITIALIZED) { + spa_remove(spa); + spa_config_sync(); + } + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Destroy a storage pool. + */ +int +spa_destroy(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); +} + +/* + * Export a storage pool. + */ +int +spa_export(char *pool, nvlist_t **oldconfig) +{ + return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); +} + +/* + * Similar to spa_export(), this unloads the spa_t without actually removing it + * from the namespace in any way. + */ +int +spa_reset(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); +} + + +/* + * ========================================================================== + * Device manipulation + * ========================================================================== + */ + +/* + * Add capacity to a storage pool. + */ +int +spa_vdev_add(spa_t *spa, nvlist_t *nvroot) +{ + uint64_t txg; + int c, error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd, *tvd; + nvlist_t **spares; + uint_t i, nspares; + + txg = spa_vdev_enter(spa); + + if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + spa->spa_pending_vdev = vd; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + nspares = 0; + + if (vd->vdev_children == 0 && nspares == 0) { + spa->spa_pending_vdev = NULL; + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + + if (vd->vdev_children != 0) { + if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { + spa->spa_pending_vdev = NULL; + return (spa_vdev_exit(spa, vd, txg, error)); + } + } + + /* + * We must validate the spares after checking the children. Otherwise, + * vdev_inuse() will blindly overwrite the spare. + */ + if ((error = spa_validate_spares(spa, nvroot, txg, + VDEV_ALLOC_ADD)) != 0) { + spa->spa_pending_vdev = NULL; + return (spa_vdev_exit(spa, vd, txg, error)); + } + + spa->spa_pending_vdev = NULL; + + /* + * Transfer each new top-level vdev from vd to rvd. + */ + for (c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + vdev_remove_child(vd, tvd); + tvd->vdev_id = rvd->vdev_children; + vdev_add_child(rvd, tvd); + vdev_config_dirty(tvd); + } + + if (nspares != 0) { + if (spa->spa_sparelist != NULL) { + nvlist_t **oldspares; + uint_t oldnspares; + nvlist_t **newspares; + + VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); + + newspares = kmem_alloc(sizeof (void *) * + (nspares + oldnspares), KM_SLEEP); + for (i = 0; i < oldnspares; i++) + VERIFY(nvlist_dup(oldspares[i], + &newspares[i], KM_SLEEP) == 0); + for (i = 0; i < nspares; i++) + VERIFY(nvlist_dup(spares[i], + &newspares[i + oldnspares], + KM_SLEEP) == 0); + + VERIFY(nvlist_remove(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, newspares, + nspares + oldnspares) == 0); + for (i = 0; i < oldnspares + nspares; i++) + nvlist_free(newspares[i]); + kmem_free(newspares, (oldnspares + nspares) * + sizeof (void *)); + } else { + VERIFY(nvlist_alloc(&spa->spa_sparelist, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + } + + spa_load_spares(spa); + spa->spa_sync_spares = B_TRUE; + } + + /* + * We have to be careful when adding new vdevs to an existing pool. + * If other threads start allocating from these vdevs before we + * sync the config cache, and we lose power, then upon reboot we may + * fail to open the pool because there are DVAs that the config cache + * can't translate. Therefore, we first add the vdevs without + * initializing metaslabs; sync the config cache (via spa_vdev_exit()); + * and then let spa_config_update() initialize the new metaslabs. + * + * spa_load() checks for added-but-not-initialized vdevs, so that + * if we lose power at any point in this sequence, the remaining + * steps will be completed the next time we load the pool. + */ + (void) spa_vdev_exit(spa, vd, txg, 0); + + mutex_enter(&spa_namespace_lock); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Attach a device to a mirror. The arguments are the path to any device + * in the mirror, and the nvroot for the new device. If the path specifies + * a device that is not mirrored, we automatically insert the mirror vdev. + * + * If 'replacing' is specified, the new device is intended to replace the + * existing device; in this case the two devices are made into their own + * mirror using the 'replacing' vdev, which is functionally idendical to + * the mirror vdev (it actually reuses all the same ops) but has a few + * extra rules: you can't attach to it after it's been created, and upon + * completion of resilvering, the first disk (the one being replaced) + * is automatically detached. + */ +int +spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) +{ + uint64_t txg, open_txg; + int error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; + vdev_ops_t *pvops; + + txg = spa_vdev_enter(spa); + + oldvd = vdev_lookup_by_guid(rvd, guid); + + if (oldvd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!oldvd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + pvd = oldvd->vdev_parent; + + if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, + VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + newvd = newrootvd->vdev_child[0]; + + if (!newvd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + if ((error = vdev_create(newrootvd, txg, replacing)) != 0) + return (spa_vdev_exit(spa, newrootvd, txg, error)); + + if (!replacing) { + /* + * For attach, the only allowable parent is a mirror or the root + * vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + pvops = &vdev_mirror_ops; + } else { + /* + * Active hot spares can only be replaced by inactive hot + * spares. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + pvd->vdev_child[1] == oldvd && + !spa_has_spare(spa, newvd->vdev_guid)) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * If the source is a hot spare, and the parent isn't already a + * spare, then we want to create a new hot spare. Otherwise, we + * want to create a replacing vdev. The user is not allowed to + * attach to a spared vdev child unless the 'isspare' state is + * the same (spare replaces spare, non-spare replaces + * non-spare). + */ + if (pvd->vdev_ops == &vdev_replacing_ops) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + else if (pvd->vdev_ops == &vdev_spare_ops && + newvd->vdev_isspare != oldvd->vdev_isspare) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + else if (pvd->vdev_ops != &vdev_spare_ops && + newvd->vdev_isspare) + pvops = &vdev_spare_ops; + else + pvops = &vdev_replacing_ops; + } + + /* + * Compare the new device size with the replaceable/attachable + * device size. + */ + if (newvd->vdev_psize < vdev_get_rsize(oldvd)) + return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); + + /* + * The new device cannot have a higher alignment requirement + * than the top-level vdev. + */ + if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) + return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + + /* + * If this is an in-place replacement, update oldvd's path and devid + * to make it distinguishable from newvd, and unopenable from now on. + */ + if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + spa_strfree(oldvd->vdev_path); + oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + KM_SLEEP); + (void) sprintf(oldvd->vdev_path, "%s/%s", + newvd->vdev_path, "old"); + if (oldvd->vdev_devid != NULL) { + spa_strfree(oldvd->vdev_devid); + oldvd->vdev_devid = NULL; + } + } + + /* + * If the parent is not a mirror, or if we're replacing, insert the new + * mirror/replacing/spare vdev above oldvd. + */ + if (pvd->vdev_ops != pvops) + pvd = vdev_add_parent(oldvd, pvops); + + ASSERT(pvd->vdev_top->vdev_parent == rvd); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + + /* + * Extract the new device from its root and add it to pvd. + */ + vdev_remove_child(newrootvd, newvd); + newvd->vdev_id = pvd->vdev_children; + vdev_add_child(pvd, newvd); + + /* + * If newvd is smaller than oldvd, but larger than its rsize, + * the addition of newvd may have decreased our parent's asize. + */ + pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); + + tvd = newvd->vdev_top; + ASSERT(pvd->vdev_top == tvd); + ASSERT(tvd->vdev_parent == rvd); + + vdev_config_dirty(tvd); + + /* + * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate + * upward when spa_vdev_exit() calls vdev_dtl_reassess(). + */ + open_txg = txg + TXG_CONCURRENT_STATES - 1; + + mutex_enter(&newvd->vdev_dtl_lock); + space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, + open_txg - TXG_INITIAL + 1); + mutex_exit(&newvd->vdev_dtl_lock); + + if (newvd->vdev_isspare) + spa_spare_activate(newvd); + + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); + + (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); + + /* + * Kick off a resilver to update newvd. + */ + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + return (0); +} + +/* + * Detach a device from a mirror or replacing vdev. + * If 'replace_done' is specified, only detach if the parent + * is a replacing vdev. + */ +int +spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) +{ + uint64_t txg; + int c, t, error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd, *pvd, *cvd, *tvd; + boolean_t unspare = B_FALSE; + uint64_t unspare_guid; + + txg = spa_vdev_enter(spa); + + vd = vdev_lookup_by_guid(rvd, guid); + + if (vd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + pvd = vd->vdev_parent; + + /* + * If replace_done is specified, only remove this device if it's + * the first child of a replacing vdev. For the 'spare' vdev, either + * disk can be removed. + */ + if (replace_done) { + if (pvd->vdev_ops == &vdev_replacing_ops) { + if (vd->vdev_id != 0) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } else if (pvd->vdev_ops != &vdev_spare_ops) { + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } + } + + ASSERT(pvd->vdev_ops != &vdev_spare_ops || + spa_version(spa) >= ZFS_VERSION_SPARES); + + /* + * Only mirror, replacing, and spare vdevs support detach. + */ + if (pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + /* + * If there's only one replica, you can't detach it. + */ + if (pvd->vdev_children <= 1) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* + * If all siblings have non-empty DTLs, this device may have the only + * valid copy of the data, which means we cannot safely detach it. + * + * XXX -- as in the vdev_offline() case, we really want a more + * precise DTL check. + */ + for (c = 0; c < pvd->vdev_children; c++) { + uint64_t dirty; + + cvd = pvd->vdev_child[c]; + if (cvd == vd) + continue; + if (vdev_is_dead(cvd)) + continue; + mutex_enter(&cvd->vdev_dtl_lock); + dirty = cvd->vdev_dtl_map.sm_space | + cvd->vdev_dtl_scrub.sm_space; + mutex_exit(&cvd->vdev_dtl_lock); + if (!dirty) + break; + } + + /* + * If we are a replacing or spare vdev, then we can always detach the + * latter child, as that is how one cancels the operation. + */ + if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && + c == pvd->vdev_children) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* + * If we are detaching the original disk from a spare, then it implies + * that the spare should become a real disk, and be removed from the + * active spare list for the pool. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + vd->vdev_id == 0) + unspare = B_TRUE; + + /* + * Erase the disk labels so the disk can be used for other things. + * This must be done after all other error cases are handled, + * but before we disembowel vd (so we can still do I/O to it). + * But if we can't do it, don't treat the error as fatal -- + * it may be that the unwritability of the disk is the reason + * it's being detached! + */ + error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + /* + * Remove vd from its parent and compact the parent's children. + */ + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + /* + * Remember one of the remaining children so we can get tvd below. + */ + cvd = pvd->vdev_child[0]; + + /* + * If we need to remove the remaining child from the list of hot spares, + * do it now, marking the vdev as no longer a spare in the process. We + * must do this before vdev_remove_parent(), because that can change the + * GUID if it creates a new toplevel GUID. + */ + if (unspare) { + ASSERT(cvd->vdev_isspare); + spa_spare_remove(cvd); + unspare_guid = cvd->vdev_guid; + } + + /* + * If the parent mirror/replacing vdev only has one child, + * the parent is no longer needed. Remove it from the tree. + */ + if (pvd->vdev_children == 1) + vdev_remove_parent(cvd); + + /* + * We don't set tvd until now because the parent we just removed + * may have been the previous top-level vdev. + */ + tvd = cvd->vdev_top; + ASSERT(tvd->vdev_parent == rvd); + + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(cvd->vdev_parent); + + /* + * If the device we just detached was smaller than the others, it may be + * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() + * can't fail because the existing metaslabs are already in core, so + * there's nothing to read from disk. + */ + VERIFY(vdev_metaslab_init(tvd, txg) == 0); + + vdev_config_dirty(tvd); + + /* + * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that + * vd->vdev_detached is set and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, to + * prevent vd from being accessed after it's freed. + */ + for (t = 0; t < TXG_SIZE; t++) + (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); + vd->vdev_detached = B_TRUE; + vdev_dirty(tvd, VDD_DTL, vd, txg); + + error = spa_vdev_exit(spa, vd, txg, 0); + + /* + * If this was the removal of the original device in a hot spare vdev, + * then we want to go through and remove the device from the hot spare + * list of every other pool. + */ + if (unspare) { + spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa->spa_state != POOL_STATE_ACTIVE) + continue; + + (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + } + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +/* + * Remove a device from the pool. Currently, this supports removing only hot + * spares. + */ +int +spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) +{ + vdev_t *vd; + nvlist_t **spares, *nv, **newspares; + uint_t i, j, nspares; + int ret = 0; + + spa_config_enter(spa, RW_WRITER, FTAG); + + vd = spa_lookup_by_guid(spa, guid); + + nv = NULL; + if (spa->spa_spares != NULL && + nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + uint64_t theguid; + + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == guid) { + nv = spares[i]; + break; + } + } + } + + /* + * We only support removing a hot spare, and only if it's not currently + * in use in this pool. + */ + if (nv == NULL && vd == NULL) { + ret = ENOENT; + goto out; + } + + if (nv == NULL && vd != NULL) { + ret = ENOTSUP; + goto out; + } + + if (!unspare && nv != NULL && vd != NULL) { + ret = EBUSY; + goto out; + } + + if (nspares == 1) { + newspares = NULL; + } else { + newspares = kmem_alloc((nspares - 1) * sizeof (void *), + KM_SLEEP); + for (i = 0, j = 0; i < nspares; i++) { + if (spares[i] != nv) + VERIFY(nvlist_dup(spares[i], + &newspares[j++], KM_SLEEP) == 0); + } + } + + VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + newspares, nspares - 1) == 0); + for (i = 0; i < nspares - 1; i++) + nvlist_free(newspares[i]); + kmem_free(newspares, (nspares - 1) * sizeof (void *)); + spa_load_spares(spa); + spa->spa_sync_spares = B_TRUE; + +out: + spa_config_exit(spa, FTAG); + + return (ret); +} + +/* + * Find any device that's done replacing, so we can detach it. + */ +static vdev_t * +spa_vdev_replace_done_hunt(vdev_t *vd) +{ + vdev_t *newvd, *oldvd; + int c; + + for (c = 0; c < vd->vdev_children; c++) { + oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); + if (oldvd != NULL) + return (oldvd); + } + + if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { + oldvd = vd->vdev_child[0]; + newvd = vd->vdev_child[1]; + + mutex_enter(&newvd->vdev_dtl_lock); + if (newvd->vdev_dtl_map.sm_space == 0 && + newvd->vdev_dtl_scrub.sm_space == 0) { + mutex_exit(&newvd->vdev_dtl_lock); + return (oldvd); + } + mutex_exit(&newvd->vdev_dtl_lock); + } + + return (NULL); +} + +static void +spa_vdev_replace_done(spa_t *spa) +{ + vdev_t *vd; + vdev_t *pvd; + uint64_t guid; + uint64_t pguid = 0; + + spa_config_enter(spa, RW_READER, FTAG); + + while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { + guid = vd->vdev_guid; + /* + * If we have just finished replacing a hot spared device, then + * we need to detach the parent's first child (the original hot + * spare) as well. + */ + pvd = vd->vdev_parent; + if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && + pvd->vdev_id == 0) { + ASSERT(pvd->vdev_ops == &vdev_replacing_ops); + ASSERT(pvd->vdev_parent->vdev_children == 2); + pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; + } + spa_config_exit(spa, FTAG); + if (spa_vdev_detach(spa, guid, B_TRUE) != 0) + return; + if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) + return; + spa_config_enter(spa, RW_READER, FTAG); + } + + spa_config_exit(spa, FTAG); +} + +/* + * Update the stored path for this vdev. Dirty the vdev configuration, relying + * on spa_vdev_enter/exit() to synchronize the labels and cache. + */ +int +spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + rvd = spa->spa_root_vdev; + + txg = spa_vdev_enter(spa); + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { + /* + * Determine if this is a reference to a hot spare. In that + * case, update the path as stored in the spare list. + */ + nvlist_t **spares; + uint_t i, nspares; + if (spa->spa_sparelist != NULL) { + VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + for (i = 0; i < nspares; i++) { + uint64_t theguid; + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == guid) + break; + } + + if (i == nspares) + return (spa_vdev_exit(spa, NULL, txg, ENOENT)); + + VERIFY(nvlist_add_string(spares[i], + ZPOOL_CONFIG_PATH, newpath) == 0); + spa_load_spares(spa); + spa->spa_sync_spares = B_TRUE; + return (spa_vdev_exit(spa, NULL, txg, 0)); + } else { + return (spa_vdev_exit(spa, NULL, txg, ENOENT)); + } + } + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + spa_strfree(vd->vdev_path); + vd->vdev_path = spa_strdup(newpath); + + vdev_config_dirty(vd->vdev_top); + + return (spa_vdev_exit(spa, NULL, txg, 0)); +} + +/* + * ========================================================================== + * SPA Scrubbing + * ========================================================================== + */ + +static void +spa_scrub_io_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; + spa->spa_scrub_errors++; + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_errors++; + mutex_exit(&vd->vdev_stat_lock); + } + + if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) + cv_broadcast(&spa->spa_scrub_io_cv); + + ASSERT(spa->spa_scrub_inflight >= 0); + + mutex_exit(&spa->spa_scrub_lock); +} + +static void +spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, + zbookmark_t *zb) +{ + size_t size = BP_GET_LSIZE(bp); + void *data; + + mutex_enter(&spa->spa_scrub_lock); + /* + * Do not give too much work to vdev(s). + */ + while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + } + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + data = zio_data_buf_alloc(size); + + if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) + flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ + + flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; + + zio_nowait(zio_read(NULL, spa, bp, data, size, + spa_scrub_io_done, NULL, priority, flags, zb)); +} + +/* ARGSUSED */ +static int +spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) +{ + blkptr_t *bp = &bc->bc_blkptr; + vdev_t *vd = spa->spa_root_vdev; + dva_t *dva = bp->blk_dva; + int needs_resilver = B_FALSE; + int d; + + if (bc->bc_errno) { + /* + * We can't scrub this block, but we can continue to scrub + * the rest of the pool. Note the error and move along. + */ + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_errors++; + mutex_exit(&spa->spa_scrub_lock); + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_errors++; + mutex_exit(&vd->vdev_stat_lock); + + return (ERESTART); + } + + ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); + + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); + + ASSERT(vd != NULL); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); + mutex_exit(&vd->vdev_stat_lock); + + if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { + if (DVA_GET_GANG(&dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best we can do is look at the + * pool-wide DTL. + * XXX -- it would be better to change our + * allocation policy to ensure that this can't + * happen. + */ + vd = spa->spa_root_vdev; + } + if (vdev_dtl_contains(&vd->vdev_dtl_map, + bp->blk_birth, 1)) + needs_resilver = B_TRUE; + } + } + + if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) + spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, + ZIO_FLAG_SCRUB, &bc->bc_bookmark); + else if (needs_resilver) + spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, + ZIO_FLAG_RESILVER, &bc->bc_bookmark); + + return (0); +} + +static void +spa_scrub_thread(void *arg) +{ + spa_t *spa = arg; + callb_cpr_t cprinfo; + traverse_handle_t *th = spa->spa_scrub_th; + vdev_t *rvd = spa->spa_root_vdev; + pool_scrub_type_t scrub_type = spa->spa_scrub_type; + int error = 0; + boolean_t complete; + + CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); + + /* + * If we're restarting due to a snapshot create/delete, + * wait for that to complete. + */ + txg_wait_synced(spa_get_dsl(spa), 0); + + dprintf("start %s mintxg=%llu maxtxg=%llu\n", + scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", + spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); + + spa_config_enter(spa, RW_WRITER, FTAG); + vdev_reopen(rvd); /* purge all vdev caches */ + vdev_config_dirty(rvd); /* rewrite all disk labels */ + vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); + spa_config_exit(spa, FTAG); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_errors = 0; + spa->spa_scrub_active = 1; + ASSERT(spa->spa_scrub_inflight == 0); + + while (!spa->spa_scrub_stop) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + while (spa->spa_scrub_suspended) { + spa->spa_scrub_active = 0; + cv_broadcast(&spa->spa_scrub_cv); + cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); + spa->spa_scrub_active = 1; + } + CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); + + if (spa->spa_scrub_restart_txg != 0) + break; + + mutex_exit(&spa->spa_scrub_lock); + error = traverse_more(th); + mutex_enter(&spa->spa_scrub_lock); + if (error != EAGAIN) + break; + } + + while (spa->spa_scrub_inflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + + spa->spa_scrub_active = 0; + cv_broadcast(&spa->spa_scrub_cv); + + mutex_exit(&spa->spa_scrub_lock); + + spa_config_enter(spa, RW_WRITER, FTAG); + + mutex_enter(&spa->spa_scrub_lock); + + /* + * Note: we check spa_scrub_restart_txg under both spa_scrub_lock + * AND the spa config lock to synchronize with any config changes + * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). + */ + if (spa->spa_scrub_restart_txg != 0) + error = ERESTART; + + if (spa->spa_scrub_stop) + error = EINTR; + + /* + * Even if there were uncorrectable errors, we consider the scrub + * completed. The downside is that if there is a transient error during + * a resilver, we won't resilver the data properly to the target. But + * if the damage is permanent (more likely) we will resilver forever, + * which isn't really acceptable. Since there is enough information for + * the user to know what has failed and why, this seems like a more + * tractable approach. + */ + complete = (error == 0); + + dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", + scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", + spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", + error, spa->spa_scrub_errors, spa->spa_scrub_stop); + + mutex_exit(&spa->spa_scrub_lock); + + /* + * If the scrub/resilver completed, update all DTLs to reflect this. + * Whether it succeeded or not, vacate all temporary scrub DTLs. + */ + vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, + complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); + vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); + spa_errlog_rotate(spa); + + spa_config_exit(spa, FTAG); + + mutex_enter(&spa->spa_scrub_lock); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); + + /* + * If we were told to restart, our final act is to start a new scrub. + */ + if (error == ERESTART) + spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? + SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); + + spa->spa_scrub_type = POOL_SCRUB_NONE; + spa->spa_scrub_active = 0; + spa->spa_scrub_thread = NULL; + cv_broadcast(&spa->spa_scrub_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ + thread_exit(); +} + +void +spa_scrub_suspend(spa_t *spa) +{ + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_suspended++; + while (spa->spa_scrub_active) { + cv_broadcast(&spa->spa_scrub_cv); + cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); + } + while (spa->spa_scrub_inflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + mutex_exit(&spa->spa_scrub_lock); +} + +void +spa_scrub_resume(spa_t *spa) +{ + mutex_enter(&spa->spa_scrub_lock); + ASSERT(spa->spa_scrub_suspended != 0); + if (--spa->spa_scrub_suspended == 0) + cv_broadcast(&spa->spa_scrub_cv); + mutex_exit(&spa->spa_scrub_lock); +} + +void +spa_scrub_restart(spa_t *spa, uint64_t txg) +{ + /* + * Something happened (e.g. snapshot create/delete) that means + * we must restart any in-progress scrubs. The itinerary will + * fix this properly. + */ + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_restart_txg = txg; + mutex_exit(&spa->spa_scrub_lock); +} + +int +spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) +{ + space_seg_t *ss; + uint64_t mintxg, maxtxg; + vdev_t *rvd = spa->spa_root_vdev; + + if ((uint_t)type >= POOL_SCRUB_TYPES) + return (ENOTSUP); + + mutex_enter(&spa->spa_scrub_lock); + + /* + * If there's a scrub or resilver already in progress, stop it. + */ + while (spa->spa_scrub_thread != NULL) { + /* + * Don't stop a resilver unless forced. + */ + if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { + mutex_exit(&spa->spa_scrub_lock); + return (EBUSY); + } + spa->spa_scrub_stop = 1; + cv_broadcast(&spa->spa_scrub_cv); + cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); + } + + /* + * Terminate the previous traverse. + */ + if (spa->spa_scrub_th != NULL) { + traverse_fini(spa->spa_scrub_th); + spa->spa_scrub_th = NULL; + } + + if (rvd == NULL) { + ASSERT(spa->spa_scrub_stop == 0); + ASSERT(spa->spa_scrub_type == type); + ASSERT(spa->spa_scrub_restart_txg == 0); + mutex_exit(&spa->spa_scrub_lock); + return (0); + } + + mintxg = TXG_INITIAL - 1; + maxtxg = spa_last_synced_txg(spa) + 1; + + mutex_enter(&rvd->vdev_dtl_lock); + + if (rvd->vdev_dtl_map.sm_space == 0) { + /* + * The pool-wide DTL is empty. + * If this is a resilver, there's nothing to do except + * check whether any in-progress replacements have completed. + */ + if (type == POOL_SCRUB_RESILVER) { + type = POOL_SCRUB_NONE; + spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); + } + } else { + /* + * The pool-wide DTL is non-empty. + * If this is a normal scrub, upgrade to a resilver instead. + */ + if (type == POOL_SCRUB_EVERYTHING) + type = POOL_SCRUB_RESILVER; + } + + if (type == POOL_SCRUB_RESILVER) { + /* + * Determine the resilvering boundaries. + * + * Note: (mintxg, maxtxg) is an open interval, + * i.e. mintxg and maxtxg themselves are not included. + * + * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 + * so we don't claim to resilver a txg that's still changing. + */ + ss = avl_first(&rvd->vdev_dtl_map.sm_root); + mintxg = ss->ss_start - 1; + ss = avl_last(&rvd->vdev_dtl_map.sm_root); + maxtxg = MIN(ss->ss_end, maxtxg); + } + + mutex_exit(&rvd->vdev_dtl_lock); + + spa->spa_scrub_stop = 0; + spa->spa_scrub_type = type; + spa->spa_scrub_restart_txg = 0; + + if (type != POOL_SCRUB_NONE) { + spa->spa_scrub_mintxg = mintxg; + spa->spa_scrub_maxtxg = maxtxg; + spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, + ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, + ZIO_FLAG_CANFAIL); + traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); + spa->spa_scrub_thread = thread_create(NULL, 0, + spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); + } + + mutex_exit(&spa->spa_scrub_lock); + + return (0); +} + +/* + * ========================================================================== + * SPA async task processing + * ========================================================================== + */ + +static void +spa_async_reopen(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *tvd; + int c; + + spa_config_enter(spa, RW_WRITER, FTAG); + + for (c = 0; c < rvd->vdev_children; c++) { + tvd = rvd->vdev_child[c]; + if (tvd->vdev_reopen_wanted) { + tvd->vdev_reopen_wanted = 0; + vdev_reopen(tvd); + } + } + + spa_config_exit(spa, FTAG); +} + +static void +spa_async_thread(void *arg) +{ + spa_t *spa = arg; + int tasks; + + ASSERT(spa->spa_sync_on); + + mutex_enter(&spa->spa_async_lock); + tasks = spa->spa_async_tasks; + spa->spa_async_tasks = 0; + mutex_exit(&spa->spa_async_lock); + + /* + * See if the config needs to be updated. + */ + if (tasks & SPA_ASYNC_CONFIG_UPDATE) { + mutex_enter(&spa_namespace_lock); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + mutex_exit(&spa_namespace_lock); + } + + /* + * See if any devices need to be reopened. + */ + if (tasks & SPA_ASYNC_REOPEN) + spa_async_reopen(spa); + + /* + * If any devices are done replacing, detach them. + */ + if (tasks & SPA_ASYNC_REPLACE_DONE) + spa_vdev_replace_done(spa); + + /* + * Kick off a scrub. + */ + if (tasks & SPA_ASYNC_SCRUB) + VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); + + /* + * Kick off a resilver. + */ + if (tasks & SPA_ASYNC_RESILVER) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + /* + * Let the world know that we're done. + */ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_thread = NULL; + cv_broadcast(&spa->spa_async_cv); + mutex_exit(&spa->spa_async_lock); + thread_exit(); +} + +void +spa_async_suspend(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_suspended++; + while (spa->spa_async_thread != NULL) + cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); + mutex_exit(&spa->spa_async_lock); +} + +void +spa_async_resume(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + ASSERT(spa->spa_async_suspended != 0); + spa->spa_async_suspended--; + mutex_exit(&spa->spa_async_lock); +} + +static void +spa_async_dispatch(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + if (spa->spa_async_tasks && !spa->spa_async_suspended && + spa->spa_async_thread == NULL && + rootdir != NULL && !vn_is_readonly(rootdir)) + spa->spa_async_thread = thread_create(NULL, 0, + spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&spa->spa_async_lock); +} + +void +spa_async_request(spa_t *spa, int task) +{ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks |= task; + mutex_exit(&spa->spa_async_lock); +} + +/* + * ========================================================================== + * SPA syncing routines + * ========================================================================== + */ + +static void +spa_sync_deferred_frees(spa_t *spa, uint64_t txg) +{ + bplist_t *bpl = &spa->spa_sync_bplist; + dmu_tx_t *tx; + blkptr_t blk; + uint64_t itor = 0; + zio_t *zio; + int error; + uint8_t c = 1; + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); + + while (bplist_iterate(bpl, &itor, &blk) == 0) + zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); + + error = zio_wait(zio); + ASSERT3U(error, ==, 0); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + bplist_vacate(bpl, tx); + + /* + * Pre-dirty the first block so we sync to convergence faster. + * (Usually only the first block is needed.) + */ + dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); + dmu_tx_commit(tx); +} + +static void +spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) +{ + char *packed = NULL; + size_t nvsize = 0; + dmu_buf_t *db; + + VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); + + packed = kmem_alloc(nvsize, KM_SLEEP); + + VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, + KM_SLEEP) == 0); + + dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); + + kmem_free(packed, nvsize); + + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = nvsize; + dmu_buf_rele(db, FTAG); +} + +static void +spa_sync_spares(spa_t *spa, dmu_tx_t *tx) +{ + nvlist_t *nvroot; + nvlist_t **spares; + int i; + + if (!spa->spa_sync_spares) + return; + + /* + * Update the MOS nvlist describing the list of available spares. + * spa_validate_spares() will have already made sure this nvlist is + * valid and the vdevs are labelled appropriately. + */ + if (spa->spa_spares_object == 0) { + spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, 1 << 14, + DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); + VERIFY(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, + sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (spa->spa_nspares == 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + NULL, 0) == 0); + } else { + spares = kmem_alloc(spa->spa_nspares * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_nspares; i++) + spares[i] = vdev_config_generate(spa, + spa->spa_spares[i], B_FALSE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + spares, spa->spa_nspares) == 0); + for (i = 0; i < spa->spa_nspares; i++) + nvlist_free(spares[i]); + kmem_free(spares, spa->spa_nspares * sizeof (void *)); + } + + spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); + nvlist_free(nvroot); + + spa->spa_sync_spares = B_FALSE; +} + +static void +spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) +{ + nvlist_t *config; + + if (list_is_empty(&spa->spa_dirty_list)) + return; + + config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); + + if (spa->spa_config_syncing) + nvlist_free(spa->spa_config_syncing); + spa->spa_config_syncing = config; + + spa_sync_nvlist(spa, spa->spa_config_object, config, tx); +} + +static void +spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + nvlist_t *nvp = arg2; + nvpair_t *nvpair; + objset_t *mos = spa->spa_meta_objset; + uint64_t zapobj; + + mutex_enter(&spa->spa_props_lock); + if (spa->spa_pool_props_object == 0) { + zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); + VERIFY(zapobj > 0); + + spa->spa_pool_props_object = zapobj; + + VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_PROPS, 8, 1, + &spa->spa_pool_props_object, tx) == 0); + } + mutex_exit(&spa->spa_props_lock); + + nvpair = NULL; + while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { + switch (zpool_name_to_prop(nvpair_name(nvpair))) { + case ZFS_PROP_BOOTFS: + VERIFY(nvlist_lookup_uint64(nvp, + nvpair_name(nvpair), &spa->spa_bootfs) == 0); + VERIFY(zap_update(mos, + spa->spa_pool_props_object, + zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, + &spa->spa_bootfs, tx) == 0); + break; + } + } +} + +/* + * Sync the specified transaction group. New blocks may be dirtied as + * part of the process, so we iterate until it converges. + */ +void +spa_sync(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + objset_t *mos = spa->spa_meta_objset; + bplist_t *bpl = &spa->spa_sync_bplist; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd; + dmu_tx_t *tx; + int dirty_vdevs; + + /* + * Lock out configuration changes. + */ + spa_config_enter(spa, RW_READER, FTAG); + + spa->spa_syncing_txg = txg; + spa->spa_sync_pass = 0; + + VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); + + tx = dmu_tx_create_assigned(dp, txg); + + /* + * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, + * set spa_deflate if we have no raid-z vdevs. + */ + if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && + spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { + int i; + + for (i = 0; i < rvd->vdev_children; i++) { + vd = rvd->vdev_child[i]; + if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) + break; + } + if (i == rvd->vdev_children) { + spa->spa_deflate = TRUE; + VERIFY(0 == zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx)); + } + } + + /* + * If anything has changed in this txg, push the deferred frees + * from the previous txg. If not, leave them alone so that we + * don't generate work on an otherwise idle system. + */ + if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || + !txg_list_empty(&dp->dp_dirty_dirs, txg) || + !txg_list_empty(&dp->dp_sync_tasks, txg)) + spa_sync_deferred_frees(spa, txg); + + /* + * Iterate to convergence. + */ + do { + spa->spa_sync_pass++; + + spa_sync_config_object(spa, tx); + spa_sync_spares(spa, tx); + spa_errlog_sync(spa, txg); + dsl_pool_sync(dp, txg); + + dirty_vdevs = 0; + while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { + vdev_sync(vd, txg); + dirty_vdevs++; + } + + bplist_sync(bpl, tx); + } while (dirty_vdevs); + + bplist_close(bpl); + + dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); + + /* + * Rewrite the vdev configuration (which includes the uberblock) + * to commit the transaction group. + * + * If there are any dirty vdevs, sync the uberblock to all vdevs. + * Otherwise, pick a random top-level vdev that's known to be + * visible in the config cache (see spa_vdev_add() for details). + * If the write fails, try the next vdev until we're tried them all. + */ + if (!list_is_empty(&spa->spa_dirty_list)) { + VERIFY(vdev_config_sync(rvd, txg) == 0); + } else { + int children = rvd->vdev_children; + int c0 = spa_get_random(children); + int c; + + for (c = 0; c < children; c++) { + vd = rvd->vdev_child[(c0 + c) % children]; + if (vd->vdev_ms_array == 0) + continue; + if (vdev_config_sync(vd, txg) == 0) + break; + } + if (c == children) + VERIFY(vdev_config_sync(rvd, txg) == 0); + } + + dmu_tx_commit(tx); + + /* + * Clear the dirty config list. + */ + while ((vd = list_head(&spa->spa_dirty_list)) != NULL) + vdev_config_clean(vd); + + /* + * Now that the new config has synced transactionally, + * let it become visible to the config cache. + */ + if (spa->spa_config_syncing != NULL) { + spa_config_set(spa, spa->spa_config_syncing); + spa->spa_config_txg = txg; + spa->spa_config_syncing = NULL; + } + + /* + * Make a stable copy of the fully synced uberblock. + * We use this as the root for pool traversals. + */ + spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ + + spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ + + rw_enter(&spa->spa_traverse_lock, RW_WRITER); + spa->spa_traverse_wanted = 0; + spa->spa_ubsync = spa->spa_uberblock; + rw_exit(&spa->spa_traverse_lock); + + spa_scrub_resume(spa); /* resume scrub with new ubsync */ + + /* + * Clean up the ZIL records for the synced txg. + */ + dsl_pool_zil_clean(dp); + + /* + * Update usable space statistics. + */ + while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + vdev_sync_done(vd, txg); + + /* + * It had better be the case that we didn't dirty anything + * since vdev_config_sync(). + */ + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); + ASSERT(bpl->bpl_queue == NULL); + + spa_config_exit(spa, FTAG); + + /* + * If any async tasks have been requested, kick them off. + */ + spa_async_dispatch(spa); +} + +/* + * Sync all pools. We don't want to hold the namespace lock across these + * operations, so we take a reference on the spa_t and drop the lock during the + * sync. + */ +void +spa_sync_allpools(void) +{ + spa_t *spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa_state(spa) != POOL_STATE_ACTIVE) + continue; + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + } + mutex_exit(&spa_namespace_lock); +} + +/* + * ========================================================================== + * Miscellaneous routines + * ========================================================================== + */ + +/* + * Remove all pools in the system. + */ +void +spa_evict_all(void) +{ + spa_t *spa; + + /* + * Remove all cached state. All pools should be closed now, + * so every spa in the AVL tree should be unreferenced. + */ + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(NULL)) != NULL) { + /* + * Stop async tasks. The async thread may need to detach + * a device that's been replaced, which requires grabbing + * spa_namespace_lock, so we must drop it here. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); + VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + spa_remove(spa); + } + mutex_exit(&spa_namespace_lock); +} + +vdev_t * +spa_lookup_by_guid(spa_t *spa, uint64_t guid) +{ + return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); +} + +void +spa_upgrade(spa_t *spa) +{ + spa_config_enter(spa, RW_WRITER, FTAG); + + /* + * This should only be called for a non-faulted pool, and since a + * future version would result in an unopenable pool, this shouldn't be + * possible. + */ + ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); + + spa->spa_uberblock.ub_version = ZFS_VERSION; + vdev_config_dirty(spa->spa_root_vdev); + + spa_config_exit(spa, FTAG); + + txg_wait_synced(spa_get_dsl(spa), 0); +} + +boolean_t +spa_has_spare(spa_t *spa, uint64_t guid) +{ + int i; + uint64_t spareguid; + + for (i = 0; i < spa->spa_nspares; i++) + if (spa->spa_spares[i]->vdev_guid == guid) + return (B_TRUE); + + for (i = 0; i < spa->spa_pending_nspares; i++) { + if (nvlist_lookup_uint64(spa->spa_pending_spares[i], + ZPOOL_CONFIG_GUID, &spareguid) == 0 && + spareguid == guid) + return (B_TRUE); + } + + return (B_FALSE); +} + +int +spa_set_props(spa_t *spa, nvlist_t *nvp) +{ + return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, + spa, nvp, 3)); +} + +int +spa_get_props(spa_t *spa, nvlist_t **nvp) +{ + zap_cursor_t zc; + zap_attribute_t za; + objset_t *mos = spa->spa_meta_objset; + zfs_source_t src; + zfs_prop_t prop; + nvlist_t *propval; + uint64_t value; + int err; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + mutex_enter(&spa->spa_props_lock); + /* If no props object, then just return empty nvlist */ + if (spa->spa_pool_props_object == 0) { + mutex_exit(&spa->spa_props_lock); + return (0); + } + + for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + + if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) + continue; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + switch (za.za_integer_length) { + case 8: + if (zfs_prop_default_numeric(prop) == + za.za_first_integer) + src = ZFS_SRC_DEFAULT; + else + src = ZFS_SRC_LOCAL; + value = za.za_first_integer; + + if (prop == ZFS_PROP_BOOTFS) { + dsl_pool_t *dp; + dsl_dataset_t *ds = NULL; + char strval[MAXPATHLEN]; + + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + if ((err = dsl_dataset_open_obj(dp, + za.za_first_integer, NULL, DS_MODE_NONE, + FTAG, &ds)) != 0) { + rw_exit(&dp->dp_config_rwlock); + break; + } + dsl_dataset_name(ds, strval); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + rw_exit(&dp->dp_config_rwlock); + + VERIFY(nvlist_add_uint64(propval, + ZFS_PROP_SOURCE, src) == 0); + VERIFY(nvlist_add_string(propval, + ZFS_PROP_VALUE, strval) == 0); + } else { + VERIFY(nvlist_add_uint64(propval, + ZFS_PROP_SOURCE, src) == 0); + VERIFY(nvlist_add_uint64(propval, + ZFS_PROP_VALUE, value) == 0); + } + VERIFY(nvlist_add_nvlist(*nvp, za.za_name, + propval) == 0); + break; + } + nvlist_free(propval); + } + zap_cursor_fini(&zc); + mutex_exit(&spa->spa_props_lock); + if (err && err != ENOENT) { + nvlist_free(*nvp); + return (err); + } + + return (0); +} + +/* + * If the bootfs property value is dsobj, clear it. + */ +void +spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) +{ + if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { + VERIFY(zap_remove(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); + spa->spa_bootfs = 0; + } +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c new file mode 100644 index 0000000..b5d8c38 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c @@ -0,0 +1,361 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/nvpair.h> +#include <sys/uio.h> +#include <sys/fs/zfs.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_ioctl.h> +#ifdef _KERNEL +#include <sys/kobj.h> +#endif + +/* + * Pool configuration repository. + * + * The configuration for all pools, in addition to being stored on disk, is + * stored in /etc/zfs/zpool.cache as a packed nvlist. The kernel maintains + * this list as pools are created, destroyed, or modified. + * + * We have a single nvlist which holds all the configuration information. When + * the module loads, we read this information from the cache and populate the + * SPA namespace. This namespace is maintained independently in spa.c. + * Whenever the namespace is modified, or the configuration of a pool is + * changed, we call spa_config_sync(), which walks through all the active pools + * and writes the configuration to disk. + */ + +static uint64_t spa_config_generation = 1; + +/* + * This can be overridden in userland to preserve an alternate namespace for + * userland pools when doing testing. + */ +const char *spa_config_dir = ZPOOL_CACHE_DIR; + +/* + * Called when the module is first loaded, this routine loads the configuration + * file into the SPA namespace. It does not actually open or load the pools; it + * only populates the namespace. + */ +void +spa_config_load(void) +{ + void *buf = NULL; + nvlist_t *nvlist, *child; + nvpair_t *nvpair; + spa_t *spa; + char pathname[128]; + struct _buf *file; + uint64_t fsize; + + root_mount_wait(); + + /* + * Open the configuration file. + */ + (void) snprintf(pathname, sizeof (pathname), "%s/%s", + spa_config_dir, ZPOOL_CACHE_FILE); + + file = kobj_open_file(pathname); + if (file == (struct _buf *)-1) + return; + + if (kobj_get_filesize(file, &fsize) != 0) + goto out; + + buf = kmem_alloc(fsize, KM_SLEEP); + + /* + * Read the nvlist from the file. + */ + if (kobj_read_file(file, buf, fsize, 0) < 0) + goto out; + + /* + * Unpack the nvlist. + */ + if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) + goto out; + + /* + * Iterate over all elements in the nvlist, creating a new spa_t for + * each one with the specified configuration. + */ + mutex_enter(&spa_namespace_lock); + nvpair = NULL; + while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { + + if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) + continue; + + VERIFY(nvpair_value_nvlist(nvpair, &child) == 0); + + if (spa_lookup(nvpair_name(nvpair)) != NULL) + continue; + spa = spa_add(nvpair_name(nvpair), NULL); + + /* + * We blindly duplicate the configuration here. If it's + * invalid, we will catch it when the pool is first opened. + */ + VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0); + } + mutex_exit(&spa_namespace_lock); + + nvlist_free(nvlist); + +out: + if (buf != NULL) + kmem_free(buf, fsize); + + kobj_close_file(file); +} + +/* + * Synchronize all pools to disk. This must be called with the namespace lock + * held. + */ +void +spa_config_sync(void) +{ + spa_t *spa = NULL; + nvlist_t *config; + size_t buflen; + char *buf; + vnode_t *vp; + int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; + char pathname[128]; + char pathname2[128]; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + /* + * Add all known pools to the configuration list, ignoring those with + * alternate root paths. + */ + spa = NULL; + while ((spa = spa_next(spa)) != NULL) { + mutex_enter(&spa->spa_config_cache_lock); + if (spa->spa_config && spa->spa_name && spa->spa_root == NULL) + VERIFY(nvlist_add_nvlist(config, spa->spa_name, + spa->spa_config) == 0); + mutex_exit(&spa->spa_config_cache_lock); + } + + /* + * Pack the configuration into a buffer. + */ + VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0); + + buf = kmem_alloc(buflen, KM_SLEEP); + + VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, + KM_SLEEP) == 0); + + /* + * Write the configuration to disk. We need to do the traditional + * 'write to temporary file, sync, move over original' to make sure we + * always have a consistent view of the data. + */ + (void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir, + ZPOOL_CACHE_TMP); + + if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0) + goto out; + + if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, NULL) == 0 && + VOP_FSYNC(vp, FSYNC, kcred) == 0) { + (void) snprintf(pathname2, sizeof (pathname2), "%s/%s", + spa_config_dir, ZPOOL_CACHE_FILE); + (void) vn_rename(pathname, pathname2, UIO_SYSSPACE); + } + + (void) VOP_CLOSE(vp, oflags, 1, 0, kcred); + VN_RELE(vp); + +out: + (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE); + spa_config_generation++; + + kmem_free(buf, buflen); + nvlist_free(config); +} + +/* + * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, + * and we don't want to allow the local zone to see all the pools anyway. + * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration + * information for all pool visible within the zone. + */ +nvlist_t * +spa_all_configs(uint64_t *generation) +{ + nvlist_t *pools; + spa_t *spa; + + if (*generation == spa_config_generation) + return (NULL); + + VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (INGLOBALZONE(curproc) || + zone_dataset_visible(spa_name(spa), NULL)) { + mutex_enter(&spa->spa_config_cache_lock); + VERIFY(nvlist_add_nvlist(pools, spa_name(spa), + spa->spa_config) == 0); + mutex_exit(&spa->spa_config_cache_lock); + } + } + mutex_exit(&spa_namespace_lock); + + *generation = spa_config_generation; + + return (pools); +} + +void +spa_config_set(spa_t *spa, nvlist_t *config) +{ + mutex_enter(&spa->spa_config_cache_lock); + if (spa->spa_config != NULL) + nvlist_free(spa->spa_config); + spa->spa_config = config; + mutex_exit(&spa->spa_config_cache_lock); +} + +/* + * Generate the pool's configuration based on the current in-core state. + * We infer whether to generate a complete config or just one top-level config + * based on whether vd is the root vdev. + */ +nvlist_t * +spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +{ + nvlist_t *config, *nvroot; + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, RW_READER)); + + if (vd == NULL) + vd = rvd; + + /* + * If txg is -1, report the current value of spa->spa_config_txg. + */ + if (txg == -1ULL) + txg = spa->spa_config_txg; + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, + spa_name(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + spa_state(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, + txg) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_guid(spa)) == 0); + + if (vd != rvd) { + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, + vd->vdev_top->vdev_guid) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + if (vd->vdev_isspare) + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, + 1ULL) == 0); + vd = vd->vdev_top; /* label contains top config */ + } + + nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvlist_free(nvroot); + + return (config); +} + +/* + * Update all disk labels, generate a fresh config based on the current + * in-core state, and sync the global config cache. + */ +void +spa_config_update(spa_t *spa, int what) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg; + int c; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_config_enter(spa, RW_WRITER, FTAG); + txg = spa_last_synced_txg(spa) + 1; + if (what == SPA_CONFIG_UPDATE_POOL) { + vdev_config_dirty(rvd); + } else { + /* + * If we have top-level vdevs that were added but have + * not yet been prepared for allocation, do that now. + * (It's safe now because the config cache is up to date, + * so it will be able to translate the new DVAs.) + * See comments in spa_vdev_add() for full details. + */ + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + if (tvd->vdev_ms_array == 0) { + vdev_init(tvd, txg); + vdev_config_dirty(tvd); + } + } + } + spa_config_exit(spa, FTAG); + + /* + * Wait for the mosconfig to be regenerated and synced. + */ + txg_wait_synced(spa->spa_dsl_pool, txg); + + /* + * Update the global config cache to reflect the new mosconfig. + */ + spa_config_sync(); + + if (what == SPA_CONFIG_UPDATE_POOL) + spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c new file mode 100644 index 0000000..c52acaf --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c @@ -0,0 +1,440 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Routines to manage the on-disk persistent error log. + * + * Each pool stores a log of all logical data errors seen during normal + * operation. This is actually the union of two distinct logs: the last log, + * and the current log. All errors seen are logged to the current log. When a + * scrub completes, the current log becomes the last log, the last log is thrown + * out, and the current log is reinitialized. This way, if an error is somehow + * corrected, a new scrub will show that that it no longer exists, and will be + * deleted from the log when the scrub completes. + * + * The log is stored using a ZAP object whose key is a string form of the + * zbookmark tuple (objset, object, level, blkid), and whose contents is an + * optional 'objset:object' human-readable string describing the data. When an + * error is first logged, this string will be empty, indicating that no name is + * known. This prevents us from having to issue a potentially large amount of + * I/O to discover the object name during an error path. Instead, we do the + * calculation when the data is requested, storing the result so future queries + * will be faster. + * + * This log is then shipped into an nvlist where the key is the dataset name and + * the value is the object name. Userland is then responsible for uniquifying + * this list and displaying it to the user. + */ + +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zap.h> +#include <sys/zio.h> + +/* + * This is a stripped-down version of strtoull, suitable only for converting + * lowercase hexidecimal numbers that don't overflow. + */ +#ifdef _KERNEL +static uint64_t +_strtonum(char *str, char **nptr) +{ + uint64_t val = 0; + char c; + int digit; + + while ((c = *str) != '\0') { + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (c >= 'a' && c <= 'f') + digit = 10 + c - 'a'; + else + break; + + val *= 16; + val += digit; + + str++; + } + + *nptr = str; + + return (val); +} +#endif + +/* + * Convert a bookmark to a string. + */ +static void +bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); +} + +/* + * Convert a string to a bookmark + */ +#ifdef _KERNEL +static void +name_to_bookmark(char *buf, zbookmark_t *zb) +{ + zb->zb_objset = _strtonum(buf, &buf); + ASSERT(*buf == ':'); + zb->zb_object = _strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_level = (int)_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_blkid = _strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} +#endif + +/* + * Log an uncorrectable error to the persistent error log. We add it to the + * spa's list of pending errors. The changes are actually synced out to disk + * during spa_errlog_sync(). + */ +void +spa_log_error(spa_t *spa, zio_t *zio) +{ + zbookmark_t *zb = &zio->io_logical->io_bookmark; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_tree_t *tree; + avl_index_t where; + + /* + * If we are trying to import a pool, ignore any errors, as we won't be + * writing to the pool any time soon. + */ + if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + return; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * If we have had a request to rotate the log, log it to the next list + * instead of the current one. + */ + if (spa->spa_scrub_active || spa->spa_scrub_finished) + tree = &spa->spa_errlist_scrub; + else + tree = &spa->spa_errlist_last; + + search.se_bookmark = *zb; + if (avl_find(tree, &search, &where) != NULL) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *zb; + avl_insert(tree, new, where); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Return the number of errors currently in the error log. This is actually the + * sum of both the last log and the current log, since we don't know the union + * of these logs until we reach userland. + */ +uint64_t +spa_get_errlog_size(spa_t *spa) +{ + uint64_t total = 0, count; + + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); + + return (total); +} + +#ifdef _KERNEL +static int +process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +{ + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_t zb; + + if (obj == 0) + return (0); + + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + + if (*count == 0) { + zap_cursor_fini(&zc); + return (ENOMEM); + } + + name_to_bookmark(za.za_name, &zb); + + if (copyout(&zb, (char *)addr + + (*count - 1) * sizeof (zbookmark_t), + sizeof (zbookmark_t)) != 0) + return (EFAULT); + + *count -= 1; + } + + zap_cursor_fini(&zc); + + return (0); +} + +static int +process_error_list(avl_tree_t *list, void *addr, size_t *count) +{ + spa_error_entry_t *se; + + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + + if (*count == 0) + return (ENOMEM); + + if (copyout(&se->se_bookmark, (char *)addr + + (*count - 1) * sizeof (zbookmark_t), + sizeof (zbookmark_t)) != 0) + return (EFAULT); + + *count -= 1; + } + + return (0); +} +#endif + +/* + * Copy all known errors to userland as an array of bookmarks. This is + * actually a union of the on-disk last log and current log, as well as any + * pending error requests. + * + * Because the act of reading the on-disk log could cause errors to be + * generated, we have two separate locks: one for the error log and one for the + * in-core error lists. We only need the error list lock to log and error, so + * we grab the error log lock while we read the on-disk logs, and only pick up + * the error list lock when we are finished. + */ +int +spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +{ + int ret = 0; + +#ifdef _KERNEL + mutex_enter(&spa->spa_errlog_lock); + + ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); + + if (!ret && !spa->spa_scrub_finished) + ret = process_error_log(spa, spa->spa_errlog_last, uaddr, + count); + + mutex_enter(&spa->spa_errlist_lock); + if (!ret) + ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + count); + if (!ret) + ret = process_error_list(&spa->spa_errlist_last, uaddr, + count); + mutex_exit(&spa->spa_errlist_lock); + + mutex_exit(&spa->spa_errlog_lock); +#endif + + return (ret); +} + +/* + * Called when a scrub completes. This simply set a bit which tells which AVL + * tree to add new errors. spa_errlog_sync() is responsible for actually + * syncing the changes to the underlying objects. + */ +void +spa_errlog_rotate(spa_t *spa) +{ + mutex_enter(&spa->spa_errlist_lock); + + ASSERT(!spa->spa_scrub_finished); + spa->spa_scrub_finished = B_TRUE; + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Discard any pending errors from the spa_t. Called when unloading a faulted + * pool, as the errors encountered during the open cannot be synced to disk. + */ +void +spa_errlog_drain(spa_t *spa) +{ + spa_error_entry_t *se; + void *cookie; + + mutex_enter(&spa->spa_errlist_lock); + + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_last, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Process a list of errors into the current on-disk log. + */ +static void +sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) +{ + spa_error_entry_t *se; + char buf[64]; + void *cookie; + + if (avl_numnodes(t) != 0) { + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, + 0, tx); + + /* add errors to the current log */ + for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { + char *name = se->se_name ? se->se_name : ""; + + bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + + (void) zap_update(spa->spa_meta_objset, + *obj, buf, 1, strlen(name) + 1, name, tx); + } + + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + } +} + +/* + * Sync the error log out to disk. This is a little tricky because the act of + * writing the error log requires the spa_errlist_lock. So, we need to lock the + * error lists, take a copy of the lists, and then reinitialize them. Then, we + * drop the error list lock and take the error log lock, at which point we + * do the errlog processing. Then, if we encounter an I/O error during this + * process, we can successfully add the error to the list. Note that this will + * result in the perpetual recycling of errors, but it is an unlikely situation + * and not a performance critical operation. + */ +void +spa_errlog_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + avl_tree_t scrub, last; + int scrub_finished; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * Bail out early under normal circumstances. + */ + if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && + avl_numnodes(&spa->spa_errlist_last) == 0 && + !spa->spa_scrub_finished) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + spa_get_errlists(spa, &last, &scrub); + scrub_finished = spa->spa_scrub_finished; + spa->spa_scrub_finished = B_FALSE; + + mutex_exit(&spa->spa_errlist_lock); + mutex_enter(&spa->spa_errlog_lock); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + /* + * Sync out the current list of errors. + */ + sync_error_list(spa, &last, &spa->spa_errlog_last, tx); + + /* + * Rotate the log if necessary. + */ + if (scrub_finished) { + if (spa->spa_errlog_last != 0) + VERIFY(dmu_object_free(spa->spa_meta_objset, + spa->spa_errlog_last, tx) == 0); + spa->spa_errlog_last = spa->spa_errlog_scrub; + spa->spa_errlog_scrub = 0; + + sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); + } + + /* + * Sync out any pending scrub errors. + */ + sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); + + /* + * Update the MOS to reflect the new values. + */ + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, + &spa->spa_errlog_last, tx); + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, + &spa->spa_errlog_scrub, tx); + + dmu_tx_commit(tx); + + mutex_exit(&spa->spa_errlog_lock); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c new file mode 100644 index 0000000..6642801 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c @@ -0,0 +1,354 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa_impl.h> +#include <sys/zap.h> +#include <sys/dsl_synctask.h> + +/* + * Routines to manage the on-disk history log. + * + * The history log is stored as a dmu object containing + * <packed record length, record nvlist> tuples. + * + * Where "record nvlist" is a nvlist containing uint64_ts and strings, and + * "packed record length" is the packed length of the "record nvlist" stored + * as a little endian uint64_t. + * + * The log is implemented as a ring buffer, though the original creation + * of the pool ('zpool create') is never overwritten. + * + * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer + * of 'spa_history' stores the offsets for logging/retrieving history as + * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of + * where the 'zpool create' record is stored. This allows us to never + * overwrite the original creation of the pool. 'sh_phys_max_off' is the + * physical ending offset in bytes of the log. This tells you the length of + * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record + * is added, 'sh_eof' is incremented by the the size of the record. + * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). + * This is where the consumer should start reading from after reading in + * the 'zpool create' portion of the log. + * + * 'sh_records_lost' keeps track of how many records have been overwritten + * and permanently lost. + */ + +typedef enum history_log_type { + LOG_CMD_CREATE, + LOG_CMD_NO_CREATE +} history_log_type_t; + +typedef struct history_arg { + const char *ha_history_str; + history_log_type_t ha_log_type; +} history_arg_t; + +/* convert a logical offset to physical */ +static uint64_t +spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) +{ + uint64_t phys_len; + + phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; + return ((log_off - shpp->sh_pool_create_len) % phys_len + + shpp->sh_pool_create_len); +} + +void +spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) +{ + dmu_buf_t *dbp; + spa_history_phys_t *shpp; + objset_t *mos = spa->spa_meta_objset; + + ASSERT(spa->spa_history == 0); + spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, + SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, + sizeof (spa_history_phys_t), tx); + + VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_HISTORY, sizeof (uint64_t), 1, + &spa->spa_history, tx) == 0); + + VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); + + shpp = dbp->db_data; + dmu_buf_will_dirty(dbp, tx); + + /* + * Figure out maximum size of history log. We set it at + * 1% of pool size, with a max of 32MB and min of 128KB. + */ + shpp->sh_phys_max_off = spa_get_dspace(spa) / 100; + shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); + shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); + + dmu_buf_rele(dbp, FTAG); +} + +/* + * Change 'sh_bof' to the beginning of the next record. + */ +static int +spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) +{ + objset_t *mos = spa->spa_meta_objset; + uint64_t firstread, reclen, phys_bof; + char buf[sizeof (reclen)]; + int err; + + phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); + firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); + + if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, + buf)) != 0) + return (err); + if (firstread != sizeof (reclen)) { + if ((err = dmu_read(mos, spa->spa_history, + shpp->sh_pool_create_len, sizeof (reclen) - firstread, + buf + firstread)) != 0) + return (err); + } + + reclen = LE_64(*((uint64_t *)buf)); + shpp->sh_bof += reclen + sizeof (reclen); + shpp->sh_records_lost++; + return (0); +} + +static int +spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, + dmu_tx_t *tx) +{ + uint64_t firstwrite, phys_eof; + objset_t *mos = spa->spa_meta_objset; + int err; + + ASSERT(MUTEX_HELD(&spa->spa_history_lock)); + + /* see if we need to reset logical BOF */ + while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - + (shpp->sh_eof - shpp->sh_bof) <= len) { + if ((err = spa_history_advance_bof(spa, shpp)) != 0) + return (err); + } + + phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); + firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); + shpp->sh_eof += len; + dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); + + len -= firstwrite; + if (len > 0) { + /* write out the rest at the beginning of physical file */ + dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, + len, (char *)buf + firstwrite, tx); + } + + return (0); +} + +/* + * Write out a history event. + */ +void +spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + history_arg_t *hap = arg2; + const char *history_str = hap->ha_history_str; + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *dbp; + spa_history_phys_t *shpp; + size_t reclen; + uint64_t le_len; + nvlist_t *nvrecord; + char *record_packed = NULL; + int ret; + + if (history_str == NULL) + return; + + /* + * If we have an older pool that doesn't have a command + * history object, create it now. + */ + mutex_enter(&spa->spa_history_lock); + if (!spa->spa_history) + spa_history_create_obj(spa, tx); + mutex_exit(&spa->spa_history_lock); + + /* + * Get the offset of where we need to write via the bonus buffer. + * Update the offset when the write completes. + */ + VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + shpp = dbp->db_data; + + dmu_buf_will_dirty(dbp, tx); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbp, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); + } +#endif + + /* construct a nvlist of the current time and cmd string */ + VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, + gethrestime_sec()) == 0); + VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0); + VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, + NV_ENCODE_XDR, KM_SLEEP) == 0); + + mutex_enter(&spa->spa_history_lock); + if (hap->ha_log_type == LOG_CMD_CREATE) + VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); + + /* write out the packed length as little endian */ + le_len = LE_64((uint64_t)reclen); + ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); + if (!ret) + ret = spa_history_write(spa, record_packed, reclen, shpp, tx); + + if (!ret && hap->ha_log_type == LOG_CMD_CREATE) { + shpp->sh_pool_create_len += sizeof (le_len) + reclen; + shpp->sh_bof = shpp->sh_pool_create_len; + } + + mutex_exit(&spa->spa_history_lock); + nvlist_free(nvrecord); + kmem_free(record_packed, reclen); + dmu_buf_rele(dbp, FTAG); +} + +/* + * Write out a history event. + */ +int +spa_history_log(spa_t *spa, const char *history_str, uint64_t pool_create) +{ + history_arg_t ha; + + ha.ha_history_str = history_str; + ha.ha_log_type = pool_create ? LOG_CMD_CREATE : LOG_CMD_NO_CREATE; + return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync, + spa, &ha, 0)); +} + +/* + * Read out the command history. + */ +int +spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) +{ + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *dbp; + uint64_t read_len, phys_read_off, phys_eof; + uint64_t leftover = 0; + spa_history_phys_t *shpp; + int err; + + /* + * If the command history doesn't exist (older pool), + * that's ok, just return ENOENT. + */ + if (!spa->spa_history) + return (ENOENT); + + if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) + return (err); + shpp = dbp->db_data; + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbp, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); + } +#endif + + mutex_enter(&spa->spa_history_lock); + phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); + + if (*offp < shpp->sh_pool_create_len) { + /* read in just the zpool create history */ + phys_read_off = *offp; + read_len = MIN(*len, shpp->sh_pool_create_len - + phys_read_off); + } else { + /* + * Need to reset passed in offset to BOF if the passed in + * offset has since been overwritten. + */ + *offp = MAX(*offp, shpp->sh_bof); + phys_read_off = spa_history_log_to_phys(*offp, shpp); + + /* + * Read up to the minimum of what the user passed down or + * the EOF (physical or logical). If we hit physical EOF, + * use 'leftover' to read from the physical BOF. + */ + if (phys_read_off <= phys_eof) { + read_len = MIN(*len, phys_eof - phys_read_off); + } else { + read_len = MIN(*len, + shpp->sh_phys_max_off - phys_read_off); + if (phys_read_off + *len > shpp->sh_phys_max_off) { + leftover = MIN(*len - read_len, + phys_eof - shpp->sh_pool_create_len); + } + } + } + + /* offset for consumer to use next */ + *offp += read_len + leftover; + + /* tell the consumer how much you actually read */ + *len = read_len + leftover; + + if (read_len == 0) { + mutex_exit(&spa->spa_history_lock); + dmu_buf_rele(dbp, FTAG); + return (0); + } + + err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf); + if (leftover && err == 0) { + err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, + leftover, buf + read_len); + } + mutex_exit(&spa->spa_history_lock); + + dmu_buf_rele(dbp, FTAG); + return (err); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c new file mode 100644 index 0000000..1de1e5a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -0,0 +1,1126 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/unique.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/fs/zfs.h> + +/* + * SPA locking + * + * There are four basic locks for managing spa_t structures: + * + * spa_namespace_lock (global mutex) + * + * This lock must be acquired to do any of the following: + * + * - Lookup a spa_t by name + * - Add or remove a spa_t from the namespace + * - Increase spa_refcount from non-zero + * - Check if spa_refcount is zero + * - Rename a spa_t + * - add/remove/attach/detach devices + * - Held for the duration of create/destroy/import/export + * + * It does not need to handle recursion. A create or destroy may + * reference objects (files or zvols) in other pools, but by + * definition they must have an existing reference, and will never need + * to lookup a spa_t by name. + * + * spa_refcount (per-spa refcount_t protected by mutex) + * + * This reference count keep track of any active users of the spa_t. The + * spa_t cannot be destroyed or freed while this is non-zero. Internally, + * the refcount is never really 'zero' - opening a pool implicitly keeps + * some references in the DMU. Internally we check against SPA_MINREF, but + * present the image of a zero/non-zero value to consumers. + * + * spa_config_lock (per-spa crazy rwlock) + * + * This SPA special is a recursive rwlock, capable of being acquired from + * asynchronous threads. It has protects the spa_t from config changes, + * and must be held in the following circumstances: + * + * - RW_READER to perform I/O to the spa + * - RW_WRITER to change the vdev config + * + * spa_config_cache_lock (per-spa mutex) + * + * This mutex prevents the spa_config nvlist from being updated. No + * other locks are required to obtain this lock, although implicitly you + * must have the namespace lock or non-zero refcount to have any kind + * of spa_t pointer at all. + * + * The locking order is fairly straightforward: + * + * spa_namespace_lock -> spa_refcount + * + * The namespace lock must be acquired to increase the refcount from 0 + * or to check if it is zero. + * + * spa_refcount -> spa_config_lock + * + * There must be at least one valid reference on the spa_t to acquire + * the config lock. + * + * spa_namespace_lock -> spa_config_lock + * + * The namespace lock must always be taken before the config lock. + * + * + * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and + * are globally visible. + * + * The namespace is manipulated using the following functions, all which require + * the spa_namespace_lock to be held. + * + * spa_lookup() Lookup a spa_t by name. + * + * spa_add() Create a new spa_t in the namespace. + * + * spa_remove() Remove a spa_t from the namespace. This also + * frees up any memory associated with the spa_t. + * + * spa_next() Returns the next spa_t in the system, or the + * first if NULL is passed. + * + * spa_evict_all() Shutdown and remove all spa_t structures in + * the system. + * + * spa_guid_exists() Determine whether a pool/device guid exists. + * + * The spa_refcount is manipulated using the following functions: + * + * spa_open_ref() Adds a reference to the given spa_t. Must be + * called with spa_namespace_lock held if the + * refcount is currently zero. + * + * spa_close() Remove a reference from the spa_t. This will + * not free the spa_t or remove it from the + * namespace. No locking is required. + * + * spa_refcount_zero() Returns true if the refcount is currently + * zero. Must be called with spa_namespace_lock + * held. + * + * The spa_config_lock is manipulated using the following functions: + * + * spa_config_enter() Acquire the config lock as RW_READER or + * RW_WRITER. At least one reference on the spa_t + * must exist. + * + * spa_config_exit() Release the config lock. + * + * spa_config_held() Returns true if the config lock is currently + * held in the given state. + * + * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). + * + * spa_vdev_enter() Acquire the namespace lock and the config lock + * for writing. + * + * spa_vdev_exit() Release the config lock, wait for all I/O + * to complete, sync the updated configs to the + * cache, and release the namespace lock. + * + * The spa_name() function also requires either the spa_namespace_lock + * or the spa_config_lock, as both are needed to do a rename. spa_rename() is + * also implemented within this file since is requires manipulation of the + * namespace. + */ + +static avl_tree_t spa_namespace_avl; +kmutex_t spa_namespace_lock; +static kcondvar_t spa_namespace_cv; +static int spa_active_count; +int spa_max_replication_override = SPA_DVAS_PER_BP; + +static kmutex_t spa_spare_lock; +static avl_tree_t spa_spare_avl; + +kmem_cache_t *spa_buffer_pool; +int spa_mode; + +#ifdef ZFS_DEBUG +int zfs_flags = ~0; +#else +int zfs_flags = 0; +#endif + +/* + * zfs_recover can be set to nonzero to attempt to recover from + * otherwise-fatal errors, typically caused by on-disk corruption. When + * set, calls to zfs_panic_recover() will turn into warning messages. + */ +int zfs_recover = 0; + +#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */ + +/* + * ========================================================================== + * SPA namespace functions + * ========================================================================== + */ + +/* + * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. + * Returns NULL if no matching spa_t is found. + */ +spa_t * +spa_lookup(const char *name) +{ + spa_t search, *spa; + avl_index_t where; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + search.spa_name = (char *)name; + spa = avl_find(&spa_namespace_avl, &search, &where); + + return (spa); +} + +/* + * Create an uninitialized spa_t with the given name. Requires + * spa_namespace_lock. The caller must ensure that the spa_t doesn't already + * exist by calling spa_lookup() first. + */ +spa_t * +spa_add(const char *name, const char *altroot) +{ + spa_t *spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); + + spa->spa_name = spa_strdup(name); + spa->spa_state = POOL_STATE_UNINITIALIZED; + spa->spa_freeze_txg = UINT64_MAX; + spa->spa_final_txg = UINT64_MAX; + + mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); + + cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + + refcount_create(&spa->spa_refcount); + refcount_create(&spa->spa_config_lock.scl_count); + + avl_add(&spa_namespace_avl, spa); + + /* + * Set the alternate root, if there is one. + */ + if (altroot) { + spa->spa_root = spa_strdup(altroot); + spa_active_count++; + } + + return (spa); +} + +/* + * Removes a spa_t from the namespace, freeing up any memory used. Requires + * spa_namespace_lock. This is called only after the spa_t has been closed and + * deactivated. + */ +void +spa_remove(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + ASSERT(spa->spa_scrub_thread == NULL); + + avl_remove(&spa_namespace_avl, spa); + cv_broadcast(&spa_namespace_cv); + + if (spa->spa_root) { + spa_strfree(spa->spa_root); + spa_active_count--; + } + + if (spa->spa_name) + spa_strfree(spa->spa_name); + + spa_config_set(spa, NULL); + + refcount_destroy(&spa->spa_refcount); + refcount_destroy(&spa->spa_config_lock.scl_count); + + cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_scrub_io_cv); + cv_destroy(&spa->spa_scrub_cv); + + mutex_destroy(&spa->spa_scrub_lock); + mutex_destroy(&spa->spa_async_lock); + mutex_destroy(&spa->spa_config_cache_lock); + + kmem_free(spa, sizeof (spa_t)); +} + +/* + * Given a pool, return the next pool in the namespace, or NULL if there is + * none. If 'prev' is NULL, return the first pool. + */ +spa_t * +spa_next(spa_t *prev) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + +/* + * ========================================================================== + * SPA refcount functions + * ========================================================================== + */ + +/* + * Add a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_open_ref(spa_t *spa, void *tag) +{ + ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF || + MUTEX_HELD(&spa_namespace_lock)); + + (void) refcount_add(&spa->spa_refcount, tag); +} + +/* + * Remove a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_close(spa_t *spa, void *tag) +{ + ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF || + MUTEX_HELD(&spa_namespace_lock)); + + (void) refcount_remove(&spa->spa_refcount, tag); +} + +/* + * Check to see if the spa refcount is zero. Must be called with + * spa_namespace_lock held. We really compare against SPA_MINREF, which is the + * number of references acquired when opening a pool + */ +boolean_t +spa_refcount_zero(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + return (refcount_count(&spa->spa_refcount) == SPA_MINREF); +} + +/* + * ========================================================================== + * SPA spare tracking + * ========================================================================== + */ + +/* + * Spares are tracked globally due to the following constraints: + * + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within + * another pool. + * - A spare in use in any pool can only be the source of a replacement if + * the target is a spare in the same pool. + * + * We keep track of all spares on the system through the use of a reference + * counted AVL tree. When a vdev is added as a spare, or used as a replacement + * spare, then we bump the reference count in the AVL tree. In addition, we set + * the 'vdev_isspare' member to indicate that the device is a spare (active or + * inactive). When a spare is made active (used to replace a device in the + * pool), we also keep track of which pool its been made a part of. + * + * The 'spa_spare_lock' protects the AVL tree. These functions are normally + * called under the spa_namespace lock as part of vdev reconfiguration. The + * separate spare lock exists for the status query path, which does not need to + * be completely consistent with respect to other vdev configuration changes. + */ + +typedef struct spa_spare { + uint64_t spare_guid; + uint64_t spare_pool; + avl_node_t spare_avl; + int spare_count; +} spa_spare_t; + +static int +spa_spare_compare(const void *a, const void *b) +{ + const spa_spare_t *sa = a; + const spa_spare_t *sb = b; + + if (sa->spare_guid < sb->spare_guid) + return (-1); + else if (sa->spare_guid > sb->spare_guid) + return (1); + else + return (0); +} + +void +spa_spare_add(vdev_t *vd) +{ + avl_index_t where; + spa_spare_t search; + spa_spare_t *spare; + + mutex_enter(&spa_spare_lock); + ASSERT(!vd->vdev_isspare); + + search.spare_guid = vd->vdev_guid; + if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) { + spare->spare_count++; + } else { + spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP); + spare->spare_guid = vd->vdev_guid; + spare->spare_count = 1; + avl_insert(&spa_spare_avl, spare, where); + } + vd->vdev_isspare = B_TRUE; + + mutex_exit(&spa_spare_lock); +} + +void +spa_spare_remove(vdev_t *vd) +{ + spa_spare_t search; + spa_spare_t *spare; + avl_index_t where; + + mutex_enter(&spa_spare_lock); + + search.spare_guid = vd->vdev_guid; + spare = avl_find(&spa_spare_avl, &search, &where); + + ASSERT(vd->vdev_isspare); + ASSERT(spare != NULL); + + if (--spare->spare_count == 0) { + avl_remove(&spa_spare_avl, spare); + kmem_free(spare, sizeof (spa_spare_t)); + } else if (spare->spare_pool == spa_guid(vd->vdev_spa)) { + spare->spare_pool = 0ULL; + } + + vd->vdev_isspare = B_FALSE; + mutex_exit(&spa_spare_lock); +} + +boolean_t +spa_spare_exists(uint64_t guid, uint64_t *pool) +{ + spa_spare_t search, *found; + avl_index_t where; + + mutex_enter(&spa_spare_lock); + + search.spare_guid = guid; + found = avl_find(&spa_spare_avl, &search, &where); + + if (pool) { + if (found) + *pool = found->spare_pool; + else + *pool = 0ULL; + } + + mutex_exit(&spa_spare_lock); + + return (found != NULL); +} + +void +spa_spare_activate(vdev_t *vd) +{ + spa_spare_t search, *found; + avl_index_t where; + + mutex_enter(&spa_spare_lock); + ASSERT(vd->vdev_isspare); + + search.spare_guid = vd->vdev_guid; + found = avl_find(&spa_spare_avl, &search, &where); + ASSERT(found != NULL); + ASSERT(found->spare_pool == 0ULL); + + found->spare_pool = spa_guid(vd->vdev_spa); + mutex_exit(&spa_spare_lock); +} + +/* + * ========================================================================== + * SPA config locking + * ========================================================================== + */ + +/* + * Acquire the config lock. The config lock is a special rwlock that allows for + * recursive enters. Because these enters come from the same thread as well as + * asynchronous threads working on behalf of the owner, we must unilaterally + * allow all reads access as long at least one reader is held (even if a write + * is requested). This has the side effect of write starvation, but write locks + * are extremely rare, and a solution to this problem would be significantly + * more complex (if even possible). + * + * We would like to assert that the namespace lock isn't held, but this is a + * valid use during create. + */ +void +spa_config_enter(spa_t *spa, krw_t rw, void *tag) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + mutex_enter(&scl->scl_lock); + + if (scl->scl_writer != curthread) { + if (rw == RW_READER) { + while (scl->scl_writer != NULL) + cv_wait(&scl->scl_cv, &scl->scl_lock); + } else { + while (scl->scl_writer != NULL || + !refcount_is_zero(&scl->scl_count)) + cv_wait(&scl->scl_cv, &scl->scl_lock); + scl->scl_writer = curthread; + } + } + + (void) refcount_add(&scl->scl_count, tag); + + mutex_exit(&scl->scl_lock); +} + +/* + * Release the spa config lock, notifying any waiters in the process. + */ +void +spa_config_exit(spa_t *spa, void *tag) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + mutex_enter(&scl->scl_lock); + + ASSERT(!refcount_is_zero(&scl->scl_count)); + if (refcount_remove(&scl->scl_count, tag) == 0) { + cv_broadcast(&scl->scl_cv); + scl->scl_writer = NULL; /* OK in either case */ + } + + mutex_exit(&scl->scl_lock); +} + +/* + * Returns true if the config lock is held in the given manner. + */ +boolean_t +spa_config_held(spa_t *spa, krw_t rw) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + boolean_t held; + + mutex_enter(&scl->scl_lock); + if (rw == RW_WRITER) + held = (scl->scl_writer == curthread); + else + held = !refcount_is_zero(&scl->scl_count); + mutex_exit(&scl->scl_lock); + + return (held); +} + +/* + * ========================================================================== + * SPA vdev locking + * ========================================================================== + */ + +/* + * Lock the given spa_t for the purpose of adding or removing a vdev. + * Grabs the global spa_namespace_lock plus the spa config lock for writing. + * It returns the next transaction group for the spa_t. + */ +uint64_t +spa_vdev_enter(spa_t *spa) +{ + /* + * Suspend scrub activity while we mess with the config. + */ + spa_scrub_suspend(spa); + + mutex_enter(&spa_namespace_lock); + + spa_config_enter(spa, RW_WRITER, spa); + + return (spa_last_synced_txg(spa) + 1); +} + +/* + * Unlock the spa_t after adding or removing a vdev. Besides undoing the + * locking of spa_vdev_enter(), we also want make sure the transactions have + * synced to disk, and then update the global configuration cache with the new + * information. + */ +int +spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) +{ + int config_changed = B_FALSE; + + ASSERT(txg > spa_last_synced_txg(spa)); + + /* + * Reassess the DTLs. + */ + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + + /* + * If the config changed, notify the scrub thread that it must restart. + */ + if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) { + config_changed = B_TRUE; + spa_scrub_restart(spa, txg); + } + + spa_config_exit(spa, spa); + + /* + * Allow scrubbing to resume. + */ + spa_scrub_resume(spa); + + /* + * Note: this txg_wait_synced() is important because it ensures + * that there won't be more than one config change per txg. + * This allows us to use the txg as the generation number. + */ + if (error == 0) + txg_wait_synced(spa->spa_dsl_pool, txg); + + if (vd != NULL) { + ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0); + vdev_free(vd); + } + + /* + * If the config changed, update the config cache. + */ + if (config_changed) + spa_config_sync(); + + mutex_exit(&spa_namespace_lock); + + return (error); +} + +/* + * ========================================================================== + * Miscellaneous functions + * ========================================================================== + */ + +/* + * Rename a spa_t. + */ +int +spa_rename(const char *name, const char *newname) +{ + spa_t *spa; + int err; + + /* + * Lookup the spa_t and grab the config lock for writing. We need to + * actually open the pool so that we can sync out the necessary labels. + * It's OK to call spa_open() with the namespace lock held because we + * allow recursive calls for other reasons. + */ + mutex_enter(&spa_namespace_lock); + if ((err = spa_open(name, &spa, FTAG)) != 0) { + mutex_exit(&spa_namespace_lock); + return (err); + } + + spa_config_enter(spa, RW_WRITER, FTAG); + + avl_remove(&spa_namespace_avl, spa); + spa_strfree(spa->spa_name); + spa->spa_name = spa_strdup(newname); + avl_add(&spa_namespace_avl, spa); + + /* + * Sync all labels to disk with the new names by marking the root vdev + * dirty and waiting for it to sync. It will pick up the new pool name + * during the sync. + */ + vdev_config_dirty(spa->spa_root_vdev); + + spa_config_exit(spa, FTAG); + + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * Sync the updated config cache. + */ + spa_config_sync(); + + spa_close(spa, FTAG); + + mutex_exit(&spa_namespace_lock); + + return (0); +} + + +/* + * Determine whether a pool with given pool_guid exists. If device_guid is + * non-zero, determine whether the pool exists *and* contains a device with the + * specified device_guid. + */ +boolean_t +spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +{ + spa_t *spa; + avl_tree_t *t = &spa_namespace_avl; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + continue; + if (spa->spa_root_vdev == NULL) + continue; + if (spa_guid(spa) == pool_guid) { + if (device_guid == 0) + break; + + if (vdev_lookup_by_guid(spa->spa_root_vdev, + device_guid) != NULL) + break; + + /* + * Check any devices we may in the process of adding. + */ + if (spa->spa_pending_vdev) { + if (vdev_lookup_by_guid(spa->spa_pending_vdev, + device_guid) != NULL) + break; + } + } + } + + return (spa != NULL); +} + +char * +spa_strdup(const char *s) +{ + size_t len; + char *new; + + len = strlen(s); + new = kmem_alloc(len + 1, KM_SLEEP); + bcopy(s, new, len); + new[len] = '\0'; + + return (new); +} + +void +spa_strfree(char *s) +{ + kmem_free(s, strlen(s) + 1); +} + +uint64_t +spa_get_random(uint64_t range) +{ + uint64_t r; + + ASSERT(range != 0); + + (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); + + return (r % range); +} + +void +sprintf_blkptr(char *buf, int len, const blkptr_t *bp) +{ + int d; + + if (bp == NULL) { + (void) snprintf(buf, len, "<NULL>"); + return; + } + + if (BP_IS_HOLE(bp)) { + (void) snprintf(buf, len, "<hole>"); + return; + } + + (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ", + (u_longlong_t)BP_GET_LEVEL(bp), + dmu_ot[BP_GET_TYPE(bp)].ot_name, + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp)); + + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + const dva_t *dva = &bp->blk_dva[d]; + (void) snprintf(buf + strlen(buf), len - strlen(buf), + "DVA[%d]=<%llu:%llx:%llx> ", d, + (u_longlong_t)DVA_GET_VDEV(dva), + (u_longlong_t)DVA_GET_OFFSET(dva), + (u_longlong_t)DVA_GET_ASIZE(dva)); + } + + (void) snprintf(buf + strlen(buf), len - strlen(buf), + "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx", + zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, + zio_compress_table[BP_GET_COMPRESS(bp)].ci_name, + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", + BP_IS_GANG(bp) ? "gang" : "contiguous", + (u_longlong_t)bp->blk_birth, + (u_longlong_t)bp->blk_fill, + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); +} + +void +spa_freeze(spa_t *spa) +{ + uint64_t freeze_txg = 0; + + spa_config_enter(spa, RW_WRITER, FTAG); + if (spa->spa_freeze_txg == UINT64_MAX) { + freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; + spa->spa_freeze_txg = freeze_txg; + } + spa_config_exit(spa, FTAG); + if (freeze_txg != 0) + txg_wait_synced(spa_get_dsl(spa), freeze_txg); +} + +void +zfs_panic_recover(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); + va_end(adx); +} + +/* + * ========================================================================== + * Accessor functions + * ========================================================================== + */ + +krwlock_t * +spa_traverse_rwlock(spa_t *spa) +{ + return (&spa->spa_traverse_lock); +} + +int +spa_traverse_wanted(spa_t *spa) +{ + return (spa->spa_traverse_wanted); +} + +dsl_pool_t * +spa_get_dsl(spa_t *spa) +{ + return (spa->spa_dsl_pool); +} + +blkptr_t * +spa_get_rootblkptr(spa_t *spa) +{ + return (&spa->spa_ubsync.ub_rootbp); +} + +void +spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) +{ + spa->spa_uberblock.ub_rootbp = *bp; +} + +void +spa_altroot(spa_t *spa, char *buf, size_t buflen) +{ + if (spa->spa_root == NULL) + buf[0] = '\0'; + else + (void) strncpy(buf, spa->spa_root, buflen); +} + +int +spa_sync_pass(spa_t *spa) +{ + return (spa->spa_sync_pass); +} + +char * +spa_name(spa_t *spa) +{ + /* + * Accessing the name requires holding either the namespace lock or the + * config lock, both of which are required to do a rename. + */ + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER)); + + return (spa->spa_name); +} + +uint64_t +spa_guid(spa_t *spa) +{ + /* + * If we fail to parse the config during spa_load(), we can go through + * the error path (which posts an ereport) and end up here with no root + * vdev. We stash the original pool guid in 'spa_load_guid' to handle + * this case. + */ + if (spa->spa_root_vdev != NULL) + return (spa->spa_root_vdev->vdev_guid); + else + return (spa->spa_load_guid); +} + +uint64_t +spa_last_synced_txg(spa_t *spa) +{ + return (spa->spa_ubsync.ub_txg); +} + +uint64_t +spa_first_txg(spa_t *spa) +{ + return (spa->spa_first_txg); +} + +int +spa_state(spa_t *spa) +{ + return (spa->spa_state); +} + +uint64_t +spa_freeze_txg(spa_t *spa) +{ + return (spa->spa_freeze_txg); +} + +/* + * In the future, this may select among different metaslab classes + * depending on the zdp. For now, there's no such distinction. + */ +metaslab_class_t * +spa_metaslab_class_select(spa_t *spa) +{ + return (spa->spa_normal_class); +} + +/* + * Return how much space is allocated in the pool (ie. sum of all asize) + */ +uint64_t +spa_get_alloc(spa_t *spa) +{ + return (spa->spa_root_vdev->vdev_stat.vs_alloc); +} + +/* + * Return how much (raid-z inflated) space there is in the pool. + */ +uint64_t +spa_get_space(spa_t *spa) +{ + return (spa->spa_root_vdev->vdev_stat.vs_space); +} + +/* + * Return the amount of raid-z-deflated space in the pool. + */ +uint64_t +spa_get_dspace(spa_t *spa) +{ + if (spa->spa_deflate) + return (spa->spa_root_vdev->vdev_stat.vs_dspace); + else + return (spa->spa_root_vdev->vdev_stat.vs_space); +} + +/* ARGSUSED */ +uint64_t +spa_get_asize(spa_t *spa, uint64_t lsize) +{ + /* + * For now, the worst case is 512-byte RAID-Z blocks, in which + * case the space requirement is exactly 2x; so just assume that. + * Add to this the fact that we can have up to 3 DVAs per bp, and + * we have to multiply by a total of 6x. + */ + return (lsize * 6); +} + +uint64_t +spa_version(spa_t *spa) +{ + return (spa->spa_ubsync.ub_version); +} + +int +spa_max_replication(spa_t *spa) +{ + /* + * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to + * handle BPs with more than one DVA allocated. Set our max + * replication level accordingly. + */ + if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS) + return (1); + return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); +} + +uint64_t +bp_get_dasize(spa_t *spa, const blkptr_t *bp) +{ + int sz = 0, i; + + if (!spa->spa_deflate) + return (BP_GET_ASIZE(bp)); + + for (i = 0; i < SPA_DVAS_PER_BP; i++) { + vdev_t *vd = + vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i])); + sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio; + } + return (sz); +} + +/* + * ========================================================================== + * Initialization and Termination + * ========================================================================== + */ + +static int +spa_name_compare(const void *a1, const void *a2) +{ + const spa_t *s1 = a1; + const spa_t *s2 = a2; + int s; + + s = strcmp(s1->spa_name, s2->spa_name); + if (s > 0) + return (1); + if (s < 0) + return (-1); + return (0); +} + +int +spa_busy(void) +{ + return (spa_active_count); +} + +void +spa_init(int mode) +{ + mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); + + avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), + offsetof(spa_t, spa_avl)); + + mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t), + offsetof(spa_spare_t, spare_avl)); + + spa_mode = mode; + + refcount_init(); + unique_init(); + zio_init(); + dmu_init(); + zil_init(); + spa_config_load(); +} + +void +spa_fini(void) +{ + spa_evict_all(); + + zil_fini(); + dmu_fini(); + zio_fini(); + refcount_fini(); + + avl_destroy(&spa_namespace_avl); + avl_destroy(&spa_spare_avl); + + cv_destroy(&spa_namespace_cv); + mutex_destroy(&spa_namespace_lock); + mutex_destroy(&spa_spare_lock); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c new file mode 100644 index 0000000..23313a9 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -0,0 +1,501 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zio.h> +#include <sys/space_map.h> + +/* + * Space map routines. + * NOTE: caller is responsible for all locking. + */ +static int +space_map_seg_compare(const void *x1, const void *x2) +{ + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + + if (s1->ss_start < s2->ss_start) { + if (s1->ss_end > s2->ss_start) + return (0); + return (-1); + } + if (s1->ss_start > s2->ss_start) { + if (s1->ss_start < s2->ss_end) + return (0); + return (1); + } + return (0); +} + +void +space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, + kmutex_t *lp) +{ + bzero(sm, sizeof (*sm)); + + cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL); + avl_create(&sm->sm_root, space_map_seg_compare, + sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); + + sm->sm_start = start; + sm->sm_size = size; + sm->sm_shift = shift; + sm->sm_lock = lp; +} + +void +space_map_destroy(space_map_t *sm) +{ + ASSERT(!sm->sm_loaded && !sm->sm_loading); + VERIFY3U(sm->sm_space, ==, 0); + avl_destroy(&sm->sm_root); + cv_destroy(&sm->sm_load_cv); +} + +void +space_map_add(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; + space_seg_t ssearch, *ss_before, *ss_after, *ss; + uint64_t end = start + size; + int merge_before, merge_after; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(size != 0); + VERIFY3U(start, >=, sm->sm_start); + VERIFY3U(end, <=, sm->sm_start + sm->sm_size); + VERIFY(sm->sm_space + size <= sm->sm_size); + VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); + VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); + + ssearch.ss_start = start; + ssearch.ss_end = end; + ss = avl_find(&sm->sm_root, &ssearch, &where); + + if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { + zfs_panic_recover("zfs: allocating allocated segment" + "(offset=%llu size=%llu)\n", + (longlong_t)start, (longlong_t)size); + return; + } + + /* Make sure we don't overlap with either of our neighbors */ + VERIFY(ss == NULL); + + ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE); + ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER); + + merge_before = (ss_before != NULL && ss_before->ss_end == start); + merge_after = (ss_after != NULL && ss_after->ss_start == end); + + if (merge_before && merge_after) { + avl_remove(&sm->sm_root, ss_before); + ss_after->ss_start = ss_before->ss_start; + kmem_free(ss_before, sizeof (*ss_before)); + } else if (merge_before) { + ss_before->ss_end = end; + } else if (merge_after) { + ss_after->ss_start = start; + } else { + ss = kmem_alloc(sizeof (*ss), KM_SLEEP); + ss->ss_start = start; + ss->ss_end = end; + avl_insert(&sm->sm_root, ss, where); + } + + sm->sm_space += size; +} + +void +space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; + space_seg_t ssearch, *ss, *newseg; + uint64_t end = start + size; + int left_over, right_over; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(size != 0); + VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); + VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); + + ssearch.ss_start = start; + ssearch.ss_end = end; + ss = avl_find(&sm->sm_root, &ssearch, &where); + + /* Make sure we completely overlap with someone */ + if (ss == NULL) { + zfs_panic_recover("zfs: freeing free segment " + "(offset=%llu size=%llu)", + (longlong_t)start, (longlong_t)size); + return; + } + VERIFY3U(ss->ss_start, <=, start); + VERIFY3U(ss->ss_end, >=, end); + VERIFY(sm->sm_space - size <= sm->sm_size); + + left_over = (ss->ss_start != start); + right_over = (ss->ss_end != end); + + if (left_over && right_over) { + newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); + newseg->ss_start = end; + newseg->ss_end = ss->ss_end; + ss->ss_end = start; + avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); + } else if (left_over) { + ss->ss_end = start; + } else if (right_over) { + ss->ss_start = end; + } else { + avl_remove(&sm->sm_root, ss); + kmem_free(ss, sizeof (*ss)); + } + + sm->sm_space -= size; +} + +int +space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; + space_seg_t ssearch, *ss; + uint64_t end = start + size; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(size != 0); + VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); + VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); + + ssearch.ss_start = start; + ssearch.ss_end = end; + ss = avl_find(&sm->sm_root, &ssearch, &where); + + return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); +} + +void +space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) +{ + space_seg_t *ss; + void *cookie = NULL; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { + if (func != NULL) + func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); + kmem_free(ss, sizeof (*ss)); + } + sm->sm_space = 0; +} + +void +space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) +{ + space_seg_t *ss; + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); +} + +void +space_map_excise(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + avl_index_t where; + space_seg_t *ss, search; + uint64_t end = start + size; + uint64_t rm_start, rm_end; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + search.ss_start = start; + search.ss_end = start; + + for (;;) { + ss = avl_find(t, &search, &where); + + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + + if (ss == NULL || ss->ss_start >= end) + break; + + rm_start = MAX(ss->ss_start, start); + rm_end = MIN(ss->ss_end, end); + + space_map_remove(sm, rm_start, rm_end - rm_start); + } +} + +/* + * Replace smd with the union of smd and sms. + */ +void +space_map_union(space_map_t *smd, space_map_t *sms) +{ + avl_tree_t *t = &sms->sm_root; + space_seg_t *ss; + + ASSERT(MUTEX_HELD(smd->sm_lock)); + + /* + * For each source segment, remove any intersections with the + * destination, then add the source segment to the destination. + */ + for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { + space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start); + space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start); + } +} + +/* + * Wait for any in-progress space_map_load() to complete. + */ +void +space_map_load_wait(space_map_t *sm) +{ + ASSERT(MUTEX_HELD(sm->sm_lock)); + + while (sm->sm_loading) + cv_wait(&sm->sm_load_cv, sm->sm_lock); +} + +/* + * Note: space_map_load() will drop sm_lock across dmu_read() calls. + * The caller must be OK with this. + */ +int +space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, + space_map_obj_t *smo, objset_t *os) +{ + uint64_t *entry, *entry_map, *entry_map_end; + uint64_t bufsize, size, offset, end, space; + uint64_t mapstart = sm->sm_start; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + space_map_load_wait(sm); + + if (sm->sm_loaded) + return (0); + + sm->sm_loading = B_TRUE; + end = smo->smo_objsize; + space = smo->smo_alloc; + + ASSERT(sm->sm_ops == NULL); + VERIFY3U(sm->sm_space, ==, 0); + + if (maptype == SM_FREE) { + space_map_add(sm, sm->sm_start, sm->sm_size); + space = sm->sm_size - space; + } + + bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT; + entry_map = zio_buf_alloc(bufsize); + + mutex_exit(sm->sm_lock); + if (end > bufsize) + dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize); + mutex_enter(sm->sm_lock); + + for (offset = 0; offset < end; offset += bufsize) { + size = MIN(end - offset, bufsize); + VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); + VERIFY(size != 0); + + dprintf("object=%llu offset=%llx size=%llx\n", + smo->smo_object, offset, size); + + mutex_exit(sm->sm_lock); + VERIFY3U(dmu_read(os, smo->smo_object, offset, size, + entry_map), ==, 0); + mutex_enter(sm->sm_lock); + + entry_map_end = entry_map + (size / sizeof (uint64_t)); + for (entry = entry_map; entry < entry_map_end; entry++) { + uint64_t e = *entry; + + if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ + continue; + + (SM_TYPE_DECODE(e) == maptype ? + space_map_add : space_map_remove)(sm, + (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart, + SM_RUN_DECODE(e) << sm->sm_shift); + } + } + VERIFY3U(sm->sm_space, ==, space); + + zio_buf_free(entry_map, bufsize); + + sm->sm_loading = B_FALSE; + sm->sm_loaded = B_TRUE; + sm->sm_ops = ops; + + cv_broadcast(&sm->sm_load_cv); + + if (ops != NULL) + ops->smop_load(sm); + + return (0); +} + +void +space_map_unload(space_map_t *sm) +{ + ASSERT(MUTEX_HELD(sm->sm_lock)); + + if (sm->sm_loaded && sm->sm_ops != NULL) + sm->sm_ops->smop_unload(sm); + + sm->sm_loaded = B_FALSE; + sm->sm_ops = NULL; + + space_map_vacate(sm, NULL, NULL); +} + +uint64_t +space_map_alloc(space_map_t *sm, uint64_t size) +{ + uint64_t start; + + start = sm->sm_ops->smop_alloc(sm, size); + if (start != -1ULL) + space_map_remove(sm, start, size); + return (start); +} + +void +space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ + sm->sm_ops->smop_claim(sm, start, size); + space_map_remove(sm, start, size); +} + +void +space_map_free(space_map_t *sm, uint64_t start, uint64_t size) +{ + space_map_add(sm, start, size); + sm->sm_ops->smop_free(sm, start, size); +} + +/* + * Note: space_map_sync() will drop sm_lock across dmu_write() calls. + */ +void +space_map_sync(space_map_t *sm, uint8_t maptype, + space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + void *cookie = NULL; + space_seg_t *ss; + uint64_t bufsize, start, size, run_len; + uint64_t *entry, *entry_map, *entry_map_end; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + if (sm->sm_space == 0) + return; + + dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n", + smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa), + maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root), + sm->sm_space); + + if (maptype == SM_ALLOC) + smo->smo_alloc += sm->sm_space; + else + smo->smo_alloc -= sm->sm_space; + + bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t); + bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT); + entry_map = zio_buf_alloc(bufsize); + entry_map_end = entry_map + (bufsize / sizeof (uint64_t)); + entry = entry_map; + + *entry++ = SM_DEBUG_ENCODE(1) | + SM_DEBUG_ACTION_ENCODE(maptype) | + SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | + SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); + + while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { + size = ss->ss_end - ss->ss_start; + start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; + + sm->sm_space -= size; + size >>= sm->sm_shift; + + while (size) { + run_len = MIN(size, SM_RUN_MAX); + + if (entry == entry_map_end) { + mutex_exit(sm->sm_lock); + dmu_write(os, smo->smo_object, smo->smo_objsize, + bufsize, entry_map, tx); + mutex_enter(sm->sm_lock); + smo->smo_objsize += bufsize; + entry = entry_map; + } + + *entry++ = SM_OFFSET_ENCODE(start) | + SM_TYPE_ENCODE(maptype) | + SM_RUN_ENCODE(run_len); + + start += run_len; + size -= run_len; + } + kmem_free(ss, sizeof (*ss)); + } + + if (entry != entry_map) { + size = (entry - entry_map) * sizeof (uint64_t); + mutex_exit(sm->sm_lock); + dmu_write(os, smo->smo_object, smo->smo_objsize, + size, entry_map, tx); + mutex_enter(sm->sm_lock); + smo->smo_objsize += size; + } + + zio_buf_free(entry_map, bufsize); + + VERIFY3U(sm->sm_space, ==, 0); +} + +void +space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) +{ + VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0); + + smo->smo_objsize = 0; + smo->smo_alloc = 0; +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h new file mode 100644 index 0000000..f58ffc0 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ARC_H +#define _SYS_ARC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zio.h> + +typedef struct arc_buf_hdr arc_buf_hdr_t; +typedef struct arc_buf arc_buf_t; +typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); +typedef void arc_byteswap_func_t(void *buf, size_t size); +typedef int arc_evict_func_t(void *private); + +/* generic arc_done_func_t's which you can use */ +arc_done_func_t arc_bcopy_func; +arc_done_func_t arc_getbuf_func; + +struct arc_buf { + arc_buf_hdr_t *b_hdr; + arc_buf_t *b_next; + void *b_data; + arc_evict_func_t *b_efunc; + void *b_private; +}; + +typedef enum arc_buf_contents { + ARC_BUFC_UNDEF, /* buffer contents undefined */ + ARC_BUFC_DATA, /* buffer contains data */ + ARC_BUFC_METADATA /* buffer contains metadata */ +} arc_buf_contents_t; +/* + * These are the flags we pass into calls to the arc + */ +#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ +#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ +#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ +#define ARC_CACHED (1 << 4) /* I/O was already in cache */ + +arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, + arc_buf_contents_t type); +void arc_buf_add_ref(arc_buf_t *buf, void *tag); +int arc_buf_remove_ref(arc_buf_t *buf, void *tag); +int arc_buf_size(arc_buf_t *buf); +void arc_release(arc_buf_t *buf, void *tag); +int arc_released(arc_buf_t *buf); +int arc_has_callback(arc_buf_t *buf); +void arc_buf_freeze(arc_buf_t *buf); +void arc_buf_thaw(arc_buf_t *buf); +#ifdef ZFS_DEBUG +int arc_referenced(arc_buf_t *buf); +#endif + +int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, + arc_done_func_t *done, void *private, int priority, int flags, + uint32_t *arc_flags, zbookmark_t *zb); +zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, + int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb); +int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, uint32_t arc_flags); +int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); + +void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); +int arc_buf_evict(arc_buf_t *buf); + +void arc_flush(void); +void arc_tempreserve_clear(uint64_t tempreserve); +int arc_tempreserve_space(uint64_t tempreserve); + +void arc_init(void); +void arc_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h new file mode 100644 index 0000000..b4c8376 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_BPLIST_H +#define _SYS_BPLIST_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bplist_phys { + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpl_entries blkptr_t's, representing + * a total of bpl_bytes physical space. + */ + uint64_t bpl_entries; + uint64_t bpl_bytes; + uint64_t bpl_comp; + uint64_t bpl_uncomp; +} bplist_phys_t; + +#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t)) + +typedef struct bplist_q { + blkptr_t bpq_blk; + void *bpq_next; +} bplist_q_t; + +typedef struct bplist { + kmutex_t bpl_lock; + objset_t *bpl_mos; + uint64_t bpl_object; + uint8_t bpl_blockshift; + uint8_t bpl_bpshift; + uint8_t bpl_havecomp; + bplist_q_t *bpl_queue; + bplist_phys_t *bpl_phys; + dmu_buf_t *bpl_dbuf; + dmu_buf_t *bpl_cached_dbuf; +} bplist_t; + +extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); +extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); +extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); +extern void bplist_close(bplist_t *bpl); +extern boolean_t bplist_empty(bplist_t *bpl); +extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); +extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx); +extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp); +extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); +extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); +extern int bplist_space(bplist_t *bpl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPLIST_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h new file mode 100644 index 0000000..d33657b --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -0,0 +1,334 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DBUF_H +#define _SYS_DBUF_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/arc.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define DB_BONUS_BLKID (-1ULL) +#define IN_DMU_SYNC 2 + +/* + * define flags for dbuf_read + */ + +#define DB_RF_MUST_SUCCEED (1 << 0) +#define DB_RF_CANFAIL (1 << 1) +#define DB_RF_HAVESTRUCT (1 << 2) +#define DB_RF_NOPREFETCH (1 << 3) +#define DB_RF_NEVERWAIT (1 << 4) +#define DB_RF_CACHED (1 << 5) + +/* + * The state transition diagram for dbufs looks like: + * + * +----> READ ----+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * | ^ + * | | + * +----> FILL ----+ + */ +typedef enum dbuf_states { + DB_UNCACHED, + DB_FILL, + DB_READ, + DB_CACHED, + DB_EVICTING +} dbuf_states_t; + +struct objset_impl; +struct dnode; +struct dmu_tx; + +/* + * level = 0 means the user data + * level = 1 means the single indirect block + * etc. + */ + +#define LIST_LINK_INACTIVE(link) \ + ((link)->list_next == NULL && (link)->list_prev == NULL) + +struct dmu_buf_impl; + +typedef enum override_states { + DR_NOT_OVERRIDDEN, + DR_IN_DMU_SYNC, + DR_OVERRIDDEN +} override_states_t; + +typedef struct dbuf_dirty_record { + /* link on our parents dirty list */ + list_node_t dr_dirty_node; + + /* transaction group this data will sync in */ + uint64_t dr_txg; + + /* zio of outstanding write IO */ + zio_t *dr_zio; + + /* pointer back to our dbuf */ + struct dmu_buf_impl *dr_dbuf; + + /* pointer to next dirty record */ + struct dbuf_dirty_record *dr_next; + + /* pointer to parent dirty record */ + struct dbuf_dirty_record *dr_parent; + + union dirty_types { + struct dirty_indirect { + + /* protect access to list */ + kmutex_t dr_mtx; + + /* Our list of dirty children */ + list_t dr_children; + } di; + struct dirty_leaf { + + /* + * dr_data is set when we dirty the buffer + * so that we can retain the pointer even if it + * gets COW'd in a subsequent transaction group. + */ + arc_buf_t *dr_data; + blkptr_t dr_overridden_by; + override_states_t dr_override_state; + } dl; + } dt; +} dbuf_dirty_record_t; + +typedef struct dmu_buf_impl { + /* + * The following members are immutable, with the exception of + * db.db_data, which is protected by db_mtx. + */ + + /* the publicly visible structure */ + dmu_buf_t db; + + /* the objset we belong to */ + struct objset_impl *db_objset; + + /* + * the dnode we belong to (NULL when evicted) + */ + struct dnode *db_dnode; + + /* + * our parent buffer; if the dnode points to us directly, + * db_parent == db_dnode->dn_dbuf + * only accessed by sync thread ??? + * (NULL when evicted) + */ + struct dmu_buf_impl *db_parent; + + /* + * link for hash table of all dmu_buf_impl_t's + */ + struct dmu_buf_impl *db_hash_next; + + /* our block number */ + uint64_t db_blkid; + + /* + * Pointer to the blkptr_t which points to us. May be NULL if we + * don't have one yet. (NULL when evicted) + */ + blkptr_t *db_blkptr; + + /* + * Our indirection level. Data buffers have db_level==0. + * Indirect buffers which point to data buffers have + * db_level==1. etc. Buffers which contain dnodes have + * db_level==0, since the dnodes are stored in a file. + */ + uint8_t db_level; + + /* db_mtx protects the members below */ + kmutex_t db_mtx; + + /* + * Current state of the buffer + */ + dbuf_states_t db_state; + + /* + * Refcount accessed by dmu_buf_{hold,rele}. + * If nonzero, the buffer can't be destroyed. + * Protected by db_mtx. + */ + refcount_t db_holds; + + /* buffer holding our data */ + arc_buf_t *db_buf; + + kcondvar_t db_changed; + dbuf_dirty_record_t *db_data_pending; + + /* pointer to most recent dirty record for this buffer */ + dbuf_dirty_record_t *db_last_dirty; + + /* + * Our link on the owner dnodes's dn_dbufs list. + * Protected by its dn_dbufs_mtx. + */ + list_node_t db_link; + + /* Data which is unique to data (leaf) blocks: */ + + /* stuff we store for the user (see dmu_buf_set_user) */ + void *db_user_ptr; + void **db_user_data_ptr_ptr; + dmu_buf_evict_func_t *db_evict_func; + + uint8_t db_immediate_evict; + uint8_t db_freed_in_flight; + + uint8_t db_dirtycnt; +} dmu_buf_impl_t; + +/* Note: the dbuf hash table is exposed only for the mdb module */ +#define DBUF_MUTEXES 256 +#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) +typedef struct dbuf_hash_table { + uint64_t hash_table_mask; + dmu_buf_impl_t **hash_table; + kmutex_t hash_mutexes[DBUF_MUTEXES]; +} dbuf_hash_table_t; + + +uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); + +dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); +dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn); + +dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); +dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, + void *tag); +int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, + void *tag, dmu_buf_impl_t **dbp); + +void dbuf_prefetch(struct dnode *dn, uint64_t blkid); + +void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); +uint64_t dbuf_refcount(dmu_buf_impl_t *db); + +void dbuf_rele(dmu_buf_impl_t *db, void *tag); + +dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); + +int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); + +void dbuf_clear(dmu_buf_impl_t *db); +void dbuf_evict(dmu_buf_impl_t *db); + +void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_unoverride(dbuf_dirty_record_t *dr); +void dbuf_sync_list(list_t *list, dmu_tx_t *tx); + +void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks, + struct dmu_tx *); + +void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); + +void dbuf_init(void); +void dbuf_fini(void); + +#define DBUF_GET_BUFC_TYPE(db) \ + ((((db)->db_level > 0) || \ + (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ? \ + ARC_BUFC_METADATA : ARC_BUFC_DATA); + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but gcc does not + * support that preprocessor token. + */ +#define dprintf_dbuf(dbuf, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dbuf)->db.db_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj); \ + dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ + "obj=%s lvl=%u blkid=%lld " fmt, \ + __db_buf, (dbuf)->db_level, \ + (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ + dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DBUF_VERIFY(db) dbuf_verify(db) + +#else + +#define dprintf_dbuf(db, fmt, ...) +#define dprintf_dbuf_bp(db, bp, fmt, ...) +#define DBUF_VERIFY(db) + +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DBUF_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h new file mode 100644 index 0000000..f534015 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -0,0 +1,586 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_H +#define _SYS_DMU_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file describes the interface that the DMU provides for its + * consumers. + * + * The DMU also interacts with the SPA. That interface is described in + * dmu_spa.h. + */ + +#include <sys/types.h> +#include <sys/param.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct uio; +struct page; +struct vnode; +struct spa; +struct zilog; +struct zio; +struct blkptr; +struct zap_cursor; +struct dsl_dataset; +struct dsl_pool; +struct dnode; +struct drr_begin; +struct drr_end; +struct zbookmark; +struct spa; +struct nvlist; +struct objset_impl; +struct file; + +typedef struct objset objset_t; +typedef struct dmu_tx dmu_tx_t; +typedef struct dsl_dir dsl_dir_t; + +typedef enum dmu_object_type { + DMU_OT_NONE, + /* general: */ + DMU_OT_OBJECT_DIRECTORY, /* ZAP */ + DMU_OT_OBJECT_ARRAY, /* UINT64 */ + DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ + DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ + DMU_OT_BPLIST, /* UINT64 */ + DMU_OT_BPLIST_HDR, /* UINT64 */ + /* spa: */ + DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ + DMU_OT_SPACE_MAP, /* UINT64 */ + /* zil: */ + DMU_OT_INTENT_LOG, /* UINT64 */ + /* dmu: */ + DMU_OT_DNODE, /* DNODE */ + DMU_OT_OBJSET, /* OBJSET */ + /* dsl: */ + DMU_OT_DSL_DIR, /* UINT64 */ + DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ + DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ + DMU_OT_DSL_PROPS, /* ZAP */ + DMU_OT_DSL_DATASET, /* UINT64 */ + /* zpl: */ + DMU_OT_ZNODE, /* ZNODE */ + DMU_OT_ACL, /* ACL */ + DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ + DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ + DMU_OT_MASTER_NODE, /* ZAP */ + DMU_OT_UNLINKED_SET, /* ZAP */ + /* zvol: */ + DMU_OT_ZVOL, /* UINT8 */ + DMU_OT_ZVOL_PROP, /* ZAP */ + /* other; for testing only! */ + DMU_OT_PLAIN_OTHER, /* UINT8 */ + DMU_OT_UINT64_OTHER, /* UINT64 */ + DMU_OT_ZAP_OTHER, /* ZAP */ + /* new object types: */ + DMU_OT_ERROR_LOG, /* ZAP */ + DMU_OT_SPA_HISTORY, /* UINT8 */ + DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ + DMU_OT_POOL_PROPS, /* ZAP */ + + DMU_OT_NUMTYPES +} dmu_object_type_t; + +typedef enum dmu_objset_type { + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, /* For testing only! */ + DMU_OST_ANY, /* Be careful! */ + DMU_OST_NUMTYPES +} dmu_objset_type_t; + +void byteswap_uint64_array(void *buf, size_t size); +void byteswap_uint32_array(void *buf, size_t size); +void byteswap_uint16_array(void *buf, size_t size); +void byteswap_uint8_array(void *buf, size_t size); +void zap_byteswap(void *buf, size_t size); +void zfs_acl_byteswap(void *buf, size_t size); +void zfs_znode_byteswap(void *buf, size_t size); + +#define DS_MODE_NONE 0 /* invalid, to aid debugging */ +#define DS_MODE_STANDARD 1 /* normal access, no special needs */ +#define DS_MODE_PRIMARY 2 /* the "main" access, e.g. a mount */ +#define DS_MODE_EXCLUSIVE 3 /* exclusive access, e.g. to destroy */ +#define DS_MODE_LEVELS 4 +#define DS_MODE_LEVEL(x) ((x) & (DS_MODE_LEVELS - 1)) +#define DS_MODE_READONLY 0x8 +#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY) +#define DS_MODE_INCONSISTENT 0x10 +#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT) + +#define DS_FIND_SNAPSHOTS (1<<0) +#define DS_FIND_CHILDREN (1<<1) + +/* + * The maximum number of bytes that can be accessed as part of one + * operation, including metadata. + */ +#define DMU_MAX_ACCESS (10<<20) /* 10MB */ + +/* + * Public routines to create, destroy, open, and close objsets. + */ +int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp); +void dmu_objset_close(objset_t *os); +int dmu_objset_evict_dbufs(objset_t *os, int try); +int dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, + void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg); +int dmu_objset_destroy(const char *name); +int dmu_snapshots_destroy(char *fsname, char *snapname); +int dmu_objset_rollback(const char *name); +int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +int dmu_objset_rename(const char *name, const char *newname); +int dmu_objset_find(char *name, int func(char *, void *), void *arg, + int flags); +void dmu_objset_byteswap(void *buf, size_t size); + +typedef struct dmu_buf { + uint64_t db_object; /* object that this buffer is part of */ + uint64_t db_offset; /* byte offset in this object */ + uint64_t db_size; /* size of buffer in bytes */ + void *db_data; /* data in buffer */ +} dmu_buf_t; + +typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); + +/* + * Callback function to perform byte swapping on a block. + */ +typedef void dmu_byteswap_func_t(void *buf, size_t size); + +/* + * The names of zap entries in the DIRECTORY_OBJECT of the MOS. + */ +#define DMU_POOL_DIRECTORY_OBJECT 1 +#define DMU_POOL_CONFIG "config" +#define DMU_POOL_ROOT_DATASET "root_dataset" +#define DMU_POOL_SYNC_BPLIST "sync_bplist" +#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" +#define DMU_POOL_ERRLOG_LAST "errlog_last" +#define DMU_POOL_SPARES "spares" +#define DMU_POOL_DEFLATE "deflate" +#define DMU_POOL_HISTORY "history" +#define DMU_POOL_PROPS "pool_props" + +/* + * Allocate an object from this objset. The range of object numbers + * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. + * + * The transaction must be assigned to a txg. The newly allocated + * object will be "held" in the transaction (ie. you can modify the + * newly allocated object in this transaction). + * + * dmu_object_alloc() chooses an object and returns it in *objectp. + * + * dmu_object_claim() allocates a specific object number. If that + * number is already allocated, it fails and returns EEXIST. + * + * Return 0 on success, or ENOSPC or EEXIST as specified above. + */ +uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * Free an object from this objset. + * + * The object's data will be freed as well (ie. you don't need to call + * dmu_free(object, 0, -1, tx)). + * + * The object need not be held in the transaction. + * + * If there are any holds on this object's buffers (via dmu_buf_hold()), + * or tx holds on the object (via dmu_tx_hold_object()), you can not + * free it; it fails and returns EBUSY. + * + * If the object is not allocated, it fails and returns ENOENT. + * + * Return 0 on success, or EBUSY or ENOENT as specified above. + */ +int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); + +/* + * Find the next allocated or free object. + * + * The objectp parameter is in-out. It will be updated to be the next + * object which is allocated. Ignore objects which have not been + * modified since txg. + * + * XXX Can only be called on a objset with no dirty data. + * + * Returns 0 on success, or ENOENT if there are no more objects. + */ +int dmu_object_next(objset_t *os, uint64_t *objectp, + boolean_t hole, uint64_t txg); + +/* + * Set the data blocksize for an object. + * + * The object cannot have any blocks allcated beyond the first. If + * the first block is allocated already, the new size must be greater + * than the current block size. If these conditions are not met, + * ENOTSUP will be returned. + * + * Returns 0 on success, or EBUSY if there are any holds on the object + * contents, or ENOTSUP as described above. + */ +int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, + int ibs, dmu_tx_t *tx); + +/* + * Set the checksum property on a dnode. The new checksum algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx); + +/* + * Set the compress property on a dnode. The new compression algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx); + +/* + * Decide how many copies of a given block we should make. Can be from + * 1 to SPA_DVAS_PER_BP. + */ +int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, + dmu_object_type_t ot); +/* + * The bonus data is accessed more or less like a regular buffer. + * You must dmu_bonus_hold() to get the buffer, which will give you a + * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus + * data. As with any normal buffer, you must call dmu_buf_read() to + * read db_data, dmu_buf_will_dirty() before modifying it, and the + * object must be held in an assigned transaction before calling + * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus + * buffer as well. You must release your hold with dmu_buf_rele(). + */ +int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); +int dmu_bonus_max(void); + +/* + * Obtain the DMU buffer from the specified object which contains the + * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so + * that it will remain in memory. You must release the hold with + * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your + * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. + * + * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill + * on the returned buffer before reading or writing the buffer's + * db_data. The comments for those routines describe what particular + * operations are valid after calling them. + * + * The object number must be a valid, allocated object number. + */ +int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **); +void dmu_buf_add_ref(dmu_buf_t *db, void* tag); +void dmu_buf_rele(dmu_buf_t *db, void *tag); +uint64_t dmu_buf_refcount(dmu_buf_t *db); + +/* + * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a + * range of an object. A pointer to an array of dmu_buf_t*'s is + * returned (in *dbpp). + * + * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and + * frees the array. The hold on the array of buffers MUST be released + * with dmu_buf_rele_array. You can NOT release the hold on each buffer + * individually with dmu_buf_rele. + */ +int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); +void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); + +/* + * Returns NULL on success, or the existing user ptr if it's already + * been set. + * + * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). + * + * user_data_ptr_ptr should be NULL, or a pointer to a pointer which + * will be set to db->db_data when you are allowed to access it. Note + * that db->db_data (the pointer) can change when you do dmu_buf_read(), + * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill(). + * *user_data_ptr_ptr will be set to the new value when it changes. + * + * If non-NULL, pageout func will be called when this buffer is being + * excised from the cache, so that you can clean up the data structure + * pointed to by user_ptr. + * + * dmu_evict_user() will call the pageout func for all buffers in a + * objset with a given pageout func. + */ +void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *pageout_func); +/* + * set_user_ie is the same as set_user, but request immediate eviction + * when hold count goes to zero. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, + void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); +void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, + void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *pageout_func); +void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); + +/* + * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. + */ +void *dmu_buf_get_user(dmu_buf_t *db); + +/* + * Indicate that you are going to modify the buffer's data (db_data). + * + * The transaction (tx) must be assigned to a txg (ie. you've called + * dmu_tx_assign()). The buffer's object must be held in the tx + * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). + */ +void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); + +/* + * You must create a transaction, then hold the objects which you will + * (or might) modify as part of this transaction. Then you must assign + * the transaction to a transaction group. Once the transaction has + * been assigned, you can modify buffers which belong to held objects as + * part of this transaction. You can't modify buffers before the + * transaction has been assigned; you can't modify buffers which don't + * belong to objects which this transaction holds; you can't hold + * objects once the transaction has been assigned. You may hold an + * object which you are going to free (with dmu_object_free()), but you + * don't have to. + * + * You can abort the transaction before it has been assigned. + * + * Note that you may hold buffers (with dmu_buf_hold) at any time, + * regardless of transaction state. + */ + +#define DMU_NEW_OBJECT (-1ULL) +#define DMU_OBJECT_END (-1ULL) + +dmu_tx_t *dmu_tx_create(objset_t *os); +void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, + uint64_t len); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name); +void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_abort(dmu_tx_t *tx); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_wait(dmu_tx_t *tx); +void dmu_tx_commit(dmu_tx_t *tx); + +/* + * Free up the data blocks for a defined range of a file. If size is + * zero, the range from offset to end-of-file is freed. + */ +int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx); + +/* + * Convenience functions. + * + * Canfail routines will return 0 on success, or an errno if there is a + * nonrecoverable I/O error. + */ +int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf); +void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); +int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, + dmu_tx_t *tx); +int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, struct page *pp, dmu_tx_t *tx); + +extern int zfs_prefetch_disable; + +/* + * Asynchronously try to read in the data. + */ +void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, + uint64_t len); + +typedef struct dmu_object_info { + /* All sizes are in bytes. */ + uint32_t doi_data_block_size; + uint32_t doi_metadata_block_size; + uint64_t doi_bonus_size; + dmu_object_type_t doi_type; + dmu_object_type_t doi_bonus_type; + uint8_t doi_indirection; /* 2 = dnode->indirect->data */ + uint8_t doi_checksum; + uint8_t doi_compress; + uint8_t doi_pad[5]; + /* Values below are number of 512-byte blocks. */ + uint64_t doi_physical_blks; /* data + metadata */ + uint64_t doi_max_block_offset; +} dmu_object_info_t; + +typedef struct dmu_object_type_info { + dmu_byteswap_func_t *ot_byteswap; + boolean_t ot_metadata; + char *ot_name; +} dmu_object_type_info_t; + +extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; + +/* + * Get information on a DMU object. + * + * Return 0 on success or ENOENT if object is not allocated. + * + * If doi is NULL, just indicates whether the object exists. + */ +int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); +void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, + u_longlong_t *nblk512); + +typedef struct dmu_objset_stats { + uint64_t dds_num_clones; /* number of clones of this */ + uint64_t dds_creation_txg; + dmu_objset_type_t dds_type; + uint8_t dds_is_snapshot; + uint8_t dds_inconsistent; + char dds_clone_of[MAXNAMELEN]; +} dmu_objset_stats_t; + +/* + * Get stats on a dataset. + */ +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); + +/* + * Add entries to the nvlist for all the objset's properties. See + * zfs_prop_table[] and zfs(1m) for details on the properties. + */ +void dmu_objset_stats(objset_t *os, struct nvlist *nv); + +/* + * Get the space usage statistics for statvfs(). + * + * refdbytes is the amount of space "referenced" by this objset. + * availbytes is the amount of space available to this objset, taking + * into account quotas & reservations, assuming that no other objsets + * use the space first. These values correspond to the 'referenced' and + * 'available' properties, described in the zfs(1m) manpage. + * + * usedobjs and availobjs are the number of objects currently allocated, + * and available. + */ +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); + +/* + * The fsid_guid is a 56-bit ID that can change to avoid collisions. + * (Contrast with the ds_guid which is a 64-bit ID that will never + * change, so there is a small probability that it will collide.) + */ +uint64_t dmu_objset_fsid_guid(objset_t *os); + +int dmu_objset_is_snapshot(objset_t *os); + +extern struct spa *dmu_objset_spa(objset_t *os); +extern struct zilog *dmu_objset_zil(objset_t *os); +extern struct dsl_pool *dmu_objset_pool(objset_t *os); +extern struct dsl_dataset *dmu_objset_ds(objset_t *os); +extern void dmu_objset_name(objset_t *os, char *buf); +extern dmu_objset_type_t dmu_objset_type(objset_t *os); +extern uint64_t dmu_objset_id(objset_t *os); +extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *id, uint64_t *offp); +extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp); + +/* + * Return the txg number for the given assigned transaction. + */ +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); + +/* + * Synchronous write. + * If a parent zio is provided this function initiates a write on the + * provided buffer as a child of the parent zio. + * In the absense of a parent zio, the write is completed synchronously. + * At write completion, blk is filled with the bp of the written block. + * Note that while the data covered by this function will be on stable + * storage when the write completes this new data does not become a + * permanent part of the file until the associated transaction commits. + */ +typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg); +int dmu_sync(struct zio *zio, dmu_buf_t *db, + struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg); + +/* + * Find the next hole or data block in file starting at *off + * Return found offset in *off. Return ESRCH for end of file. + */ +int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, + uint64_t *off); + +/* + * Initial setup and final teardown. + */ +extern void dmu_init(void); +extern void dmu_fini(void); + +typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, + uint64_t object, uint64_t offset, int len); +void dmu_traverse_objset(objset_t *os, uint64_t txg_start, + dmu_traverse_cb_t cb, void *arg); + +int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp); +int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, + boolean_t force, struct file *fp, uint64_t voffset); + +/* CRC64 table */ +#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ +extern uint64_t zfs_crc64_table[256]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h new file mode 100644 index 0000000..807011e --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -0,0 +1,237 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_IMPL_H +#define _SYS_DMU_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/txg_impl.h> +#include <sys/zio.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the locking strategy for the DMU. Numbers in parenthesis are + * cases that use that lock order, referenced below: + * + * ARC is self-contained + * bplist is self-contained + * refcount is self-contained + * txg is self-contained (hopefully!) + * zst_lock + * zf_rwlock + * + * XXX try to improve evicting path? + * + * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > + * dn_dbufs_mtx > hash_mutexes > db_mtx > leafs + * + * dp_config_rwlock + * must be held before: everything + * protects dd namespace changes + * protects property changes globally + * held from: + * dsl_dir_open/r: + * dsl_dir_create_sync/w: + * dsl_dir_sync_destroy/w: + * dsl_dir_rename_sync/w: + * dsl_prop_changed_notify/r: + * + * os_obj_lock + * must be held before: + * everything except dp_config_rwlock + * protects os_obj_next + * held from: + * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock + * + * dn_struct_rwlock + * must be held before: + * everything except dp_config_rwlock and os_obj_lock + * protects structure of dnode (eg. nlevels) + * db_blkptr can change when syncing out change to nlevels + * dn_maxblkid + * dn_nlevels + * dn_*blksz* + * phys nlevels, maxblkid, physical blkptr_t's (?) + * held from: + * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch + * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) + * dmu_tx_count_free: + * dbuf_read_impl: db_mtx, dmu_zfetch() + * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() + * dbuf_new_size: db_mtx + * dbuf_dirty: db_mtx + * dbuf_findbp: (callers, phys? - the real need) + * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) + * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx + * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() + * dnode_sync/w (increase_indirection): db_mtx (phys) + * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) + * dnode_new_blkid/w: (dn_maxblkid) + * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) + * dnode_next_offset: (phys) + * + * dn_dbufs_mtx + * must be held before: + * db_mtx, hash_mutexes + * protects: + * dn_dbufs + * dn_evicted + * held from: + * dmu_evict_user: db_mtx (dn_dbufs) + * dbuf_free_range: db_mtx (dn_dbufs) + * dbuf_remove_ref: db_mtx, callees: + * dbuf_hash_remove: hash_mutexes, db_mtx + * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) + * dnode_set_blksz: (dn_dbufs) + * + * hash_mutexes (global) + * must be held before: + * db_mtx + * protects dbuf_hash_table (global) and db_hash_next + * held from: + * dbuf_find: db_mtx + * dbuf_hash_insert: db_mtx + * dbuf_hash_remove: db_mtx + * + * db_mtx (meta-leaf) + * must be held before: + * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) + * protects: + * db_state + * db_holds + * db_buf + * db_changed + * db_data_pending + * db_dirtied + * db_link + * db_dirty_node (??) + * db_dirtycnt + * db_d.* + * db.* + * held from: + * dbuf_dirty: dn_mtx, dn_dirty_mtx + * dbuf_dirty->dsl_dir_willuse_space: dd_lock + * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock + * dbuf_undirty: dn_dirty_mtx (db_d) + * dbuf_write_done: dn_dirty_mtx (db_state) + * dbuf_* + * dmu_buf_update_user: none (db_d) + * dmu_evict_user: none (db_d) (maybe can eliminate) + * dbuf_find: none (db_holds) + * dbuf_hash_insert: none (db_holds) + * dmu_buf_read_array_impl: none (db_state, db_changed) + * dmu_sync: none (db_dirty_node, db_d) + * dnode_reallocate: none (db) + * + * dn_mtx (leaf) + * protects: + * dn_dirty_dbufs + * dn_ranges + * phys accounting + * dn_allocated_txg + * dn_free_txg + * dn_assigned_txg + * dd_assigned_tx + * dn_notxholds + * dn_dirtyctx + * dn_dirtyctx_firstset + * (dn_phys copy fields?) + * (dn_phys contents?) + * held from: + * dnode_* + * dbuf_dirty: none + * dbuf_sync: none (phys accounting) + * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) + * dbuf_write_done: none (phys accounting) + * dmu_object_info_from_dnode: none (accounting) + * dmu_tx_commit: none + * dmu_tx_hold_object_impl: none + * dmu_tx_try_assign: dn_notxholds(cv) + * dmu_tx_unassign: none + * + * dd_lock (leaf) + * protects: + * dd_prop_cbs + * dd_sync_* + * dd_used_bytes + * dd_tempreserved + * dd_space_towrite + * dd_myname + * dd_phys accounting? + * held from: + * dsl_dir_* + * dsl_prop_changed_notify: none (dd_prop_cbs) + * dsl_prop_register: none (dd_prop_cbs) + * dsl_prop_unregister: none (dd_prop_cbs) + * dsl_dataset_block_freeable: none (dd_sync_*) + * + * os_lock (leaf) + * protects: + * os_dirty_dnodes + * os_free_dnodes + * os_dnodes + * os_downgraded_dbufs + * dn_dirtyblksz + * dn_dirty_link + * held from: + * dnode_create: none (os_dnodes) + * dnode_destroy: none (os_dnodes) + * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) + * dnode_free: none (dn_dirtyblksz, os_*_dnodes) + * + * ds_lock (leaf) + * protects: + * ds_user_ptr + * ds_user_evice_func + * ds_open_refcount + * ds_snapname + * ds_phys accounting + * held from: + * dsl_dataset_* + * + * dr_mtx (leaf) + * protects: + * dr_children + * held from: + * dbuf_dirty + * dbuf_undirty + * dbuf_sync_indirect + * dnode_new_blkid + */ + +struct objset; +struct dmu_pool; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h new file mode 100644 index 0000000..8293a3b --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_OBJSET_H +#define _SYS_DMU_OBJSET_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/arc.h> +#include <sys/txg.h> +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/zio.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dmu_tx; +struct objset_impl; + +typedef struct objset_phys { + dnode_phys_t os_meta_dnode; + zil_header_t os_zil_header; + uint64_t os_type; + char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - + sizeof (uint64_t)]; +} objset_phys_t; + +struct objset { + struct objset_impl *os; + int os_mode; +}; + +typedef struct objset_impl { + /* Immutable: */ + struct dsl_dataset *os_dsl_dataset; + spa_t *os_spa; + arc_buf_t *os_phys_buf; + objset_phys_t *os_phys; + dnode_t *os_meta_dnode; + zilog_t *os_zil; + objset_t os; + uint8_t os_checksum; /* can change, under dsl_dir's locks */ + uint8_t os_compress; /* can change, under dsl_dir's locks */ + uint8_t os_copies; /* can change, under dsl_dir's locks */ + uint8_t os_md_checksum; + uint8_t os_md_compress; + + /* no lock needed: */ + struct dmu_tx *os_synctx; /* XXX sketchy */ + blkptr_t *os_rootbp; + + /* Protected by os_obj_lock */ + kmutex_t os_obj_lock; + uint64_t os_obj_next; + + /* Protected by os_lock */ + kmutex_t os_lock; + list_t os_dirty_dnodes[TXG_SIZE]; + list_t os_free_dnodes[TXG_SIZE]; + list_t os_dnodes; + list_t os_downgraded_dbufs; +} objset_impl_t; + +#define DMU_META_DNODE_OBJECT 0 + +/* called from zpl */ +int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp); +void dmu_objset_close(objset_t *os); +int dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, + void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg); +int dmu_objset_destroy(const char *name); +int dmu_objset_rollback(const char *name); +int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +void dmu_objset_stats(objset_t *os, nvlist_t *nv); +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dmu_objset_fsid_guid(objset_t *os); +int dmu_objset_find(char *name, int func(char *, void *), void *arg, + int flags); +void dmu_objset_byteswap(void *buf, size_t size); +int dmu_objset_evict_dbufs(objset_t *os, int try); + +/* called from dsl */ +void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); +objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, + blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); +int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, + objset_impl_t **osip); +void dmu_objset_evict(struct dsl_dataset *ds, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_OBJSET_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h new file mode 100644 index 0000000..ea9fa6c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_TRAVERSE_H +#define _SYS_DMU_TRAVERSE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/dnode.h> +#include <sys/arc.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ADVANCE_POST 0 /* post-order traversal */ +#define ADVANCE_PRE 0x01 /* pre-order traversal */ +#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */ +#define ADVANCE_DATA 0x04 /* read user data blocks */ +#define ADVANCE_HOLES 0x08 /* visit holes */ +#define ADVANCE_ZIL 0x10 /* visit intent log blocks */ +#define ADVANCE_NOLOCK 0x20 /* Don't grab SPA sync lock */ + +#define ZB_NO_LEVEL -2 +#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */ +#define ZB_MAXBLKID (1ULL << 62) +#define ZB_MAXOBJSET (1ULL << 62) +#define ZB_MAXOBJECT (1ULL << 62) + +#define ZB_MOS_CACHE 0 +#define ZB_MDN_CACHE 1 +#define ZB_DN_CACHE 2 +#define ZB_DEPTH 3 + +typedef struct zseg { + uint64_t seg_mintxg; + uint64_t seg_maxtxg; + zbookmark_t seg_start; + zbookmark_t seg_end; + list_node_t seg_node; +} zseg_t; + +typedef struct traverse_blk_cache { + zbookmark_t bc_bookmark; + blkptr_t bc_blkptr; + void *bc_data; + dnode_phys_t *bc_dnode; + int bc_errno; + int bc_pad1; + uint64_t bc_pad2; +} traverse_blk_cache_t; + +typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg); + +struct traverse_handle { + spa_t *th_spa; + blkptr_cb_t *th_func; + void *th_arg; + uint16_t th_advance; + uint16_t th_locked; + int th_zio_flags; + list_t th_seglist; + traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL]; + traverse_blk_cache_t th_zil_cache; + uint64_t th_hits; + uint64_t th_arc_hits; + uint64_t th_reads; + uint64_t th_callbacks; + uint64_t th_syncs; + uint64_t th_restarts; + zbookmark_t th_noread; + zbookmark_t th_lastcb; +}; + +int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start, + int advance, blkptr_cb_t func, void *arg); + +traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg, + int advance, int zio_flags); +void traverse_fini(traverse_handle_t *th); + +void traverse_add_dnode(traverse_handle_t *th, + uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object); +void traverse_add_objset(traverse_handle_t *th, + uint64_t mintxg, uint64_t maxtxg, uint64_t objset); +void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg); + +int traverse_more(traverse_handle_t *th); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TRAVERSE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h new file mode 100644 index 0000000..89f4799 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h @@ -0,0 +1,134 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_TX_H +#define _SYS_DMU_TX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/txg.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf_impl; +struct dmu_tx_hold; +struct dnode_link; +struct dsl_pool; +struct dnode; +struct dsl_dir; + +struct dmu_tx { + /* + * No synchronization is needed because a tx can only be handled + * by one thread. + */ + list_t tx_holds; /* list of dmu_tx_hold_t */ + objset_t *tx_objset; + struct dsl_dir *tx_dir; + struct dsl_pool *tx_pool; + uint64_t tx_txg; + uint64_t tx_lastsnap_txg; + uint64_t tx_lasttried_txg; + txg_handle_t tx_txgh; + void *tx_tempreserve_cookie; + struct dmu_tx_hold *tx_needassign_txh; + uint8_t tx_anyobj; + int tx_err; +#ifdef ZFS_DEBUG + uint64_t tx_space_towrite; + uint64_t tx_space_tofree; + uint64_t tx_space_tooverwrite; + refcount_t tx_space_written; + refcount_t tx_space_freed; +#endif +}; + +enum dmu_tx_hold_type { + THT_NEWOBJECT, + THT_WRITE, + THT_BONUS, + THT_FREE, + THT_ZAP, + THT_SPACE, + THT_NUMTYPES +}; + +typedef struct dmu_tx_hold { + dmu_tx_t *txh_tx; + list_node_t txh_node; + struct dnode *txh_dnode; + uint64_t txh_space_towrite; + uint64_t txh_space_tofree; + uint64_t txh_space_tooverwrite; +#ifdef ZFS_DEBUG + enum dmu_tx_hold_type txh_type; + uint64_t txh_arg1; + uint64_t txh_arg2; +#endif +} dmu_tx_hold_t; + + +/* + * These routines are defined in dmu.h, and are called by the user. + */ +dmu_tx_t *dmu_tx_create(objset_t *dd); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_commit(dmu_tx_t *tx); +void dmu_tx_abort(dmu_tx_t *tx); +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); +void dmu_tx_wait(dmu_tx_t *tx); + +/* + * These routines are defined in dmu_spa.h, and are called by the SPA. + */ +extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); + +/* + * These routines are only called by the DMU. + */ +dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); +int dmu_tx_is_syncing(dmu_tx_t *tx); +int dmu_tx_private_ok(dmu_tx_t *tx); +void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); +void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); +void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); +int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); + +#ifdef ZFS_DEBUG +#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) +#else +#define DMU_TX_DIRTY_BUF(tx, db) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TX_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h new file mode 100644 index 0000000..c94bced --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DFETCH_H +#define _DFETCH_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint64_t zfetch_array_rd_sz; + +struct dnode; /* so we can reference dnode */ + +typedef enum zfetch_dirn { + ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */ + ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */ +} zfetch_dirn_t; + +typedef struct zstream { + uint64_t zst_offset; /* offset of starting block in range */ + uint64_t zst_len; /* length of range, in blocks */ + zfetch_dirn_t zst_direction; /* direction of prefetch */ + uint64_t zst_stride; /* length of stride, in blocks */ + uint64_t zst_ph_offset; /* prefetch offset, in blocks */ + uint64_t zst_cap; /* prefetch limit (cap), in blocks */ + kmutex_t zst_lock; /* protects stream */ + clock_t zst_last; /* lbolt of last prefetch */ + avl_node_t zst_node; /* embed avl node here */ +} zstream_t; + +typedef struct zfetch { + krwlock_t zf_rwlock; /* protects zfetch structure */ + list_t zf_stream; /* AVL tree of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ + uint32_t zf_stream_cnt; /* # of active streams */ + uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */ +} zfetch_t; + +void dmu_zfetch_init(zfetch_t *, struct dnode *); +void dmu_zfetch_rele(zfetch_t *); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); + + +#ifdef __cplusplus +} +#endif + +#endif /* _DFETCH_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h new file mode 100644 index 0000000..327e538 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -0,0 +1,267 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DNODE_H +#define _SYS_DNODE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/refcount.h> +#include <sys/dmu_zfetch.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Flags. + */ +#define DNODE_MUST_BE_ALLOCATED 1 +#define DNODE_MUST_BE_FREE 2 + +/* + * Fixed constants. + */ +#define DNODE_SHIFT 9 /* 512 bytes */ +#define DN_MIN_INDBLKSHIFT 10 /* 1k */ +#define DN_MAX_INDBLKSHIFT 14 /* 16k */ +#define DNODE_BLOCK_SHIFT 14 /* 16k */ +#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ +#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ +#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ + +/* + * Derived constants. + */ +#define DNODE_SIZE (1 << DNODE_SHIFT) +#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) +#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) + +#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) +#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) +#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) + +/* The +2 here is a cheesy way to round up */ +#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ + (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) + +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ + (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) + +#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ + (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) + +#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) + +struct dmu_buf_impl; +struct objset_impl; +struct zio; + +enum dnode_dirtycontext { + DN_UNDIRTIED, + DN_DIRTY_OPEN, + DN_DIRTY_SYNC +}; + +/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ +#define DNODE_FLAG_USED_BYTES (1<<0) + +typedef struct dnode_phys { + uint8_t dn_type; /* dmu_object_type_t */ + uint8_t dn_indblkshift; /* ln2(indirect block size) */ + uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ + uint8_t dn_nblkptr; /* length of dn_blkptr */ + uint8_t dn_bonustype; /* type of data in bonus buffer */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_flags; /* DNODE_FLAG_* */ + uint16_t dn_datablkszsec; /* data block size in 512b sectors */ + uint16_t dn_bonuslen; /* length of dn_bonus */ + uint8_t dn_pad2[4]; + + /* accounting is protected by dn_dirty_mtx */ + uint64_t dn_maxblkid; /* largest allocated block ID */ + uint64_t dn_used; /* bytes (or sectors) of disk space */ + + uint64_t dn_pad3[4]; + + blkptr_t dn_blkptr[1]; + uint8_t dn_bonus[DN_MAX_BONUSLEN]; +} dnode_phys_t; + +typedef struct dnode { + /* + * dn_struct_rwlock protects the structure of the dnode, + * including the number of levels of indirection (dn_nlevels), + * dn_maxblkid, and dn_next_* + */ + krwlock_t dn_struct_rwlock; + + /* + * Our link on dataset's dd_dnodes list. + * Protected by dd_accounting_mtx. + */ + list_node_t dn_link; + + /* immutable: */ + struct objset_impl *dn_objset; + uint64_t dn_object; + struct dmu_buf_impl *dn_dbuf; + dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ + + /* + * Copies of stuff in dn_phys. They're valid in the open + * context (eg. even before the dnode is first synced). + * Where necessary, these are protected by dn_struct_rwlock. + */ + dmu_object_type_t dn_type; /* object type */ + uint16_t dn_bonuslen; /* bonus length */ + uint8_t dn_bonustype; /* bonus type */ + uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_nlevels; + uint8_t dn_indblkshift; + uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ + uint16_t dn_datablkszsec; /* in 512b sectors */ + uint32_t dn_datablksz; /* in bytes */ + uint64_t dn_maxblkid; + uint8_t dn_next_nlevels[TXG_SIZE]; + uint8_t dn_next_indblkshift[TXG_SIZE]; + uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ + + /* protected by os_lock: */ + list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ + + /* protected by dn_mtx: */ + kmutex_t dn_mtx; + list_t dn_dirty_records[TXG_SIZE]; + avl_tree_t dn_ranges[TXG_SIZE]; + uint64_t dn_allocated_txg; + uint64_t dn_free_txg; + uint64_t dn_assigned_txg; + kcondvar_t dn_notxholds; + enum dnode_dirtycontext dn_dirtyctx; + uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ + + /* protected by own devices */ + refcount_t dn_tx_holds; + refcount_t dn_holds; + + kmutex_t dn_dbufs_mtx; + list_t dn_dbufs; /* linked list of descendent dbuf_t's */ + struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + + /* parent IO for current sync write */ + zio_t *dn_zio; + + /* holds prefetch structure */ + struct zfetch dn_zfetch; +} dnode_t; + +typedef struct free_range { + avl_node_t fr_node; + uint64_t fr_blkid; + uint64_t fr_nblks; +} free_range_t; + +dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, + uint64_t object); +void dnode_special_close(dnode_t *dn); + +int dnode_hold(struct objset_impl *dd, uint64_t object, + void *ref, dnode_t **dnp); +int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, + void *ref, dnode_t **dnp); +void dnode_add_ref(dnode_t *dn, void *ref); +void dnode_rele(dnode_t *dn, void *ref); +void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); +void dnode_sync(dnode_t *dn, dmu_tx_t *tx); +void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +void dnode_free(dnode_t *dn, dmu_tx_t *tx); +void dnode_byteswap(dnode_phys_t *dnp); +void dnode_buf_byteswap(void *buf, size_t size); +void dnode_verify(dnode_t *dn); +int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); +uint64_t dnode_current_max_length(dnode_t *dn); +void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); +void dnode_clear_range(dnode_t *dn, uint64_t blkid, + uint64_t nblks, dmu_tx_t *tx); +void dnode_diduse_space(dnode_t *dn, int64_t space); +void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); +void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); +uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); +void dnode_init(void); +void dnode_fini(void); +int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl, + uint64_t blkfill, uint64_t txg); +int dnode_evict_dbufs(dnode_t *dn, int try); + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but that piece of shit + * gcc doesn't support that preprocessor token. + */ +#define dprintf_dnode(dn, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dn)->dn_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj);\ + dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ + __db_buf, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DNODE_VERIFY(dn) dnode_verify(dn) +#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) + +#else + +#define dprintf_dnode(db, fmt, ...) +#define DNODE_VERIFY(dn) +#define FREE_VERIFY(db, start, end, tx) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DNODE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h new file mode 100644 index 0000000..8929dbc --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -0,0 +1,185 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DATASET_H +#define _SYS_DSL_DATASET_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/bplist.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dsl_dir; +struct dsl_pool; + +typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); + +#define DS_FLAG_INCONSISTENT (1ULL<<0) +/* + * NB: nopromote can not yet be set, but we want support for it in this + * on-disk version, so that we don't need to upgrade for it later. It + * will be needed when we implement 'zfs split' (where the split off + * clone should not be promoted). + */ +#define DS_FLAG_NOPROMOTE (1ULL<<1) + +typedef struct dsl_dataset_phys { + uint64_t ds_dir_obj; + uint64_t ds_prev_snap_obj; + uint64_t ds_prev_snap_txg; + uint64_t ds_next_snap_obj; + uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */ + uint64_t ds_num_children; /* clone/snap children; ==0 for head */ + uint64_t ds_creation_time; /* seconds since 1970 */ + uint64_t ds_creation_txg; + uint64_t ds_deadlist_obj; + uint64_t ds_used_bytes; + uint64_t ds_compressed_bytes; + uint64_t ds_uncompressed_bytes; + uint64_t ds_unique_bytes; /* only relevant to snapshots */ + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + uint64_t ds_fsid_guid; + uint64_t ds_guid; + uint64_t ds_flags; + blkptr_t ds_bp; + uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */ +} dsl_dataset_phys_t; + +typedef struct dsl_dataset { + /* Immutable: */ + struct dsl_dir *ds_dir; + dsl_dataset_phys_t *ds_phys; + dmu_buf_t *ds_dbuf; + uint64_t ds_object; + + /* only used in syncing context: */ + struct dsl_dataset *ds_prev; /* only valid for non-snapshots */ + + /* has internal locking: */ + bplist_t ds_deadlist; + + /* protected by lock on pool's dp_dirty_datasets list */ + txg_node_t ds_dirty_link; + list_node_t ds_synced_link; + + /* + * ds_phys->ds_<accounting> is also protected by ds_lock. + * Protected by ds_lock: + */ + kmutex_t ds_lock; + void *ds_user_ptr; + dsl_dataset_evict_func_t *ds_user_evict_func; + uint64_t ds_open_refcount; + + /* no locking; only for making guesses */ + uint64_t ds_trysnap_txg; + + /* Protected by ds_lock; keep at end of struct for better locality */ + char ds_snapname[MAXNAMELEN]; +} dsl_dataset_t; + +#define dsl_dataset_is_snapshot(ds) \ + ((ds)->ds_phys->ds_num_children != 0) + +int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, + void *tag, dsl_dataset_t **dsp); +int dsl_dataset_open(const char *name, int mode, void *tag, + dsl_dataset_t **dsp); +int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj, + const char *tail, int mode, void *tag, dsl_dataset_t **); +void dsl_dataset_name(dsl_dataset_t *ds, char *name); +void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag); +uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, + const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx); +int dsl_dataset_destroy(const char *name); +int dsl_snapshots_destroy(char *fsname, char *snapname); +dsl_checkfunc_t dsl_dataset_snapshot_check; +dsl_syncfunc_t dsl_dataset_snapshot_sync; +int dsl_dataset_rollback(dsl_dataset_t *ds); +int dsl_dataset_rename(const char *name, const char *newname); +int dsl_dataset_promote(const char *name); + +void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, + void *p, dsl_dataset_evict_func_t func); +void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds); + +blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); +void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); + +spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); + +void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); + +void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); +void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, + dmu_tx_t *tx); +int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); +uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); + +void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); +void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); +void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat); +void dsl_dataset_space(dsl_dataset_t *ds, + uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); + +void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp, + dmu_tx_t *tx); + +int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); + +#ifdef ZFS_DEBUG +#define dprintf_ds(ds, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \ + dsl_dataset_name(ds, __ds_name); \ + dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, MAXNAMELEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_ds(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DATASET_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h new file mode 100644 index 0000000..f33776a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h @@ -0,0 +1,142 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DIR_H +#define _SYS_DSL_DIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/refcount.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; + +typedef struct dsl_dir_phys { + uint64_t dd_creation_time; /* not actually used */ + uint64_t dd_head_dataset_obj; + uint64_t dd_parent_obj; + uint64_t dd_clone_parent_obj; + uint64_t dd_child_dir_zapobj; + /* + * how much space our children are accounting for; for leaf + * datasets, == physical space used by fs + snaps + */ + uint64_t dd_used_bytes; + uint64_t dd_compressed_bytes; + uint64_t dd_uncompressed_bytes; + /* Administrative quota setting */ + uint64_t dd_quota; + /* Administrative reservation setting */ + uint64_t dd_reserved; + uint64_t dd_props_zapobj; + uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */ +} dsl_dir_phys_t; + +struct dsl_dir { + /* These are immutable; no lock needed: */ + uint64_t dd_object; + dsl_dir_phys_t *dd_phys; + dmu_buf_t *dd_dbuf; + dsl_pool_t *dd_pool; + + /* protected by lock on pool's dp_dirty_dirs list */ + txg_node_t dd_dirty_link; + + /* protected by dp_config_rwlock */ + dsl_dir_t *dd_parent; + + /* Protected by dd_lock */ + kmutex_t dd_lock; + list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ + + /* Accounting */ + /* reflects any changes to dd_phys->dd_used_bytes made this syncing */ + int64_t dd_used_bytes; + /* gross estimate of space used by in-flight tx's */ + uint64_t dd_tempreserved[TXG_SIZE]; + /* amount of space we expect to write; == amount of dirty data */ + int64_t dd_space_towrite[TXG_SIZE]; + + /* protected by dd_lock; keep at end of struct for better locality */ + char dd_myname[MAXNAMELEN]; +}; + +void dsl_dir_close(dsl_dir_t *dd, void *tag); +int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail); +int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **, + const char **tailp); +int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **); +void dsl_dir_name(dsl_dir_t *dd, char *buf); +int dsl_dir_is_private(dsl_dir_t *dd); +uint64_t dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx); +void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx); +dsl_checkfunc_t dsl_dir_destroy_check; +dsl_syncfunc_t dsl_dir_destroy_sync; +void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); +uint64_t dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly); +void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); +void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); +int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, + uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx); +void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); +void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); +void dsl_dir_diduse_space(dsl_dir_t *dd, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); +int dsl_dir_set_quota(const char *ddname, uint64_t quota); +int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); +int dsl_dir_rename(dsl_dir_t *dd, const char *newname); +int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); + +/* internal reserved dir name */ +#define MOS_DIR_NAME "$MOS" + +#ifdef ZFS_DEBUG +#define dprintf_dd(dd, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \ + KM_SLEEP); \ + dsl_dir_name(dd, __ds_name); \ + dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_dd(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DIR_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h new file mode 100644 index 0000000..f7ec67a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_POOL_H +#define _SYS_DSL_POOL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/txg_impl.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; + +typedef struct dsl_pool { + /* Immutable */ + spa_t *dp_spa; + struct objset *dp_meta_objset; + struct dsl_dir *dp_root_dir; + struct dsl_dir *dp_mos_dir; + uint64_t dp_root_dir_obj; + + /* No lock needed - sync context only */ + blkptr_t dp_meta_rootbp; + list_t dp_synced_objsets; + + /* Has its own locking */ + tx_state_t dp_tx; + txg_list_t dp_dirty_datasets; + txg_list_t dp_dirty_dirs; + txg_list_t dp_sync_tasks; + + /* + * Protects administrative changes (properties, namespace) + * It is only held for write in syncing context. Therefore + * syncing context does not need to ever have it for read, since + * nobody else could possibly have it for write. + */ + krwlock_t dp_config_rwlock; +} dsl_pool_t; + +int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +void dsl_pool_close(dsl_pool_t *dp); +dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg); +void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); +void dsl_pool_zil_clean(dsl_pool_t *dp); +int dsl_pool_sync_context(dsl_pool_t *dp); +uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_POOL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h new file mode 100644 index 0000000..d2debff --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_PROP_H +#define _SYS_DSL_PROP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; + +/* The callback func may not call into the DMU or DSL! */ +typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); + +typedef struct dsl_prop_cb_record { + list_node_t cbr_node; /* link on dd_prop_cbs */ + struct dsl_dataset *cbr_ds; + const char *cbr_propname; + dsl_prop_changed_cb_t *cbr_func; + void *cbr_arg; +} dsl_prop_cb_record_t; + +int dsl_prop_register(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +int dsl_prop_numcb(struct dsl_dataset *ds); + +int dsl_prop_get(const char *ddname, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint); +int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); + +int dsl_prop_set(const char *ddname, const char *propname, + int intsz, int numints, const void *buf); +int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname, + int intsz, int numints, const void *buf); + +void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); +void dsl_prop_nvlist_add_string(nvlist_t *nv, + zfs_prop_t prop, const char *value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_PROP_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h new file mode 100644 index 0000000..e695b18 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_SYNCTASK_H +#define _SYS_DSL_SYNCTASK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/txg.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; + +typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); + +typedef struct dsl_sync_task { + list_node_t dst_node; + dsl_checkfunc_t *dst_checkfunc; + dsl_syncfunc_t *dst_syncfunc; + void *dst_arg1; + void *dst_arg2; + int dst_err; +} dsl_sync_task_t; + +typedef struct dsl_sync_task_group { + txg_node_t dstg_node; + list_t dstg_tasks; + struct dsl_pool *dstg_pool; + uint64_t dstg_txg; + int dstg_err; + int dstg_space; +} dsl_sync_task_group_t; + +dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp); +void dsl_sync_task_create(dsl_sync_task_group_t *dstg, + dsl_checkfunc_t *, dsl_syncfunc_t *, + void *arg1, void *arg2, int blocks_modified); +int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg); +void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg); +void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); + +int dsl_sync_task_do(struct dsl_pool *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SYNCTASK_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h new file mode 100644 index 0000000..095dd3c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_METASLAB_H +#define _SYS_METASLAB_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/space_map.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct metaslab_class metaslab_class_t; +typedef struct metaslab_group metaslab_group_t; + +extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, + uint64_t start, uint64_t size, uint64_t txg); +extern void metaslab_fini(metaslab_t *msp); +extern void metaslab_sync(metaslab_t *msp, uint64_t txg); +extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); + +extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, + int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid); +extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, + boolean_t now); +extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); + +extern metaslab_class_t *metaslab_class_create(void); +extern void metaslab_class_destroy(metaslab_class_t *mc); +extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); +extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); + +extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, + vdev_t *vd); +extern void metaslab_group_destroy(metaslab_group_t *mg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h new file mode 100644 index 0000000..5980cbc --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -0,0 +1,81 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_METASLAB_IMPL_H +#define _SYS_METASLAB_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/metaslab.h> +#include <sys/space_map.h> +#include <sys/vdev.h> +#include <sys/txg.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct metaslab_class { + metaslab_group_t *mc_rotor; + uint64_t mc_allocated; +}; + +struct metaslab_group { + kmutex_t mg_lock; + avl_tree_t mg_metaslab_tree; + uint64_t mg_aliquot; + int64_t mg_bias; + metaslab_class_t *mg_class; + vdev_t *mg_vd; + metaslab_group_t *mg_prev; + metaslab_group_t *mg_next; +}; + +/* + * Each metaslab's free space is tracked in space map object in the MOS, + * which is only updated in syncing context. Each time we sync a txg, + * we append the allocs and frees from that txg to the space map object. + * When the txg is done syncing, metaslab_sync_done() updates ms_smo + * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. + */ +struct metaslab { + kmutex_t ms_lock; /* metaslab lock */ + space_map_obj_t ms_smo; /* synced space map object */ + space_map_obj_t ms_smo_syncing; /* syncing space map object */ + space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ + space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ + space_map_t ms_map; /* in-core free space map */ + uint64_t ms_weight; /* weight vs. others in group */ + metaslab_group_t *ms_group; /* metaslab group */ + avl_node_t ms_group_node; /* node in metaslab group tree */ + txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h new file mode 100644 index 0000000..4de1cae --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_REFCOUNT_H +#define _SYS_REFCOUNT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/list.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * If the reference is held only by the calling function and not any + * particular object, use FTAG (which is a string) for the holder_tag. + * Otherwise, use the object that holds the reference. + */ +#define FTAG ((char *)__func__) + +#if defined(DEBUG) || !defined(_KERNEL) +typedef struct reference { + list_node_t ref_link; + void *ref_holder; + uint64_t ref_number; + uint8_t *ref_removed; +} reference_t; + +typedef struct refcount { + kmutex_t rc_mtx; + list_t rc_list; + list_t rc_removed; + int64_t rc_count; + int64_t rc_removed_count; +} refcount_t; + +/* Note: refcount_t should be initialized to zero before use. */ + +void refcount_create(refcount_t *rc); +void refcount_destroy(refcount_t *rc); +void refcount_destroy_many(refcount_t *rc, uint64_t number); +int refcount_is_zero(refcount_t *rc); +int64_t refcount_count(refcount_t *rc); +int64_t refcount_add(refcount_t *rc, void *holder_tag); +int64_t refcount_remove(refcount_t *rc, void *holder_tag); +int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); +int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); + +void refcount_init(void); +void refcount_fini(void); + +#else /* DEBUG */ + +typedef struct refcount { + uint64_t rc_count; +} refcount_t; + +#define refcount_create(rc) ((rc)->rc_count = 0) +#define refcount_destroy(rc) ((rc)->rc_count = 0) +#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) +#define refcount_is_zero(rc) ((rc)->rc_count == 0) +#define refcount_count(rc) ((rc)->rc_count) +#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1) +#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1) +#define refcount_add_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, number) +#define refcount_remove_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, -number) + +#define refcount_init() +#define refcount_fini() + +#endif /* DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_REFCOUNT_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h new file mode 100644 index 0000000..2bcf4c8 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -0,0 +1,491 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_H +#define _SYS_SPA_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/avl.h> +#include <sys/zfs_context.h> +#include <sys/nvpair.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Forward references that lots of things need. + */ +typedef struct spa spa_t; +typedef struct vdev vdev_t; +typedef struct metaslab metaslab_t; +typedef struct zilog zilog_t; +typedef struct traverse_handle traverse_handle_t; +struct dsl_pool; + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) \ + ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len)) +#define BF64_SET(x, low, len, val) \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +#define BF32_SET_SB(x, low, len, shift, bias, val) \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)) +#define BF64_SET_SB(x, low, len, shift, bias, val) \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)) + +/* + * We currently support nine block sizes, from 512 bytes to 128K. + * We could go higher, but the benefits are near-zero and the cost + * of COWing a giant block to modify one byte would become excessive. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_MAXBLOCKSHIFT 17 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) + +/* + * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. + * The ASIZE encoding should be at least 64 times larger (6 more bits) + * to support up to 4-way RAID-Z mirror mode with worst-case gang block + * overhead, three DVAs per bp, plus one more bit in case we do anything + * else that expands the ASIZE. + */ +#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ +#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ +#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ + +/* + * All SPA data is represented by 128-bit data virtual addresses (DVAs). + * The members of the dva_t should be considered opaque outside the SPA. + */ +typedef struct dva { + uint64_t dva_word[2]; +} dva_t; + +/* + * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. + */ +typedef struct zio_cksum { + uint64_t zc_word[4]; +} zio_cksum_t; + +/* + * Each block is described by its DVAs, time of birth, checksum, etc. + * The word-by-word, bit-by-bit layout of the blkptr is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | vdev3 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 |G| offset3 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | checksum[2] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | checksum[3] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * E endianness + * type DMU object type + * lvl level of indirection + * birth txg transaction group in which the block was born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ +typedef struct blkptr { + dva_t blk_dva[3]; /* 128-bit Data Virtual Address */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[3]; /* Extra space for the future */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ + +/* + * Macros to get and set fields in a bp or DVA. + */ +#define DVA_GET_ASIZE(dva) \ + BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_ASIZE(dva, x) \ + BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) +#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) + +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) +#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) + +#define DVA_GET_OFFSET(dva) \ + BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_OFFSET(dva, x) \ + BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) +#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) + +#define BP_GET_LSIZE(bp) \ + (BP_IS_HOLE(bp) ? 0 : \ + BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_PSIZE(bp) \ + BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define BP_SET_PSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) + +#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) +#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) + +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) + +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) + +#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_GET_ASIZE(bp) \ + (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_GET_UCSIZE(bp) \ + ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + +#define BP_GET_NDVAS(bp) \ + (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_COUNT_GANG(bp) \ + (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ + DVA_GET_GANG(&(bp)->blk_dva[1]) + \ + DVA_GET_GANG(&(bp)->blk_dva[2])) + +#define DVA_EQUAL(dva1, dva2) \ + ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ + (dva1)->dva_word[0] == (dva2)->dva_word[0]) + +#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ + ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ + ((zc1).zc_word[3] - (zc2).zc_word[3]))) + + +#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) + +#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ +{ \ + (zcp)->zc_word[0] = w0; \ + (zcp)->zc_word[1] = w1; \ + (zcp)->zc_word[2] = w2; \ + (zcp)->zc_word[3] = w3; \ +} + +#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) +#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) +#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) + +#define BP_ZERO(bp) \ +{ \ + (bp)->blk_dva[0].dva_word[0] = 0; \ + (bp)->blk_dva[0].dva_word[1] = 0; \ + (bp)->blk_dva[1].dva_word[0] = 0; \ + (bp)->blk_dva[1].dva_word[1] = 0; \ + (bp)->blk_dva[2].dva_word[0] = 0; \ + (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_prop = 0; \ + (bp)->blk_pad[0] = 0; \ + (bp)->blk_pad[1] = 0; \ + (bp)->blk_pad[2] = 0; \ + (bp)->blk_birth = 0; \ + (bp)->blk_fill = 0; \ + ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ +} + +/* + * Note: the byteorder is either 0 or -1, both of which are palindromes. + * This simplifies the endianness handling a bit. + */ +#ifdef _BIG_ENDIAN +#define ZFS_HOST_BYTEORDER (0ULL) +#else +#define ZFS_HOST_BYTEORDER (-1ULL) +#endif + +#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) + +#define BP_SPRINTF_LEN 320 + +#include <sys/dmu.h> + +#define BP_GET_BUFC_TYPE(bp) \ + (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ + ARC_BUFC_METADATA : ARC_BUFC_DATA); +/* + * Routines found in spa.c + */ + +/* state manipulation functions */ +extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_get_stats(const char *pool, nvlist_t **config, + char *altroot, size_t buflen); +extern int spa_create(const char *pool, nvlist_t *config, const char *altroot); +extern int spa_import(const char *pool, nvlist_t *config, const char *altroot); +extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); +extern int spa_destroy(char *pool); +extern int spa_export(char *pool, nvlist_t **oldconfig); +extern int spa_reset(char *pool); +extern void spa_async_request(spa_t *spa, int flag); +extern void spa_async_suspend(spa_t *spa); +extern void spa_async_resume(spa_t *spa); +extern spa_t *spa_inject_addref(char *pool); +extern void spa_inject_delref(spa_t *spa); + +#define SPA_ASYNC_REOPEN 0x01 +#define SPA_ASYNC_REPLACE_DONE 0x02 +#define SPA_ASYNC_SCRUB 0x04 +#define SPA_ASYNC_RESILVER 0x08 +#define SPA_ASYNC_CONFIG_UPDATE 0x10 + +/* device manipulation */ +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, + int replacing); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done); +extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); + +/* spare state (which is global across all pools) */ +extern void spa_spare_add(vdev_t *vd); +extern void spa_spare_remove(vdev_t *vd); +extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool); +extern void spa_spare_activate(vdev_t *vd); + +/* scrubbing */ +extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force); +extern void spa_scrub_suspend(spa_t *spa); +extern void spa_scrub_resume(spa_t *spa); +extern void spa_scrub_restart(spa_t *spa, uint64_t txg); + +/* spa syncing */ +extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ +extern void spa_sync_allpools(void); + +/* + * SPA configuration functions in spa_config.c + */ + +#define SPA_CONFIG_UPDATE_POOL 0 +#define SPA_CONFIG_UPDATE_VDEVS 1 + +extern void spa_config_sync(void); +extern void spa_config_load(void); +extern nvlist_t *spa_all_configs(uint64_t *); +extern void spa_config_set(spa_t *spa, nvlist_t *config); +extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats); +extern void spa_config_update(spa_t *spa, int what); + +/* + * Miscellaneous SPA routines in spa_misc.c + */ + +/* Namespace manipulation */ +extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_add(const char *name, const char *altroot); +extern void spa_remove(spa_t *spa); +extern spa_t *spa_next(spa_t *prev); + +/* Refcount functions */ +extern void spa_open_ref(spa_t *spa, void *tag); +extern void spa_close(spa_t *spa, void *tag); +extern boolean_t spa_refcount_zero(spa_t *spa); + +/* Pool configuration lock */ +extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag); +extern void spa_config_exit(spa_t *spa, void *tag); +extern boolean_t spa_config_held(spa_t *spa, krw_t rw); + +/* Pool vdev add/remove lock */ +extern uint64_t spa_vdev_enter(spa_t *spa); +extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); + +/* Accessor functions */ +extern krwlock_t *spa_traverse_rwlock(spa_t *spa); +extern int spa_traverse_wanted(spa_t *spa); +extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern blkptr_t *spa_get_rootblkptr(spa_t *spa); +extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); +extern void spa_altroot(spa_t *, char *, size_t); +extern int spa_sync_pass(spa_t *spa); +extern char *spa_name(spa_t *spa); +extern uint64_t spa_guid(spa_t *spa); +extern uint64_t spa_last_synced_txg(spa_t *spa); +extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_version(spa_t *spa); +extern int spa_state(spa_t *spa); +extern uint64_t spa_freeze_txg(spa_t *spa); +struct metaslab_class; +extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa); +extern uint64_t spa_get_alloc(spa_t *spa); +extern uint64_t spa_get_space(spa_t *spa); +extern uint64_t spa_get_dspace(spa_t *spa); +extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_version(spa_t *spa); +extern int spa_max_replication(spa_t *spa); +extern int spa_busy(void); + +/* Miscellaneous support routines */ +extern int spa_rename(const char *oldname, const char *newname); +extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); +extern char *spa_strdup(const char *); +extern void spa_strfree(char *); +extern uint64_t spa_get_random(uint64_t range); +extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); +extern void spa_freeze(spa_t *spa); +extern void spa_upgrade(spa_t *spa); +extern void spa_evict_all(void); +extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid); +extern boolean_t spa_has_spare(spa_t *, uint64_t guid); +extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); + +/* history logging */ +extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); +extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, + char *his_buf); +extern int spa_history_log(spa_t *spa, const char *his_buf, + uint64_t pool_create); + +/* error handling */ +struct zbookmark; +struct zio; +extern void spa_log_error(spa_t *spa, struct zio *zio); +extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t stateoroffset, uint64_t length); +extern void zfs_post_ok(spa_t *spa, vdev_t *vd); +extern uint64_t spa_get_errlog_size(spa_t *spa); +extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); +extern void spa_errlog_rotate(spa_t *spa); +extern void spa_errlog_drain(spa_t *spa); +extern void spa_errlog_sync(spa_t *spa, uint64_t txg); +extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); + +/* Initialization and termination */ +extern void spa_init(int flags); +extern void spa_fini(void); + +/* properties */ +extern int spa_set_props(spa_t *spa, nvlist_t *nvp); +extern int spa_get_props(spa_t *spa, nvlist_t **nvp); +extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); +extern boolean_t spa_has_bootfs(spa_t *spa); + +#ifdef ZFS_DEBUG +#define dprintf_bp(bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ + dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_bp(bp, fmt, ...) +#endif + +extern int spa_mode; /* mode, e.g. FREAD | FWRITE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h new file mode 100644 index 0000000..8c57123 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -0,0 +1,168 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_IMPL_H +#define _SYS_SPA_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/metaslab.h> +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/uberblock_impl.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/refcount.h> +#include <sys/bplist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_config_lock { + kmutex_t scl_lock; + refcount_t scl_count; + kthread_t *scl_writer; + kcondvar_t scl_cv; +} spa_config_lock_t; + +typedef struct spa_error_entry { + zbookmark_t se_bookmark; + char *se_name; + avl_node_t se_avl; +} spa_error_entry_t; + +typedef struct spa_history_phys { + uint64_t sh_pool_create_len; /* ending offset of zpool create */ + uint64_t sh_phys_max_off; /* physical EOF */ + uint64_t sh_bof; /* logical BOF */ + uint64_t sh_eof; /* logical EOF */ + uint64_t sh_records_lost; /* num of records overwritten */ +} spa_history_phys_t; + +typedef struct spa_props { + nvlist_t *spa_props_nvp; + list_node_t spa_list_node; +} spa_props_t; + +struct spa { + /* + * Fields protected by spa_namespace_lock. + */ + char *spa_name; /* pool name */ + avl_node_t spa_avl; /* node in spa_namespace_avl */ + nvlist_t *spa_config; /* last synced config */ + nvlist_t *spa_config_syncing; /* currently syncing config */ + uint64_t spa_config_txg; /* txg of last config change */ + kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */ + int spa_sync_pass; /* iterate-to-convergence */ + int spa_state; /* pool state */ + int spa_inject_ref; /* injection references */ + uint8_t spa_traverse_wanted; /* traverse lock wanted */ + uint8_t spa_sync_on; /* sync threads are running */ + spa_load_state_t spa_load_state; /* current load operation */ + taskq_t *spa_zio_issue_taskq[ZIO_TYPES]; + taskq_t *spa_zio_intr_taskq[ZIO_TYPES]; + dsl_pool_t *spa_dsl_pool; + metaslab_class_t *spa_normal_class; /* normal data class */ + uint64_t spa_first_txg; /* first txg after spa_open() */ + uint64_t spa_final_txg; /* txg of export/destroy */ + uint64_t spa_freeze_txg; /* freeze pool at this txg */ + objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ + vdev_t *spa_root_vdev; /* top-level vdev container */ + uint64_t spa_load_guid; /* initial guid for spa_load */ + list_t spa_dirty_list; /* vdevs with dirty labels */ + uint64_t spa_spares_object; /* MOS object for spare list */ + nvlist_t *spa_sparelist; /* cached spare config */ + vdev_t **spa_spares; /* available hot spares */ + int spa_nspares; /* number of hot spares */ + boolean_t spa_sync_spares; /* sync the spares list */ + uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_syncing_txg; /* txg currently syncing */ + uint64_t spa_sync_bplist_obj; /* object for deferred frees */ + bplist_t spa_sync_bplist; /* deferred-free bplist */ + krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */ + uberblock_t spa_ubsync; /* last synced uberblock */ + uberblock_t spa_uberblock; /* current uberblock */ + kmutex_t spa_scrub_lock; /* resilver/scrub lock */ + kthread_t *spa_scrub_thread; /* scrub/resilver thread */ + traverse_handle_t *spa_scrub_th; /* scrub traverse handle */ + uint64_t spa_scrub_restart_txg; /* need to restart */ + uint64_t spa_scrub_mintxg; /* min txg we'll scrub */ + uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */ + uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ + uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ + uint64_t spa_scrub_errors; /* scrub I/O error count */ + int spa_scrub_suspended; /* tell scrubber to suspend */ + kcondvar_t spa_scrub_cv; /* scrub thread state change */ + kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ + uint8_t spa_scrub_stop; /* tell scrubber to stop */ + uint8_t spa_scrub_active; /* active or suspended? */ + uint8_t spa_scrub_type; /* type of scrub we're doing */ + uint8_t spa_scrub_finished; /* indicator to rotate logs */ + kmutex_t spa_async_lock; /* protect async state */ + kthread_t *spa_async_thread; /* thread doing async task */ + int spa_async_suspended; /* async tasks suspended */ + kcondvar_t spa_async_cv; /* wait for thread_exit() */ + uint16_t spa_async_tasks; /* async task mask */ + char *spa_root; /* alternate root directory */ + kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */ + uint64_t spa_ena; /* spa-wide ereport ENA */ + boolean_t spa_last_open_failed; /* true if last open faled */ + kmutex_t spa_errlog_lock; /* error log lock */ + uint64_t spa_errlog_last; /* last error log object */ + uint64_t spa_errlog_scrub; /* scrub error log object */ + kmutex_t spa_errlist_lock; /* error list/ereport lock */ + avl_tree_t spa_errlist_last; /* last error list */ + avl_tree_t spa_errlist_scrub; /* scrub error list */ + uint64_t spa_deflate; /* should we deflate? */ + uint64_t spa_history; /* history object */ + kmutex_t spa_history_lock; /* history lock */ + vdev_t *spa_pending_vdev; /* pending vdev additions */ + nvlist_t **spa_pending_spares; /* pending spare additions */ + uint_t spa_pending_nspares; /* # pending spares */ + kmutex_t spa_props_lock; /* property lock */ + uint64_t spa_pool_props_object; /* object for properties */ + uint64_t spa_bootfs; /* default boot filesystem */ + /* + * spa_refcnt must be the last element because it changes size based on + * compilation options. In order for the MDB module to function + * correctly, the other fields must remain in the same location. + */ + spa_config_lock_t spa_config_lock; /* configuration changes */ + refcount_t spa_refcount; /* number of opens */ +}; + +extern const char *spa_config_dir; +extern kmutex_t spa_namespace_lock; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h new file mode 100644 index 0000000..db9daef --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPACE_MAP_H +#define _SYS_SPACE_MAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/avl.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct space_map_ops space_map_ops_t; + +typedef struct space_map { + avl_tree_t sm_root; /* AVL tree of map segments */ + uint64_t sm_space; /* sum of all segments in the map */ + uint64_t sm_start; /* start of map */ + uint64_t sm_size; /* size of map */ + uint8_t sm_shift; /* unit shift */ + uint8_t sm_pad[3]; /* unused */ + uint8_t sm_loaded; /* map loaded? */ + uint8_t sm_loading; /* map loading? */ + kcondvar_t sm_load_cv; /* map load completion */ + space_map_ops_t *sm_ops; /* space map block picker ops vector */ + void *sm_ppd; /* picker-private data */ + kmutex_t *sm_lock; /* pointer to lock that protects map */ +} space_map_t; + +typedef struct space_seg { + avl_node_t ss_node; /* AVL node */ + uint64_t ss_start; /* starting offset of this segment */ + uint64_t ss_end; /* ending offset (non-inclusive) */ +} space_seg_t; + +typedef struct space_map_obj { + uint64_t smo_object; /* on-disk space map object */ + uint64_t smo_objsize; /* size of the object */ + uint64_t smo_alloc; /* space allocated from the map */ +} space_map_obj_t; + +struct space_map_ops { + void (*smop_load)(space_map_t *sm); + void (*smop_unload)(space_map_t *sm); + uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); + void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); + void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); +}; + +/* + * debug entry + * + * 1 3 10 50 + * ,---+--------+------------+---------------------------------. + * | 1 | action | syncpass | txg (lower bits) | + * `---+--------+------------+---------------------------------' + * 63 62 60 59 50 49 0 + * + * + * + * non-debug entry + * + * 1 47 1 15 + * ,-----------------------------------------------------------. + * | 0 | offset (sm_shift units) | type | run | + * `-----------------------------------------------------------' + * 63 62 17 16 15 0 + */ + +/* All this stuff takes and returns bytes */ +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47) +#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1) +#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3) + +#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) +#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) + +#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) +#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) + +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) + +#define SM_ALLOC 0x0 +#define SM_FREE 0x1 + +/* + * The data for a given space map can be kept on blocks of any size. + * Larger blocks entail fewer i/o operations, but they also cause the + * DMU to keep more data in-core, and also to waste more i/o bandwidth + * when only a few blocks have changed since the last transaction group. + * This could use a lot more research, but for now, set the freelist + * block size to 4k (2^12). + */ +#define SPACE_MAP_BLOCKSHIFT 12 + +typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size); + +extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, + uint8_t shift, kmutex_t *lp); +extern void space_map_destroy(space_map_t *sm); +extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); +extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_vacate(space_map_t *sm, + space_map_func_t *func, space_map_t *mdest); +extern void space_map_walk(space_map_t *sm, + space_map_func_t *func, space_map_t *mdest); +extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_union(space_map_t *smd, space_map_t *sms); + +extern void space_map_load_wait(space_map_t *sm); +extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, + uint8_t maptype, space_map_obj_t *smo, objset_t *os); +extern void space_map_unload(space_map_t *sm); + +extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); +extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); + +extern void space_map_sync(space_map_t *sm, uint8_t maptype, + space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); +extern void space_map_truncate(space_map_obj_t *smo, + objset_t *os, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPACE_MAP_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h new file mode 100644 index 0000000..dae129c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TXG_H +#define _SYS_TXG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */ +#define TXG_SIZE 4 /* next power of 2 */ +#define TXG_MASK (TXG_SIZE - 1) /* mask for size */ +#define TXG_INITIAL TXG_SIZE /* initial txg */ +#define TXG_IDX (txg & TXG_MASK) + +#define TXG_WAIT 1ULL +#define TXG_NOWAIT 2ULL + +typedef struct tx_cpu tx_cpu_t; + +typedef struct txg_handle { + tx_cpu_t *th_cpu; + uint64_t th_txg; +} txg_handle_t; + +typedef struct txg_node { + struct txg_node *tn_next[TXG_SIZE]; + uint8_t tn_member[TXG_SIZE]; +} txg_node_t; + +typedef struct txg_list { + kmutex_t tl_lock; + size_t tl_offset; + txg_node_t *tl_head[TXG_SIZE]; +} txg_list_t; + +struct dsl_pool; + +extern void txg_init(struct dsl_pool *dp, uint64_t txg); +extern void txg_fini(struct dsl_pool *dp); +extern void txg_sync_start(struct dsl_pool *dp); +extern void txg_sync_stop(struct dsl_pool *dp); +extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); +extern void txg_rele_to_quiesce(txg_handle_t *txghp); +extern void txg_rele_to_sync(txg_handle_t *txghp); +extern void txg_suspend(struct dsl_pool *dp); +extern void txg_resume(struct dsl_pool *dp); + +/* + * Wait until the given transaction group has finished syncing. + * Try to make this happen as soon as possible (eg. kick off any + * necessary syncs immediately). If txg==0, wait for the currently open + * txg to finish syncing. + */ +extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); + +/* + * Wait until the given transaction group, or one after it, is + * the open transaction group. Try to make this happen as soon + * as possible (eg. kick off any necessary syncs immediately). + * If txg == 0, wait for the next open txg. + */ +extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg); + +/* + * Returns TRUE if we are "backed up" waiting for the syncing + * transaction to complete; otherwise returns FALSE. + */ +extern int txg_stalled(struct dsl_pool *dp); + +/* + * Per-txg object lists. + */ + +#define TXG_CLEAN(txg) ((txg) - 1) + +extern void txg_list_create(txg_list_t *tl, size_t offset); +extern void txg_list_destroy(txg_list_t *tl); +extern int txg_list_empty(txg_list_t *tl, uint64_t txg); +extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); +extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); +extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_head(txg_list_t *tl, uint64_t txg); +extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h new file mode 100644 index 0000000..45a138a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TXG_IMPL_H +#define _SYS_TXG_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/txg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct tx_cpu { + kmutex_t tc_lock; + kcondvar_t tc_cv[TXG_SIZE]; + uint64_t tc_count[TXG_SIZE]; + char tc_pad[16]; +}; + +typedef struct tx_state { + tx_cpu_t *tx_cpu; /* protects right to enter txg */ + kmutex_t tx_sync_lock; /* protects tx_state_t */ + krwlock_t tx_suspend; + uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ + uint64_t tx_syncing_txg; /* currently syncing txg id */ + uint64_t tx_synced_txg; /* last synced txg id */ + + uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ + uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ + + kcondvar_t tx_sync_more_cv; + kcondvar_t tx_sync_done_cv; + kcondvar_t tx_quiesce_more_cv; + kcondvar_t tx_quiesce_done_cv; + kcondvar_t tx_timeout_exit_cv; + kcondvar_t tx_exit_cv; /* wait for all threads to exit */ + + uint8_t tx_threads; /* number of threads */ + uint8_t tx_exiting; /* set when we're exiting */ + + kthread_t *tx_sync_thread; + kthread_t *tx_quiesce_thread; + kthread_t *tx_timelimit_thread; +} tx_state_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h new file mode 100644 index 0000000..93d936a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UBERBLOCK_H +#define _SYS_UBERBLOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct uberblock uberblock_t; + +extern int uberblock_verify(uberblock_t *ub); +extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h new file mode 100644 index 0000000..ab0f2dc --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UBERBLOCK_IMPL_H +#define _SYS_UBERBLOCK_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/uberblock.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The uberblock version is incremented whenever an incompatible on-disk + * format change is made to the SPA, DMU, or ZAP. + * + * Note: the first two fields should never be moved. When a storage pool + * is opened, the uberblock must be read off the disk before the version + * can be checked. If the ub_version field is moved, we may not detect + * version mismatch. If the ub_magic field is moved, applications that + * expect the magic number in the first word won't work. + */ +#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ +#define UBERBLOCK_SHIFT 10 /* up to 1K */ + +struct uberblock { + uint64_t ub_magic; /* UBERBLOCK_MAGIC */ + uint64_t ub_version; /* ZFS_VERSION */ + uint64_t ub_txg; /* txg of last sync */ + uint64_t ub_guid_sum; /* sum of all vdev guids */ + uint64_t ub_timestamp; /* UTC time of last sync */ + blkptr_t ub_rootbp; /* MOS objset_phys_t */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h new file mode 100644 index 0000000..c8c177e --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UNIQUE_H +#define _SYS_UNIQUE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* The number of significant bits in each unique value. */ +#define UNIQUE_BITS 56 + +void unique_init(void); + +/* Return a new unique value. */ +uint64_t unique_create(void); + +/* Return a unique value, which equals the one passed in if possible. */ +uint64_t unique_insert(uint64_t value); + +/* Indicate that this value no longer needs to be uniquified against. */ +void unique_remove(uint64_t value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UNIQUE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h new file mode 100644 index 0000000..3120811 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -0,0 +1,132 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_H +#define _SYS_VDEV_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/space_map.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t zfs_nocacheflush; + +/* + * Fault injection modes. + */ +#define VDEV_FAULT_NONE 0 +#define VDEV_FAULT_RANDOM 1 +#define VDEV_FAULT_COUNT 2 + +extern int vdev_open(vdev_t *); +extern int vdev_validate(vdev_t *); +extern void vdev_close(vdev_t *); +extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); +extern void vdev_init(vdev_t *, uint64_t txg); +extern void vdev_reopen(vdev_t *); +extern int vdev_validate_spare(vdev_t *); + +extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); +extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); +extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); +extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); +extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + int scrub_done); + +extern const char *vdev_description(vdev_t *vd); + +extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); +extern void vdev_metaslab_fini(vdev_t *vd); + +extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); +extern void vdev_stat_update(zio_t *zio); +extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, + boolean_t complete); +extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); +extern void vdev_propagate_state(vdev_t *vd); +extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, + vdev_aux_t aux); + +extern void vdev_space_update(vdev_t *vd, int64_t space_delta, + int64_t alloc_delta); + +extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); + +extern void vdev_io_start(zio_t *zio); +extern void vdev_io_done(zio_t *zio); + +extern int vdev_online(spa_t *spa, uint64_t guid); +extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp); +extern void vdev_clear(spa_t *spa, vdev_t *vd); + +extern int vdev_error_inject(vdev_t *vd, zio_t *zio); +extern int vdev_is_dead(vdev_t *vd); + +extern void vdev_cache_init(vdev_t *vd); +extern void vdev_cache_fini(vdev_t *vd); +extern int vdev_cache_read(zio_t *zio); +extern void vdev_cache_write(zio_t *zio); + +extern void vdev_queue_init(vdev_t *vd); +extern void vdev_queue_fini(vdev_t *vd); +extern zio_t *vdev_queue_io(zio_t *zio); +extern void vdev_queue_io_done(zio_t *zio); + +extern void vdev_config_dirty(vdev_t *vd); +extern void vdev_config_clean(vdev_t *vd); +extern int vdev_config_sync(vdev_t *vd, uint64_t txg); + +extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, + boolean_t getstats, boolean_t isspare); + +/* + * Label routines + */ +struct uberblock; +extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); +extern nvlist_t *vdev_label_read_config(vdev_t *vd); +extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); + +typedef enum { + VDEV_LABEL_CREATE, /* create/add a new device */ + VDEV_LABEL_REPLACE, /* replace an existing device */ + VDEV_LABEL_SPARE, /* add a new hot spare */ + VDEV_LABEL_REMOVE /* remove an existing device */ +} vdev_labeltype_t; + +extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h new file mode 100644 index 0000000..95536a7 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_DISK_H +#define _SYS_VDEV_DISK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/vdev.h> +#ifdef _KERNEL +#include <sys/sunldi.h> +#include <sys/sunddi.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_disk { + ddi_devid_t vd_devid; + char *vd_minor; + ldi_handle_t vd_lh; +} vdev_disk_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DISK_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h new file mode 100644 index 0000000..cd49673 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_FILE_H +#define _SYS_VDEV_FILE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/vdev.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_file { + vnode_t *vf_vnode; +} vdev_file_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_FILE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h new file mode 100644 index 0000000..aba7567 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -0,0 +1,298 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_IMPL_H +#define _SYS_VDEV_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/avl.h> +#include <sys/dmu.h> +#include <sys/metaslab.h> +#include <sys/nvpair.h> +#include <sys/space_map.h> +#include <sys/vdev.h> +#include <sys/dkio.h> +#include <sys/uberblock_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Virtual device descriptors. + * + * All storage pool operations go through the virtual device framework, + * which provides data replication and I/O scheduling. + */ + +/* + * Forward declarations that lots of things need. + */ +typedef struct vdev_queue vdev_queue_t; +typedef struct vdev_cache vdev_cache_t; +typedef struct vdev_cache_entry vdev_cache_entry_t; + +/* + * Virtual device operations + */ +typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift); +typedef void vdev_close_func_t(vdev_t *vd); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef void vdev_io_start_func_t(zio_t *zio); +typedef void vdev_io_done_func_t(zio_t *zio); +typedef void vdev_state_change_func_t(vdev_t *vd, int, int); + +typedef struct vdev_ops { + vdev_open_func_t *vdev_op_open; + vdev_close_func_t *vdev_op_close; + vdev_asize_func_t *vdev_op_asize; + vdev_io_start_func_t *vdev_op_io_start; + vdev_io_done_func_t *vdev_op_io_done; + vdev_state_change_func_t *vdev_op_state_change; + char vdev_op_type[16]; + boolean_t vdev_op_leaf; +} vdev_ops_t; + +/* + * Virtual device properties + */ +struct vdev_cache_entry { + char *ve_data; + uint64_t ve_offset; + uint64_t ve_lastused; + avl_node_t ve_offset_node; + avl_node_t ve_lastused_node; + uint32_t ve_hits; + uint16_t ve_missed_update; + zio_t *ve_fill_io; +}; + +struct vdev_cache { + avl_tree_t vc_offset_tree; + avl_tree_t vc_lastused_tree; + kmutex_t vc_lock; +}; + +struct vdev_queue { + avl_tree_t vq_deadline_tree; + avl_tree_t vq_read_tree; + avl_tree_t vq_write_tree; + avl_tree_t vq_pending_tree; + kmutex_t vq_lock; +}; + +/* + * Virtual device descriptor + */ +struct vdev { + /* + * Common to all vdev types. + */ + uint64_t vdev_id; /* child number in vdev parent */ + uint64_t vdev_guid; /* unique ID for this vdev */ + uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_asize; /* allocatable device capacity */ + uint64_t vdev_ashift; /* block alignment shift */ + uint64_t vdev_state; /* see VDEV_STATE_* #defines */ + uint64_t vdev_prevstate; /* used when reopening a vdev */ + vdev_ops_t *vdev_ops; /* vdev operations */ + spa_t *vdev_spa; /* spa for this vdev */ + void *vdev_tsd; /* type-specific data */ + vdev_t *vdev_top; /* top-level vdev */ + vdev_t *vdev_parent; /* parent vdev */ + vdev_t **vdev_child; /* array of children */ + uint64_t vdev_children; /* number of children */ + space_map_t vdev_dtl_map; /* dirty time log in-core state */ + space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */ + vdev_stat_t vdev_stat; /* virtual device statistics */ + + /* + * Top-level vdev state. + */ + uint64_t vdev_ms_array; /* metaslab array object */ + uint64_t vdev_ms_shift; /* metaslab size shift */ + uint64_t vdev_ms_count; /* number of metaslabs */ + metaslab_group_t *vdev_mg; /* metaslab group */ + metaslab_t **vdev_ms; /* metaslab array */ + txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ + txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ + txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ + uint8_t vdev_reopen_wanted; /* async reopen wanted? */ + list_node_t vdev_dirty_node; /* config dirty list */ + uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ + + /* + * Leaf vdev state. + */ + uint64_t vdev_psize; /* physical device capacity */ + space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ + txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ + uint64_t vdev_wholedisk; /* true if this is a whole disk */ + uint64_t vdev_offline; /* device taken offline? */ + uint64_t vdev_nparity; /* number of parity devices for raidz */ + char *vdev_path; /* vdev path (if any) */ + char *vdev_devid; /* vdev devid (if any) */ + uint64_t vdev_fault_arg; /* fault injection paramater */ + int vdev_fault_mask; /* zio types to fault */ + uint8_t vdev_fault_mode; /* fault injection mode */ + uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */ + uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ + uint8_t vdev_detached; /* device detached? */ + uint64_t vdev_isspare; /* was a hot spare */ + vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ + vdev_cache_t vdev_cache; /* physical block cache */ + uint64_t vdev_not_present; /* not present during import */ + hrtime_t vdev_last_try; /* last reopen time */ + boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + + /* + * For DTrace to work in userland (libzpool) context, these fields must + * remain at the end of the structure. DTrace will use the kernel's + * CTF definition for 'struct vdev', and since the size of a kmutex_t is + * larger in userland, the offsets for the rest fields would be + * incorrect. + */ + kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ + kmutex_t vdev_stat_lock; /* vdev_stat */ +}; + +#define VDEV_SKIP_SIZE (8 << 10) +#define VDEV_BOOT_HEADER_SIZE (8 << 10) +#define VDEV_PHYS_SIZE (112 << 10) +#define VDEV_UBERBLOCK_RING (128 << 10) + +#define VDEV_UBERBLOCK_SHIFT(vd) \ + MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) +#define VDEV_UBERBLOCK_COUNT(vd) \ + (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) +#define VDEV_UBERBLOCK_OFFSET(vd, n) \ + offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) +#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) + +/* ZFS boot block */ +#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL +#define VDEV_BOOT_VERSION 1 /* version number */ + +typedef struct vdev_boot_header { + uint64_t vb_magic; /* VDEV_BOOT_MAGIC */ + uint64_t vb_version; /* VDEV_BOOT_VERSION */ + uint64_t vb_offset; /* start offset (bytes) */ + uint64_t vb_size; /* size (bytes) */ + char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)]; +} vdev_boot_header_t; + +typedef struct vdev_phys { + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; + zio_block_tail_t vp_zbt; +} vdev_phys_t; + +typedef struct vdev_label { + char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ + vdev_boot_header_t vl_boot_header; /* 8K */ + vdev_phys_t vl_vdev_phys; /* 112K */ + char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ +} vdev_label_t; /* 256K total */ + +/* + * vdev_dirty() flags + */ +#define VDD_METASLAB 0x01 +#define VDD_DTL 0x02 + +/* + * Size and offset of embedded boot loader region on each label. + * The total size of the first two labels plus the boot area is 4MB. + */ +#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ + +/* + * Size of label regions at the start and end of each leaf device. + */ +#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) +#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) +#define VDEV_LABELS 4 + +#define VDEV_ALLOC_LOAD 0 +#define VDEV_ALLOC_ADD 1 +#define VDEV_ALLOC_SPARE 2 + +/* + * Allocate or free a vdev + */ +extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, + vdev_t *parent, uint_t id, int alloctype); +extern void vdev_free(vdev_t *vd); + +/* + * Add or remove children and parents + */ +extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_compact_children(vdev_t *pvd); +extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); +extern void vdev_remove_parent(vdev_t *cvd); + +/* + * vdev sync load and sync + */ +extern void vdev_load(vdev_t *vd); +extern void vdev_sync(vdev_t *vd, uint64_t txg); +extern void vdev_sync_done(vdev_t *vd, uint64_t txg); +extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); + +/* + * Available vdev types. + */ +extern vdev_ops_t vdev_root_ops; +extern vdev_ops_t vdev_mirror_ops; +extern vdev_ops_t vdev_replacing_ops; +extern vdev_ops_t vdev_raidz_ops; +#ifdef _KERNEL +extern vdev_ops_t vdev_geom_ops; +#else +extern vdev_ops_t vdev_disk_ops; +extern vdev_ops_t vdev_file_ops; +#endif +extern vdev_ops_t vdev_missing_ops; +extern vdev_ops_t vdev_spare_ops; + +/* + * Common size functions + */ +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_get_rsize(vdev_t *vd); + +/* + * zdb uses this tunable, so it must be declared here to make lint happy. + */ +extern int zfs_vdev_cache_size; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h new file mode 100644 index 0000000..f89d938 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h @@ -0,0 +1,359 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_H +#define _SYS_ZAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZAP - ZFS Attribute Processor + * + * The ZAP is a module which sits on top of the DMU (Data Managemnt + * Unit) and implements a higher-level storage primitive using DMU + * objects. Its primary consumer is the ZPL (ZFS Posix Layer). + * + * A "zapobj" is a DMU object which the ZAP uses to stores attributes. + * Users should use only zap routines to access a zapobj - they should + * not access the DMU object directly using DMU routines. + * + * The attributes stored in a zapobj are name-value pairs. The name is + * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including + * terminating NULL). The value is an array of integers, which may be + * 1, 2, 4, or 8 bytes long. The total space used by the array (number + * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. + * Note that an 8-byte integer value can be used to store the location + * (object number) of another dmu object (which may be itself a zapobj). + * Note that you can use a zero-length attribute to store a single bit + * of information - the attribute is present or not. + * + * The ZAP routines are thread-safe. However, you must observe the + * DMU's restriction that a transaction may not be operated on + * concurrently. + * + * Any of the routines that return an int may return an I/O error (EIO + * or ECHECKSUM). + * + * + * Implementation / Performance Notes: + * + * The ZAP is intended to operate most efficiently on attributes with + * short (49 bytes or less) names and single 8-byte values, for which + * the microzap will be used. The ZAP should be efficient enough so + * that the user does not need to cache these attributes. + * + * The ZAP's locking scheme makes its routines thread-safe. Operations + * on different zapobjs will be processed concurrently. Operations on + * the same zapobj which only read data will be processed concurrently. + * Operations on the same zapobj which modify data will be processed + * concurrently when there are many attributes in the zapobj (because + * the ZAP uses per-block locking - more than 128 * (number of cpus) + * small attributes will suffice). + */ + +/* + * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C + * strings) for the names of attributes, rather than a byte string + * bounded by an explicit length. If some day we want to support names + * in character sets which have embedded zeros (eg. UTF-16, UTF-32), + * we'll have to add routines for using length-bounded strings. + */ + +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZAP_MAXNAMELEN 256 +#define ZAP_MAXVALUELEN 1024 + +/* + * Create a new zapobj with no attributes and return its object number. + */ +uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * Create a new zapobj with no attributes from the given (unallocated) + * object number. + */ +int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * The zapobj passed in must be a valid ZAP object for all of the + * following routines. + */ + +/* + * Destroy this zapobj and all its attributes. + * + * Frees the object number using dmu_object_free. + */ +int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); + +/* + * Manipulate attributes. + * + * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. + */ + +/* + * Retrieve the contents of the attribute with the given name. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + * + * If 'integer_size' is smaller than the attribute's integer size, the + * call will fail and return EINVAL. + * + * If 'integer_size' is equal to or larger than the attribute's integer + * size, the call will succeed and return 0. * When converting to a + * larger integer size, the integers will be treated as unsigned (ie. no + * sign-extension will be performed). + * + * 'num_integers' is the length (in integers) of 'buf'. + * + * If the attribute is longer than the buffer, as many integers as will + * fit will be transferred to 'buf'. If the entire attribute was not + * transferred, the call will return EOVERFLOW. + */ +int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); + +/* + * Create an attribute with the given name and value. + * + * If an attribute with the given name already exists, the call will + * fail and return EEXIST. + */ +int zap_add(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); + +/* + * Set the attribute with the given name to the given value. If an + * attribute with the given name does not exist, it will be created. If + * an attribute with the given name already exists, the previous value + * will be overwritten. The integer_size may be different from the + * existing attribute's integer size, in which case the attribute's + * integer size will be updated to the new value. + */ +int zap_update(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); + +/* + * Get the length (in integers) and the integer size of the specified + * attribute. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_length(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers); + +/* + * Remove the specified attribute. + * + * If the specified attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); + +/* + * Returns (in *count) the number of attributes in the specified zap + * object. + */ +int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); + + +/* + * Returns (in name) the name of the entry whose value + * (za_first_integer) is value, or ENOENT if not found. The string + * pointed to by name must be at least 256 bytes long. + */ +int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name); + +struct zap; +struct zap_leaf; +typedef struct zap_cursor { + /* This structure is opaque! */ + objset_t *zc_objset; + struct zap *zc_zap; + struct zap_leaf *zc_leaf; + uint64_t zc_zapobj; + uint64_t zc_hash; + uint32_t zc_cd; +} zap_cursor_t; + +typedef struct { + int za_integer_length; + uint64_t za_num_integers; + uint64_t za_first_integer; /* no sign extension for <8byte ints */ + char za_name[MAXNAMELEN]; +} zap_attribute_t; + +/* + * The interface for listing all the attributes of a zapobj can be + * thought of as cursor moving down a list of the attributes one by + * one. The cookie returned by the zap_cursor_serialize routine is + * persistent across system calls (and across reboot, even). + */ + +/* + * Initialize a zap cursor, pointing to the "first" attribute of the + * zapobj. You must _fini the cursor when you are done with it. + */ +void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); +void zap_cursor_fini(zap_cursor_t *zc); + +/* + * Get the attribute currently pointed to by the cursor. Returns + * ENOENT if at the end of the attributes. + */ +int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); + +/* + * Advance the cursor to the next attribute. + */ +void zap_cursor_advance(zap_cursor_t *zc); + +/* + * Get a persistent cookie pointing to the current position of the zap + * cursor. The low 4 bits in the cookie are always zero, and thus can + * be used as to differentiate a serialized cookie from a different type + * of value. The cookie will be less than 2^32 as long as there are + * fewer than 2^22 (4.2 million) entries in the zap object. + */ +uint64_t zap_cursor_serialize(zap_cursor_t *zc); + +/* + * Initialize a zap cursor pointing to the position recorded by + * zap_cursor_serialize (in the "serialized" argument). You can also + * use a "serialized" argument of 0 to start at the beginning of the + * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to + * zap_cursor_init(...).) + */ +void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, + uint64_t zapobj, uint64_t serialized); + + +#define ZAP_HISTOGRAM_SIZE 10 + +typedef struct zap_stats { + /* + * Size of the pointer table (in number of entries). + * This is always a power of 2, or zero if it's a microzap. + * In general, it should be considerably greater than zs_num_leafs. + */ + uint64_t zs_ptrtbl_len; + + uint64_t zs_blocksize; /* size of zap blocks */ + + /* + * The number of blocks used. Note that some blocks may be + * wasted because old ptrtbl's and large name/value blocks are + * not reused. (Although their space is reclaimed, we don't + * reuse those offsets in the object.) + */ + uint64_t zs_num_blocks; + + /* + * Pointer table values from zap_ptrtbl in the zap_phys_t + */ + uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ + uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ + uint64_t zs_ptrtbl_zt_blk; /* starting block number */ + uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ + uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ + + /* + * Values of the other members of the zap_phys_t + */ + uint64_t zs_block_type; /* ZBT_HEADER */ + uint64_t zs_magic; /* ZAP_MAGIC */ + uint64_t zs_num_leafs; /* The number of leaf blocks */ + uint64_t zs_num_entries; /* The number of zap entries */ + uint64_t zs_salt; /* salt to stir into hash function */ + + /* + * Histograms. For all histograms, the last index + * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater + * than what can be represented. For example + * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number + * of leafs with more than 45 entries. + */ + + /* + * zs_leafs_with_n_pointers[n] is the number of leafs with + * 2^n pointers to it. + */ + uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_with_n_entries[n] is the number of leafs with + * [n*5, (n+1)*5) entries. In the current implementation, there + * can be at most 55 entries in any block, but there may be + * fewer if the name or value is large, or the block is not + * completely full. + */ + uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_n_tenths_full[n] is the number of leafs whose + * fullness is in the range [n/10, (n+1)/10). + */ + uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_entries_using_n_chunks[n] is the number of entries which + * consume n 24-byte chunks. (Note, large names/values only use + * one chunk, but contribute to zs_num_blocks_large.) + */ + uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_buckets_with_n_entries[n] is the number of buckets (each + * leaf has 64 buckets) with n entries. + * zs_buckets_with_n_entries[1] should be very close to + * zs_num_entries. + */ + uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; +} zap_stats_t; + +/* + * Get statistics about a ZAP object. Note: you need to be aware of the + * internal implementation of the ZAP to correctly interpret some of the + * statistics. This interface shouldn't be relied on unless you really + * know what you're doing. + */ +int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h new file mode 100644 index 0000000..4e43f4a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h @@ -0,0 +1,204 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_IMPL_H +#define _SYS_ZAP_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zap.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern int fzap_default_block_shift; + +#define ZAP_MAGIC 0x2F52AB2ABULL + +#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) + +#define ZAP_MAXCD (uint32_t)(-1) +#define ZAP_HASHBITS 28 +#define MZAP_ENT_LEN 64 +#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) +#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT +#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) + +typedef struct mzap_ent_phys { + uint64_t mze_value; + uint32_t mze_cd; + uint16_t mze_pad; /* in case we want to chain them someday */ + char mze_name[MZAP_NAME_LEN]; +} mzap_ent_phys_t; + +typedef struct mzap_phys { + uint64_t mz_block_type; /* ZBT_MICRO */ + uint64_t mz_salt; + uint64_t mz_pad[6]; + mzap_ent_phys_t mz_chunk[1]; + /* actually variable size depending on block size */ +} mzap_phys_t; + +typedef struct mzap_ent { + avl_node_t mze_node; + int mze_chunkid; + uint64_t mze_hash; + mzap_ent_phys_t mze_phys; +} mzap_ent_t; + + +/* + * The (fat) zap is stored in one object. It is an array of + * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of: + * + * ptrtbl fits in first block: + * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ... + * + * ptrtbl too big for first block: + * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ... + * + */ + +struct dmu_buf; +struct zap_leaf; + +#define ZBT_LEAF ((1ULL << 63) + 0) +#define ZBT_HEADER ((1ULL << 63) + 1) +#define ZBT_MICRO ((1ULL << 63) + 3) +/* any other values are ptrtbl blocks */ + +/* + * the embedded pointer table takes up half a block: + * block size / entry size (2^3) / 2 + */ +#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) + +/* + * The embedded pointer table starts half-way through the block. Since + * the pointer table itself is half the block, it starts at (64-bit) + * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)). + */ +#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \ + ((uint64_t *)(zap)->zap_f.zap_phys) \ + [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))] + +/* + * TAKE NOTE: + * If zap_phys_t is modified, zap_byteswap() must be modified. + */ +typedef struct zap_phys { + uint64_t zap_block_type; /* ZBT_HEADER */ + uint64_t zap_magic; /* ZAP_MAGIC */ + + struct zap_table_phys { + uint64_t zt_blk; /* starting block number */ + uint64_t zt_numblks; /* number of blocks */ + uint64_t zt_shift; /* bits to index it */ + uint64_t zt_nextblk; /* next (larger) copy start block */ + uint64_t zt_blks_copied; /* number source blocks copied */ + } zap_ptrtbl; + + uint64_t zap_freeblk; /* the next free block */ + uint64_t zap_num_leafs; /* number of leafs */ + uint64_t zap_num_entries; /* number of entries */ + uint64_t zap_salt; /* salt to stir into hash function */ + /* + * This structure is followed by padding, and then the embedded + * pointer table. The embedded pointer table takes up second + * half of the block. It is accessed using the + * ZAP_EMBEDDED_PTRTBL_ENT() macro. + */ +} zap_phys_t; + +typedef struct zap_table_phys zap_table_phys_t; + +typedef struct zap { + objset_t *zap_objset; + uint64_t zap_object; + struct dmu_buf *zap_dbuf; + krwlock_t zap_rwlock; + int zap_ismicro; + uint64_t zap_salt; + union { + struct { + zap_phys_t *zap_phys; + + /* + * zap_num_entries_mtx protects + * zap_num_entries + */ + kmutex_t zap_num_entries_mtx; + int zap_block_shift; + } zap_fat; + struct { + mzap_phys_t *zap_phys; + int16_t zap_num_entries; + int16_t zap_num_chunks; + int16_t zap_alloc_next; + avl_tree_t zap_avl; + } zap_micro; + } zap_u; +} zap_t; + +#define zap_f zap_u.zap_fat +#define zap_m zap_u.zap_micro + +uint64_t zap_hash(zap_t *zap, const char *name); +int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, int fatreader, zap_t **zapp); +void zap_unlockdir(zap_t *zap); +void zap_evict(dmu_buf_t *db, void *vmzap); + +#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) + +void fzap_byteswap(void *buf, size_t size); +int fzap_count(zap_t *zap, uint64_t *count); +int fzap_lookup(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); +int fzap_add(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); +int fzap_update(zap_t *zap, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int fzap_length(zap_t *zap, const char *name, + uint64_t *integer_size, uint64_t *num_integers); +int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx); +int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); +void fzap_get_stats(zap_t *zap, zap_stats_t *zs); +void zap_put_leaf(struct zap_leaf *l); + +int fzap_add_cd(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx); +void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h new file mode 100644 index 0000000..147fb72 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h @@ -0,0 +1,234 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_LEAF_H +#define _SYS_ZAP_LEAF_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +struct zap; + +#define ZAP_LEAF_MAGIC 0x2AB1EAF + +/* chunk size = 24 bytes */ +#define ZAP_LEAF_CHUNKSIZE 24 + +/* + * The amount of space available for chunks is: + * block size (1<<l->l_bs) - hash entry size (2) * number of hash + * entries - header space (2*chunksize) + */ +#define ZAP_LEAF_NUMCHUNKS(l) \ + (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ + ZAP_LEAF_CHUNKSIZE - 2) + +/* + * The amount of space within the chunk available for the array is: + * chunk size - space for type (1) - space for next pointer (2) + */ +#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) + +#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ + (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) + +/* + * Low water mark: when there are only this many chunks free, start + * growing the ptrtbl. Ideally, this should be larger than a + * "reasonably-sized" entry. 20 chunks is more than enough for the + * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), + * while still being only around 3% for 16k blocks. + */ +#define ZAP_LEAF_LOW_WATER (20) + +/* + * The leaf hash table has block size / 2^5 (32) number of entries, + * which should be more than enough for the maximum number of entries, + * which is less than block size / CHUNKSIZE (24) / minimum number of + * chunks per entry (3). + */ +#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) +#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) + +/* + * The chunks start immediately after the hash table. The end of the + * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a + * chunk_t. + */ +#define ZAP_LEAF_CHUNK(l, idx) \ + ((zap_leaf_chunk_t *) \ + ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] +#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) + +typedef enum zap_chunk_type { + ZAP_CHUNK_FREE = 253, + ZAP_CHUNK_ENTRY = 252, + ZAP_CHUNK_ARRAY = 251, + ZAP_CHUNK_TYPE_MAX = 250 +} zap_chunk_type_t; + +/* + * TAKE NOTE: + * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. + */ +typedef struct zap_leaf_phys { + struct zap_leaf_header { + uint64_t lh_block_type; /* ZBT_LEAF */ + uint64_t lh_pad1; + uint64_t lh_prefix; /* hash prefix of this leaf */ + uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ + uint16_t lh_nfree; /* number free chunks */ + uint16_t lh_nentries; /* number of entries */ + uint16_t lh_prefix_len; /* num bits used to id this */ + +/* above is accessable to zap, below is zap_leaf private */ + + uint16_t lh_freelist; /* chunk head of free list */ + uint8_t lh_pad2[12]; + } l_hdr; /* 2 24-byte chunks */ + + /* + * The header is followed by a hash table with + * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is + * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) + * zap_leaf_chunk structures. These structures are accessed + * with the ZAP_LEAF_CHUNK() macro. + */ + + uint16_t l_hash[1]; +} zap_leaf_phys_t; + +typedef union zap_leaf_chunk { + struct zap_leaf_entry { + uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ + uint8_t le_int_size; /* size of ints */ + uint16_t le_next; /* next entry in hash chain */ + uint16_t le_name_chunk; /* first chunk of the name */ + uint16_t le_name_length; /* bytes in name, incl null */ + uint16_t le_value_chunk; /* first chunk of the value */ + uint16_t le_value_length; /* value length in ints */ + uint32_t le_cd; /* collision differentiator */ + uint64_t le_hash; /* hash value of the name */ + } l_entry; + struct zap_leaf_array { + uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ + uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; + uint16_t la_next; /* next blk or CHAIN_END */ + } l_array; + struct zap_leaf_free { + uint8_t lf_type; /* always ZAP_CHUNK_FREE */ + uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; + uint16_t lf_next; /* next in free list, or CHAIN_END */ + } l_free; +} zap_leaf_chunk_t; + +typedef struct zap_leaf { + krwlock_t l_rwlock; /* only used on head of chain */ + uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */ + int l_bs; /* block size shift */ + dmu_buf_t *l_dbuf; + zap_leaf_phys_t *l_phys; +} zap_leaf_t; + + +typedef struct zap_entry_handle { + /* below is set by zap_leaf.c and is public to zap.c */ + uint64_t zeh_num_integers; + uint64_t zeh_hash; + uint32_t zeh_cd; + uint8_t zeh_integer_size; + + /* below is private to zap_leaf.c */ + uint16_t zeh_fakechunk; + uint16_t *zeh_chunkp; + zap_leaf_t *zeh_leaf; +} zap_entry_handle_t; + +/* + * Return a handle to the named entry, or ENOENT if not found. The hash + * value must equal zap_hash(name). + */ +extern int zap_leaf_lookup(zap_leaf_t *l, + const char *name, uint64_t h, zap_entry_handle_t *zeh); + +/* + * Return a handle to the entry with this hash+cd, or the entry with the + * next closest hash+cd. + */ +extern int zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh); + +/* + * Read the first num_integers in the attribute. Integer size + * conversion will be done without sign extension. Return EINVAL if + * integer_size is too small. Return EOVERFLOW if there are more than + * num_integers in the attribute. + */ +extern int zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf); + +extern int zap_entry_read_name(const zap_entry_handle_t *zeh, + uint16_t buflen, char *buf); + +/* + * Replace the value of an existing entry. + * + * zap_entry_update may fail if it runs out of space (ENOSPC). + */ +extern int zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf); + +/* + * Remove an entry. + */ +extern void zap_entry_remove(zap_entry_handle_t *zeh); + +/* + * Create an entry. An equal entry must not exist, and this entry must + * belong in this leaf (according to its hash value). Fills in the + * entry handle on success. Returns 0 on success or ENOSPC on failure. + */ +extern int zap_entry_create(zap_leaf_t *l, + const char *name, uint64_t h, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh); + +/* + * Other stuff. + */ + +extern void zap_leaf_init(zap_leaf_t *l); +extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); +extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl); +extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_LEAF_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h new file mode 100644 index 0000000..3250b76 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h @@ -0,0 +1,115 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_ACL_H +#define _SYS_FS_ZFS_ACL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef _KERNEL +#include <sys/cred.h> +#endif +#include <sys/acl.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct znode_phys; + +#define ACCESS_UNDETERMINED -1 + +#define ACE_SLOT_CNT 6 + +typedef struct zfs_znode_acl { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_count; /* Number of ACEs */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_pad; /* pad */ + ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +} zfs_znode_acl_t; + +#define ACL_DATA_ALLOCED 0x1 + +/* + * Max ACL size is prepended deny for all entries + the + * canonical six tacked on * the end. + */ +#define MAX_ACL_SIZE (MAX_ACL_ENTRIES * 2 + 6) + +typedef struct zfs_acl { + int z_slots; /* number of allocated slots for ACEs */ + int z_acl_count; + uint_t z_state; + ace_t *z_acl; +} zfs_acl_t; + +#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) + +/* + * Property values for acl_mode and acl_inherit. + * + * acl_mode can take discard, noallow, groupmask and passthrough. + * whereas acl_inherit has secure instead of groupmask. + */ + +#define ZFS_ACL_DISCARD 0 +#define ZFS_ACL_NOALLOW 1 +#define ZFS_ACL_GROUPMASK 2 +#define ZFS_ACL_PASSTHROUGH 3 +#define ZFS_ACL_SECURE 4 + +struct znode; + +#ifdef _KERNEL +void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, + dmu_tx_t *, cred_t *); +#ifdef TODO +int zfs_getacl(struct znode *, vsecattr_t *, cred_t *); +#endif +int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t *); +#ifdef TODO +int zfs_setacl(struct znode *, vsecattr_t *, cred_t *); +#endif +void zfs_acl_rele(void *); +void zfs_ace_byteswap(ace_t *, int); +extern int zfs_zaccess(struct znode *, int, cred_t *); +extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *); +extern int zfs_acl_access(struct znode *, int, cred_t *); +int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *); +int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); +int zfs_zaccess_rename(struct znode *, struct znode *, + struct znode *, struct znode *, cred_t *cr); +int zfs_zaccess_v4_perm(struct znode *, int, cred_t *); +void zfs_acl_free(zfs_acl_t *); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* !ZFS_NO_ACL */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h new file mode 100644 index 0000000..c91d807 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h @@ -0,0 +1,122 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_CONTEXT_H +#define _SYS_ZFS_CONTEXT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/param.h> +#include <sys/stdint.h> +#include <sys/note.h> +#include <sys/kernel.h> +#include <sys/debug.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/systm.h> +#include <sys/kobj.h> +#include <sys/conf.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/random.h> +#include <sys/byteorder.h> +#include <sys/systm.h> +#include <sys/list.h> +#include <sys/uio.h> +#include <sys/dirent.h> +#include <sys/time.h> +#include <sys/uio.h> +#include <sys/fcntl.h> +#include <sys/limits.h> +#include <sys/string.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/cred.h> +#include <sys/sdt.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/sysctl.h> +#include <sys/sbuf.h> +#include <sys/priv.h> +#include <sys/kdb.h> +#include <sys/ktr.h> +#include <sys/stack.h> +#include <sys/lockf.h> +#include <sys/policy.h> +#include <sys/zone.h> +#include <sys/eventhandler.h> +#include <sys/zfs_debug.h> + +#include <machine/stdarg.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +/* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */ +#ifdef min_offset +#undef min_offset +#endif +#ifdef max_offset +#undef max_offset +#endif +#include <vm/vm_extern.h> +#include <vm/vnode_pager.h> + +#define CPU_SEQID (curcpu) + +#ifdef __cplusplus +} +#endif + +#define physmem (vm_kmem_size / PAGE_SIZE) + +extern int zfs_debug_level; +extern struct mtx zfs_debug_mtx; +#define ZFS_LOG(lvl, ...) do { \ + if (((lvl) & 0xff) <= zfs_debug_level) { \ + mtx_lock(&zfs_debug_mtx); \ + printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + if ((lvl) & 0x100) \ + kdb_backtrace(); \ + mtx_unlock(&zfs_debug_mtx); \ + } \ +} while (0) + +#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h new file mode 100644 index 0000000..a676533 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/vnode.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_CTLDIR_NAME ".zfs" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ + ((zdp)->z_zfsvfs->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + ((zdp)->z_zfsvfs->z_show_ctldir)) + +void zfsctl_create(zfsvfs_t *); +void zfsctl_destroy(zfsvfs_t *); +vnode_t *zfsctl_root(znode_t *); +void zfsctl_init(void); +void zfsctl_fini(void); + +int zfsctl_rename_snapshot(const char *from, const char *to); +int zfsctl_destroy_snapshot(const char *snapname, int force); +int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); + +int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr); + +int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); + +#define ZFSCTL_INO_ROOT 0x1 +#define ZFSCTL_INO_SNAPDIR 0x2 + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CTLDIR_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h new file mode 100644 index 0000000..450ac1c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_DEBUG_H +#define _SYS_ZFS_DEBUG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +/* + * ZFS debugging + */ + +#if defined(DEBUG) || !defined(_KERNEL) +#define ZFS_DEBUG +#endif + +extern int zfs_flags; + +#define ZFS_DEBUG_DPRINTF 0x0001 +#define ZFS_DEBUG_DBUF_VERIFY 0x0002 +#define ZFS_DEBUG_DNODE_VERIFY 0x0004 +#define ZFS_DEBUG_SNAPNAMES 0x0008 +#define ZFS_DEBUG_MODIFY 0x0010 + +#ifdef ZFS_DEBUG +extern void __dprintf(const char *file, const char *func, + int line, const char *fmt, ...); +#define dprintf(...) \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) \ + __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__) +#else +#define dprintf(...) ((void)0) +#endif /* ZFS_DEBUG */ + +extern void zfs_panic_recover(const char *fmt, ...); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_DEBUG_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h new file mode 100644 index 0000000..f60d614 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ +#define IS_REPLAY 0x04 /* we are replaying intent log */ + +extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, + int); +extern void zfs_dirent_unlock(zfs_dirlock_t *); +extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, + boolean_t *); +extern int zfs_dirlook(znode_t *, char *, vnode_t **); +extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *, + dmu_tx_t *, cred_t *, uint_t, znode_t **, int); +extern void zfs_rmnode(znode_t *); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); +extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h new file mode 100644 index 0000000..d28729b --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_H +#define _SYS_ZFS_IOCTL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/cred.h> +#include <sys/dmu.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Property values for snapdir + */ +#define ZFS_SNAPDIR_HIDDEN 0 +#define ZFS_SNAPDIR_VISIBLE 1 + +#define DMU_BACKUP_VERSION (1ULL) +#define DMU_BACKUP_MAGIC 0x2F5bacbacULL + +/* + * zfs ioctl command structure + */ +typedef struct dmu_replay_record { + enum { + DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, + DRR_WRITE, DRR_FREE, DRR_END, + } drr_type; + uint32_t drr_pad; + union { + struct drr_begin { + uint64_t drr_magic; + uint64_t drr_version; + uint64_t drr_creation_time; + dmu_objset_type_t drr_type; + uint32_t drr_pad; + uint64_t drr_toguid; + uint64_t drr_fromguid; + char drr_toname[MAXNAMELEN]; + } drr_begin; + struct drr_end { + zio_cksum_t drr_checksum; + } drr_end; + struct drr_object { + uint64_t drr_object; + dmu_object_type_t drr_type; + dmu_object_type_t drr_bonustype; + uint32_t drr_blksz; + uint32_t drr_bonuslen; + uint8_t drr_checksum; + uint8_t drr_compress; + uint8_t drr_pad[6]; + /* bonus content follows */ + } drr_object; + struct drr_freeobjects { + uint64_t drr_firstobj; + uint64_t drr_numobjs; + } drr_freeobjects; + struct drr_write { + uint64_t drr_object; + dmu_object_type_t drr_type; + uint32_t drr_pad; + uint64_t drr_offset; + uint64_t drr_length; + /* content follows */ + } drr_write; + struct drr_free { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + } drr_free; + } drr_u; +} dmu_replay_record_t; + +typedef struct zinject_record { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; +} zinject_record_t; + +#define ZINJECT_NULL 0x1 +#define ZINJECT_FLUSH_ARC 0x2 +#define ZINJECT_UNLOAD_SPA 0x4 + +typedef struct zfs_cmd { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN * 2]; + uint64_t zc_guid; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_cred; + uint64_t zc_dev; + uint64_t zc_objset_type; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; +} zfs_cmd_t; + +#ifdef _KERNEL +typedef struct zfs_create_data { + cred_t *zc_cred; + dev_t zc_dev; + nvlist_t *zc_props; +} zfs_create_data_t; +#endif + +#define ZVOL_MAX_MINOR (1 << 16) +#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) + +#ifdef _KERNEL + +extern int zfs_secpolicy_write(const char *dataset, cred_t *cr); +extern int zfs_busy(void); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h new file mode 100644 index 0000000..f302b66 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_RLOCK_H +#define _SYS_FS_ZFS_RLOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#include <sys/zfs_znode.h> + +typedef enum { + RL_READER, + RL_WRITER, + RL_APPEND +} rl_type_t; + +typedef struct rl { + znode_t *r_zp; /* znode this lock applies to */ + avl_node_t r_node; /* avl node link */ + uint64_t r_off; /* file range offset */ + uint64_t r_len; /* file range length */ + uint_t r_cnt; /* range reference count in tree */ + rl_type_t r_type; /* range type */ + kcondvar_t r_wr_cv; /* cv for waiting writers */ + kcondvar_t r_rd_cv; /* cv for waiting readers */ + uint8_t r_proxy; /* acting for original range */ + uint8_t r_write_wanted; /* writer wants to lock this range */ + uint8_t r_read_wanted; /* reader wants to lock this range */ +} rl_t; + +/* + * Lock a range (offset, length) as either shared (READER) + * or exclusive (WRITER or APPEND). APPEND is a special type that + * is converted to WRITER that specified to lock from the start of the + * end of file. zfs_range_lock() returns the range lock structure. + */ +rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); + +/* + * Unlock range and destroy range lock structure. + */ +void zfs_range_unlock(rl_t *rl); + +/* + * Reduce range locked as RW_WRITER from whole file to specified range. + * Asserts the whole file was previously locked. + */ +void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); + +/* + * AVL comparison function used to compare range locks + */ +int zfs_range_compare(const void *arg1, const void *arg2); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_RLOCK_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h new file mode 100644 index 0000000..aa82cc1 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -0,0 +1,100 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/list.h> +#include <sys/vfs.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfsvfs zfsvfs_t; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted1; /* unmounted phase 1 */ + boolean_t z_unmounted2; /* unmounted phase 2 */ + uint32_t z_op_cnt; /* vnode/vfs operations ref count */ + krwlock_t z_um_lock; /* rw lock for umount phase 2 */ + list_t z_all_znodes; /* all vnodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + vnode_t *z_ctldir; /* .zfs directory pointer */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ +#define ZFS_OBJ_MTX_SZ 64 + kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ +}; + +/* + * The total file ID size is limited to 12 bytes (including the length + * field) in the NFSv2 protocol. For historical reasons, this same limit + * is currently being imposed by the Solaris NFSv3 implementation... + * although the protocol actually permits a maximum of 64 bytes. It will + * not be possible to expand beyond 12 bytes without abandoning support + * of NFSv2 and making some changes to the Solaris NFSv3 implementation. + * + * For the time being, we will partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h new file mode 100644 index 0000000..c9c317e --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h @@ -0,0 +1,298 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_ZNODE_H +#define _SYS_FS_ZFS_ZNODE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef _KERNEL +#include <sys/list.h> +#include <sys/dmu.h> +#include <sys/zfs_vfsops.h> +#endif +#include <sys/zfs_acl.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Define special zfs pflags + */ +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ + +#define MASTER_NODE_OBJ 1 + +/* + * special attributes for master node. + */ + +#define ZFS_FSID "FSID" +#define ZFS_UNLINKED_SET "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZPL_VERSION_OBJ "VERSION" +#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE" +#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS" + +#define ZFS_FLAG_BLOCKPERPAGE 0x1 +#define ZFS_FLAG_NOGROWBLOCKS 0x2 + +/* + * ZPL version - rev'd whenever an incompatible on-disk format change + * occurs. Independent of SPA/DMU/ZAP versioning. + */ + +#define ZPL_VERSION 1ULL + +#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) + +/* Path component length */ +/* + * The generic fs code uses MAXNAMELEN to represent + * what the largest component length is. Unfortunately, + * this length includes the terminating NULL. ZFS needs + * to tell the users via pathconf() and statvfs() what the + * true maximum length of a component is, excluding the NULL. + */ +#define ZFS_MAXNAMELEN (MAXNAMELEN - 1) + +/* + * The directory entry has the type (currently unused on Solaris) in the + * top 4 bits, and the object number in the low 48 bits. The "middle" + * 12 bits are unused. + */ +#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) +#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) +#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj) + + +/* + * This is the persistent portion of the znode. It is stored + * in the "bonus buffer" of the file. Short symbolic links + * are also stored in the bonus buffer. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_pad[4]; /* 144 - future */ + zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we only use this space to store symbolic links. + */ +} znode_phys_t; + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#ifdef _KERNEL +typedef struct zfs_dirlock { + char *dl_name; /* directory entry being locked */ + uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ + uint16_t dl_namesize; /* set if dl_name was allocated */ + kcondvar_t dl_cv; /* wait for entry to be unlocked */ + struct znode *dl_dzp; /* directory znode */ + struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ +} zfs_dirlock_t; + +typedef struct znode { + struct zfsvfs *z_zfsvfs; + vnode_t *z_vnode; + uint64_t z_id; /* object ID for this znode */ + kmutex_t z_lock; /* znode modification lock */ + krwlock_t z_map_lock; /* page map lock */ + krwlock_t z_parent_lock; /* parent lock for directories */ + krwlock_t z_name_lock; /* "master" lock for dirent locks */ + zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ + kmutex_t z_range_lock; /* protects changes to z_range_avl */ + avl_tree_t z_range_avl; /* avl tree of file range locks */ + uint8_t z_unlinked; /* file has been unlinked */ + uint8_t z_atime_dirty; /* atime needs to be synced */ + uint8_t z_dbuf_held; /* Is z_dbuf already held? */ + uint8_t z_zn_prefetch; /* Prefetch znodes? */ + uint_t z_blksz; /* block size in bytes */ + uint_t z_seq; /* modification sequence number */ + uint64_t z_mapcnt; /* number of pages mapped to file */ + uint64_t z_last_itx; /* last ZIL itx on this znode */ + uint32_t z_sync_cnt; /* synchronous open count */ + kmutex_t z_acl_lock; /* acl data lock */ + list_node_t z_link_node; /* all znodes in fs link */ + struct lockf *z_lockf; /* Head of byte-level lock list. */ + /* + * These are dmu managed fields. + */ + znode_phys_t *z_phys; /* pointer to persistent znode */ + dmu_buf_t *z_dbuf; /* buffer containing the z_phys */ +} znode_t; + + +/* + * Range locking rules + * -------------------- + * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole + * file range needs to be locked as RL_WRITER. Only then can the pages be + * freed etc and zp_size reset. zp_size must be set within range lock. + * 2. For writes and punching holes (zfs_write & zfs_space) just the range + * being written or freed needs to be locked as RL_WRITER. + * Multiple writes at the end of the file must coordinate zp_size updates + * to ensure data isn't lost. A compare and swap loop is currently used + * to ensure the file size is at least the offset last written. + * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being + * read needs to be locked as RL_READER. A check against zp_size can then + * be made for reading beyond end of file. + */ + +/* + * Convert between znode pointers and vnode pointers + */ +#define ZTOV(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((znode_t *)(VP)->v_data) + +/* + * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation. + * ZFS_EXIT() must be called before exitting the vop. + */ +#define ZFS_ENTER(zfsvfs) \ + { \ + atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \ + if ((zfsvfs)->z_unmounted1) { \ + ZFS_EXIT(zfsvfs); \ + return (EIO); \ + } \ + } +#define ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1) + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_HASH(obj_num) (obj_num & (ZFS_OBJ_MTX_SZ - 1)) +#define ZFS_OBJ_MUTEX(zp) \ + (&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)]) +#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ + mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]); + +#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ + mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) + +/* + * Macros to encode/decode ZFS stored time values from/to struct timespec + */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +{ \ + stmp[0] = (uint64_t)(tp)->tv_sec; \ + stmp[1] = (uint64_t)(tp)->tv_nsec; \ +} + +#define ZFS_TIME_DECODE(tp, stmp) \ +{ \ + (tp)->tv_sec = (time_t)stmp[0]; \ + (tp)->tv_nsec = (long)stmp[1]; \ +} + +/* + * Timestamp defines + */ +#define ACCESSED (AT_ATIME) +#define STATE_CHANGED (AT_CTIME) +#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) + +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ + zfs_time_stamper(zp, ACCESSED, NULL) + +extern int zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *); +extern void zfs_set_dataprop(objset_t *); +extern void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx); +extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); +extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); +extern void zfs_znode_init(void); +extern void zfs_znode_fini(void); +extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); +extern void zfs_zinactive(znode_t *); +extern void zfs_znode_delete(znode_t *, dmu_tx_t *); +extern void zfs_znode_free(znode_t *); +extern void zfs_remove_op_tables(); +extern int zfs_create_op_tables(); +extern dev_t zfs_cmpldev(uint64_t); + +extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name); +extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, char *name); +extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name); +extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name, char *link); +extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); +extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t len, int ioflag); +extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len); +extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied); +#ifndef ZFS_NO_ACL +extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, int aclcnt, ace_t *z_ace); +#endif + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +#endif /* _KERNEL */ + +extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZNODE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h new file mode 100644 index 0000000..947ba9f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h @@ -0,0 +1,276 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIL_H +#define _SYS_ZIL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Intent log format: + * + * Each objset has its own intent log. The log header (zil_header_t) + * for objset N's intent log is kept in the Nth object of the SPA's + * intent_log objset. The log header points to a chain of log blocks, + * each of which contains log records (i.e., transactions) followed by + * a log block trailer (zil_trailer_t). The format of a log record + * depends on the record (or transaction) type, but all records begin + * with a common structure that defines the type, length, and txg. + */ + +/* + * Intent log header - this on disk structure holds fields to manage + * the log. All fields are 64 bit to easily handle cross architectures. + */ +typedef struct zil_header { + uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ + uint64_t zh_replay_seq; /* highest replayed sequence number */ + blkptr_t zh_log; /* log chain */ + uint64_t zh_claim_seq; /* highest claimed sequence number */ + uint64_t zh_pad[5]; +} zil_header_t; + +/* + * Log block trailer - structure at the end of the header and each log block + * + * The zit_bt contains a zbt_cksum which for the intent log is + * the sequence number of this log block. A seq of 0 is invalid. + * The zbt_cksum is checked by the SPA against the sequence + * number passed in the blk_cksum field of the blkptr_t + */ +typedef struct zil_trailer { + uint64_t zit_pad; + blkptr_t zit_next_blk; /* next block in chain */ + uint64_t zit_nused; /* bytes in log block used */ + zio_block_tail_t zit_bt; /* block trailer */ +} zil_trailer_t; + +#define ZIL_MIN_BLKSZ 4096ULL +#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE +#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) + +/* + * The words of a log block checksum. + */ +#define ZIL_ZC_GUID_0 0 +#define ZIL_ZC_GUID_1 1 +#define ZIL_ZC_OBJSET 2 +#define ZIL_ZC_SEQ 3 + +/* + * Intent log transaction types and record structures + */ +#define TX_CREATE 1 /* Create file */ +#define TX_MKDIR 2 /* Make directory */ +#define TX_MKXATTR 3 /* Make XATTR directory */ +#define TX_SYMLINK 4 /* Create symbolic link to a file */ +#define TX_REMOVE 5 /* Remove file */ +#define TX_RMDIR 6 /* Remove directory */ +#define TX_LINK 7 /* Create hard link to a file */ +#define TX_RENAME 8 /* Rename a file */ +#define TX_WRITE 9 /* File write */ +#define TX_TRUNCATE 10 /* Truncate a file */ +#define TX_SETATTR 11 /* Set file attributes */ +#define TX_ACL 12 /* Set acl */ +#define TX_MAX_TYPE 13 /* Max transaction type */ + +/* + * Format of log records. + * The fields are carefully defined to allow them to be aligned + * and sized the same on sparc & intel architectures. + * Each log record has a common structure at the beginning. + * + * Note, lrc_seq holds two different sequence numbers. Whilst in memory + * it contains the transaction sequence number. The log record on + * disk holds the sequence number of all log records which is used to + * ensure we don't replay the same record. The two sequence numbers are + * different because the transactions can now be pushed out of order. + */ +typedef struct { /* common log record header */ + uint64_t lrc_txtype; /* intent log transaction type */ + uint64_t lrc_reclen; /* transaction record length */ + uint64_t lrc_txg; /* dmu transaction group number */ + uint64_t lrc_seq; /* see comment above */ +} lr_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* object id of directory */ + uint64_t lr_foid; /* object id of created file object */ + uint64_t lr_mode; /* mode of object */ + uint64_t lr_uid; /* uid of object */ + uint64_t lr_gid; /* gid of object */ + uint64_t lr_gen; /* generation (txg of creation) */ + uint64_t lr_crtime[2]; /* creation time */ + uint64_t lr_rdev; /* rdev of object to create */ + /* name of object to create follows this */ + /* for symlinks, link content follows name */ +} lr_create_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + /* name of object to remove follows this */ +} lr_remove_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + uint64_t lr_link_obj; /* obj id of link */ + /* name of object to link follows this */ +} lr_link_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_sdoid; /* obj id of source directory */ + uint64_t lr_tdoid; /* obj id of target directory */ + /* 2 strings: names of source and destination follow this */ +} lr_rename_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to write */ + uint64_t lr_offset; /* offset to write to */ + uint64_t lr_length; /* user data length to write */ + uint64_t lr_blkoff; /* offset represented by lr_blkptr */ + blkptr_t lr_blkptr; /* spa block pointer for replay */ + /* write data will follow for small writes */ +} lr_write_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id of file to truncate */ + uint64_t lr_offset; /* offset to truncate from */ + uint64_t lr_length; /* length to truncate */ +} lr_truncate_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to change attributes */ + uint64_t lr_mask; /* mask of attributes to set */ + uint64_t lr_mode; /* mode to set */ + uint64_t lr_uid; /* uid to set */ + uint64_t lr_gid; /* gid to set */ + uint64_t lr_size; /* size to set */ + uint64_t lr_atime[2]; /* access time */ + uint64_t lr_mtime[2]; /* modification time */ +} lr_setattr_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of acl entries */ + /* lr_aclcnt number of ace_t entries follow this */ +} lr_acl_t; + +/* + * ZIL structure definitions, interface function prototype and globals. + */ + +/* + * ZFS intent log transaction structure + */ +typedef enum { + WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ + /* and put blkptr in log, rather than actual data) */ + WR_COPIED, /* immediate - data is copied into lr_write_t */ + WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ +} itx_wr_state_t; + +typedef struct itx { + list_node_t itx_node; /* linkage on zl_itx_list */ + void *itx_private; /* type-specific opaque data */ + itx_wr_state_t itx_wr_state; /* write state */ + uint8_t itx_sync; /* synchronous transaction */ + lr_t itx_lr; /* common part of log record */ + /* followed by type-specific part of lr_xx_t and its immediate data */ +} itx_t; + + +/* + * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done() + * to handle the cleanup of the dmu_sync() buffer write + */ +typedef struct { + zilog_t *zgd_zilog; /* zilog */ + blkptr_t *zgd_bp; /* block pointer */ + struct rl *zgd_rl; /* range lock */ +} zgd_t; + + +typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, + uint64_t txg); +typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, + uint64_t txg); +typedef int zil_replay_func_t(); +typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); + +extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); + +extern void zil_init(void); +extern void zil_fini(void); + +extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); +extern void zil_free(zilog_t *zilog); + +extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); +extern void zil_close(zilog_t *zilog); + +extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, + zil_replay_func_t *replay_func[TX_MAX_TYPE]); +extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); + +extern itx_t *zil_itx_create(int txtype, size_t lrsize); +extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); + +extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); + +extern int zil_claim(char *osname, void *txarg); +extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); +extern void zil_clean(zilog_t *zilog); +extern int zil_is_committed(zilog_t *zilog); + +extern int zil_suspend(zilog_t *zilog); +extern void zil_resume(zilog_t *zilog); + +extern void zil_add_vdev(zilog_t *zilog, uint64_t vdev); + +extern int zil_disable; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h new file mode 100644 index 0000000..3ecf4e4 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIL_IMPL_H +#define _SYS_ZIL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zil.h> +#include <sys/dmu_objset.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Log write buffer. + */ +typedef struct lwb { + zilog_t *lwb_zilog; /* back pointer to log struct */ + blkptr_t lwb_blk; /* on disk address of this log blk */ + int lwb_nused; /* # used bytes in buffer */ + int lwb_sz; /* size of block and buffer */ + char *lwb_buf; /* log write buffer */ + zio_t *lwb_zio; /* zio for this buffer */ + uint64_t lwb_max_txg; /* highest txg in this lwb */ + txg_handle_t lwb_txgh; /* txg handle for txg_exit() */ + list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ +} lwb_t; + +/* + * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes. + * Any vdev numbers beyond that use a linked list of zil_vdev_t structures. + */ + +#define ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */ +typedef struct zil_vdev { + uint64_t vdev; /* device written */ + list_node_t vdev_seq_node; /* zilog->zl_vdev_list linkage */ +} zil_vdev_t; + +/* + * Stable storage intent log management structure. One per dataset. + */ +struct zilog { + kmutex_t zl_lock; /* protects most zilog_t fields */ + struct dsl_pool *zl_dmu_pool; /* DSL pool */ + spa_t *zl_spa; /* handle for read/write log */ + const zil_header_t *zl_header; /* log header buffer */ + objset_t *zl_os; /* object set we're logging */ + zil_get_data_t *zl_get_data; /* callback to get object content */ + zio_t *zl_root_zio; /* log writer root zio */ + uint64_t zl_itx_seq; /* next itx sequence number */ + uint64_t zl_commit_seq; /* committed upto this number */ + uint64_t zl_lr_seq; /* log record sequence number */ + uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ + uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */ + uint32_t zl_suspend; /* log suspend count */ + kcondvar_t zl_cv_writer; /* log writer thread completion */ + kcondvar_t zl_cv_suspend; /* log suspend completion */ + uint8_t zl_suspending; /* log is currently suspending */ + uint8_t zl_keep_first; /* keep first log block in destroy */ + uint8_t zl_stop_replay; /* don't replay any further */ + uint8_t zl_stop_sync; /* for debugging */ + uint8_t zl_writer; /* boolean: write setup in progress */ + uint8_t zl_log_error; /* boolean: log write error */ + list_t zl_itx_list; /* in-memory itx list */ + uint64_t zl_itx_list_sz; /* total size of records on list */ + uint64_t zl_cur_used; /* current commit log size used */ + uint64_t zl_prev_used; /* previous commit log size used */ + list_t zl_lwb_list; /* in-flight log write list */ + list_t zl_vdev_list; /* list of [vdev, seq] pairs */ + uint8_t zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */ + taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ + avl_tree_t zl_dva_tree; /* track DVAs during log parse */ + clock_t zl_replay_time; /* lbolt of when replay started */ + uint64_t zl_replay_blks; /* number of log blocks replayed */ +}; + +typedef struct zil_dva_node { + dva_t zn_dva; + avl_node_t zn_node; +} zil_dva_node_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h new file mode 100644 index 0000000..b026ae6 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -0,0 +1,366 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZIO_H +#define _ZIO_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/dkio.h> +#include <sys/fs/zfs.h> +#include <sys/zio_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */ + +typedef struct zio_block_tail { + uint64_t zbt_magic; /* for validation, endianness */ + zio_cksum_t zbt_cksum; /* 256-bit checksum */ +} zio_block_tail_t; + +/* + * Gang block headers are self-checksumming and contain an array + * of block pointers. + */ +#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE +#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_block_tail_t)) / sizeof (blkptr_t)) +#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_block_tail_t) - \ + (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ + sizeof (uint64_t)) + +#define ZIO_GET_IOSIZE(zio) \ + (BP_IS_GANG((zio)->io_bp) ? \ + SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp)) + +typedef struct zio_gbh { + blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; + uint64_t zg_filler[SPA_GBH_FILLER]; + zio_block_tail_t zg_tail; +} zio_gbh_phys_t; + +enum zio_checksum { + ZIO_CHECKSUM_INHERIT = 0, + ZIO_CHECKSUM_ON, + ZIO_CHECKSUM_OFF, + ZIO_CHECKSUM_LABEL, + ZIO_CHECKSUM_GANG_HEADER, + ZIO_CHECKSUM_ZILOG, + ZIO_CHECKSUM_FLETCHER_2, + ZIO_CHECKSUM_FLETCHER_4, + ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_FUNCTIONS +}; + +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2 +#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON + +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_FUNCTIONS +}; + +#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB +#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF + +#define ZIO_PRIORITY_NOW (zio_priority_table[0]) +#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) +#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) +#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3]) +#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4]) +#define ZIO_PRIORITY_FREE (zio_priority_table[5]) +#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6]) +#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7]) +#define ZIO_PRIORITY_RESILVER (zio_priority_table[8]) +#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) +#define ZIO_PRIORITY_TABLE_SIZE 10 + +#define ZIO_FLAG_MUSTSUCCEED 0x00000 +#define ZIO_FLAG_CANFAIL 0x00001 +#define ZIO_FLAG_FAILFAST 0x00002 +#define ZIO_FLAG_CONFIG_HELD 0x00004 +#define ZIO_FLAG_CONFIG_GRABBED 0x00008 + +#define ZIO_FLAG_DONT_CACHE 0x00010 +#define ZIO_FLAG_DONT_QUEUE 0x00020 +#define ZIO_FLAG_DONT_PROPAGATE 0x00040 +#define ZIO_FLAG_DONT_RETRY 0x00080 + +#define ZIO_FLAG_PHYSICAL 0x00100 +#define ZIO_FLAG_IO_BYPASS 0x00200 +#define ZIO_FLAG_IO_REPAIR 0x00400 +#define ZIO_FLAG_SPECULATIVE 0x00800 + +#define ZIO_FLAG_RESILVER 0x01000 +#define ZIO_FLAG_SCRUB 0x02000 +#define ZIO_FLAG_SCRUB_THREAD 0x04000 +#define ZIO_FLAG_SUBBLOCK 0x08000 + +#define ZIO_FLAG_NOBOOKMARK 0x10000 +#define ZIO_FLAG_USER 0x20000 + +#define ZIO_FLAG_METADATA 0x40000 + +#define ZIO_FLAG_GANG_INHERIT \ + (ZIO_FLAG_CANFAIL | \ + ZIO_FLAG_FAILFAST | \ + ZIO_FLAG_CONFIG_HELD | \ + ZIO_FLAG_DONT_RETRY | \ + ZIO_FLAG_IO_REPAIR | \ + ZIO_FLAG_SPECULATIVE | \ + ZIO_FLAG_RESILVER | \ + ZIO_FLAG_SCRUB | \ + ZIO_FLAG_SCRUB_THREAD) + +#define ZIO_FLAG_VDEV_INHERIT \ + (ZIO_FLAG_GANG_INHERIT | \ + ZIO_FLAG_DONT_CACHE | \ + ZIO_FLAG_PHYSICAL) + +/* + * We'll take the EILSEQ (Illegal byte sequence) errno + * to indicate checksum errors. + */ +#define ECKSUM EILSEQ + +typedef struct zio zio_t; +typedef void zio_done_func_t(zio_t *zio); + +extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; +extern char *zio_type_name[ZIO_TYPES]; + +/* + * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely + * identifies any block in the pool. By convention, the meta-objset (MOS) + * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is + * level -1 of the meta-dnode, and intent log blocks (which are chained + * off the root block) have blkid == sequence number. In summary: + * + * mos is objset 0 + * meta-dnode is object 0 + * root block is <objset, 0, -1, 0> + * intent log is <objset, 0, -1, ZIL sequence number> + * + * Note: this structure is called a bookmark because its first purpose was + * to remember where to resume a pool-wide traverse. The absolute ordering + * for block visitation during traversal is defined in compare_bookmark(). + * + * Note: this structure is passed between userland and the kernel. + * Therefore it must not change size or alignment between 32/64 bit + * compilation options. + */ +typedef struct zbookmark { + uint64_t zb_objset; + uint64_t zb_object; + int64_t zb_level; + uint64_t zb_blkid; +} zbookmark_t; + +struct zio { + /* Core information about this I/O */ + zio_t *io_parent; + zio_t *io_root; + spa_t *io_spa; + zbookmark_t io_bookmark; + enum zio_checksum io_checksum; + enum zio_compress io_compress; + int io_ndvas; + uint64_t io_txg; + blkptr_t *io_bp; + blkptr_t io_bp_copy; + zio_t *io_child; + zio_t *io_sibling_prev; + zio_t *io_sibling_next; + zio_transform_t *io_transform_stack; + zio_t *io_logical; + + /* Callback info */ + zio_done_func_t *io_ready; + zio_done_func_t *io_done; + void *io_private; + blkptr_t io_bp_orig; + + /* Data represented by this I/O */ + void *io_data; + uint64_t io_size; + + /* Stuff for the vdev stack */ + vdev_t *io_vd; + void *io_vsd; + uint64_t io_offset; + uint64_t io_deadline; + uint64_t io_timestamp; + avl_node_t io_offset_node; + avl_node_t io_deadline_node; + avl_tree_t *io_vdev_tree; + zio_t *io_delegate_list; + zio_t *io_delegate_next; + + /* Internal pipeline state */ + int io_flags; + enum zio_type io_type; + enum zio_stage io_stage; + uint8_t io_stalled; + uint8_t io_priority; + struct dk_callback io_dk_callback; + int io_cmd; + int io_retries; + int io_error; + uint32_t io_numerrors; + uint32_t io_pipeline; + uint32_t io_async_stages; + uint64_t io_children_notready; + uint64_t io_children_notdone; + void *io_waiter; + kmutex_t io_lock; + kcondvar_t io_cv; + + /* FMA state */ + uint64_t io_ena; +}; + +extern zio_t *zio_null(zio_t *pio, spa_t *spa, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_root(spa_t *spa, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, + uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags, zbookmark_t *zb); + +extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, + int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb); + +extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum, + uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *done, void *private, int priority, int flags, + zbookmark_t *zb); + +extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private); + +extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private); + +extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + zio_done_func_t *done, void *private, int priority, int flags); + +extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, void *data, int checksum, + zio_done_func_t *done, void *private, int priority, int flags); + +extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, void *data, int checksum, + zio_done_func_t *done, void *private, int priority, int flags); + +extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, + blkptr_t *old_bp, uint64_t txg); +extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); + +extern int zio_wait(zio_t *zio); +extern void zio_nowait(zio_t *zio); + +extern void *zio_buf_alloc(size_t size); +extern void zio_buf_free(void *buf, size_t size); +extern void *zio_data_buf_alloc(size_t size); +extern void zio_data_buf_free(void *buf, size_t size); + +/* + * Move an I/O to the next stage of the pipeline and execute that stage. + * There's no locking on io_stage because there's no legitimate way for + * multiple threads to be attempting to process the same I/O. + */ +extern void zio_next_stage(zio_t *zio); +extern void zio_next_stage_async(zio_t *zio); +extern void zio_wait_children_done(zio_t *zio); + +/* + * Delegate I/O to a child vdev. + */ +extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, + uint64_t offset, void *data, uint64_t size, int type, int priority, + int flags, zio_done_func_t *done, void *private); + +extern void zio_vdev_io_bypass(zio_t *zio); +extern void zio_vdev_io_reissue(zio_t *zio); +extern void zio_vdev_io_redone(zio_t *zio); + +extern void zio_checksum_verified(zio_t *zio); +extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp); + +extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); +extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); + +boolean_t zio_should_retry(zio_t *zio); + +/* + * Initial setup and teardown. + */ +extern void zio_init(void); +extern void zio_fini(void); + +/* + * Fault injection + */ +struct zinject_record; +extern uint32_t zio_injection_enabled; +extern int zio_inject_fault(char *name, int flags, int *id, + struct zinject_record *record); +extern int zio_inject_list_next(int *id, char *name, size_t buflen, + struct zinject_record *record); +extern int zio_clear_fault(int id); +extern int zio_handle_fault_injection(zio_t *zio, int error); +extern int zio_handle_device_injection(vdev_t *vd, int error); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h new file mode 100644 index 0000000..bb7bd41 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIO_CHECKSUM_H +#define _SYS_ZIO_CHECKSUM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Signature for checksum functions. + */ +typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); + +/* + * Information about each checksum function. + */ +typedef struct zio_checksum_info { + zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ + int ci_correctable; /* number of correctable bits */ + int ci_zbt; /* uses zio block tail? */ + char *ci_name; /* descriptive name */ +} zio_checksum_info_t; + +extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; + +/* + * Checksum routines. + */ +extern zio_checksum_t fletcher_2_native; +extern zio_checksum_t fletcher_4_native; +extern zio_checksum_t fletcher_4_incremental_native; + +extern zio_checksum_t fletcher_2_byteswap; +extern zio_checksum_t fletcher_4_byteswap; +extern zio_checksum_t fletcher_4_incremental_byteswap; + +extern zio_checksum_t zio_checksum_SHA256; + +extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp, + void *data, uint64_t size); +extern int zio_checksum_error(zio_t *zio); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_CHECKSUM_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h new file mode 100644 index 0000000..66ee8d4 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIO_COMPRESS_H +#define _SYS_ZIO_COMPRESS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Common signature for all zio compress/decompress functions. + */ +typedef size_t zio_compress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); +typedef int zio_decompress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); + +/* + * Information about each compression function. + */ +typedef struct zio_compress_info { + zio_compress_func_t *ci_compress; /* compression function */ + zio_decompress_func_t *ci_decompress; /* decompression function */ + int ci_level; /* level parameter */ + char *ci_name; /* algorithm name */ +} zio_compress_info_t; + +extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; + +/* + * Compression routines. + */ +extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); + +/* + * Compress and decompress data if necessary. + */ +extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, + void **destp, uint64_t *destsizep, uint64_t *destbufsizep); +extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, + void *dest, uint64_t destsize); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_COMPRESS_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h new file mode 100644 index 0000000..d2ddbc3 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h @@ -0,0 +1,205 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZIO_IMPL_H +#define _ZIO_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * I/O Groups: pipeline stage definitions. + */ + +typedef enum zio_stage { + ZIO_STAGE_OPEN = 0, /* RWFCI */ + ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */ + + ZIO_STAGE_WRITE_COMPRESS, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ + + ZIO_STAGE_GANG_PIPELINE, /* -WFC- */ + + ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */ + ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */ + ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */ + ZIO_STAGE_CLAIM_GANG_MEMBERS, /* ---C- */ + + ZIO_STAGE_DVA_ALLOCATE, /* -W--- */ + ZIO_STAGE_DVA_FREE, /* --F-- */ + ZIO_STAGE_DVA_CLAIM, /* ---C- */ + + ZIO_STAGE_GANG_CHECKSUM_GENERATE, /* -W--- */ + + ZIO_STAGE_READY, /* RWFCI */ + + ZIO_STAGE_VDEV_IO_START, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ + + ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */ + + ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ + ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */ + ZIO_STAGE_READ_DECOMPRESS, /* R---- */ + + ZIO_STAGE_DONE /* RWFCI */ +} zio_stage_t; + +/* + * The stages for which there's some performance value in going async. + * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well. + */ +#define ZIO_ASYNC_PIPELINE_STAGES \ + ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ + (1U << ZIO_STAGE_VDEV_IO_DONE) | \ + (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ + (1U << ZIO_STAGE_READ_DECOMPRESS)) + +#define ZIO_VDEV_IO_PIPELINE \ + ((1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_VDEV_IO_DONE) | \ + (1U << ZIO_STAGE_VDEV_IO_ASSESS)) + +#define ZIO_READ_PHYS_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_READY) | \ + ZIO_VDEV_IO_PIPELINE | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_READ_PIPELINE \ + ZIO_READ_PHYS_PIPELINE + +#define ZIO_WRITE_PHYS_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ + (1U << ZIO_STAGE_READY) | \ + ZIO_VDEV_IO_PIPELINE | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_WRITE_COMMON_PIPELINE \ + ZIO_WRITE_PHYS_PIPELINE + +#define ZIO_WRITE_PIPELINE \ + ((1U << ZIO_STAGE_WRITE_COMPRESS) | \ + ZIO_WRITE_COMMON_PIPELINE) + +#define ZIO_GANG_STAGES \ + ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ + (1U << ZIO_STAGE_READ_GANG_MEMBERS)) + +#define ZIO_REWRITE_PIPELINE \ + ((1U << ZIO_STAGE_GANG_PIPELINE) | \ + (1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ + ZIO_WRITE_COMMON_PIPELINE) + +#define ZIO_WRITE_ALLOCATE_PIPELINE \ + ((1U << ZIO_STAGE_DVA_ALLOCATE) | \ + ZIO_WRITE_COMMON_PIPELINE) + +#define ZIO_GANG_FREE_STAGES \ + ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_FREE_GANG_MEMBERS)) + +#define ZIO_FREE_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_GANG_PIPELINE) | \ + (1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_DVA_FREE) | \ + (1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_CLAIM_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_GANG_PIPELINE) | \ + (1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_DVA_CLAIM) | \ + (1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_IOCTL_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_READY) | \ + ZIO_VDEV_IO_PIPELINE | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \ + ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \ + ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \ + ZIO_VDEV_IO_PIPELINE) + +#define ZIO_ERROR_PIPELINE_MASK \ + ZIO_WAIT_FOR_CHILDREN_PIPELINE + +typedef struct zio_transform zio_transform_t; +struct zio_transform { + void *zt_data; + uint64_t zt_size; + uint64_t zt_bufsize; + zio_transform_t *zt_next; +}; + +extern void zio_inject_init(void); +extern void zio_inject_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h new file mode 100644 index 0000000..df85824 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZVOL_H +#define _SYS_ZVOL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); +extern int zvol_check_volblocksize(uint64_t volblocksize); +extern int zvol_get_stats(objset_t *os, nvlist_t *nv); +extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx); +extern int zvol_create_minor(const char *, dev_t); +extern int zvol_remove_minor(const char *); +extern int zvol_set_volsize(const char *, dev_t, uint64_t); +extern int zvol_set_volblocksize(const char *, uint64_t); + +extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); +extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); +#ifndef __FreeBSD__ +extern int zvol_strategy(buf_t *bp); +extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); +extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); +#endif +extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, + int *rvalp); +extern int zvol_busy(void); +extern void zvol_init(void); +extern void zvol_fini(void); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZVOL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c new file mode 100644 index 0000000..844beb6 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c @@ -0,0 +1,611 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/txg_impl.h> +#include <sys/dmu_impl.h> +#include <sys/dsl_pool.h> +#include <sys/callb.h> + +/* + * Pool-wide transaction groups. + */ + +static void txg_sync_thread(void *arg); +static void txg_quiesce_thread(void *arg); +static void txg_timelimit_thread(void *arg); + +int txg_time = 5; /* max 5 seconds worth of delta per txg */ + +/* + * Prepare the txg subsystem. + */ +void +txg_init(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + int c, i; + bzero(tx, sizeof (tx_state_t)); + + tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); + for (c = 0; c < max_ncpus; c++) { + mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); + for (i = 0; i < TXG_SIZE; i++) + cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); + } + + rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL); + mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_timeout_exit_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); + + tx->tx_open_txg = txg; +} + +/* + * Close down the txg subsystem. + */ +void +txg_fini(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + int c, i; + + ASSERT(tx->tx_threads == 0); + + cv_destroy(&tx->tx_exit_cv); + cv_destroy(&tx->tx_timeout_exit_cv); + cv_destroy(&tx->tx_quiesce_done_cv); + cv_destroy(&tx->tx_quiesce_more_cv); + cv_destroy(&tx->tx_sync_done_cv); + cv_destroy(&tx->tx_sync_more_cv); + rw_destroy(&tx->tx_suspend); + mutex_destroy(&tx->tx_sync_lock); + + for (c = 0; c < max_ncpus; c++) { + for (i = 0; i < TXG_SIZE; i++) + cv_destroy(&tx->tx_cpu[c].tc_cv[i]); + mutex_destroy(&tx->tx_cpu[c].tc_lock); + } + + kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); + + bzero(tx, sizeof (tx_state_t)); +} + +/* + * Start syncing transaction groups. + */ +void +txg_sync_start(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + + dprintf("pool %p\n", dp); + + ASSERT(tx->tx_threads == 0); + + tx->tx_threads = 3; + + tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + mutex_exit(&tx->tx_sync_lock); +} + +static void +txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) +{ + CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); + mutex_enter(&tx->tx_sync_lock); +} + +static void +txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) +{ + ASSERT(*tpp != NULL); + *tpp = NULL; + tx->tx_threads--; + cv_broadcast(&tx->tx_exit_cv); + CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ + thread_exit(); +} + +static void +txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax) +{ + CALLB_CPR_SAFE_BEGIN(cpr); + + if (secmax) + (void) cv_timedwait(cv, &tx->tx_sync_lock, secmax * hz); + else + cv_wait(cv, &tx->tx_sync_lock); + + CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); +} + +/* + * Stop syncing transaction groups. + */ +void +txg_sync_stop(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + dprintf("pool %p\n", dp); + /* + * Finish off any work in progress. + */ + ASSERT(tx->tx_threads == 3); + txg_wait_synced(dp, 0); + + /* + * Wake all 3 sync threads (one per state) and wait for them to die. + */ + mutex_enter(&tx->tx_sync_lock); + + ASSERT(tx->tx_threads == 3); + + tx->tx_exiting = 1; + + cv_broadcast(&tx->tx_quiesce_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + cv_broadcast(&tx->tx_sync_more_cv); + cv_broadcast(&tx->tx_timeout_exit_cv); + + while (tx->tx_threads != 0) + cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); + + tx->tx_exiting = 0; + + mutex_exit(&tx->tx_sync_lock); +} + +uint64_t +txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) +{ + tx_state_t *tx = &dp->dp_tx; + tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; + uint64_t txg; + + mutex_enter(&tc->tc_lock); + + txg = tx->tx_open_txg; + tc->tc_count[txg & TXG_MASK]++; + + th->th_cpu = tc; + th->th_txg = txg; + + return (txg); +} + +void +txg_rele_to_quiesce(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + + mutex_exit(&tc->tc_lock); +} + +void +txg_rele_to_sync(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + int g = th->th_txg & TXG_MASK; + + mutex_enter(&tc->tc_lock); + ASSERT(tc->tc_count[g] != 0); + if (--tc->tc_count[g] == 0) + cv_broadcast(&tc->tc_cv[g]); + mutex_exit(&tc->tc_lock); + + th->th_cpu = NULL; /* defensive */ +} + +static void +txg_quiesce(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + int g = txg & TXG_MASK; + int c; + + /* + * Grab all tx_cpu locks so nobody else can get into this txg. + */ + for (c = 0; c < max_ncpus; c++) + mutex_enter(&tx->tx_cpu[c].tc_lock); + + ASSERT(txg == tx->tx_open_txg); + tx->tx_open_txg++; + + /* + * Now that we've incremented tx_open_txg, we can let threads + * enter the next transaction group. + */ + for (c = 0; c < max_ncpus; c++) + mutex_exit(&tx->tx_cpu[c].tc_lock); + + /* + * Quiesce the transaction group by waiting for everyone to txg_exit(). + */ + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + mutex_enter(&tc->tc_lock); + while (tc->tc_count[g] != 0) + cv_wait(&tc->tc_cv[g], &tc->tc_lock); + mutex_exit(&tc->tc_lock); + } +} + +static void +txg_sync_thread(void *arg) +{ + dsl_pool_t *dp = arg; + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + + txg_thread_enter(tx, &cpr); + + for (;;) { + uint64_t txg; + + /* + * We sync when there's someone waiting on us, or the + * quiesce thread has handed off a txg to us. + */ + while (!tx->tx_exiting && + tx->tx_synced_txg >= tx->tx_sync_txg_waiting && + tx->tx_quiesced_txg == 0) { + dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0); + } + + /* + * Wait until the quiesce thread hands off a txg to us, + * prompting it to do so if necessary. + */ + while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { + if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) + tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; + cv_broadcast(&tx->tx_quiesce_more_cv); + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); + } + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + + rw_enter(&tx->tx_suspend, RW_WRITER); + + /* + * Consume the quiesced txg which has been handed off to + * us. This may cause the quiescing thread to now be + * able to quiesce another txg, so we must signal it. + */ + txg = tx->tx_quiesced_txg; + tx->tx_quiesced_txg = 0; + tx->tx_syncing_txg = txg; + cv_broadcast(&tx->tx_quiesce_more_cv); + rw_exit(&tx->tx_suspend); + + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, + tx->tx_sync_txg_waiting); + mutex_exit(&tx->tx_sync_lock); + spa_sync(dp->dp_spa, txg); + mutex_enter(&tx->tx_sync_lock); + rw_enter(&tx->tx_suspend, RW_WRITER); + tx->tx_synced_txg = txg; + tx->tx_syncing_txg = 0; + rw_exit(&tx->tx_suspend); + cv_broadcast(&tx->tx_sync_done_cv); + } +} + +static void +txg_quiesce_thread(void *arg) +{ + dsl_pool_t *dp = arg; + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + + txg_thread_enter(tx, &cpr); + + for (;;) { + uint64_t txg; + + /* + * We quiesce when there's someone waiting on us. + * However, we can only have one txg in "quiescing" or + * "quiesced, waiting to sync" state. So we wait until + * the "quiesced, waiting to sync" txg has been consumed + * by the sync thread. + */ + while (!tx->tx_exiting && + (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || + tx->tx_quiesced_txg != 0)) + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); + + txg = tx->tx_open_txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, + tx->tx_sync_txg_waiting); + mutex_exit(&tx->tx_sync_lock); + txg_quiesce(dp, txg); + mutex_enter(&tx->tx_sync_lock); + + /* + * Hand this txg off to the sync thread. + */ + dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiesced_txg = txg; + cv_broadcast(&tx->tx_sync_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + } +} + +void +txg_wait_synced(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + ASSERT(tx->tx_threads == 3); + if (txg == 0) + txg = tx->tx_open_txg; + if (tx->tx_sync_txg_waiting < txg) + tx->tx_sync_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_synced_txg < txg) { + dprintf("broadcasting sync more " + "tx_synced=%llu waiting=%llu dp=%p\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + cv_broadcast(&tx->tx_sync_more_cv); + cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + } + mutex_exit(&tx->tx_sync_lock); +} + +void +txg_wait_open(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + ASSERT(tx->tx_threads == 3); + if (txg == 0) + txg = tx->tx_open_txg + 1; + if (tx->tx_quiesce_txg_waiting < txg) + tx->tx_quiesce_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_open_txg < txg) { + cv_broadcast(&tx->tx_quiesce_more_cv); + cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); + } + mutex_exit(&tx->tx_sync_lock); +} + +static void +txg_timelimit_thread(void *arg) +{ + dsl_pool_t *dp = arg; + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + + txg_thread_enter(tx, &cpr); + + while (!tx->tx_exiting) { + uint64_t txg = tx->tx_open_txg + 1; + + txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time); + + if (tx->tx_quiesce_txg_waiting < txg) + tx->tx_quiesce_txg_waiting = txg; + + while (!tx->tx_exiting && tx->tx_open_txg < txg) { + dprintf("pushing out %llu\n", txg); + cv_broadcast(&tx->tx_quiesce_more_cv); + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); + } + } + txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread); +} + +int +txg_stalled(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); +} + +void +txg_suspend(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + /* XXX some code paths suspend when they are already suspended! */ + rw_enter(&tx->tx_suspend, RW_READER); +} + +void +txg_resume(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + rw_exit(&tx->tx_suspend); +} + +/* + * Per-txg object lists. + */ +void +txg_list_create(txg_list_t *tl, size_t offset) +{ + int t; + + mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); + + tl->tl_offset = offset; + + for (t = 0; t < TXG_SIZE; t++) + tl->tl_head[t] = NULL; +} + +void +txg_list_destroy(txg_list_t *tl) +{ + int t; + + for (t = 0; t < TXG_SIZE; t++) + ASSERT(txg_list_empty(tl, t)); + + mutex_destroy(&tl->tl_lock); +} + +int +txg_list_empty(txg_list_t *tl, uint64_t txg) +{ + return (tl->tl_head[txg & TXG_MASK] == NULL); +} + +/* + * Add an entry to the list. + * Returns 0 if it's a new entry, 1 if it's already there. + */ +int +txg_list_add(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + int already_on_list; + + mutex_enter(&tl->tl_lock); + already_on_list = tn->tn_member[t]; + if (!already_on_list) { + tn->tn_member[t] = 1; + tn->tn_next[t] = tl->tl_head[t]; + tl->tl_head[t] = tn; + } + mutex_exit(&tl->tl_lock); + + return (already_on_list); +} + +/* + * Remove the head of the list and return it. + */ +void * +txg_list_remove(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn; + void *p = NULL; + + mutex_enter(&tl->tl_lock); + if ((tn = tl->tl_head[t]) != NULL) { + p = (char *)tn - tl->tl_offset; + tl->tl_head[t] = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + } + mutex_exit(&tl->tl_lock); + + return (p); +} + +/* + * Remove a specific item from the list and return it. + */ +void * +txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn, **tp; + + mutex_enter(&tl->tl_lock); + + for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { + if ((char *)tn - tl->tl_offset == p) { + *tp = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + mutex_exit(&tl->tl_lock); + return (p); + } + } + + mutex_exit(&tl->tl_lock); + + return (NULL); +} + +int +txg_list_member(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + return (tn->tn_member[t]); +} + +/* + * Walk a txg list -- only safe if you know it's not changing. + */ +void * +txg_list_head(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = tl->tl_head[t]; + + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} + +void * +txg_list_next(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + tn = tn->tn_next[t]; + + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c new file mode 100644 index 0000000..34d7e0c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/uberblock_impl.h> +#include <sys/vdev_impl.h> + +int +uberblock_verify(uberblock_t *ub) +{ + if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) + byteswap_uint64_array(ub, sizeof (uberblock_t)); + + if (ub->ub_magic != UBERBLOCK_MAGIC) + return (EINVAL); + + return (0); +} + +/* + * Update the uberblock and return a boolean value indicating whether + * anything changed in this transaction group. + */ +int +uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) +{ + ASSERT(ub->ub_txg < txg); + + /* + * We explicitly do not set ub_version here, so that older versions + * continue to be written with the previous uberblock version. + */ + ub->ub_magic = UBERBLOCK_MAGIC; + ub->ub_txg = txg; + ub->ub_guid_sum = rvd->vdev_guid_sum; + ub->ub_timestamp = gethrestime_sec(); + + return (ub->ub_rootbp.blk_birth == txg); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c new file mode 100644 index 0000000..b52e729 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/unique.h> + +static avl_tree_t unique_avl; +static kmutex_t unique_mtx; /* Lock never initialized. */ +SX_SYSINIT(unique, &unique_mtx, "unique lock"); + +typedef struct unique { + avl_node_t un_link; + uint64_t un_value; +} unique_t; + +#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1) + +static int +unique_compare(const void *a, const void *b) +{ + const unique_t *una = a; + const unique_t *unb = b; + + if (una->un_value < unb->un_value) + return (-1); + if (una->un_value > unb->un_value) + return (+1); + return (0); +} + +void +unique_init(void) +{ + avl_create(&unique_avl, unique_compare, + sizeof (unique_t), offsetof(unique_t, un_link)); +} + +uint64_t +unique_create(void) +{ + return (unique_insert(0)); +} + +uint64_t +unique_insert(uint64_t value) +{ + avl_index_t idx; + unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP); + + un->un_value = value; + + mutex_enter(&unique_mtx); + while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK || + avl_find(&unique_avl, un, &idx)) { + mutex_exit(&unique_mtx); + (void) random_get_pseudo_bytes((void*)&un->un_value, + sizeof (un->un_value)); + un->un_value &= UNIQUE_MASK; + mutex_enter(&unique_mtx); + } + + avl_insert(&unique_avl, un, idx); + mutex_exit(&unique_mtx); + + return (un->un_value); +} + +void +unique_remove(uint64_t value) +{ + unique_t un_tofind; + unique_t *un; + + un_tofind.un_value = value; + mutex_enter(&unique_mtx); + un = avl_find(&unique_avl, &un_tofind, NULL); + if (un != NULL) { + avl_remove(&unique_avl, un); + kmem_free(un, sizeof (unique_t)); + } + mutex_exit(&unique_mtx); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c new file mode 100644 index 0000000..0fceb8d --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -0,0 +1,1905 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/vdev_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/space_map.h> +#include <sys/zio.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); + +/* + * Virtual device management. + */ + +static vdev_ops_t *vdev_ops_table[] = { + &vdev_root_ops, + &vdev_raidz_ops, + &vdev_mirror_ops, + &vdev_replacing_ops, + &vdev_spare_ops, +#ifdef _KERNEL + &vdev_geom_ops, +#else + &vdev_disk_ops, + &vdev_file_ops, +#endif + &vdev_missing_ops, + NULL +}; + +/* maximum scrub/resilver I/O queue */ +int zfs_scrub_limit = 70; + +/* + * Given a vdev type, return the appropriate ops vector. + */ +static vdev_ops_t * +vdev_getops(const char *type) +{ + vdev_ops_t *ops, **opspp; + + for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) + if (strcmp(ops->vdev_op_type, type) == 0) + break; + + return (ops); +} + +/* + * Default asize function: return the MAX of psize with the asize of + * all children. This is what's used by anything other than RAID-Z. + */ +uint64_t +vdev_default_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); + uint64_t csize; + uint64_t c; + + for (c = 0; c < vd->vdev_children; c++) { + csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + asize = MAX(asize, csize); + } + + return (asize); +} + +/* + * Get the replaceable or attachable device size. + * If the parent is a mirror or raidz, the replaceable size is the minimum + * psize of all its children. For the rest, just return our own psize. + * + * e.g. + * psize rsize + * root - - + * mirror/raidz - - + * disk1 20g 20g + * disk2 40g 20g + * disk3 80g 80g + */ +uint64_t +vdev_get_rsize(vdev_t *vd) +{ + vdev_t *pvd, *cvd; + uint64_t c, rsize; + + pvd = vd->vdev_parent; + + /* + * If our parent is NULL or the root, just return our own psize. + */ + if (pvd == NULL || pvd->vdev_parent == NULL) + return (vd->vdev_psize); + + rsize = 0; + + for (c = 0; c < pvd->vdev_children; c++) { + cvd = pvd->vdev_child[c]; + rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; + } + + return (rsize); +} + +vdev_t * +vdev_lookup_top(spa_t *spa, uint64_t vdev) +{ + vdev_t *rvd = spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + return (rvd->vdev_child[vdev]); + + return (NULL); +} + +vdev_t * +vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) +{ + int c; + vdev_t *mvd; + + if (vd->vdev_guid == guid) + return (vd); + + for (c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != + NULL) + return (mvd); + + return (NULL); +} + +void +vdev_add_child(vdev_t *pvd, vdev_t *cvd) +{ + size_t oldsize, newsize; + uint64_t id = cvd->vdev_id; + vdev_t **newchild; + + ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); + ASSERT(cvd->vdev_parent == NULL); + + cvd->vdev_parent = pvd; + + if (pvd == NULL) + return; + + ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); + + oldsize = pvd->vdev_children * sizeof (vdev_t *); + pvd->vdev_children = MAX(pvd->vdev_children, id + 1); + newsize = pvd->vdev_children * sizeof (vdev_t *); + + newchild = kmem_zalloc(newsize, KM_SLEEP); + if (pvd->vdev_child != NULL) { + bcopy(pvd->vdev_child, newchild, oldsize); + kmem_free(pvd->vdev_child, oldsize); + } + + pvd->vdev_child = newchild; + pvd->vdev_child[id] = cvd; + + cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); + ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += cvd->vdev_guid_sum; + + if (cvd->vdev_ops->vdev_op_leaf) + cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; +} + +void +vdev_remove_child(vdev_t *pvd, vdev_t *cvd) +{ + int c; + uint_t id = cvd->vdev_id; + + ASSERT(cvd->vdev_parent == pvd); + + if (pvd == NULL) + return; + + ASSERT(id < pvd->vdev_children); + ASSERT(pvd->vdev_child[id] == cvd); + + pvd->vdev_child[id] = NULL; + cvd->vdev_parent = NULL; + + for (c = 0; c < pvd->vdev_children; c++) + if (pvd->vdev_child[c]) + break; + + if (c == pvd->vdev_children) { + kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); + pvd->vdev_child = NULL; + pvd->vdev_children = 0; + } + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum -= cvd->vdev_guid_sum; + + if (cvd->vdev_ops->vdev_op_leaf) + cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; +} + +/* + * Remove any holes in the child array. + */ +void +vdev_compact_children(vdev_t *pvd) +{ + vdev_t **newchild, *cvd; + int oldc = pvd->vdev_children; + int newc, c; + + ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); + + for (c = newc = 0; c < oldc; c++) + if (pvd->vdev_child[c]) + newc++; + + newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); + + for (c = newc = 0; c < oldc; c++) { + if ((cvd = pvd->vdev_child[c]) != NULL) { + newchild[newc] = cvd; + cvd->vdev_id = newc++; + } + } + + kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); + pvd->vdev_child = newchild; + pvd->vdev_children = newc; +} + +/* + * Allocate and minimally initialize a vdev_t. + */ +static vdev_t * +vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) +{ + vdev_t *vd; + + vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); + + if (spa->spa_root_vdev == NULL) { + ASSERT(ops == &vdev_root_ops); + spa->spa_root_vdev = vd; + } + + if (guid == 0) { + if (spa->spa_root_vdev == vd) { + /* + * The root vdev's guid will also be the pool guid, + * which must be unique among all pools. + */ + while (guid == 0 || spa_guid_exists(guid, 0)) + guid = spa_get_random(-1ULL); + } else { + /* + * Any other vdev's guid must be unique within the pool. + */ + while (guid == 0 || + spa_guid_exists(spa_guid(spa), guid)) + guid = spa_get_random(-1ULL); + } + ASSERT(!spa_guid_exists(spa_guid(spa), guid)); + } + + vd->vdev_spa = spa; + vd->vdev_id = id; + vd->vdev_guid = guid; + vd->vdev_guid_sum = guid; + vd->vdev_ops = ops; + vd->vdev_state = VDEV_STATE_CLOSED; + + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); + space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); + space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); + txg_list_create(&vd->vdev_ms_list, + offsetof(struct metaslab, ms_txg_node)); + txg_list_create(&vd->vdev_dtl_list, + offsetof(struct vdev, vdev_dtl_node)); + vd->vdev_stat.vs_timestamp = gethrtime(); + + return (vd); +} + +/* + * Free a vdev_t that has been removed from service. + */ +static void +vdev_free_common(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + + if (vd->vdev_isspare) + spa_spare_remove(vd); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); + space_map_unload(&vd->vdev_dtl_map); + space_map_destroy(&vd->vdev_dtl_map); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_destroy(&vd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_stat_lock); + + if (vd == spa->spa_root_vdev) + spa->spa_root_vdev = NULL; + + kmem_free(vd, sizeof (vdev_t)); +} + +/* + * Allocate a new vdev. The 'alloctype' is used to control whether we are + * creating a new vdev or loading an existing one - the behavior is slightly + * different for each case. + */ +int +vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, + int alloctype) +{ + vdev_ops_t *ops; + char *type; + uint64_t guid = 0; + vdev_t *vd; + + ASSERT(spa_config_held(spa, RW_WRITER)); + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (EINVAL); + + if ((ops = vdev_getops(type)) == NULL) + return (EINVAL); + + /* + * If this is a load, get the vdev guid from the nvlist. + * Otherwise, vdev_alloc_common() will generate one for us. + */ + if (alloctype == VDEV_ALLOC_LOAD) { + uint64_t label_id; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || + label_id != id) + return (EINVAL); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); + } else if (alloctype == VDEV_ALLOC_SPARE) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); + } + + /* + * The first allocated vdev must be of type 'root'. + */ + if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) + return (EINVAL); + + vd = vdev_alloc_common(spa, id, guid, ops); + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) + vd->vdev_path = spa_strdup(vd->vdev_path); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) + vd->vdev_devid = spa_strdup(vd->vdev_devid); + + /* + * Set the nparity propery for RAID-Z vdevs. + */ + if (ops == &vdev_raidz_ops) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &vd->vdev_nparity) == 0) { + /* + * Currently, we can only support 2 parity devices. + */ + if (vd->vdev_nparity > 2) + return (EINVAL); + /* + * Older versions can only support 1 parity device. + */ + if (vd->vdev_nparity == 2 && + spa_version(spa) < ZFS_VERSION_RAID6) + return (ENOTSUP); + + } else { + /* + * We require the parity to be specified for SPAs that + * support multiple parity levels. + */ + if (spa_version(spa) >= ZFS_VERSION_RAID6) + return (EINVAL); + + /* + * Otherwise, we default to 1 parity device for RAID-Z. + */ + vd->vdev_nparity = 1; + } + } else { + vd->vdev_nparity = 0; + } + + /* + * Set the whole_disk property. If it's not specified, leave the value + * as -1. + */ + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &vd->vdev_wholedisk) != 0) + vd->vdev_wholedisk = -1ULL; + + /* + * Look for the 'not present' flag. This will only be set if the device + * was not present at the time of import. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); + + /* + * Get the alignment requirement. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); + + /* + * If we're a top-level vdev, try to load the allocation parameters. + */ + if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + &vd->vdev_ms_array); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + &vd->vdev_ms_shift); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, + &vd->vdev_asize); + } + + /* + * If we're a leaf vdev, try to load the DTL object and offline state. + */ + if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, + &vd->vdev_dtl.smo_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, + &vd->vdev_offline); + } + + /* + * Add ourselves to the parent's list of children. + */ + vdev_add_child(parent, vd); + + *vdp = vd; + + return (0); +} + +void +vdev_free(vdev_t *vd) +{ + int c; + + /* + * vdev_free() implies closing the vdev first. This is simpler than + * trying to ensure complicated semantics for all callers. + */ + vdev_close(vd); + + ASSERT(!list_link_active(&vd->vdev_dirty_node)); + + /* + * Free all children. + */ + for (c = 0; c < vd->vdev_children; c++) + vdev_free(vd->vdev_child[c]); + + ASSERT(vd->vdev_child == NULL); + ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + + /* + * Discard allocation state. + */ + if (vd == vd->vdev_top) + vdev_metaslab_fini(vd); + + ASSERT3U(vd->vdev_stat.vs_space, ==, 0); + ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); + ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); + + /* + * Remove this vdev from its parent's child list. + */ + vdev_remove_child(vd->vdev_parent, vd); + + ASSERT(vd->vdev_parent == NULL); + + vdev_free_common(vd); +} + +/* + * Transfer top-level vdev state from svd to tvd. + */ +static void +vdev_top_transfer(vdev_t *svd, vdev_t *tvd) +{ + spa_t *spa = svd->vdev_spa; + metaslab_t *msp; + vdev_t *vd; + int t; + + ASSERT(tvd == tvd->vdev_top); + + tvd->vdev_ms_array = svd->vdev_ms_array; + tvd->vdev_ms_shift = svd->vdev_ms_shift; + tvd->vdev_ms_count = svd->vdev_ms_count; + + svd->vdev_ms_array = 0; + svd->vdev_ms_shift = 0; + svd->vdev_ms_count = 0; + + tvd->vdev_mg = svd->vdev_mg; + tvd->vdev_ms = svd->vdev_ms; + + svd->vdev_mg = NULL; + svd->vdev_ms = NULL; + + if (tvd->vdev_mg != NULL) + tvd->vdev_mg->mg_vd = tvd; + + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; + tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; + tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; + + svd->vdev_stat.vs_alloc = 0; + svd->vdev_stat.vs_space = 0; + svd->vdev_stat.vs_dspace = 0; + + for (t = 0; t < TXG_SIZE; t++) { + while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_ms_list, msp, t); + while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); + if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) + (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); + } + + if (list_link_active(&svd->vdev_dirty_node)) { + vdev_config_clean(svd); + vdev_config_dirty(tvd); + } + + tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted; + svd->vdev_reopen_wanted = 0; + + tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; + svd->vdev_deflate_ratio = 0; +} + +static void +vdev_top_update(vdev_t *tvd, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + vd->vdev_top = tvd; + + for (c = 0; c < vd->vdev_children; c++) + vdev_top_update(tvd, vd->vdev_child[c]); +} + +/* + * Add a mirror/replacing vdev above an existing vdev. + */ +vdev_t * +vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) +{ + spa_t *spa = cvd->vdev_spa; + vdev_t *pvd = cvd->vdev_parent; + vdev_t *mvd; + + ASSERT(spa_config_held(spa, RW_WRITER)); + + mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); + + mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_state = cvd->vdev_state; + + vdev_remove_child(pvd, cvd); + vdev_add_child(pvd, mvd); + cvd->vdev_id = mvd->vdev_children; + vdev_add_child(mvd, cvd); + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + if (mvd == mvd->vdev_top) + vdev_top_transfer(cvd, mvd); + + return (mvd); +} + +/* + * Remove a 1-way mirror/replacing vdev from the tree. + */ +void +vdev_remove_parent(vdev_t *cvd) +{ + vdev_t *mvd = cvd->vdev_parent; + vdev_t *pvd = mvd->vdev_parent; + + ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); + + ASSERT(mvd->vdev_children == 1); + ASSERT(mvd->vdev_ops == &vdev_mirror_ops || + mvd->vdev_ops == &vdev_replacing_ops || + mvd->vdev_ops == &vdev_spare_ops); + cvd->vdev_ashift = mvd->vdev_ashift; + + vdev_remove_child(mvd, cvd); + vdev_remove_child(pvd, mvd); + cvd->vdev_id = mvd->vdev_id; + vdev_add_child(pvd, cvd); + /* + * If we created a new toplevel vdev, then we need to change the child's + * vdev GUID to match the old toplevel vdev. Otherwise, we could have + * detached an offline device, and when we go to import the pool we'll + * think we have two toplevel vdevs, instead of a different version of + * the same toplevel vdev. + */ + if (cvd->vdev_top == cvd) { + pvd->vdev_guid_sum -= cvd->vdev_guid; + cvd->vdev_guid_sum -= cvd->vdev_guid; + cvd->vdev_guid = mvd->vdev_guid; + cvd->vdev_guid_sum += mvd->vdev_guid; + pvd->vdev_guid_sum += cvd->vdev_guid; + } + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + if (cvd == cvd->vdev_top) + vdev_top_transfer(mvd, cvd); + + ASSERT(mvd->vdev_children == 0); + vdev_free(mvd); +} + +int +vdev_metaslab_init(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + metaslab_class_t *mc = spa_metaslab_class_select(spa); + uint64_t m; + uint64_t oldc = vd->vdev_ms_count; + uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; + metaslab_t **mspp; + int error; + + if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ + return (0); + + dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); + + ASSERT(oldc <= newc); + + if (vd->vdev_mg == NULL) + vd->vdev_mg = metaslab_group_create(mc, vd); + + mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); + + if (oldc != 0) { + bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); + kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); + } + + vd->vdev_ms = mspp; + vd->vdev_ms_count = newc; + + for (m = oldc; m < newc; m++) { + space_map_obj_t smo = { 0, 0, 0 }; + if (txg == 0) { + uint64_t object = 0; + error = dmu_read(mos, vd->vdev_ms_array, + m * sizeof (uint64_t), sizeof (uint64_t), &object); + if (error) + return (error); + if (object != 0) { + dmu_buf_t *db; + error = dmu_bonus_hold(mos, object, FTAG, &db); + if (error) + return (error); + ASSERT3U(db->db_size, ==, sizeof (smo)); + bcopy(db->db_data, &smo, db->db_size); + ASSERT3U(smo.smo_object, ==, object); + dmu_buf_rele(db, FTAG); + } + } + vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, + m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); + } + + return (0); +} + +void +vdev_metaslab_fini(vdev_t *vd) +{ + uint64_t m; + uint64_t count = vd->vdev_ms_count; + + if (vd->vdev_ms != NULL) { + for (m = 0; m < count; m++) + if (vd->vdev_ms[m] != NULL) + metaslab_fini(vd->vdev_ms[m]); + kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); + vd->vdev_ms = NULL; + } +} + +/* + * Prepare a virtual device for access. + */ +int +vdev_open(vdev_t *vd) +{ + int error; + int c; + uint64_t osize = 0; + uint64_t asize, psize; + uint64_t ashift = 0; + + ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || + vd->vdev_state == VDEV_STATE_CANT_OPEN || + vd->vdev_state == VDEV_STATE_OFFLINE); + + if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) + vd->vdev_fault_arg >>= 1; + else + vd->vdev_fault_mode = VDEV_FAULT_NONE; + + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_cache_init(vd); + vdev_queue_init(vd); + vd->vdev_cache_active = B_TRUE; + } + + if (vd->vdev_offline) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); + return (ENXIO); + } + + error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); + + if (zio_injection_enabled && error == 0) + error = zio_handle_device_injection(vd, ENXIO); + + dprintf("%s = %d, osize %llu, state = %d\n", + vdev_description(vd), error, osize, vd->vdev_state); + + if (error) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + vd->vdev_stat.vs_aux); + return (error); + } + + vd->vdev_state = VDEV_STATE_HEALTHY; + + for (c = 0; c < vd->vdev_children; c++) + if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_NONE); + break; + } + + osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); + + if (vd->vdev_children == 0) { + if (osize < SPA_MINDEVSIZE) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); + return (EOVERFLOW); + } + psize = osize; + asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + } else { + if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); + return (EOVERFLOW); + } + psize = 0; + asize = osize; + } + + vd->vdev_psize = psize; + + if (vd->vdev_asize == 0) { + /* + * This is the first-ever open, so use the computed values. + * For testing purposes, a higher ashift can be requested. + */ + vd->vdev_asize = asize; + vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); + } else { + /* + * Make sure the alignment requirement hasn't increased. + */ + if (ashift > vd->vdev_top->vdev_ashift) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); + } + + /* + * Make sure the device hasn't shrunk. + */ + if (asize < vd->vdev_asize) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); + } + + /* + * If all children are healthy and the asize has increased, + * then we've experienced dynamic LUN growth. + */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && + asize > vd->vdev_asize) { + vd->vdev_asize = asize; + } + } + + /* + * If this is a top-level vdev, compute the raidz-deflation + * ratio. Note, we hard-code in 128k (1<<17) because it is the + * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE + * changes, this algorithm must never change, or we will + * inconsistently account for existing bp's. + */ + if (vd->vdev_top == vd) { + vd->vdev_deflate_ratio = (1<<17) / + (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); + } + + /* + * This allows the ZFS DE to close cases appropriately. If a device + * goes away and later returns, we want to close the associated case. + * But it's not enough to simply post this only when a device goes from + * CANT_OPEN -> HEALTHY. If we reboot the system and the device is + * back, we also need to close the case (otherwise we will try to replay + * it). So we have to post this notifier every time. Since this only + * occurs during pool open or error recovery, this should not be an + * issue. + */ + zfs_post_ok(vd->vdev_spa, vd); + + return (0); +} + +/* + * Called once the vdevs are all opened, this routine validates the label + * contents. This needs to be done before vdev_load() so that we don't + * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen() + * won't succeed if the device has been changed underneath. + * + * This function will only return failure if one of the vdevs indicates that it + * has since been destroyed or exported. This is only possible if + * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state + * will be updated but the function will return 0. + */ +int +vdev_validate(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + int c; + nvlist_t *label; + uint64_t guid; + uint64_t state; + + for (c = 0; c < vd->vdev_children; c++) + if (vdev_validate(vd->vdev_child[c]) != 0) + return (-1); + + /* + * If the device has already failed, or was marked offline, don't do + * any further validation. Otherwise, label I/O will fail and we will + * overwrite the previous state. + */ + if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { + + if ((label = vdev_label_read_config(vd)) == NULL) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, + &guid) != 0 || guid != spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid != vd->vdev_guid) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (0); + } + + nvlist_free(label); + + if (spa->spa_load_state == SPA_LOAD_OPEN && + state != POOL_STATE_ACTIVE) + return (-1); + } + + /* + * If we were able to open and validate a vdev that was previously + * marked permanently unavailable, clear that state now. + */ + if (vd->vdev_not_present) + vd->vdev_not_present = 0; + + return (0); +} + +/* + * Close a virtual device. + */ +void +vdev_close(vdev_t *vd) +{ + vd->vdev_ops->vdev_op_close(vd); + + if (vd->vdev_cache_active) { + vdev_cache_fini(vd); + vdev_queue_fini(vd); + vd->vdev_cache_active = B_FALSE; + } + + /* + * We record the previous state before we close it, so that if we are + * doing a reopen(), we don't generate FMA ereports if we notice that + * it's still faulted. + */ + vd->vdev_prevstate = vd->vdev_state; + + if (vd->vdev_offline) + vd->vdev_state = VDEV_STATE_OFFLINE; + else + vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; +} + +void +vdev_reopen(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, RW_WRITER)); + + vdev_close(vd); + (void) vdev_open(vd); + + /* + * Call vdev_validate() here to make sure we have the same device. + * Otherwise, a device with an invalid label could be successfully + * opened in response to vdev_reopen(). + * + * The downside to this is that if the user is simply experimenting by + * overwriting an entire disk, we'll fault the device rather than + * demonstrate self-healing capabilities. On the other hand, with + * proper FMA integration, the series of errors we'd see from the device + * would result in a faulted device anyway. Given that this doesn't + * model any real-world corruption, it's better to catch this here and + * correctly identify that the device has either changed beneath us, or + * is corrupted beyond recognition. + */ + (void) vdev_validate(vd); + + /* + * Reassess root vdev's health. + */ + vdev_propagate_state(spa->spa_root_vdev); +} + +int +vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) +{ + int error; + + /* + * Normally, partial opens (e.g. of a mirror) are allowed. + * For a create, however, we want to fail the request if + * there are any components we can't open. + */ + error = vdev_open(vd); + + if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { + vdev_close(vd); + return (error ? error : ENXIO); + } + + /* + * Recursively initialize all labels. + */ + if ((error = vdev_label_init(vd, txg, isreplacing ? + VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { + vdev_close(vd); + return (error); + } + + return (0); +} + +/* + * The is the latter half of vdev_create(). It is distinct because it + * involves initiating transactions in order to do metaslab creation. + * For creation, we want to try to create all vdevs at once and then undo it + * if anything fails; this is much harder if we have pending transactions. + */ +void +vdev_init(vdev_t *vd, uint64_t txg) +{ + /* + * Aim for roughly 200 metaslabs per vdev. + */ + vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); + vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); + + /* + * Initialize the vdev's metaslabs. This can't fail because + * there's nothing to read when creating all new metaslabs. + */ + VERIFY(vdev_metaslab_init(vd, txg) == 0); +} + +void +vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) +{ + ASSERT(vd == vd->vdev_top); + ASSERT(ISP2(flags)); + + if (flags & VDD_METASLAB) + (void) txg_list_add(&vd->vdev_ms_list, arg, txg); + + if (flags & VDD_DTL) + (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); + + (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); +} + +void +vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) +{ + mutex_enter(sm->sm_lock); + if (!space_map_contains(sm, txg, size)) + space_map_add(sm, txg, size); + mutex_exit(sm->sm_lock); +} + +int +vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) +{ + int dirty; + + /* + * Quick test without the lock -- covers the common case that + * there are no dirty time segments. + */ + if (sm->sm_space == 0) + return (0); + + mutex_enter(sm->sm_lock); + dirty = space_map_contains(sm, txg, size); + mutex_exit(sm->sm_lock); + + return (dirty); +} + +/* + * Reassess DTLs after a config change or scrub completion. + */ +void +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) +{ + spa_t *spa = vd->vdev_spa; + int c; + + ASSERT(spa_config_held(spa, RW_WRITER)); + + if (vd->vdev_children == 0) { + mutex_enter(&vd->vdev_dtl_lock); + /* + * We're successfully scrubbed everything up to scrub_txg. + * Therefore, excise all old DTLs up to that point, then + * fold in the DTLs for everything we couldn't scrub. + */ + if (scrub_txg != 0) { + space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); + space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); + } + if (scrub_done) + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + mutex_exit(&vd->vdev_dtl_lock); + if (txg != 0) + vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); + return; + } + + /* + * Make sure the DTLs are always correct under the scrub lock. + */ + if (vd == spa->spa_root_vdev) + mutex_enter(&spa->spa_scrub_lock); + + mutex_enter(&vd->vdev_dtl_lock); + space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + mutex_exit(&vd->vdev_dtl_lock); + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); + mutex_enter(&vd->vdev_dtl_lock); + space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); + space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + } + + if (vd == spa->spa_root_vdev) + mutex_exit(&spa->spa_scrub_lock); +} + +static int +vdev_dtl_load(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + space_map_obj_t *smo = &vd->vdev_dtl; + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *db; + int error; + + ASSERT(vd->vdev_children == 0); + + if (smo->smo_object == 0) + return (0); + + if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) + return (error); + + ASSERT3U(db->db_size, ==, sizeof (*smo)); + bcopy(db->db_data, smo, db->db_size); + dmu_buf_rele(db, FTAG); + + mutex_enter(&vd->vdev_dtl_lock); + error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); + mutex_exit(&vd->vdev_dtl_lock); + + return (error); +} + +void +vdev_dtl_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + space_map_obj_t *smo = &vd->vdev_dtl; + space_map_t *sm = &vd->vdev_dtl_map; + objset_t *mos = spa->spa_meta_objset; + space_map_t smsync; + kmutex_t smlock; + dmu_buf_t *db; + dmu_tx_t *tx; + + dprintf("%s in txg %llu pass %d\n", + vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + if (vd->vdev_detached) { + if (smo->smo_object != 0) { + int err = dmu_object_free(mos, smo->smo_object, tx); + ASSERT3U(err, ==, 0); + smo->smo_object = 0; + } + dmu_tx_commit(tx); + dprintf("detach %s committed in txg %llu\n", + vdev_description(vd), txg); + return; + } + + if (smo->smo_object == 0) { + ASSERT(smo->smo_objsize == 0); + ASSERT(smo->smo_alloc == 0); + smo->smo_object = dmu_object_alloc(mos, + DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, + DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); + ASSERT(smo->smo_object != 0); + vdev_config_dirty(vd->vdev_top); + } + + bzero(&smlock, sizeof(smlock)); + mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); + + space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, + &smlock); + + mutex_enter(&smlock); + + mutex_enter(&vd->vdev_dtl_lock); + space_map_walk(sm, space_map_add, &smsync); + mutex_exit(&vd->vdev_dtl_lock); + + space_map_truncate(smo, mos, tx); + space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); + + space_map_destroy(&smsync); + + mutex_exit(&smlock); + mutex_destroy(&smlock); + + VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + ASSERT3U(db->db_size, ==, sizeof (*smo)); + bcopy(smo, db->db_data, db->db_size); + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); +} + +void +vdev_load(vdev_t *vd) +{ + int c; + + /* + * Recursively load all children. + */ + for (c = 0; c < vd->vdev_children; c++) + vdev_load(vd->vdev_child[c]); + + /* + * If this is a top-level vdev, initialize its metaslabs. + */ + if (vd == vd->vdev_top && + (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || + vdev_metaslab_init(vd, 0) != 0)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + + /* + * If this is a leaf vdev, load its DTL. + */ + if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); +} + +/* + * This special case of vdev_spare() is used for hot spares. It's sole purpose + * it to set the vdev state for the associated vdev. To do this, we make sure + * that we can open the underlying device, then try to read the label, and make + * sure that the label is sane and that it hasn't been repurposed to another + * pool. + */ +int +vdev_validate_spare(vdev_t *vd) +{ + nvlist_t *label; + uint64_t guid, version; + uint64_t state; + + if ((label = vdev_label_read_config(vd)) == NULL) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (-1); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || + version > ZFS_VERSION || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || + guid != vd->vdev_guid || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (-1); + } + + spa_spare_add(vd); + + /* + * We don't actually check the pool state here. If it's in fact in + * use by another pool, we update this fact on the fly when requested. + */ + nvlist_free(label); + return (0); +} + +void +vdev_sync_done(vdev_t *vd, uint64_t txg) +{ + metaslab_t *msp; + + dprintf("%s txg %llu\n", vdev_description(vd), txg); + + while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + metaslab_sync_done(msp, txg); +} + +void +vdev_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *lvd; + metaslab_t *msp; + dmu_tx_t *tx; + + dprintf("%s txg %llu pass %d\n", + vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); + + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { + ASSERT(vd == vd->vdev_top); + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); + ASSERT(vd->vdev_ms_array != 0); + vdev_config_dirty(vd); + dmu_tx_commit(tx); + } + + while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { + metaslab_sync(msp, txg); + (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); + } + + while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) + vdev_dtl_sync(lvd, txg); + + (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); +} + +uint64_t +vdev_psize_to_asize(vdev_t *vd, uint64_t psize) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize)); +} + +void +vdev_io_start(zio_t *zio) +{ + zio->io_vd->vdev_ops->vdev_op_io_start(zio); +} + +void +vdev_io_done(zio_t *zio) +{ + zio->io_vd->vdev_ops->vdev_op_io_done(zio); +} + +const char * +vdev_description(vdev_t *vd) +{ + if (vd == NULL || vd->vdev_ops == NULL) + return ("<unknown>"); + + if (vd->vdev_path != NULL) + return (vd->vdev_path); + + if (vd->vdev_parent == NULL) + return (spa_name(vd->vdev_spa)); + + return (vd->vdev_ops->vdev_op_type); +} + +int +vdev_online(spa_t *spa, uint64_t guid) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + dprintf("ONLINE: %s\n", vdev_description(vd)); + + vd->vdev_offline = B_FALSE; + vd->vdev_tmpoffline = B_FALSE; + vdev_reopen(vd->vdev_top); + + vdev_config_dirty(vd->vdev_top); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + return (0); +} + +int +vdev_offline(spa_t *spa, uint64_t guid, int istmp) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + dprintf("OFFLINE: %s\n", vdev_description(vd)); + + /* + * If the device isn't already offline, try to offline it. + */ + if (!vd->vdev_offline) { + /* + * If this device's top-level vdev has a non-empty DTL, + * don't allow the device to be offlined. + * + * XXX -- make this more precise by allowing the offline + * as long as the remaining devices don't have any DTL holes. + */ + if (vd->vdev_top->vdev_dtl_map.sm_space != 0) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* + * Offline this device and reopen its top-level vdev. + * If this action results in the top-level vdev becoming + * unusable, undo it and fail the request. + */ + vd->vdev_offline = B_TRUE; + vdev_reopen(vd->vdev_top); + if (vdev_is_dead(vd->vdev_top)) { + vd->vdev_offline = B_FALSE; + vdev_reopen(vd->vdev_top); + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + } + } + + vd->vdev_tmpoffline = istmp; + + vdev_config_dirty(vd->vdev_top); + + return (spa_vdev_exit(spa, NULL, txg, 0)); +} + +/* + * Clear the error counts associated with this vdev. Unlike vdev_online() and + * vdev_offline(), we assume the spa config is locked. We also clear all + * children. If 'vd' is NULL, then the user wants to clear all vdevs. + */ +void +vdev_clear(spa_t *spa, vdev_t *vd) +{ + int c; + + if (vd == NULL) + vd = spa->spa_root_vdev; + + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + + for (c = 0; c < vd->vdev_children; c++) + vdev_clear(spa, vd->vdev_child[c]); +} + +int +vdev_is_dead(vdev_t *vd) +{ + return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); +} + +int +vdev_error_inject(vdev_t *vd, zio_t *zio) +{ + int error = 0; + + if (vd->vdev_fault_mode == VDEV_FAULT_NONE) + return (0); + + if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) + return (0); + + switch (vd->vdev_fault_mode) { + case VDEV_FAULT_RANDOM: + if (spa_get_random(vd->vdev_fault_arg) == 0) + error = EIO; + break; + + case VDEV_FAULT_COUNT: + if ((int64_t)--vd->vdev_fault_arg <= 0) + vd->vdev_fault_mode = VDEV_FAULT_NONE; + error = EIO; + break; + } + + if (error != 0) { + dprintf("returning %d for type %d on %s state %d offset %llx\n", + error, zio->io_type, vdev_description(vd), + vd->vdev_state, zio->io_offset); + } + + return (error); +} + +/* + * Get statistics for the given vdev. + */ +void +vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +{ + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + int c, t; + + mutex_enter(&vd->vdev_stat_lock); + bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + vs->vs_timestamp = gethrtime() - vs->vs_timestamp; + vs->vs_state = vd->vdev_state; + vs->vs_rsize = vdev_get_rsize(vd); + mutex_exit(&vd->vdev_stat_lock); + + /* + * If we're getting stats on the root vdev, aggregate the I/O counts + * over all top-level vdevs (i.e. the direct children of the root). + */ + if (vd == rvd) { + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + vdev_stat_t *cvs = &cvd->vdev_stat; + + mutex_enter(&vd->vdev_stat_lock); + for (t = 0; t < ZIO_TYPES; t++) { + vs->vs_ops[t] += cvs->vs_ops[t]; + vs->vs_bytes[t] += cvs->vs_bytes[t]; + } + vs->vs_read_errors += cvs->vs_read_errors; + vs->vs_write_errors += cvs->vs_write_errors; + vs->vs_checksum_errors += cvs->vs_checksum_errors; + vs->vs_scrub_examined += cvs->vs_scrub_examined; + vs->vs_scrub_errors += cvs->vs_scrub_errors; + mutex_exit(&vd->vdev_stat_lock); + } + } +} + +void +vdev_stat_update(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *pvd; + uint64_t txg = zio->io_txg; + vdev_stat_t *vs = &vd->vdev_stat; + zio_type_t type = zio->io_type; + int flags = zio->io_flags; + + if (zio->io_error == 0) { + if (!(flags & ZIO_FLAG_IO_BYPASS)) { + mutex_enter(&vd->vdev_stat_lock); + vs->vs_ops[type]++; + vs->vs_bytes[type] += zio->io_size; + mutex_exit(&vd->vdev_stat_lock); + } + if ((flags & ZIO_FLAG_IO_REPAIR) && + zio->io_delegate_list == NULL) { + mutex_enter(&vd->vdev_stat_lock); + if (flags & ZIO_FLAG_SCRUB_THREAD) + vs->vs_scrub_repaired += zio->io_size; + else + vs->vs_self_healed += zio->io_size; + mutex_exit(&vd->vdev_stat_lock); + } + return; + } + + if (flags & ZIO_FLAG_SPECULATIVE) + return; + + if (!vdev_is_dead(vd)) { + mutex_enter(&vd->vdev_stat_lock); + if (type == ZIO_TYPE_READ) { + if (zio->io_error == ECKSUM) + vs->vs_checksum_errors++; + else + vs->vs_read_errors++; + } + if (type == ZIO_TYPE_WRITE) + vs->vs_write_errors++; + mutex_exit(&vd->vdev_stat_lock); + } + + if (type == ZIO_TYPE_WRITE) { + if (txg == 0 || vd->vdev_children != 0) + return; + if (flags & ZIO_FLAG_SCRUB_THREAD) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); + } + if (!(flags & ZIO_FLAG_IO_REPAIR)) { + if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) + return; + vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); + for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); + } + } +} + +void +vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) +{ + int c; + vdev_stat_t *vs = &vd->vdev_stat; + + for (c = 0; c < vd->vdev_children; c++) + vdev_scrub_stat_update(vd->vdev_child[c], type, complete); + + mutex_enter(&vd->vdev_stat_lock); + + if (type == POOL_SCRUB_NONE) { + /* + * Update completion and end time. Leave everything else alone + * so we can report what happened during the previous scrub. + */ + vs->vs_scrub_complete = complete; + vs->vs_scrub_end = gethrestime_sec(); + } else { + vs->vs_scrub_type = type; + vs->vs_scrub_complete = 0; + vs->vs_scrub_examined = 0; + vs->vs_scrub_repaired = 0; + vs->vs_scrub_errors = 0; + vs->vs_scrub_start = gethrestime_sec(); + vs->vs_scrub_end = 0; + } + + mutex_exit(&vd->vdev_stat_lock); +} + +/* + * Update the in-core space usage stats for this vdev and the root vdev. + */ +void +vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) +{ + ASSERT(vd == vd->vdev_top); + int64_t dspace_delta = space_delta; + + do { + if (vd->vdev_ms_count) { + /* + * If this is a top-level vdev, apply the + * inverse of its psize-to-asize (ie. RAID-Z) + * space-expansion factor. We must calculate + * this here and not at the root vdev because + * the root vdev's psize-to-asize is simply the + * max of its childrens', thus not accurate + * enough for us. + */ + ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); + dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio; + } + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_space += space_delta; + vd->vdev_stat.vs_alloc += alloc_delta; + vd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&vd->vdev_stat_lock); + } while ((vd = vd->vdev_parent) != NULL); +} + +/* + * Mark a top-level vdev's config as dirty, placing it on the dirty list + * so that it will be written out next time the vdev configuration is synced. + * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. + */ +void +vdev_config_dirty(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + int c; + + /* + * The dirty list is protected by the config lock. The caller must + * either hold the config lock as writer, or must be the sync thread + * (which holds the lock as reader). There's only one sync thread, + * so this is sufficient to ensure mutual exclusion. + */ + ASSERT(spa_config_held(spa, RW_WRITER) || + dsl_pool_sync_context(spa_get_dsl(spa))); + + if (vd == rvd) { + for (c = 0; c < rvd->vdev_children; c++) + vdev_config_dirty(rvd->vdev_child[c]); + } else { + ASSERT(vd == vd->vdev_top); + + if (!list_link_active(&vd->vdev_dirty_node)) + list_insert_head(&spa->spa_dirty_list, vd); + } +} + +void +vdev_config_clean(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, RW_WRITER) || + dsl_pool_sync_context(spa_get_dsl(spa))); + + ASSERT(list_link_active(&vd->vdev_dirty_node)); + list_remove(&spa->spa_dirty_list, vd); +} + +void +vdev_propagate_state(vdev_t *vd) +{ + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + int degraded = 0, faulted = 0; + int corrupted = 0; + int c; + vdev_t *child; + + for (c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + if (child->vdev_state <= VDEV_STATE_CANT_OPEN) + faulted++; + else if (child->vdev_state == VDEV_STATE_DEGRADED) + degraded++; + + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } + + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a toplevel vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); +} + +/* + * Set a vdev's state. If this is during an open, we don't update the parent + * state, because we're in the process of opening children depth-first. + * Otherwise, we propagate the change to the parent. + * + * If this routine places a device in a faulted state, an appropriate ereport is + * generated. + */ +void +vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) +{ + uint64_t save_state; + + if (state == vd->vdev_state) { + vd->vdev_stat.vs_aux = aux; + return; + } + + save_state = vd->vdev_state; + + vd->vdev_state = state; + vd->vdev_stat.vs_aux = aux; + + if (state == VDEV_STATE_CANT_OPEN) { + /* + * If we fail to open a vdev during an import, we mark it as + * "not available", which signifies that it was never there to + * begin with. Failure to open such a device is not considered + * an error. + */ + if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT && + vd->vdev_ops->vdev_op_leaf) + vd->vdev_not_present = 1; + + /* + * Post the appropriate ereport. If the 'prevstate' field is + * set to something other than VDEV_STATE_UNKNOWN, it indicates + * that this is part of a vdev_reopen(). In this case, we don't + * want to post the ereport if the device was already in the + * CANT_OPEN state beforehand. + */ + if (vd->vdev_prevstate != state && !vd->vdev_not_present && + vd != vd->vdev_spa->spa_root_vdev) { + const char *class; + + switch (aux) { + case VDEV_AUX_OPEN_FAILED: + class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; + break; + case VDEV_AUX_CORRUPT_DATA: + class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; + break; + case VDEV_AUX_NO_REPLICAS: + class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; + break; + case VDEV_AUX_BAD_GUID_SUM: + class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; + break; + case VDEV_AUX_TOO_SMALL: + class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; + break; + case VDEV_AUX_BAD_LABEL: + class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; + break; + default: + class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; + } + + zfs_ereport_post(class, vd->vdev_spa, + vd, NULL, save_state, 0); + } + } + + if (isopen) + return; + + if (vd->vdev_parent != NULL) + vdev_propagate_state(vd->vdev_parent); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c new file mode 100644 index 0000000..b4fb960 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c @@ -0,0 +1,394 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> + +/* + * Virtual device read-ahead caching. + * + * This file implements a simple LRU read-ahead cache. When the DMU reads + * a given block, it will often want other, nearby blocks soon thereafter. + * We take advantage of this by reading a larger disk region and caching + * the result. In the best case, this can turn 256 back-to-back 512-byte + * reads into a single 128k read followed by 255 cache hits; this reduces + * latency dramatically. In the worst case, it can turn an isolated 512-byte + * read into a 128k read, which doesn't affect latency all that much but is + * terribly wasteful of bandwidth. A more intelligent version of the cache + * could keep track of access patterns and not do read-ahead unless it sees + * at least two temporally close I/Os to the same region. It could also + * take advantage of semantic information about the I/O. And it could use + * something faster than an AVL tree; that was chosen solely for convenience. + * + * There are five cache operations: allocate, fill, read, write, evict. + * + * (1) Allocate. This reserves a cache entry for the specified region. + * We separate the allocate and fill operations so that multiple threads + * don't generate I/O for the same cache miss. + * + * (2) Fill. When the I/O for a cache miss completes, the fill routine + * places the data in the previously allocated cache entry. + * + * (3) Read. Read data from the cache. + * + * (4) Write. Update cache contents after write completion. + * + * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry + * if the total cache size exceeds zfs_vdev_cache_size. + */ + +/* + * These tunables are for performance analysis. + */ +/* + * All i/os smaller than zfs_vdev_cache_max will be turned into + * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software + * track buffer. At most zfs_vdev_cache_size bytes will be kept in each + * vdev's vdev_cache. + */ +int zfs_vdev_cache_max = 1<<14; +int zfs_vdev_cache_size = 10ULL << 20; +int zfs_vdev_cache_bshift = 16; + +SYSCTL_DECL(_vfs_zfs_vdev); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); +TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max); +SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN, + &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size"); +TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size); +SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN, + &zfs_vdev_cache_size, 0, "Size of VDEV cache"); + +#define VCBS (1 << zfs_vdev_cache_bshift) + +static int +vdev_cache_offset_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = a1; + const vdev_cache_entry_t *ve2 = a2; + + if (ve1->ve_offset < ve2->ve_offset) + return (-1); + if (ve1->ve_offset > ve2->ve_offset) + return (1); + return (0); +} + +static int +vdev_cache_lastused_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = a1; + const vdev_cache_entry_t *ve2 = a2; + + if (ve1->ve_lastused < ve2->ve_lastused) + return (-1); + if (ve1->ve_lastused > ve2->ve_lastused) + return (1); + + /* + * Among equally old entries, sort by offset to ensure uniqueness. + */ + return (vdev_cache_offset_compare(a1, a2)); +} + +/* + * Evict the specified entry from the cache. + */ +static void +vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) +{ + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT(ve->ve_fill_io == NULL); + ASSERT(ve->ve_data != NULL); + + dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n", + vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused, + ve->ve_hits, ve->ve_missed_update); + + avl_remove(&vc->vc_lastused_tree, ve); + avl_remove(&vc->vc_offset_tree, ve); + zio_buf_free(ve->ve_data, VCBS); + kmem_free(ve, sizeof (vdev_cache_entry_t)); +} + +/* + * Allocate an entry in the cache. At the point we don't have the data, + * we're just creating a placeholder so that multiple threads don't all + * go off and read the same blocks. + */ +static vdev_cache_entry_t * +vdev_cache_allocate(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + uint64_t offset = P2ALIGN(zio->io_offset, VCBS); + vdev_cache_entry_t *ve; + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + + if (zfs_vdev_cache_size == 0) + return (NULL); + + /* + * If adding a new entry would exceed the cache size, + * evict the oldest entry (LRU). + */ + if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > + zfs_vdev_cache_size) { + ve = avl_first(&vc->vc_lastused_tree); + if (ve->ve_fill_io != NULL) { + dprintf("can't evict in %p, still filling\n", vc); + return (NULL); + } + ASSERT(ve->ve_hits != 0); + vdev_cache_evict(vc, ve); + } + + ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); + ve->ve_offset = offset; + ve->ve_lastused = lbolt; + ve->ve_data = zio_buf_alloc(VCBS); + + avl_add(&vc->vc_offset_tree, ve); + avl_add(&vc->vc_lastused_tree, ve); + + return (ve); +} + +static void +vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) +{ + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT(ve->ve_fill_io == NULL); + + if (ve->ve_lastused != lbolt) { + avl_remove(&vc->vc_lastused_tree, ve); + ve->ve_lastused = lbolt; + avl_add(&vc->vc_lastused_tree, ve); + } + + ve->ve_hits++; + bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); +} + +/* + * Fill a previously allocated cache entry with data. + */ +static void +vdev_cache_fill(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve = zio->io_private; + zio_t *dio; + + ASSERT(zio->io_size == VCBS); + + /* + * Add data to the cache. + */ + mutex_enter(&vc->vc_lock); + + ASSERT(ve->ve_fill_io == zio); + ASSERT(ve->ve_offset == zio->io_offset); + ASSERT(ve->ve_data == zio->io_data); + + ve->ve_fill_io = NULL; + + /* + * Even if this cache line was invalidated by a missed write update, + * any reads that were queued up before the missed update are still + * valid, so we can satisfy them from this line before we evict it. + */ + for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) + vdev_cache_hit(vc, ve, dio); + + if (zio->io_error || ve->ve_missed_update) + vdev_cache_evict(vc, ve); + + mutex_exit(&vc->vc_lock); + + while ((dio = zio->io_delegate_list) != NULL) { + zio->io_delegate_list = dio->io_delegate_next; + dio->io_delegate_next = NULL; + dio->io_error = zio->io_error; + zio_next_stage(dio); + } +} + +/* + * Read data from the cache. Returns 0 on cache hit, errno on a miss. + */ +int +vdev_cache_read(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); + zio_t *fio; + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + if (zio->io_flags & ZIO_FLAG_DONT_CACHE) + return (EINVAL); + + if (zio->io_size > zfs_vdev_cache_max) + return (EOVERFLOW); + + /* + * If the I/O straddles two or more cache blocks, don't cache it. + */ + if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS)) + return (EXDEV); + + ASSERT(cache_phase + zio->io_size <= VCBS); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = cache_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); + + if (ve != NULL) { + if (ve->ve_missed_update) { + mutex_exit(&vc->vc_lock); + return (ESTALE); + } + + if ((fio = ve->ve_fill_io) != NULL) { + zio->io_delegate_next = fio->io_delegate_list; + fio->io_delegate_list = zio; + zio_vdev_io_bypass(zio); + mutex_exit(&vc->vc_lock); + return (0); + } + + vdev_cache_hit(vc, ve, zio); + zio_vdev_io_bypass(zio); + + mutex_exit(&vc->vc_lock); + zio_next_stage(zio); + return (0); + } + + ve = vdev_cache_allocate(zio); + + if (ve == NULL) { + mutex_exit(&vc->vc_lock); + return (ENOMEM); + } + + fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, + ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK, + vdev_cache_fill, ve); + + ve->ve_fill_io = fio; + fio->io_delegate_list = zio; + zio_vdev_io_bypass(zio); + + mutex_exit(&vc->vc_lock); + zio_nowait(fio); + + return (0); +} + +/* + * Update cache contents upon write completion. + */ +void +vdev_cache_write(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t io_start = zio->io_offset; + uint64_t io_end = io_start + zio->io_size; + uint64_t min_offset = P2ALIGN(io_start, VCBS); + uint64_t max_offset = P2ROUNDUP(io_end, VCBS); + avl_index_t where; + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = min_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); + + if (ve == NULL) + ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); + + while (ve != NULL && ve->ve_offset < max_offset) { + uint64_t start = MAX(ve->ve_offset, io_start); + uint64_t end = MIN(ve->ve_offset + VCBS, io_end); + + if (ve->ve_fill_io != NULL) { + ve->ve_missed_update = 1; + } else { + bcopy((char *)zio->io_data + start - io_start, + ve->ve_data + start - ve->ve_offset, end - start); + } + ve = AVL_NEXT(&vc->vc_offset_tree, ve); + } + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_init(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_offset_node)); + + avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_lastused_node)); +} + +void +vdev_cache_fini(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); + + avl_destroy(&vc->vc_offset_tree); + avl_destroy(&vc->vc_lastused_tree); + + mutex_destroy(&vc->vc_lock); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c new file mode 100644 index 0000000..b965b1c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -0,0 +1,363 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_disk.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/sunldi.h> + +/* + * Virtual device vector for disks. + */ + +extern ldi_ident_t zfs_li; + +typedef struct vdev_disk_buf { + buf_t vdb_buf; + zio_t *vdb_io; +} vdev_disk_buf_t; + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_disk_t *dvd; + struct dk_minfo dkm; + int error; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + + /* + * When opening a disk device, we want to preserve the user's original + * intent. We always want to open the device by the path the user gave + * us, even if it is one of multiple paths to the save device. But we + * also want to be able to survive disks being removed/recabled. + * Therefore the sequence of opening devices is: + * + * 1. Try opening the device by path. For legacy pools without the + * 'whole_disk' property, attempt to fix the path by appending 's0'. + * + * 2. If the devid of the device matches the stored value, return + * success. + * + * 3. Otherwise, the device may have moved. Try opening the device + * by the devid instead. + * + */ + if (vd->vdev_devid != NULL) { + if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, + &dvd->vd_minor) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + } + + error = EINVAL; /* presume failure */ + + if (vd->vdev_path != NULL) { + ddi_devid_t devid; + + if (vd->vdev_wholedisk == -1ULL) { + size_t len = strlen(vd->vdev_path) + 3; + char *buf = kmem_alloc(len, KM_SLEEP); + ldi_handle_t lh; + + (void) snprintf(buf, len, "%ss0", vd->vdev_path); + + if (ldi_open_by_name(buf, spa_mode, kcred, + &lh, zfs_li) == 0) { + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + vd->vdev_wholedisk = 1ULL; + (void) ldi_close(lh, spa_mode, kcred); + } else { + kmem_free(buf, len); + } + } + + error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, + &dvd->vd_lh, zfs_li); + + /* + * Compare the devid to the stored value. + */ + if (error == 0 && vd->vdev_devid != NULL && + ldi_get_devid(dvd->vd_lh, &devid) == 0) { + if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { + error = EINVAL; + (void) ldi_close(dvd->vd_lh, spa_mode, kcred); + dvd->vd_lh = NULL; + } + ddi_devid_free(devid); + } + + /* + * If we succeeded in opening the device, but 'vdev_wholedisk' + * is not yet set, then this must be a slice. + */ + if (error == 0 && vd->vdev_wholedisk == -1ULL) + vd->vdev_wholedisk = 0; + } + + /* + * If we were unable to open by path, or the devid check fails, open by + * devid instead. + */ + if (error != 0 && vd->vdev_devid != NULL) + error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, + spa_mode, kcred, &dvd->vd_lh, zfs_li); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + /* + * Determine the actual size of the device. + */ + if (ldi_get_size(dvd->vd_lh, psize) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (EINVAL); + } + + /* + * If we own the whole disk, try to enable disk write caching. + * We ignore errors because it's OK if we can't do it. + */ + if (vd->vdev_wholedisk == 1) { + int wce = 1; + (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, + FKIOCTL, kcred, NULL); + } + + /* + * Determine the device's minimum transfer size. + * If the ioctl isn't supported, assume DEV_BSIZE. + */ + if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm, + FKIOCTL, kcred, NULL) != 0) + dkm.dki_lbsize = DEV_BSIZE; + + *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1; + + /* + * Clear the nowritecache bit, so that on a vdev_reopen() we will + * try again. + */ + vd->vdev_nowritecache = B_FALSE; + + return (0); +} + +static void +vdev_disk_close(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + if (dvd == NULL) + return; + + dprintf("removing disk %s, devid %s\n", + vd->vdev_path ? vd->vdev_path : "<none>", + vd->vdev_devid ? vd->vdev_devid : "<none>"); + + if (dvd->vd_minor != NULL) + ddi_devid_str_free(dvd->vd_minor); + + if (dvd->vd_devid != NULL) + ddi_devid_free(dvd->vd_devid); + + if (dvd->vd_lh != NULL) + (void) ldi_close(dvd->vd_lh, spa_mode, kcred); + + kmem_free(dvd, sizeof (vdev_disk_t)); + vd->vdev_tsd = NULL; +} + +static void +vdev_disk_io_intr(buf_t *bp) +{ + vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; + zio_t *zio = vdb->vdb_io; + + if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0) + zio->io_error = EIO; + + kmem_free(vdb, sizeof (vdev_disk_buf_t)); + + zio_next_stage_async(zio); +} + +static void +vdev_disk_ioctl_done(void *zio_arg, int error) +{ + zio_t *zio = zio_arg; + + zio->io_error = error; + + zio_next_stage_async(zio); +} + +static void +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_buf_t *vdb; + buf_t *bp; + int flags, error; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio_vdev_io_bypass(zio); + + /* XXPOLICY */ + if (vdev_is_dead(vd)) { + zio->io_error = ENXIO; + zio_next_stage_async(zio); + return; + } + + switch (zio->io_cmd) { + + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (vd->vdev_nowritecache) { + zio->io_error = ENOTSUP; + break; + } + + zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; + zio->io_dk_callback.dkc_cookie = zio; + + error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, + (uintptr_t)&zio->io_dk_callback, + FKIOCTL, kcred, NULL); + + if (error == 0) { + /* + * The ioctl will be done asychronously, + * and will call vdev_disk_ioctl_done() + * upon completion. + */ + return; + } else if (error == ENOTSUP) { + /* + * If we get ENOTSUP, we know that no future + * attempts will ever succeed. In this case we + * set a persistent bit so that we don't bother + * with the ioctl in the future. + */ + vd->vdev_nowritecache = B_TRUE; + } + zio->io_error = error; + + break; + + default: + zio->io_error = ENOTSUP; + } + + zio_next_stage_async(zio); + return; + } + + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + return; + + if ((zio = vdev_queue_io(zio)) == NULL) + return; + + flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); + flags |= B_BUSY | B_NOCACHE; + if (zio->io_flags & ZIO_FLAG_FAILFAST) + flags |= B_FAILFAST; + + vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); + + vdb->vdb_io = zio; + bp = &vdb->vdb_buf; + + bioinit(bp); + bp->b_flags = flags; + bp->b_bcount = zio->io_size; + bp->b_un.b_addr = zio->io_data; + bp->b_lblkno = lbtodb(zio->io_offset); + bp->b_bufsize = zio->io_size; + bp->b_iodone = (int (*)())vdev_disk_io_intr; + + /* XXPOLICY */ + error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); + if (error) { + zio->io_error = error; + bioerror(bp, error); + bp->b_resid = bp->b_bcount; + bp->b_iodone(bp); + return; + } + + error = ldi_strategy(dvd->vd_lh, bp); + /* ldi_strategy() will return non-zero only on programming errors */ + ASSERT(error == 0); +} + +static void +vdev_disk_io_done(zio_t *zio) +{ + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + + zio_next_stage(zio); +} + +vdev_ops_t vdev_disk_ops = { + vdev_disk_open, + vdev_disk_close, + vdev_default_asize, + vdev_disk_io_start, + vdev_disk_io_done, + NULL, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c new file mode 100644 index 0000000..b8e79f8 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -0,0 +1,225 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_file.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for files. + */ + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_file_t *vf; + vnode_t *vp; + vattr_t vattr; + int error; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX, + 0, &vp, 0, 0, rootdir); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_vnode = vp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (vp->v_type != VREG) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (ENODEV); + } +#endif + + /* + * Determine the physical size of the file. + */ + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, kcred); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *psize = vattr.va_size; + *ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vf == NULL) + return; + + if (vf->vf_vnode != NULL) { + (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred); + (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred); + VN_RELE(vf->vf_vnode); + } + + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + ssize_t resid; + int error; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio_vdev_io_bypass(zio); + + /* XXPOLICY */ + if (vdev_is_dead(vd)) { + zio->io_error = ENXIO; + zio_next_stage_async(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, + kcred); + dprintf("fsync(%s) = %d\n", vdev_description(vd), + zio->io_error); + break; + default: + zio->io_error = ENOTSUP; + } + + zio_next_stage_async(zio); + return; + } + + /* + * In the kernel, don't bother double-caching, but in userland, + * we want to test the vdev_cache code. + */ +#ifndef _KERNEL + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + return; +#endif + + if ((zio = vdev_queue_io(zio)) == NULL) + return; + + /* XXPOLICY */ + error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); + if (error) { + zio->io_error = error; + zio_next_stage_async(zio); + return; + } + + zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? + UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, + zio->io_size, zio->io_offset, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, &resid); + + if (resid != 0 && zio->io_error == 0) + zio->io_error = ENOSPC; + + zio_next_stage_async(zio); +} + +static void +vdev_file_io_done(zio_t *zio) +{ + vdev_queue_io_done(zio); + +#ifndef _KERNEL + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); +#endif + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + + zio_next_stage(zio); +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c new file mode 100644 index 0000000..9699171 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -0,0 +1,432 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/bio.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <geom/geom.h> + +/* + * Virtual device vector for GEOM. + */ + +struct g_class zfs_vdev_class = { + .name = "ZFS::VDEV", + .version = G_VERSION, +}; + +DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); + +typedef struct vdev_geom_ctx { + struct g_consumer *gc_consumer; + int gc_state; + struct bio_queue_head gc_queue; + struct mtx gc_queue_mtx; +} vdev_geom_ctx_t; + +static void +vdev_geom_release(vdev_t *vd) +{ + vdev_geom_ctx_t *ctx; + + ctx = vd->vdev_tsd; + vd->vdev_tsd = NULL; + + mtx_lock(&ctx->gc_queue_mtx); + ctx->gc_state = 1; + wakeup_one(&ctx->gc_queue); + while (ctx->gc_state != 2) + msleep(&ctx->gc_state, &ctx->gc_queue_mtx, 0, "vgeom:w", 0); + mtx_unlock(&ctx->gc_queue_mtx); + mtx_destroy(&ctx->gc_queue_mtx); + kmem_free(ctx, sizeof(*ctx)); +} + +static void +vdev_geom_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + vdev_t *vd; + int error; + + g_topology_assert(); + + vd = cp->private; + gp = cp->geom; + error = cp->provider->error; + + ZFS_LOG(1, "Closing access to %s.", cp->provider->name); + g_access(cp, -cp->acr, -cp->acw, -cp->ace); + g_detach(cp); + ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name); + g_destroy_consumer(cp); + /* Destroy geom if there are no consumers left. */ + if (LIST_EMPTY(&gp->consumer)) { + ZFS_LOG(1, "Destroyed geom %s.", gp->name); + g_wither_geom(cp->geom, error); + } + vdev_geom_release(vd); + /* Both methods below work, but in a bit different way. */ +#if 0 + vd->vdev_reopen_wanted = 1; +#else + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); +#endif +} + +static struct g_consumer * +vdev_geom_attach(struct g_provider *pp, int write) +{ + struct g_geom *gp; + struct g_consumer *cp; + + g_topology_assert(); + + ZFS_LOG(1, "Attaching to %s.", pp->name); + /* Do we have geom already? No? Create one. */ + LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { + if (!(gp->flags & G_GEOM_WITHER)) + break; + } + if (gp == NULL) { + gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); + gp->orphan = vdev_geom_orphan; + cp = g_new_consumer(gp); + if (g_attach(cp, pp) != 0) { + g_wither_geom(gp, ENXIO); + return (NULL); + } + if (g_access(cp, 1, write, 1) != 0) { + g_wither_geom(gp, ENXIO); + return (NULL); + } + ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); + } else { + /* Check if we are already connected to this provider. */ + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (cp->provider == pp) { + ZFS_LOG(1, "Found consumer for %s.", pp->name); + break; + } + } + if (cp == NULL) { + cp = g_new_consumer(gp); + if (g_attach(cp, pp) != 0) { + g_destroy_consumer(cp); + return (NULL); + } + if (g_access(cp, 1, write, 1) != 0) { + g_detach(cp); + g_destroy_consumer(cp); + return (NULL); + } + ZFS_LOG(1, "Created consumer for %s.", pp->name); + } else { + if (g_access(cp, 1, cp->acw > 0 ? 0 : write, 1) != 0) + return (NULL); + ZFS_LOG(1, "Used existing consumer for %s.", pp->name); + } + } + return (cp); +} + +static void +vdev_geom_detach(void *arg, int flag __unused) +{ + struct g_geom *gp; + struct g_consumer *cp; + + g_topology_assert(); + cp = arg; + gp = cp->geom; + + ZFS_LOG(1, "Closing access to %s.", cp->provider->name); + g_access(cp, -1, 0, -1); + /* Destroy consumer on last close. */ + if (cp->acr == 0 && cp->ace == 0) { + ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name); + if (cp->acw > 0) + g_access(cp, 0, -cp->acw, 0); + g_detach(cp); + g_destroy_consumer(cp); + } + /* Destroy geom if there are no consumers left. */ + if (LIST_EMPTY(&gp->consumer)) { + ZFS_LOG(1, "Destroyed geom %s.", gp->name); + g_wither_geom(gp, ENXIO); + } +} + +static void +vdev_geom_worker(void *arg) +{ + vdev_geom_ctx_t *ctx; + zio_t *zio; + struct bio *bp; + + ctx = arg; + for (;;) { + mtx_lock(&ctx->gc_queue_mtx); + bp = bioq_takefirst(&ctx->gc_queue); + if (bp == NULL) { + if (ctx->gc_state == 1) { + ctx->gc_state = 2; + wakeup_one(&ctx->gc_state); + mtx_unlock(&ctx->gc_queue_mtx); + kthread_exit(0); + } + msleep(&ctx->gc_queue, &ctx->gc_queue_mtx, + PRIBIO | PDROP, "vgeom:io", 0); + continue; + } + mtx_unlock(&ctx->gc_queue_mtx); + zio = bp->bio_caller1; + zio->io_error = bp->bio_error; + if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) { + vdev_t *vd; + + /* + * If we get ENOTSUP, we know that no future + * attempts will ever succeed. In this case we + * set a persistent bit so that we don't bother + * with the ioctl in the future. + */ + vd = zio->io_vd; + vd->vdev_nowritecache = B_TRUE; + } + g_destroy_bio(bp); + zio_next_stage_async(zio); + } +} + +static int +vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_geom_ctx_t *ctx; + struct g_provider *pp; + struct g_consumer *cp; + int owned; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + if ((owned = mtx_owned(&Giant))) + mtx_unlock(&Giant); + g_topology_lock(); + pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); + if (pp == NULL) { + g_topology_unlock(); + if (owned) + mtx_lock(&Giant); + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + cp = vdev_geom_attach(pp, !!(spa_mode & FWRITE)); + g_topology_unlock(); + if (owned) + mtx_lock(&Giant); + if (cp == NULL) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (EACCES); + } + + /* + * Determine the actual size of the device. + */ + *psize = pp->mediasize; + + /* + * Determine the device's minimum transfer size. + */ + *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; + + /* + * Clear the nowritecache bit, so that on a vdev_reopen() we will + * try again. + */ + vd->vdev_nowritecache = B_FALSE; + + cp->private = vd; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP); + bioq_init(&ctx->gc_queue); + mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF); + ctx->gc_consumer = cp; + ctx->gc_state = 0; + + vd->vdev_tsd = ctx; + + kthread_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s", + pp->name); + + return (0); +} + +static void +vdev_geom_close(vdev_t *vd) +{ + vdev_geom_ctx_t *ctx; + struct g_consumer *cp; + + if ((ctx = vd->vdev_tsd) == NULL) + return; + if ((cp = ctx->gc_consumer) == NULL) + return; + vdev_geom_release(vd); + g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL); +} + +static void +vdev_geom_io_intr(struct bio *bp) +{ + vdev_geom_ctx_t *ctx; + zio_t *zio; + + zio = bp->bio_caller1; + ctx = zio->io_vd->vdev_tsd; + + mtx_lock(&ctx->gc_queue_mtx); + bioq_insert_tail(&ctx->gc_queue, bp); + wakeup_one(&ctx->gc_queue); + mtx_unlock(&ctx->gc_queue_mtx); +} + +static void +vdev_geom_io_start(zio_t *zio) +{ + vdev_t *vd; + vdev_geom_ctx_t *ctx; + struct g_consumer *cp; + struct bio *bp; + int error; + + cp = NULL; + + vd = zio->io_vd; + ctx = vd->vdev_tsd; + if (ctx != NULL) + cp = ctx->gc_consumer; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio_vdev_io_bypass(zio); + + /* XXPOLICY */ + if (vdev_is_dead(vd)) { + zio->io_error = ENXIO; + zio_next_stage_async(zio); + return; + } + + switch (zio->io_cmd) { + + case DKIOCFLUSHWRITECACHE: + if (vd->vdev_nowritecache) { + zio->io_error = ENOTSUP; + break; + } + + goto sendreq; + default: + zio->io_error = ENOTSUP; + } + + zio_next_stage_async(zio); + return; + } + + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + return; + + if ((zio = vdev_queue_io(zio)) == NULL) + return; + +sendreq: + + error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); + if (error == 0 && cp == NULL) + error = ENXIO; + if (error) { + zio->io_error = error; + zio_next_stage_async(zio); + return; + } + + bp = g_alloc_bio(); + bp->bio_caller1 = zio; + switch (zio->io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE; + bp->bio_data = zio->io_data; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + case ZIO_TYPE_IOCTL: + bp->bio_cmd = BIO_FLUSH; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + } + bp->bio_done = vdev_geom_io_intr; + + g_io_request(bp, cp); +} + +static void +vdev_geom_io_done(zio_t *zio) +{ + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + + zio_next_stage(zio); +} + +vdev_ops_t vdev_geom_ops = { + vdev_geom_open, + vdev_geom_close, + vdev_default_asize, + vdev_geom_io_start, + vdev_geom_io_done, + NULL, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c new file mode 100644 index 0000000..9d9f555 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -0,0 +1,1011 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Virtual Device Labels + * --------------------- + * + * The vdev label serves several distinct purposes: + * + * 1. Uniquely identify this device as part of a ZFS pool and confirm its + * identity within the pool. + * + * 2. Verify that all the devices given in a configuration are present + * within the pool. + * + * 3. Determine the uberblock for the pool. + * + * 4. In case of an import operation, determine the configuration of the + * toplevel vdev of which it is a part. + * + * 5. If an import operation cannot find all the devices in the pool, + * provide enough information to the administrator to determine which + * devices are missing. + * + * It is important to note that while the kernel is responsible for writing the + * label, it only consumes the information in the first three cases. The + * latter information is only consumed in userland when determining the + * configuration to import a pool. + * + * + * Label Organization + * ------------------ + * + * Before describing the contents of the label, it's important to understand how + * the labels are written and updated with respect to the uberblock. + * + * When the pool configuration is altered, either because it was newly created + * or a device was added, we want to update all the labels such that we can deal + * with fatal failure at any point. To this end, each disk has two labels which + * are updated before and after the uberblock is synced. Assuming we have + * labels and an uberblock with the following transacation groups: + * + * L1 UB L2 + * +------+ +------+ +------+ + * | | | | | | + * | t10 | | t10 | | t10 | + * | | | | | | + * +------+ +------+ +------+ + * + * In this stable state, the labels and the uberblock were all updated within + * the same transaction group (10). Each label is mirrored and checksummed, so + * that we can detect when we fail partway through writing the label. + * + * In order to identify which labels are valid, the labels are written in the + * following manner: + * + * 1. For each vdev, update 'L1' to the new label + * 2. Update the uberblock + * 3. For each vdev, update 'L2' to the new label + * + * Given arbitrary failure, we can determine the correct label to use based on + * the transaction group. If we fail after updating L1 but before updating the + * UB, we will notice that L1's transaction group is greater than the uberblock, + * so L2 must be valid. If we fail after writing the uberblock but before + * writing L2, we will notice that L2's transaction group is less than L1, and + * therefore L1 is valid. + * + * Another added complexity is that not every label is updated when the config + * is synced. If we add a single device, we do not want to have to re-write + * every label for every device in the pool. This means that both L1 and L2 may + * be older than the pool uberblock, because the necessary information is stored + * on another vdev. + * + * + * On-disk Format + * -------------- + * + * The vdev label consists of two distinct parts, and is wrapped within the + * vdev_label_t structure. The label includes 8k of padding to permit legacy + * VTOC disk labels, but is otherwise ignored. + * + * The first half of the label is a packed nvlist which contains pool wide + * properties, per-vdev properties, and configuration information. It is + * described in more detail below. + * + * The latter half of the label consists of a redundant array of uberblocks. + * These uberblocks are updated whenever a transaction group is committed, + * or when the configuration is updated. When a pool is loaded, we scan each + * vdev for the 'best' uberblock. + * + * + * Configuration Information + * ------------------------- + * + * The nvlist describing the pool and vdev contains the following elements: + * + * version ZFS on-disk version + * name Pool name + * state Pool state + * txg Transaction group in which this label was written + * pool_guid Unique identifier for this pool + * vdev_tree An nvlist describing vdev tree. + * + * Each leaf device label also contains the following: + * + * top_guid Unique ID for top-level vdev in which this is contained + * guid Unique ID for the leaf vdev + * + * The 'vs' configuration follows the format described in 'spa_config.c'. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Basic routines to read and write from a vdev label. + * Used throughout the rest of this file. + */ +uint64_t +vdev_label_offset(uint64_t psize, int l, uint64_t offset) +{ + ASSERT(offset < sizeof (vdev_label_t)); + + return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); +} + +static void +vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private) +{ + ASSERT(vd->vdev_children == 0); + + zio_nowait(zio_read_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE)); +} + +static void +vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private) +{ + ASSERT(vd->vdev_children == 0); + + zio_nowait(zio_write_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL)); +} + +/* + * Generate the nvlist representing this vdev's config. + */ +nvlist_t * +vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, + boolean_t isspare) +{ + nvlist_t *nv = NULL; + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, + vd->vdev_ops->vdev_op_type) == 0); + if (!isspare) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) + == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); + + if (vd->vdev_path != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, + vd->vdev_path) == 0); + + if (vd->vdev_devid != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, + vd->vdev_devid) == 0); + + if (vd->vdev_nparity != 0) { + ASSERT(strcmp(vd->vdev_ops->vdev_op_type, + VDEV_TYPE_RAIDZ) == 0); + + /* + * Make sure someone hasn't managed to sneak a fancy new vdev + * into a crufty old storage pool. + */ + ASSERT(vd->vdev_nparity == 1 || + (vd->vdev_nparity == 2 && + spa_version(spa) >= ZFS_VERSION_RAID6)); + + /* + * Note that we'll add the nparity tag even on storage pools + * that only support a single parity device -- older software + * will just ignore it. + */ + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, + vd->vdev_nparity) == 0); + } + + if (vd->vdev_wholedisk != -1ULL) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + vd->vdev_wholedisk) == 0); + + if (vd->vdev_not_present) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0); + + if (vd->vdev_isspare) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); + + if (!isspare && vd == vd->vdev_top) { + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + vd->vdev_ms_array) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + vd->vdev_ms_shift) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, + vd->vdev_ashift) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, + vd->vdev_asize) == 0); + } + + if (vd->vdev_dtl.smo_object != 0) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, + vd->vdev_dtl.smo_object) == 0); + + if (getstats) { + vdev_stat_t vs; + vdev_get_stats(vd, &vs); + VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS, + (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); + } + + if (!vd->vdev_ops->vdev_op_leaf) { + nvlist_t **child; + int c; + + child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), + KM_SLEEP); + + for (c = 0; c < vd->vdev_children; c++) + child[c] = vdev_config_generate(spa, vd->vdev_child[c], + getstats, isspare); + + VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + child, vd->vdev_children) == 0); + + for (c = 0; c < vd->vdev_children; c++) + nvlist_free(child[c]); + + kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); + + } else { + if (vd->vdev_offline && !vd->vdev_tmpoffline) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, + B_TRUE) == 0); + else + (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE, + DATA_TYPE_UINT64); + } + + return (nv); +} + +nvlist_t * +vdev_label_read_config(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *config = NULL; + vdev_phys_t *vp; + zio_t *zio; + int l; + + ASSERT(spa_config_held(spa, RW_READER)); + + if (vdev_is_dead(vd)) + return (NULL); + + vp = zio_buf_alloc(sizeof (vdev_phys_t)); + + for (l = 0; l < VDEV_LABELS; l++) { + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD); + + vdev_label_read(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL); + + if (zio_wait(zio) == 0 && + nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), + &config, 0) == 0) + break; + + if (config != NULL) { + nvlist_free(config); + config = NULL; + } + } + + zio_buf_free(vp, sizeof (vdev_phys_t)); + + return (config); +} + +/* + * Determine if a device is in use. The 'spare_guid' parameter will be filled + * in with the device guid if this spare is active elsewhere on the system. + */ +static boolean_t +vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, + uint64_t *spare_guid) +{ + spa_t *spa = vd->vdev_spa; + uint64_t state, pool_guid, device_guid, txg, spare_pool; + uint64_t vdtxg = 0; + nvlist_t *label; + + if (spare_guid) + *spare_guid = 0ULL; + + /* + * Read the label, if any, and perform some basic sanity checks. + */ + if ((label = vdev_label_read_config(vd)) == NULL) + return (B_FALSE); + + (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + &vdtxg); + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, + &device_guid) != 0) { + nvlist_free(label); + return (B_FALSE); + } + + if (state != POOL_STATE_SPARE && + (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0)) { + nvlist_free(label); + return (B_FALSE); + } + + nvlist_free(label); + + /* + * Check to see if this device indeed belongs to the pool it claims to + * be a part of. The only way this is allowed is if the device is a hot + * spare (which we check for later on). + */ + if (state != POOL_STATE_SPARE && + !spa_guid_exists(pool_guid, device_guid) && + !spa_spare_exists(device_guid, NULL)) + return (B_FALSE); + + /* + * If the transaction group is zero, then this an initialized (but + * unused) label. This is only an error if the create transaction + * on-disk is the same as the one we're using now, in which case the + * user has attempted to add the same vdev multiple times in the same + * transaction. + */ + if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg) + return (B_TRUE); + + /* + * Check to see if this is a spare device. We do an explicit check for + * spa_has_spare() here because it may be on our pending list of spares + * to add. + */ + if (spa_spare_exists(device_guid, &spare_pool) || + spa_has_spare(spa, device_guid)) { + if (spare_guid) + *spare_guid = device_guid; + + switch (reason) { + case VDEV_LABEL_CREATE: + return (B_TRUE); + + case VDEV_LABEL_REPLACE: + return (!spa_has_spare(spa, device_guid) || + spare_pool != 0ULL); + + case VDEV_LABEL_SPARE: + return (spa_has_spare(spa, device_guid)); + } + } + + /* + * If the device is marked ACTIVE, then this device is in use by another + * pool on the system. + */ + return (state == POOL_STATE_ACTIVE); +} + +/* + * Initialize a vdev label. We check to make sure each leaf device is not in + * use, and writable. We put down an initial label which we will later + * overwrite with a complete label. Note that it's important to do this + * sequentially, not in parallel, so that we catch cases of multiple use of the + * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with + * itself. + */ +int +vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *label; + vdev_phys_t *vp; + vdev_boot_header_t *vb; + uberblock_t *ub; + zio_t *zio; + int l, c, n; + char *buf; + size_t buflen; + int error; + uint64_t spare_guid; + + ASSERT(spa_config_held(spa, RW_WRITER)); + + for (c = 0; c < vd->vdev_children; c++) + if ((error = vdev_label_init(vd->vdev_child[c], + crtxg, reason)) != 0) + return (error); + + if (!vd->vdev_ops->vdev_op_leaf) + return (0); + + /* + * Dead vdevs cannot be initialized. + */ + if (vdev_is_dead(vd)) + return (EIO); + + /* + * Determine if the vdev is in use. + */ + if (reason != VDEV_LABEL_REMOVE && + vdev_inuse(vd, crtxg, reason, &spare_guid)) + return (EBUSY); + + ASSERT(reason != VDEV_LABEL_REMOVE || + vdev_inuse(vd, crtxg, reason, NULL)); + + /* + * If this is a request to add or replace a spare that is in use + * elsewhere on the system, then we must update the guid (which was + * initialized to a random value) to reflect the actual GUID (which is + * shared between multiple pools). + */ + if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) { + vdev_t *pvd = vd->vdev_parent; + + for (; pvd != NULL; pvd = pvd->vdev_parent) { + pvd->vdev_guid_sum -= vd->vdev_guid; + pvd->vdev_guid_sum += spare_guid; + } + + vd->vdev_guid = vd->vdev_guid_sum = spare_guid; + + /* + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding a spare, then it's already + * labelled appropriately and we can just return. + */ + if (reason == VDEV_LABEL_SPARE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE); + } + + /* + * Initialize its label. + */ + vp = zio_buf_alloc(sizeof (vdev_phys_t)); + bzero(vp, sizeof (vdev_phys_t)); + + /* + * Generate a label describing the pool and our top-level vdev. + * We mark it as being from txg 0 to indicate that it's not + * really part of an active pool just yet. The labels will + * be written again with a meaningful txg by spa_sync(). + */ + if (reason == VDEV_LABEL_SPARE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { + /* + * For inactive hot spares, we generate a special label that + * identifies as a mutually shared hot spare. We write the + * label if we are adding a hot spare, or if we are removing an + * active hot spare (in which case we want to revert the + * labels). + */ + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, + POOL_STATE_SPARE) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } else { + label = spa_config_generate(spa, vd, 0ULL, B_FALSE); + + /* + * Add our creation time. This allows us to detect multiple + * vdev uses as described above, and automatically expires if we + * fail. + */ + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + crtxg) == 0); + } + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); + if (error != 0) { + nvlist_free(label); + zio_buf_free(vp, sizeof (vdev_phys_t)); + /* EFAULT means nvlist_pack ran out of room */ + return (error == EFAULT ? ENAMETOOLONG : EINVAL); + } + + /* + * Initialize boot block header. + */ + vb = zio_buf_alloc(sizeof (vdev_boot_header_t)); + bzero(vb, sizeof (vdev_boot_header_t)); + vb->vb_magic = VDEV_BOOT_MAGIC; + vb->vb_version = VDEV_BOOT_VERSION; + vb->vb_offset = VDEV_BOOT_OFFSET; + vb->vb_size = VDEV_BOOT_SIZE; + + /* + * Initialize uberblock template. + */ + ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); + bzero(ub, VDEV_UBERBLOCK_SIZE(vd)); + *ub = spa->spa_uberblock; + ub->ub_txg = 0; + + /* + * Write everything in parallel. + */ + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + + for (l = 0; l < VDEV_LABELS; l++) { + + vdev_label_write(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL); + + vdev_label_write(zio, vd, l, vb, + offsetof(vdev_label_t, vl_boot_header), + sizeof (vdev_boot_header_t), NULL, NULL); + + for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + vdev_label_write(zio, vd, l, ub, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), NULL, NULL); + } + } + + error = zio_wait(zio); + + nvlist_free(label); + zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); + zio_buf_free(vb, sizeof (vdev_boot_header_t)); + zio_buf_free(vp, sizeof (vdev_phys_t)); + + /* + * If this vdev hasn't been previously identified as a spare, then we + * mark it as such only if a) we are labelling it as a spare, or b) it + * exists as a spare elsewhere in the system. + */ + if (error == 0 && !vd->vdev_isspare && + (reason == VDEV_LABEL_SPARE || + spa_spare_exists(vd->vdev_guid, NULL))) + spa_spare_add(vd); + + return (error); +} + +/* + * ========================================================================== + * uberblock load/sync + * ========================================================================== + */ + +/* + * Consider the following situation: txg is safely synced to disk. We've + * written the first uberblock for txg + 1, and then we lose power. When we + * come back up, we fail to see the uberblock for txg + 1 because, say, + * it was on a mirrored device and the replica to which we wrote txg + 1 + * is now offline. If we then make some changes and sync txg + 1, and then + * the missing replica comes back, then for a new seconds we'll have two + * conflicting uberblocks on disk with the same txg. The solution is simple: + * among uberblocks with equal txg, choose the one with the latest timestamp. + */ +static int +vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) +{ + if (ub1->ub_txg < ub2->ub_txg) + return (-1); + if (ub1->ub_txg > ub2->ub_txg) + return (1); + + if (ub1->ub_timestamp < ub2->ub_timestamp) + return (-1); + if (ub1->ub_timestamp > ub2->ub_timestamp) + return (1); + + return (0); +} + +static void +vdev_uberblock_load_done(zio_t *zio) +{ + uberblock_t *ub = zio->io_data; + uberblock_t *ubbest = zio->io_private; + spa_t *spa = zio->io_spa; + + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); + + if (zio->io_error == 0 && uberblock_verify(ub) == 0) { + mutex_enter(&spa->spa_uberblock_lock); + if (vdev_uberblock_compare(ub, ubbest) > 0) + *ubbest = *ub; + mutex_exit(&spa->spa_uberblock_lock); + } + + zio_buf_free(zio->io_data, zio->io_size); +} + +void +vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) +{ + int l, c, n; + + for (c = 0; c < vd->vdev_children; c++) + vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (vdev_is_dead(vd)) + return; + + for (l = 0; l < VDEV_LABELS; l++) { + for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + vdev_label_read(zio, vd, l, + zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_load_done, ubbest); + } + } +} + +/* + * Write the uberblock to both labels of all leaves of the specified vdev. + * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). + */ +static void +vdev_uberblock_sync_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_root->io_private; + + if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) + atomic_add_64(good_writes, 1); +} + +static void +vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg) +{ + int l, c, n; + + for (c = 0; c < vd->vdev_children; c++) + vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (vdev_is_dead(vd)) + return; + + n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); + + ASSERT(ub->ub_txg == txg); + + for (l = 0; l < VDEV_LABELS; l++) + vdev_label_write(zio, vd, l, ub, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_sync_done, NULL); + + dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg); +} + +static int +vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg) +{ + uberblock_t *ubbuf; + size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE; + uint64_t *good_writes; + zio_t *zio; + int error; + + ubbuf = zio_buf_alloc(size); + bzero(ubbuf, size); + *ubbuf = *ub; + + good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + + zio = zio_root(spa, NULL, good_writes, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + + vdev_uberblock_sync(zio, ubbuf, vd, txg); + + error = zio_wait(zio); + + if (error && *good_writes != 0) { + dprintf("partial success: good_writes = %llu\n", *good_writes); + error = 0; + } + + /* + * It's possible to have no good writes and no error if every vdev is in + * the CANT_OPEN state. + */ + if (*good_writes == 0 && error == 0) + error = EIO; + + kmem_free(good_writes, sizeof (uint64_t)); + zio_buf_free(ubbuf, size); + + return (error); +} + +/* + * Sync out an individual vdev. + */ +static void +vdev_sync_label_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_root->io_private; + + if (zio->io_error == 0) + atomic_add_64(good_writes, 1); +} + +static void +vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg) +{ + nvlist_t *label; + vdev_phys_t *vp; + char *buf; + size_t buflen; + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_sync_label(zio, vd->vdev_child[c], l, txg); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (vdev_is_dead(vd)) + return; + + /* + * Generate a label describing the top-level config to which we belong. + */ + label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); + + vp = zio_buf_alloc(sizeof (vdev_phys_t)); + bzero(vp, sizeof (vdev_phys_t)); + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) + vdev_label_write(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), + vdev_sync_label_done, NULL); + + zio_buf_free(vp, sizeof (vdev_phys_t)); + nvlist_free(label); + + dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg); +} + +static int +vdev_sync_labels(vdev_t *vd, int l, uint64_t txg) +{ + uint64_t *good_writes; + zio_t *zio; + int error; + + ASSERT(vd == vd->vdev_top); + + good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + + zio = zio_root(vd->vdev_spa, NULL, good_writes, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + + /* + * Recursively kick off writes to all labels. + */ + vdev_sync_label(zio, vd, l, txg); + + error = zio_wait(zio); + + if (error && *good_writes != 0) { + dprintf("partial success: good_writes = %llu\n", *good_writes); + error = 0; + } + + if (*good_writes == 0 && error == 0) + error = ENODEV; + + kmem_free(good_writes, sizeof (uint64_t)); + + return (error); +} + +/* + * Sync the entire vdev configuration. + * + * The order of operations is carefully crafted to ensure that + * if the system panics or loses power at any time, the state on disk + * is still transactionally consistent. The in-line comments below + * describe the failure semantics at each stage. + * + * Moreover, it is designed to be idempotent: if spa_sync_labels() fails + * at any time, you can just call it again, and it will resume its work. + */ +int +vdev_config_sync(vdev_t *uvd, uint64_t txg) +{ + spa_t *spa = uvd->vdev_spa; + uberblock_t *ub = &spa->spa_uberblock; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd; + zio_t *zio; + int l, error; + + ASSERT(ub->ub_txg <= txg); + + /* + * If this isn't a resync due to I/O errors, and nothing changed + * in this transaction group, and the vdev configuration hasn't changed, + * then there's nothing to do. + */ + if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE && + list_is_empty(&spa->spa_dirty_list)) { + dprintf("nothing to sync in %s in txg %llu\n", + spa_name(spa), txg); + return (0); + } + + if (txg > spa_freeze_txg(spa)) + return (0); + + ASSERT(txg <= spa->spa_final_txg); + + dprintf("syncing %s txg %llu\n", spa_name(spa), txg); + + /* + * Flush the write cache of every disk that's been written to + * in this transaction group. This ensures that all blocks + * written in this txg will be committed to stable storage + * before any uberblock that references them. + */ + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; + vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) { + zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + } + (void) zio_wait(zio); + + /* + * Sync out the even labels (L0, L2) for every dirty vdev. If the + * system dies in the middle of this process, that's OK: all of the + * even labels that made it to disk will be newer than any uberblock, + * and will therefore be considered invalid. The odd labels (L1, L3), + * which have not yet been touched, will still be valid. + */ + for (vd = list_head(&spa->spa_dirty_list); vd != NULL; + vd = list_next(&spa->spa_dirty_list, vd)) { + for (l = 0; l < VDEV_LABELS; l++) { + if (l & 1) + continue; + if ((error = vdev_sync_labels(vd, l, txg)) != 0) + return (error); + } + } + + /* + * Flush the new labels to disk. This ensures that all even-label + * updates are committed to stable storage before the uberblock update. + */ + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + for (vd = list_head(&spa->spa_dirty_list); vd != NULL; + vd = list_next(&spa->spa_dirty_list, vd)) { + zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + } + (void) zio_wait(zio); + + /* + * Sync the uberblocks to all vdevs in the tree specified by uvd. + * If the system dies in the middle of this step, there are two cases + * to consider, and the on-disk state is consistent either way: + * + * (1) If none of the new uberblocks made it to disk, then the + * previous uberblock will be the newest, and the odd labels + * (which had not yet been touched) will be valid with respect + * to that uberblock. + * + * (2) If one or more new uberblocks made it to disk, then they + * will be the newest, and the even labels (which had all + * been successfully committed) will be valid with respect + * to the new uberblocks. + */ + if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0) + return (error); + + /* + * Flush the uberblocks to disk. This ensures that the odd labels + * are no longer needed (because the new uberblocks and the even + * labels are safely on disk), so it is safe to overwrite them. + */ + (void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + + /* + * Sync out odd labels for every dirty vdev. If the system dies + * in the middle of this process, the even labels and the new + * uberblocks will suffice to open the pool. The next time + * the pool is opened, the first thing we'll do -- before any + * user data is modified -- is mark every vdev dirty so that + * all labels will be brought up to date. + */ + for (vd = list_head(&spa->spa_dirty_list); vd != NULL; + vd = list_next(&spa->spa_dirty_list, vd)) { + for (l = 0; l < VDEV_LABELS; l++) { + if ((l & 1) == 0) + continue; + if ((error = vdev_sync_labels(vd, l, txg)) != 0) + return (error); + } + } + + /* + * Flush the new labels to disk. This ensures that all odd-label + * updates are committed to stable storage before the next + * transaction group begins. + */ + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + for (vd = list_head(&spa->spa_dirty_list); vd != NULL; + vd = list_next(&spa->spa_dirty_list, vd)) { + zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + } + (void) zio_wait(zio); + + return (0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c new file mode 100644 index 0000000..73d1a83 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -0,0 +1,495 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for mirroring. + */ + +typedef struct mirror_child { + vdev_t *mc_vd; + uint64_t mc_offset; + int mc_error; + short mc_tried; + short mc_skipped; +} mirror_child_t; + +typedef struct mirror_map { + int mm_children; + int mm_replacing; + int mm_preferred; + int mm_root; + mirror_child_t mm_child[1]; +} mirror_map_t; + +int vdev_mirror_shift = 21; + +static mirror_map_t * +vdev_mirror_map_alloc(zio_t *zio) +{ + mirror_map_t *mm = NULL; + mirror_child_t *mc; + vdev_t *vd = zio->io_vd; + int c, d; + + if (vd == NULL) { + dva_t *dva = zio->io_bp->blk_dva; + spa_t *spa = zio->io_spa; + + c = BP_GET_NDVAS(zio->io_bp); + + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm->mm_children = c; + mm->mm_replacing = B_FALSE; + mm->mm_preferred = spa_get_random(c); + mm->mm_root = B_TRUE; + + /* + * Check the other, lower-index DVAs to see if they're on + * the same vdev as the child we picked. If they are, use + * them since they are likely to have been allocated from + * the primary metaslab in use at the time, and hence are + * more likely to have locality with single-copy data. + */ + for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { + if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) + mm->mm_preferred = d; + } + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); + mc->mc_offset = DVA_GET_OFFSET(&dva[c]); + } + } else { + c = vd->vdev_children; + + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm->mm_children = c; + mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + mm->mm_preferred = mm->mm_replacing ? 0 : + (zio->io_offset >> vdev_mirror_shift) % c; + mm->mm_root = B_FALSE; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + mc->mc_vd = vd->vdev_child[c]; + mc->mc_offset = zio->io_offset; + } + } + + zio->io_vsd = mm; + return (mm); +} + +static void +vdev_mirror_map_free(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + + kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); + zio->io_vsd = NULL; +} + +static int +vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + vdev_t *cvd; + uint64_t c; + int numerrors = 0; + int ret, lasterror = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if ((ret = vdev_open(cvd)) != 0) { + lasterror = ret; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *ashift = MAX(*ashift, cvd->vdev_ashift); + } + + if (numerrors == vd->vdev_children) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_mirror_close(vdev_t *vd) +{ + uint64_t c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_mirror_child_done(zio_t *zio) +{ + mirror_child_t *mc = zio->io_private; + + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; +} + +static void +vdev_mirror_scrub_done(zio_t *zio) +{ + mirror_child_t *mc = zio->io_private; + + if (zio->io_error == 0) { + zio_t *pio = zio->io_parent; + mutex_enter(&pio->io_lock); + ASSERT3U(zio->io_size, >=, pio->io_size); + bcopy(zio->io_data, pio->io_data, pio->io_size); + mutex_exit(&pio->io_lock); + } + + zio_buf_free(zio->io_data, zio->io_size); + + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; +} + +static void +vdev_mirror_repair_done(zio_t *zio) +{ + ASSERT(zio->io_private == zio->io_parent); + vdev_mirror_map_free(zio->io_private); +} + +/* + * Try to find a child whose DTL doesn't contain the block we want to read. + * If we can't, try the read on any vdev we haven't already tried. + */ +static int +vdev_mirror_child_select(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + mirror_child_t *mc; + uint64_t txg = zio->io_txg; + int i, c; + + ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg); + + /* + * Try to find a child whose DTL doesn't contain the block to read. + * If a child is known to be completely inaccessible (indicated by + * vdev_is_dead() returning B_TRUE), don't even try. + */ + for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { + if (c >= mm->mm_children) + c = 0; + mc = &mm->mm_child[c]; + if (mc->mc_tried || mc->mc_skipped) + continue; + if (vdev_is_dead(mc->mc_vd)) { + mc->mc_error = ENXIO; + mc->mc_tried = 1; /* don't even try */ + mc->mc_skipped = 1; + continue; + } + if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) + return (c); + mc->mc_error = ESTALE; + mc->mc_skipped = 1; + } + + /* + * Every device is either missing or has this txg in its DTL. + * Look for any child we haven't already tried before giving up. + */ + for (c = 0; c < mm->mm_children; c++) + if (!mm->mm_child[c].mc_tried) + return (c); + + /* + * Every child failed. There's no place left to look. + */ + return (-1); +} + +static void +vdev_mirror_io_start(zio_t *zio) +{ + mirror_map_t *mm; + mirror_child_t *mc; + int c, children; + + mm = vdev_mirror_map_alloc(zio); + + if (zio->io_type == ZIO_TYPE_READ) { + if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { + /* + * For scrubbing reads we need to allocate a read + * buffer for each child and issue reads to all + * children. If any child succeeds, it will copy its + * data into zio->io_data in vdev_mirror_scrub_done. + */ + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, + zio_buf_alloc(zio->io_size), zio->io_size, + zio->io_type, zio->io_priority, + ZIO_FLAG_CANFAIL, + vdev_mirror_scrub_done, mc)); + } + zio_wait_children_done(zio); + return; + } + /* + * For normal reads just pick one child. + */ + c = vdev_mirror_child_select(zio); + children = (c >= 0); + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + /* + * If this is a resilvering I/O to a replacing vdev, + * only the last child should be written -- unless the + * first child happens to have a DTL entry here as well. + * All other writes go to all children. + */ + if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && + !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, + zio->io_txg, 1)) { + c = mm->mm_children - 1; + children = 1; + } else { + c = 0; + children = mm->mm_children; + } + } + + while (children--) { + mc = &mm->mm_child[c]; + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, + zio->io_data, zio->io_size, zio->io_type, zio->io_priority, + ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc)); + c++; + } + + zio_wait_children_done(zio); +} + +static void +vdev_mirror_io_done(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + mirror_child_t *mc; + int c; + int good_copies = 0; + int unexpected_errors = 0; + + zio->io_error = 0; + zio->io_numerrors = 0; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + if (mc->mc_tried && mc->mc_error == 0) { + good_copies++; + continue; + } + + /* + * We preserve any EIOs because those may be worth retrying; + * whereas ECKSUM and ENXIO are more likely to be persistent. + */ + if (mc->mc_error) { + if (zio->io_error != EIO) + zio->io_error = mc->mc_error; + if (!mc->mc_skipped) + unexpected_errors++; + zio->io_numerrors++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * XXX -- for now, treat partial writes as success. + * XXX -- For a replacing vdev, we need to make sure the + * new child succeeds. + */ + /* XXPOLICY */ + if (good_copies != 0) + zio->io_error = 0; + vdev_mirror_map_free(zio); + zio_next_stage(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* + * If we don't have a good copy yet, keep trying other children. + */ + /* XXPOLICY */ + if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { + ASSERT(c >= 0 && c < mm->mm_children); + mc = &mm->mm_child[c]; + dprintf("retrying i/o (err=%d) on child %s\n", + zio->io_error, vdev_description(mc->mc_vd)); + zio->io_error = 0; + zio_vdev_io_redone(zio); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL, + vdev_mirror_child_done, mc)); + zio_wait_children_done(zio); + return; + } + + /* XXPOLICY */ + if (good_copies) + zio->io_error = 0; + else + ASSERT(zio->io_error != 0); + + if (good_copies && (spa_mode & FWRITE) && + (unexpected_errors || + (zio->io_flags & ZIO_FLAG_RESILVER) || + ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { + zio_t *rio; + + /* + * Use the good data we have in hand to repair damaged children. + * + * We issue all repair I/Os as children of 'rio' to arrange + * that vdev_mirror_map_free(zio) will be invoked after all + * repairs complete, but before we advance to the next stage. + */ + rio = zio_null(zio, zio->io_spa, + vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL); + + for (c = 0; c < mm->mm_children; c++) { + /* + * Don't rewrite known good children. + * Not only is it unnecessary, it could + * actually be harmful: if the system lost + * power while rewriting the only good copy, + * there would be no good copies left! + */ + mc = &mm->mm_child[c]; + + if (mc->mc_error == 0) { + if (mc->mc_tried) + continue; + if (!(zio->io_flags & ZIO_FLAG_SCRUB) && + !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, + zio->io_txg, 1)) + continue; + mc->mc_error = ESTALE; + } + + dprintf("resilvered %s @ 0x%llx error %d\n", + vdev_description(mc->mc_vd), mc->mc_offset, + mc->mc_error); + + zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd, + mc->mc_offset, zio->io_data, zio->io_size, + ZIO_TYPE_WRITE, zio->io_priority, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); + } + + zio_nowait(rio); + zio_wait_children_done(zio); + return; + } + + vdev_mirror_map_free(zio); + zio_next_stage(zio); +} + +static void +vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted == vd->vdev_children) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_mirror_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + VDEV_TYPE_MIRROR, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_replacing_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + VDEV_TYPE_REPLACING, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_spare_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + VDEV_TYPE_SPARE, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c new file mode 100644 index 0000000..b35f4a5 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * The 'missing' vdev is a special vdev type used only during import. It + * signifies a placeholder in the root vdev for some vdev that we know is + * missing. We pass it down to the kernel to allow the rest of the + * configuration to parsed and an attempt made to open all available devices. + * Because its GUID is always 0, we know that the guid sum will mismatch and we + * won't be able to open the pool anyway. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> + +/* ARGSUSED */ +static int +vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + /* + * Really this should just fail. But then the root vdev will be in the + * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is + * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we + * will fail the GUID sum check before ever trying to open the pool. + */ + *psize = SPA_MINDEVSIZE; + *ashift = SPA_MINBLOCKSHIFT; + return (0); +} + +/* ARGSUSED */ +static void +vdev_missing_close(vdev_t *vd) +{ +} + +/* ARGSUSED */ +static void +vdev_missing_io_start(zio_t *zio) +{ + zio->io_error = ENOTSUP; + zio_next_stage_async(zio); +} + +/* ARGSUSED */ +static void +vdev_missing_io_done(zio_t *zio) +{ + zio_next_stage(zio); +} + +vdev_ops_t vdev_missing_ops = { + vdev_missing_open, + vdev_missing_close, + vdev_default_asize, + vdev_missing_io_start, + vdev_missing_io_done, + NULL, + VDEV_TYPE_MISSING, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c new file mode 100644 index 0000000..7e99c1f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -0,0 +1,323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/avl.h> + +/* + * These tunables are for performance analysis. + */ +/* + * zfs_vdev_max_pending is the maximum number of i/os concurrently + * pending to each device. zfs_vdev_min_pending is the initial number + * of i/os pending to each device (before it starts ramping up to + * max_pending). + */ +int zfs_vdev_max_pending = 35; +int zfs_vdev_min_pending = 4; + +/* deadline = pri + (lbolt >> time_shift) */ +int zfs_vdev_time_shift = 6; + +/* exponential I/O issue ramp-up rate */ +int zfs_vdev_ramp_rate = 2; + +/* + * i/os will be aggregated into a single large i/o up to + * zfs_vdev_aggregation_limit bytes long. + */ +int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; + +/* + * Virtual device vector for disk I/O scheduling. + */ +int +vdev_queue_deadline_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_deadline < z2->io_deadline) + return (-1); + if (z1->io_deadline > z2->io_deadline) + return (1); + + if (z1->io_offset < z2->io_offset) + return (-1); + if (z1->io_offset > z2->io_offset) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + +int +vdev_queue_offset_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) + return (-1); + if (z1->io_offset > z2->io_offset) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + +void +vdev_queue_init(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, + sizeof (zio_t), offsetof(struct zio, io_deadline_node)); + + avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_offset_node)); +} + +void +vdev_queue_fini(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + + avl_destroy(&vq->vq_deadline_tree); + avl_destroy(&vq->vq_read_tree); + avl_destroy(&vq->vq_write_tree); + avl_destroy(&vq->vq_pending_tree); + + mutex_destroy(&vq->vq_lock); +} + +static void +vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) +{ + avl_add(&vq->vq_deadline_tree, zio); + avl_add(zio->io_vdev_tree, zio); +} + +static void +vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) +{ + avl_remove(&vq->vq_deadline_tree, zio); + avl_remove(zio->io_vdev_tree, zio); +} + +static void +vdev_queue_agg_io_done(zio_t *aio) +{ + zio_t *dio; + uint64_t offset = 0; + + while ((dio = aio->io_delegate_list) != NULL) { + if (aio->io_type == ZIO_TYPE_READ) + bcopy((char *)aio->io_data + offset, dio->io_data, + dio->io_size); + offset += dio->io_size; + aio->io_delegate_list = dio->io_delegate_next; + dio->io_delegate_next = NULL; + dio->io_error = aio->io_error; + zio_next_stage(dio); + } + ASSERT3U(offset, ==, aio->io_size); + + zio_buf_free(aio->io_data, aio->io_size); +} + +#define IS_ADJACENT(io, nio) \ + ((io)->io_offset + (io)->io_size == (nio)->io_offset) + +typedef void zio_issue_func_t(zio_t *); + +static zio_t * +vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, + zio_issue_func_t **funcp) +{ + zio_t *fio, *lio, *aio, *dio; + avl_tree_t *tree; + uint64_t size; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + *funcp = NULL; + + if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || + avl_numnodes(&vq->vq_deadline_tree) == 0) + return (NULL); + + fio = lio = avl_first(&vq->vq_deadline_tree); + + tree = fio->io_vdev_tree; + size = fio->io_size; + + while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && + size + dio->io_size <= zfs_vdev_aggregation_limit) { + dio->io_delegate_next = fio; + fio = dio; + size += dio->io_size; + } + + while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && + size + dio->io_size <= zfs_vdev_aggregation_limit) { + lio->io_delegate_next = dio; + lio = dio; + size += dio->io_size; + } + + if (fio != lio) { + char *buf = zio_buf_alloc(size); + uint64_t offset = 0; + int nagg = 0; + + ASSERT(size <= zfs_vdev_aggregation_limit); + + aio = zio_vdev_child_io(fio, NULL, fio->io_vd, + fio->io_offset, buf, size, fio->io_type, + ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_NOBOOKMARK, + vdev_queue_agg_io_done, NULL); + + aio->io_delegate_list = fio; + + for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { + ASSERT(dio->io_type == aio->io_type); + ASSERT(dio->io_vdev_tree == tree); + if (dio->io_type == ZIO_TYPE_WRITE) + bcopy(dio->io_data, buf + offset, dio->io_size); + offset += dio->io_size; + vdev_queue_io_remove(vq, dio); + zio_vdev_io_bypass(dio); + nagg++; + } + + ASSERT(offset == size); + + dprintf("%5s T=%llu off=%8llx agg=%3d " + "old=%5llx new=%5llx\n", + zio_type_name[fio->io_type], + fio->io_deadline, fio->io_offset, nagg, fio->io_size, size); + + avl_add(&vq->vq_pending_tree, aio); + + *funcp = zio_nowait; + return (aio); + } + + ASSERT(fio->io_vdev_tree == tree); + vdev_queue_io_remove(vq, fio); + + avl_add(&vq->vq_pending_tree, fio); + + *funcp = zio_next_stage; + + return (fio); +} + +zio_t * +vdev_queue_io(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; + zio_issue_func_t *func; + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + + if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) + return (zio); + + zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + + if (zio->io_type == ZIO_TYPE_READ) + zio->io_vdev_tree = &vq->vq_read_tree; + else + zio->io_vdev_tree = &vq->vq_write_tree; + + mutex_enter(&vq->vq_lock); + + zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + + zio->io_priority; + + vdev_queue_io_add(vq, zio); + + nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func); + + mutex_exit(&vq->vq_lock); + + if (nio == NULL || func != zio_nowait) + return (nio); + + func(nio); + return (NULL); +} + +void +vdev_queue_io_done(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; + zio_issue_func_t *func; + int i; + + mutex_enter(&vq->vq_lock); + + avl_remove(&vq->vq_pending_tree, zio); + + for (i = 0; i < zfs_vdev_ramp_rate; i++) { + nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func); + if (nio == NULL) + break; + mutex_exit(&vq->vq_lock); + if (func == zio_next_stage) + zio_vdev_io_reissue(nio); + func(nio); + mutex_enter(&vq->vq_lock); + } + + mutex_exit(&vq->vq_lock); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c new file mode 100644 index 0000000..08df7e0 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -0,0 +1,1223 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> + +/* + * Virtual device vector for RAID-Z. + * + * This vdev supports both single and double parity. For single parity, we + * use a simple XOR of all the data columns. For double parity, we use both + * the simple XOR as well as a technique described in "The mathematics of + * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), + * over the integers expressable in a single byte. Briefly, the operations on + * the field are defined as follows: + * + * o addition (+) is represented by a bitwise XOR + * o subtraction (-) is therefore identical to addition: A + B = A - B + * o multiplication of A by 2 is defined by the following bitwise expression: + * (A * 2)_7 = A_6 + * (A * 2)_6 = A_5 + * (A * 2)_5 = A_4 + * (A * 2)_4 = A_3 + A_7 + * (A * 2)_3 = A_2 + A_7 + * (A * 2)_2 = A_1 + A_7 + * (A * 2)_1 = A_0 + * (A * 2)_0 = A_7 + * + * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). + * + * Observe that any number in the field (except for 0) can be expressed as a + * power of 2 -- a generator for the field. We store a table of the powers of + * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can + * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather + * than field addition). The inverse of a field element A (A^-1) is A^254. + * + * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, + * can be expressed by field operations: + * + * P = D_0 + D_1 + ... + D_n-2 + D_n-1 + * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 + * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 + * + * See the reconstruction code below for how P and Q can used individually or + * in concert to recover missing data columns. + */ + +typedef struct raidz_col { + uint64_t rc_devidx; /* child device index for I/O */ + uint64_t rc_offset; /* device offset */ + uint64_t rc_size; /* I/O size */ + void *rc_data; /* I/O data */ + int rc_error; /* I/O error for this device */ + uint8_t rc_tried; /* Did we attempt this I/O column? */ + uint8_t rc_skipped; /* Did we skip this I/O column? */ +} raidz_col_t; + +typedef struct raidz_map { + uint64_t rm_cols; /* Column count */ + uint64_t rm_bigcols; /* Number of oversized columns */ + uint64_t rm_asize; /* Actual total I/O size */ + uint64_t rm_missingdata; /* Count of missing data devices */ + uint64_t rm_missingparity; /* Count of missing parity devices */ + uint64_t rm_firstdatacol; /* First data column/parity count */ + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ +} raidz_map_t; + +#define VDEV_RAIDZ_P 0 +#define VDEV_RAIDZ_Q 1 + +#define VDEV_RAIDZ_MAXPARITY 2 + +#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) + +/* + * These two tables represent powers and logs of 2 in the Galois field defined + * above. These values were computed by repeatedly multiplying by 2 as above. + */ +static const uint8_t vdev_raidz_pow2[256] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, + 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, + 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, + 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, + 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, + 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, + 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, + 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, + 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, + 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, + 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, + 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, + 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, + 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, + 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, + 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, + 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, + 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, + 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, + 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, + 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, + 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, + 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, + 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, + 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, + 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, + 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, + 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, + 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, + 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, + 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 +}; +static const uint8_t vdev_raidz_log2[256] = { + 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, + 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, + 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, + 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, + 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, + 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, + 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, + 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, + 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, + 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, + 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, + 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, + 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, + 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, + 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, + 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, + 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, + 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, + 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, + 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, + 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, + 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, + 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, + 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, + 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, + 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, + 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, + 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, + 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, + 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, + 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, + 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, +}; + +/* + * Multiply a given number by 2 raised to the given power. + */ +static uint8_t +vdev_raidz_exp2(uint_t a, int exp) +{ + if (a == 0) + return (0); + + ASSERT(exp >= 0); + ASSERT(vdev_raidz_log2[a] > 0 || a == 1); + + exp += vdev_raidz_log2[a]; + if (exp > 255) + exp -= 255; + + return (vdev_raidz_pow2[exp]); +} + +static raidz_map_t * +vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, + uint64_t nparity) +{ + raidz_map_t *rm; + uint64_t b = zio->io_offset >> unit_shift; + uint64_t s = zio->io_size >> unit_shift; + uint64_t f = b % dcols; + uint64_t o = (b / dcols) << unit_shift; + uint64_t q, r, c, bc, col, acols, coff, devidx; + + q = s / (dcols - nparity); + r = s - q * (dcols - nparity); + bc = (r == 0 ? 0 : r + nparity); + + acols = (q == 0 ? bc : dcols); + + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); + + rm->rm_cols = acols; + rm->rm_bigcols = bc; + rm->rm_asize = 0; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + rm->rm_firstdatacol = nparity; + + for (c = 0; c < acols; c++) { + col = f + c; + coff = o; + if (col >= dcols) { + col -= dcols; + coff += 1ULL << unit_shift; + } + rm->rm_col[c].rc_devidx = col; + rm->rm_col[c].rc_offset = coff; + rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; + rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_error = 0; + rm->rm_col[c].rc_tried = 0; + rm->rm_col[c].rc_skipped = 0; + rm->rm_asize += rm->rm_col[c].rc_size; + } + + rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); + + for (c = 0; c < rm->rm_firstdatacol; c++) + rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + + rm->rm_col[c].rc_data = zio->io_data; + + for (c = c + 1; c < acols; c++) + rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + + rm->rm_col[c - 1].rc_size; + + /* + * If all data stored spans all columns, there's a danger that parity + * will always be on the same device and, since parity isn't read + * during normal operation, that that device's I/O bandwidth won't be + * used effectively. We therefore switch the parity every 1MB. + * + * ... at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices evenly, we + * won't see any benefit. Further, occasional writes that aren't a + * multiple of the LCM of the number of children and the minimum + * stripe width are sufficient to avoid pessimal behavior. + * Unfortunately, this decision created an implicit on-disk format + * requirement that we need to support for all eternity, but only + * for single-parity RAID-Z. + */ + ASSERT(rm->rm_cols >= 2); + ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + + if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + devidx = rm->rm_col[0].rc_devidx; + o = rm->rm_col[0].rc_offset; + rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; + rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; + rm->rm_col[1].rc_devidx = devidx; + rm->rm_col[1].rc_offset = o; + } + + zio->io_vsd = rm; + return (rm); +} + +static void +vdev_raidz_map_free(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + int c; + + for (c = 0; c < rm->rm_firstdatacol; c++) + zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); + zio->io_vsd = NULL; +} + +static void +vdev_raidz_generate_parity_p(raidz_map_t *rm) +{ + uint64_t *p, *src, pcount, ccount, i; + int c; + + pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccount == pcount); + for (i = 0; i < ccount; i++, p++, src++) { + *p = *src; + } + } else { + ASSERT(ccount <= pcount); + for (i = 0; i < ccount; i++, p++, src++) { + *p ^= *src; + } + } + } +} + +static void +vdev_raidz_generate_parity_pq(raidz_map_t *rm) +{ + uint64_t *q, *p, *src, pcount, ccount, mask, i; + int c; + + pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccount == pcount || ccount == 0); + for (i = 0; i < ccount; i++, p++, q++, src++) { + *q = *src; + *p = *src; + } + for (; i < pcount; i++, p++, q++, src++) { + *q = 0; + *p = 0; + } + } else { + ASSERT(ccount <= pcount); + + /* + * Rather than multiplying each byte individually (as + * described above), we are able to handle 8 at once + * by generating a mask based on the high bit in each + * byte and using that to conditionally XOR in 0x1d. + */ + for (i = 0; i < ccount; i++, p++, q++, src++) { + mask = *q & 0x8080808080808080ULL; + mask = (mask << 1) - (mask >> 7); + *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ + (mask & 0x1d1d1d1d1d1d1d1dULL); + *q ^= *src; + *p ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + */ + for (; i < pcount; i++, q++) { + mask = *q & 0x8080808080808080ULL; + mask = (mask << 1) - (mask >> 7); + *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ + (mask & 0x1d1d1d1d1d1d1d1dULL); + } + } + } +} + +static void +vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) +{ + uint64_t *dst, *src, xcount, ccount, count, i; + int c; + + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); + ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); + ASSERT(xcount > 0); + + src = rm->rm_col[VDEV_RAIDZ_P].rc_data; + dst = rm->rm_col[x].rc_data; + for (i = 0; i < xcount; i++, dst++, src++) { + *dst = *src; + } + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + dst = rm->rm_col[x].rc_data; + + if (c == x) + continue; + + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + count = MIN(ccount, xcount); + + for (i = 0; i < count; i++, dst++, src++) { + *dst ^= *src; + } + } +} + +static void +vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) +{ + uint64_t *dst, *src, xcount, ccount, count, mask, i; + uint8_t *b; + int c, j, exp; + + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); + ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + dst = rm->rm_col[x].rc_data; + + if (c == x) + ccount = 0; + else + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + count = MIN(ccount, xcount); + + if (c == rm->rm_firstdatacol) { + for (i = 0; i < count; i++, dst++, src++) { + *dst = *src; + } + for (; i < xcount; i++, dst++) { + *dst = 0; + } + + } else { + /* + * For an explanation of this, see the comment in + * vdev_raidz_generate_parity_pq() above. + */ + for (i = 0; i < count; i++, dst++, src++) { + mask = *dst & 0x8080808080808080ULL; + mask = (mask << 1) - (mask >> 7); + *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ + (mask & 0x1d1d1d1d1d1d1d1dULL); + *dst ^= *src; + } + + for (; i < xcount; i++, dst++) { + mask = *dst & 0x8080808080808080ULL; + mask = (mask << 1) - (mask >> 7); + *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ + (mask & 0x1d1d1d1d1d1d1d1dULL); + } + } + } + + src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + dst = rm->rm_col[x].rc_data; + exp = 255 - (rm->rm_cols - 1 - x); + + for (i = 0; i < xcount; i++, dst++, src++) { + *dst ^= *src; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, exp); + } + } +} + +static void +vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) +{ + uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; + void *pdata, *qdata; + uint64_t xsize, ysize, i; + + ASSERT(x < y); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(y < rm->rm_cols); + + ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + + /* + * Move the parity data aside -- we're going to compute parity as + * though columns x and y were full of zeros -- Pxy and Qxy. We want to + * reuse the parity generation mechanism without trashing the actual + * parity so we make those columns appear to be full of zeros by + * setting their lengths to zero. + */ + pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; + qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + xsize = rm->rm_col[x].rc_size; + ysize = rm->rm_col[y].rc_size; + + rm->rm_col[VDEV_RAIDZ_P].rc_data = + zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); + rm->rm_col[VDEV_RAIDZ_Q].rc_data = + zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); + rm->rm_col[x].rc_size = 0; + rm->rm_col[y].rc_size = 0; + + vdev_raidz_generate_parity_pq(rm); + + rm->rm_col[x].rc_size = xsize; + rm->rm_col[y].rc_size = ysize; + + p = pdata; + q = qdata; + pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; + qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + xd = rm->rm_col[x].rc_data; + yd = rm->rm_col[y].rc_data; + + /* + * We now have: + * Pxy = P + D_x + D_y + * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y + * + * We can then solve for D_x: + * D_x = A * (P + Pxy) + B * (Q + Qxy) + * where + * A = 2^(x - y) * (2^(x - y) + 1)^-1 + * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 + * + * With D_x in hand, we can easily solve for D_y: + * D_y = P + Pxy + D_x + */ + + a = vdev_raidz_pow2[255 + x - y]; + b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + tmp = 255 - vdev_raidz_log2[a ^ 1]; + + aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; + bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; + + for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { + *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ + vdev_raidz_exp2(*q ^ *qxy, bexp); + + if (i < ysize) + *yd = *p ^ *pxy ^ *xd; + } + + zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, + rm->rm_col[VDEV_RAIDZ_P].rc_size); + zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + + /* + * Restore the saved parity data. + */ + rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; + rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; +} + + +static int +vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + vdev_t *cvd; + uint64_t nparity = vd->vdev_nparity; + int c, error; + int lasterror = 0; + int numerrors = 0; + + ASSERT(nparity > 0); + + if (nparity > VDEV_RAIDZ_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if ((error = vdev_open(cvd)) != 0) { + lasterror = error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *ashift = MAX(*ashift, cvd->vdev_ashift); + } + + *asize *= vd->vdev_children; + + if (numerrors > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_raidz_close(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static uint64_t +vdev_raidz_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t cols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + + asize = ((psize - 1) >> ashift) + 1; + asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); + asize = roundup(asize, nparity + 1) << ashift; + + return (asize); +} + +static void +vdev_raidz_child_done(zio_t *zio) +{ + raidz_col_t *rc = zio->io_private; + + rc->rc_error = zio->io_error; + rc->rc_tried = 1; + rc->rc_skipped = 0; +} + +static void +vdev_raidz_repair_done(zio_t *zio) +{ + ASSERT(zio->io_private == zio->io_parent); + vdev_raidz_map_free(zio->io_private); +} + +static void +vdev_raidz_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + vdev_t *cvd; + blkptr_t *bp = zio->io_bp; + raidz_map_t *rm; + raidz_col_t *rc; + int c; + + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, + vd->vdev_nparity); + + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * Generate RAID parity in the first virtual columns. + */ + if (rm->rm_firstdatacol == 1) + vdev_raidz_generate_parity_p(rm); + else + vdev_raidz_generate_parity_pq(rm); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, + vdev_raidz_child_done, rc)); + } + zio_wait_children_done(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity + * data. + */ + for (c = rm->rm_cols - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + if (vdev_is_dead(cvd)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = ENXIO; + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = ESTALE; + rc->rc_skipped = 1; + continue; + } + if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + (zio->io_flags & ZIO_FLAG_SCRUB)) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, + vdev_raidz_child_done, rc)); + } + } + + zio_wait_children_done(zio); +} + +/* + * Report a checksum error for a child of a RAID-Z device. + */ +static void +raidz_checksum_error(zio_t *zio, raidz_col_t *rc) +{ + vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; + dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", + vdev_description(vd)); + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); +} + +/* + * Generate the parity from the data columns. If we tried and were able to + * read the parity without error, verify that the generated parity matches the + * data we read. If it doesn't, we fire off a checksum error. Return the + * number such failures. + */ +static int +raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +{ + void *orig[VDEV_RAIDZ_MAXPARITY]; + int c, ret = 0; + raidz_col_t *rc; + + for (c = 0; c < rm->rm_firstdatacol; c++) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + orig[c] = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig[c], rc->rc_size); + } + + if (rm->rm_firstdatacol == 1) + vdev_raidz_generate_parity_p(rm); + else + vdev_raidz_generate_parity_pq(rm); + + for (c = 0; c < rm->rm_firstdatacol; c++) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + raidz_checksum_error(zio, rc); + rc->rc_error = ECKSUM; + ret++; + } + zio_buf_free(orig[c], rc->rc_size); + } + + return (ret); +} + +static uint64_t raidz_corrected_p; +static uint64_t raidz_corrected_q; +static uint64_t raidz_corrected_pq; + +static void +vdev_raidz_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc, *rc1; + int unexpected_errors = 0; + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; + int n, c, c1; + + ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + + zio->io_error = 0; + zio->io_numerrors = 0; + + ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); + ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + + /* + * We preserve any EIOs because those may be worth retrying; + * whereas ECKSUM and ENXIO are more likely to be persistent. + */ + if (rc->rc_error) { + if (zio->io_error != EIO) + zio->io_error = rc->rc_error; + + if (c < rm->rm_firstdatacol) + parity_errors++; + else + data_errors++; + + if (!rc->rc_skipped) + unexpected_errors++; + + zio->io_numerrors++; + } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * If this is not a failfast write, and we were able to + * write enough columns to reconstruct the data, good enough. + */ + /* XXPOLICY */ + if (zio->io_numerrors <= rm->rm_firstdatacol && + !(zio->io_flags & ZIO_FLAG_FAILFAST)) + zio->io_error = 0; + + vdev_raidz_map_free(zio); + zio_next_stage(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + /* + * There are three potential phases for a read: + * 1. produce valid data from the columns read + * 2. read all disks and try again + * 3. perform combinatorial reconstruction + * + * Each phase is progressively both more expensive and less likely to + * occur. If we encounter more errors than we can repair or all phases + * fail, we have no choice but to return an error. + */ + + /* + * If the number of errors we saw was correctable -- less than or equal + * to the number of parity disks read -- attempt to produce data that + * has a valid checksum. Naturally, this case applies in the absence of + * any errors. + */ + if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) { + switch (data_errors) { + case 0: + if (zio_checksum_error(zio) == 0) { + zio->io_error = 0; + if (parity_errors + parity_untried < + rm->rm_firstdatacol) { + n = raidz_parity_verify(zio, rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + goto done; + } + break; + + case 1: + /* + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. + */ + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rm->rm_firstdatacol); + + /* + * Find the column that reported the error. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error != 0) + break; + } + ASSERT(c != rm->rm_cols); + ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || + rc->rc_error == ESTALE); + + if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { + vdev_raidz_reconstruct_p(rm, c); + } else { + ASSERT(rm->rm_firstdatacol > 1); + vdev_raidz_reconstruct_q(rm, c); + } + + if (zio_checksum_error(zio) == 0) { + zio->io_error = 0; + if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) + atomic_inc_64(&raidz_corrected_p); + else + atomic_inc_64(&raidz_corrected_q); + + /* + * If there's more than one parity disk that + * was successfully read, confirm that the + * other parity disk produced the correct data. + * This routine is suboptimal in that it + * regenerates both the parity we wish to test + * as well as the parity we just used to + * perform the reconstruction, but this should + * be a relatively uncommon case, and can be + * optimized if it becomes a problem. + */ + if (parity_errors < rm->rm_firstdatacol - 1) { + n = raidz_parity_verify(zio, rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + + goto done; + } + break; + + case 2: + /* + * Two data column errors require double parity. + */ + ASSERT(rm->rm_firstdatacol == 2); + + /* + * Find the two columns that reported errors. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error != 0) + break; + } + ASSERT(c != rm->rm_cols); + ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || + rc->rc_error == ESTALE); + + for (c1 = c++; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error != 0) + break; + } + ASSERT(c != rm->rm_cols); + ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || + rc->rc_error == ESTALE); + + vdev_raidz_reconstruct_pq(rm, c1, c); + + if (zio_checksum_error(zio) == 0) { + zio->io_error = 0; + atomic_inc_64(&raidz_corrected_pq); + + goto done; + } + break; + + default: + ASSERT(rm->rm_firstdatacol <= 2); + ASSERT(0); + } + } + + /* + * This isn't a typical situation -- either we got a read error or + * a child silently returned bad data. Read every block so we can + * try again with as much data and parity as we can track down. If + * we've already been through once before, all children will be marked + * as tried so we'll proceed to combinatorial reconstruction. + */ + unexpected_errors = 1; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + + for (c = 0; c < rm->rm_cols; c++) { + if (rm->rm_col[c].rc_tried) + continue; + + zio->io_error = 0; + zio_vdev_io_redone(zio); + do { + rc = &rm->rm_col[c]; + if (rc->rc_tried) + continue; + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, + vdev_raidz_child_done, rc)); + } while (++c < rm->rm_cols); + dprintf("rereading\n"); + zio_wait_children_done(zio); + return; + } + + /* + * At this point we've attempted to reconstruct the data given the + * errors we detected, and we've attempted to read all columns. There + * must, therefore, be one or more additional problems -- silent errors + * resulting in invalid data rather than explicit I/O errors resulting + * in absent data. Before we attempt combinatorial reconstruction make + * sure we have a chance of coming up with the right answer. + */ + if (zio->io_numerrors >= rm->rm_firstdatacol) { + ASSERT(zio->io_error != 0); + goto done; + } + + if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { + /* + * Attempt to reconstruct the data from parity P. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + void *orig; + rc = &rm->rm_col[c]; + + orig = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig, rc->rc_size); + vdev_raidz_reconstruct_p(rm, c); + + if (zio_checksum_error(zio) == 0) { + zio_buf_free(orig, rc->rc_size); + zio->io_error = 0; + atomic_inc_64(&raidz_corrected_p); + + /* + * If this child didn't know that it returned + * bad data, inform it. + */ + if (rc->rc_tried && rc->rc_error == 0) + raidz_checksum_error(zio, rc); + rc->rc_error = ECKSUM; + goto done; + } + + bcopy(orig, rc->rc_data, rc->rc_size); + zio_buf_free(orig, rc->rc_size); + } + } + + if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + /* + * Attempt to reconstruct the data from parity Q. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + void *orig; + rc = &rm->rm_col[c]; + + orig = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig, rc->rc_size); + vdev_raidz_reconstruct_q(rm, c); + + if (zio_checksum_error(zio) == 0) { + zio_buf_free(orig, rc->rc_size); + zio->io_error = 0; + atomic_inc_64(&raidz_corrected_q); + + /* + * If this child didn't know that it returned + * bad data, inform it. + */ + if (rc->rc_tried && rc->rc_error == 0) + raidz_checksum_error(zio, rc); + rc->rc_error = ECKSUM; + goto done; + } + + bcopy(orig, rc->rc_data, rc->rc_size); + zio_buf_free(orig, rc->rc_size); + } + } + + if (rm->rm_firstdatacol > 1 && + rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && + rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + /* + * Attempt to reconstruct the data from both P and Q. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { + void *orig, *orig1; + rc = &rm->rm_col[c]; + + orig = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig, rc->rc_size); + + for (c1 = c + 1; c1 < rm->rm_cols; c1++) { + rc1 = &rm->rm_col[c1]; + + orig1 = zio_buf_alloc(rc1->rc_size); + bcopy(rc1->rc_data, orig1, rc1->rc_size); + + vdev_raidz_reconstruct_pq(rm, c, c1); + + if (zio_checksum_error(zio) == 0) { + zio_buf_free(orig, rc->rc_size); + zio_buf_free(orig1, rc1->rc_size); + zio->io_error = 0; + atomic_inc_64(&raidz_corrected_pq); + + /* + * If these children didn't know they + * returned bad data, inform them. + */ + if (rc->rc_tried && rc->rc_error == 0) + raidz_checksum_error(zio, rc); + if (rc1->rc_tried && rc1->rc_error == 0) + raidz_checksum_error(zio, rc1); + + rc->rc_error = ECKSUM; + rc1->rc_error = ECKSUM; + + goto done; + } + + bcopy(orig1, rc1->rc_data, rc1->rc_size); + zio_buf_free(orig1, rc1->rc_size); + } + + bcopy(orig, rc->rc_data, rc->rc_size); + zio_buf_free(orig, rc->rc_size); + } + } + + /* + * All combinations failed to checksum. Generate checksum ereports for + * all children. + */ + zio->io_error = ECKSUM; + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, + rc->rc_offset, rc->rc_size); + } + } + +done: + zio_checksum_verified(zio); + + if (zio->io_error == 0 && (spa_mode & FWRITE) && + (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + zio_t *rio; + + /* + * Use the good data we have in hand to repair damaged children. + * + * We issue all repair I/Os as children of 'rio' to arrange + * that vdev_raidz_map_free(zio) will be invoked after all + * repairs complete, but before we advance to the next stage. + */ + rio = zio_null(zio, zio->io_spa, + vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error == 0) + continue; + + dprintf("%s resilvered %s @ 0x%llx error %d\n", + vdev_description(vd), + vdev_description(cvd), + zio->io_offset, rc->rc_error); + + zio_nowait(zio_vdev_child_io(rio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + ZIO_TYPE_WRITE, zio->io_priority, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_CANFAIL, NULL, NULL)); + } + + zio_nowait(rio); + zio_wait_children_done(zio); + return; + } + + vdev_raidz_map_free(zio); + zio_next_stage(zio); +} + +static void +vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted > vd->vdev_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_raidz_ops = { + vdev_raidz_open, + vdev_raidz_close, + vdev_raidz_asize, + vdev_raidz_io_start, + vdev_raidz_io_done, + vdev_raidz_state_change, + VDEV_TYPE_RAIDZ, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c new file mode 100644 index 0000000..0e8752c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for the pool's root vdev. + */ + +/* + * We should be able to tolerate one failure with absolutely no damage + * to our metadata. Two failures will take out space maps, a bunch of + * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy + * place to live. When we get smarter, we can liberalize this policy. + * e.g. If we haven't lost two consecutive top-level vdevs, then we are + * probably fine. Adding bean counters during alloc/free can make this + * future guesswork more accurate. + */ +/*ARGSUSED*/ +static int +too_many_errors(vdev_t *vd, int numerrors) +{ + return (numerrors > 0); +} + +static int +vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + vdev_t *cvd; + int c, error; + int lasterror = 0; + int numerrors = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if ((error = vdev_open(cvd)) != 0) { + lasterror = error; + numerrors++; + continue; + } + } + + if (too_many_errors(vd, numerrors)) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + *asize = 0; + *ashift = 0; + + return (0); +} + +static void +vdev_root_close(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_root_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (too_many_errors(vd, faulted)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_root_ops = { + vdev_root_open, + vdev_root_close, + vdev_default_asize, + NULL, /* io_start - not applicable to the root */ + NULL, /* io_done - not applicable to the root */ + vdev_root_state_change, + VDEV_TYPE_ROOT, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c new file mode 100644 index 0000000..533431f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -0,0 +1,1070 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * This file contains the top half of the zfs directory structure + * implementation. The bottom half is in zap_leaf.c. + * + * The zdir is an extendable hash data structure. There is a table of + * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are + * each a constant size and hold a variable number of directory entries. + * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. + * + * The pointer table holds a power of 2 number of pointers. + * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to + * by the pointer at index i in the table holds entries whose hash value + * has a zd_prefix_len - bit prefix + */ + +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zfs_context.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> + +int fzap_default_block_shift = 14; /* 16k blocksize */ + +static void zap_leaf_pageout(dmu_buf_t *db, void *vl); +static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); + + +void +fzap_byteswap(void *vbuf, size_t size) +{ + uint64_t block_type; + + block_type = *(uint64_t *)vbuf; + + if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) + zap_leaf_byteswap(vbuf, size); + else { + /* it's a ptrtbl block */ + byteswap_uint64_array(vbuf, size); + } +} + +void +fzap_upgrade(zap_t *zap, dmu_tx_t *tx) +{ + dmu_buf_t *db; + zap_leaf_t *l; + int i; + zap_phys_t *zp; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + zap->zap_ismicro = FALSE; + + (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, + &zap->zap_f.zap_phys, zap_evict); + + mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0); + zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1; + + zp = zap->zap_f.zap_phys; + /* + * explicitly zero it since it might be coming from an + * initialized microzap + */ + bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); + zp->zap_block_type = ZBT_HEADER; + zp->zap_magic = ZAP_MAGIC; + + zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); + + zp->zap_freeblk = 2; /* block 1 will be the first leaf */ + zp->zap_num_leafs = 1; + zp->zap_num_entries = 0; + zp->zap_salt = zap->zap_salt; + + /* block 1 will be the first leaf */ + for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++) + ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; + + /* + * set up block 1 - the first leaf + */ + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l->l_dbuf = db; + l->l_phys = db->db_data; + + zap_leaf_init(l); + + kmem_free(l, sizeof (zap_leaf_t)); + dmu_buf_rele(db, FTAG); +} + +static int +zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) +{ + if (RW_WRITE_HELD(&zap->zap_rwlock)) + return (1); + if (rw_tryupgrade(&zap->zap_rwlock)) { + dmu_buf_will_dirty(zap->zap_dbuf, tx); + return (1); + } + return (0); +} + +/* + * Generic routines for dealing with the pointer & cookie tables. + */ + +static int +zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, + void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), + dmu_tx_t *tx) +{ + uint64_t b, newblk; + dmu_buf_t *db_old, *db_new; + int err; + int bs = FZAP_BLOCK_SHIFT(zap); + int hepb = 1<<(bs-4); + /* hepb = half the number of entries in a block */ + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + ASSERT(tbl->zt_numblks > 0); + + if (tbl->zt_nextblk != 0) { + newblk = tbl->zt_nextblk; + } else { + newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); + tbl->zt_nextblk = newblk; + ASSERT3U(tbl->zt_blks_copied, ==, 0); + dmu_prefetch(zap->zap_objset, zap->zap_object, + tbl->zt_blk << bs, tbl->zt_numblks << bs); + } + + /* + * Copy the ptrtbl from the old to new location. + */ + + b = tbl->zt_blks_copied; + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + b) << bs, FTAG, &db_old); + if (err) + return (err); + + /* first half of entries in old[b] go to new[2*b+0] */ + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+0) << bs, FTAG, &db_new)); + dmu_buf_will_dirty(db_new, tx); + transfer_func(db_old->db_data, db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + /* second half of entries in old[b] go to new[2*b+1] */ + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+1) << bs, FTAG, &db_new)); + dmu_buf_will_dirty(db_new, tx); + transfer_func((uint64_t *)db_old->db_data + hepb, + db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + dmu_buf_rele(db_old, FTAG); + + tbl->zt_blks_copied++; + + dprintf("copied block %llu of %llu\n", + tbl->zt_blks_copied, tbl->zt_numblks); + + if (tbl->zt_blks_copied == tbl->zt_numblks) { + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); + + tbl->zt_blk = newblk; + tbl->zt_numblks *= 2; + tbl->zt_shift++; + tbl->zt_nextblk = 0; + tbl->zt_blks_copied = 0; + + dprintf("finished; numblocks now %llu (%lluk entries)\n", + tbl->zt_numblks, 1<<(tbl->zt_shift-10)); + } + + return (0); +} + +static int +zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, + dmu_tx_t *tx) +{ + int err; + uint64_t blk, off; + int bs = FZAP_BLOCK_SHIFT(zap); + dmu_buf_t *db; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + + dprintf("storing %llx at index %llx\n", val, idx); + + blk = idx >> (bs-3); + off = idx & ((1<<(bs-3))-1); + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << bs, FTAG, &db); + if (err) + return (err); + dmu_buf_will_dirty(db, tx); + + if (tbl->zt_nextblk != 0) { + uint64_t idx2 = idx * 2; + uint64_t blk2 = idx2 >> (bs-3); + uint64_t off2 = idx2 & ((1<<(bs-3))-1); + dmu_buf_t *db2; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_nextblk + blk2) << bs, FTAG, &db2); + if (err) { + dmu_buf_rele(db, FTAG); + return (err); + } + dmu_buf_will_dirty(db2, tx); + ((uint64_t *)db2->db_data)[off2] = val; + ((uint64_t *)db2->db_data)[off2+1] = val; + dmu_buf_rele(db2, FTAG); + } + + ((uint64_t *)db->db_data)[off] = val; + dmu_buf_rele(db, FTAG); + + return (0); +} + +static int +zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) +{ + uint64_t blk, off; + int err; + dmu_buf_t *db; + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + blk = idx >> (bs-3); + off = idx & ((1<<(bs-3))-1); + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << bs, FTAG, &db); + if (err) + return (err); + *valp = ((uint64_t *)db->db_data)[off]; + dmu_buf_rele(db, FTAG); + + if (tbl->zt_nextblk != 0) { + /* + * read the nextblk for the sake of i/o error checking, + * so that zap_table_load() will catch errors for + * zap_table_store. + */ + blk = (idx*2) >> (bs-3); + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_nextblk + blk) << bs, FTAG, &db); + dmu_buf_rele(db, FTAG); + } + return (err); +} + +/* + * Routines for growing the ptrtbl. + */ + +static void +zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) +{ + int i; + for (i = 0; i < n; i++) { + uint64_t lb = src[i]; + dst[2*i+0] = lb; + dst[2*i+1] = lb; + } +} + +static int +zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) +{ + /* In case things go horribly wrong. */ + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2) + return (ENOSPC); + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + /* + * We are outgrowing the "embedded" ptrtbl (the one + * stored in the header block). Give it its own entire + * block, which will double the size of the ptrtbl. + */ + uint64_t newblk; + dmu_buf_t *db_new; + int err; + + ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, + ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0); + + newblk = zap_allocate_blocks(zap, 1); + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new); + if (err) + return (err); + dmu_buf_will_dirty(db_new, tx); + zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + dmu_buf_rele(db_new, FTAG); + + zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; + zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1; + zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++; + + ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, + zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << + (FZAP_BLOCK_SHIFT(zap)-3)); + + return (0); + } else { + return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + zap_ptrtbl_transfer, tx)); + } +} + +static void +zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) +{ + dmu_buf_will_dirty(zap->zap_dbuf, tx); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); + ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta); + zap->zap_f.zap_phys->zap_num_entries += delta; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); +} + +static uint64_t +zap_allocate_blocks(zap_t *zap, int nblocks) +{ + uint64_t newblk; + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + newblk = zap->zap_f.zap_phys->zap_freeblk; + zap->zap_f.zap_phys->zap_freeblk += nblocks; + return (newblk); +} + +static zap_leaf_t * +zap_create_leaf(zap_t *zap, dmu_tx_t *tx) +{ + void *winner; + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = zap_allocate_blocks(zap, 1); + l->l_dbuf = NULL; + l->l_phys = NULL; + + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf)); + winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); + ASSERT(winner == NULL); + dmu_buf_will_dirty(l->l_dbuf, tx); + + zap_leaf_init(l); + + zap->zap_f.zap_phys->zap_num_leafs++; + + return (l); +} + +int +fzap_count(zap_t *zap, uint64_t *count) +{ + ASSERT(!zap->zap_ismicro); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ + *count = zap->zap_f.zap_phys->zap_num_entries; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); + return (0); +} + +/* + * Routines for obtaining zap_leaf_t's + */ + +void +zap_put_leaf(zap_leaf_t *l) +{ + rw_exit(&l->l_rwlock); + dmu_buf_rele(l->l_dbuf, NULL); +} + +_NOTE(ARGSUSED(0)) +static void +zap_leaf_pageout(dmu_buf_t *db, void *vl) +{ + zap_leaf_t *l = vl; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + +static zap_leaf_t * +zap_open_leaf(uint64_t blkid, dmu_buf_t *db) +{ + zap_leaf_t *l, *winner; + + ASSERT(blkid != 0); + + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = blkid; + l->l_bs = highbit(db->db_size)-1; + l->l_dbuf = db; + l->l_phys = NULL; + + winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); + + rw_exit(&l->l_rwlock); + if (winner != NULL) { + /* someone else set it first */ + zap_leaf_pageout(NULL, l); + l = winner; + } + + /* + * lhr_pad was previously used for the next leaf in the leaf + * chain. There should be no chained leafs (as we have removed + * support for them). + */ + ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0); + + /* + * There should be more hash entries than there can be + * chunks to put in the hash table + */ + ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); + + /* The chunks should begin at the end of the hash table */ + ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, + &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); + + /* The chunks should end at the end of the block */ + ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - + (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size); + + return (l); +} + +static int +zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, + zap_leaf_t **lp) +{ + dmu_buf_t *db; + zap_leaf_t *l; + int bs = FZAP_BLOCK_SHIFT(zap); + int err; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + blkid << bs, NULL, &db); + if (err) + return (err); + + ASSERT3U(db->db_object, ==, zap->zap_object); + ASSERT3U(db->db_offset, ==, blkid << bs); + ASSERT3U(db->db_size, ==, 1 << bs); + ASSERT(blkid != 0); + + l = dmu_buf_get_user(db); + + if (l == NULL) + l = zap_open_leaf(blkid, db); + + rw_enter(&l->l_rwlock, lt); + /* + * Must lock before dirtying, otherwise l->l_phys could change, + * causing ASSERT below to fail. + */ + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); + ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); + ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + *lp = l; + return (0); +} + +static int +zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + ASSERT3U(idx, <, + (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift)); + *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); + return (0); + } else { + return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + idx, valp)); + } +} + +static int +zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) +{ + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) { + ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; + return (0); + } else { + return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + idx, blk, tx)); + } +} + +static int +zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) +{ + uint64_t idx, blk; + int err; + + ASSERT(zap->zap_dbuf == NULL || + zap->zap_f.zap_phys == zap->zap_dbuf->db_data); + ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); + idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + err = zap_idx_to_blk(zap, idx, &blk); + if (err != 0) + return (err); + err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); + + ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) == + (*lp)->l_phys->l_hdr.lh_prefix); + return (err); +} + +static int +zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx, + zap_leaf_t **lp) +{ + zap_leaf_t *nl; + int prefix_diff, i, err; + uint64_t sibling; + int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len; + + ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + l->l_phys->l_hdr.lh_prefix); + + if (zap_tryupgradedir(zap, tx) == 0 || + old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { + /* We failed to upgrade, or need to grow the pointer table */ + objset_t *os = zap->zap_objset; + uint64_t object = zap->zap_object; + + zap_put_leaf(l); + zap_unlockdir(zap); + err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap); + if (err) + return (err); + ASSERT(!zap->zap_ismicro); + + while (old_prefix_len == + zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { + err = zap_grow_ptrtbl(zap, tx); + if (err) + return (err); + } + + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err) + return (err); + + if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) { + /* it split while our locks were down */ + *lp = l; + return (0); + } + } + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + l->l_phys->l_hdr.lh_prefix); + + prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - + (old_prefix_len + 1); + sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; + + /* check for i/o errors before doing zap_leaf_split */ + for (i = 0; i < (1ULL<<prefix_diff); i++) { + uint64_t blk; + err = zap_idx_to_blk(zap, sibling+i, &blk); + if (err) + return (err); + ASSERT3U(blk, ==, l->l_blkid); + } + + nl = zap_create_leaf(zap, tx); + zap_leaf_split(l, nl); + + /* set sibling pointers */ + for (i = 0; i < (1ULL<<prefix_diff); i++) { + err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx); + ASSERT3U(err, ==, 0); /* we checked for i/o errors above */ + } + + if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) { + /* we want the sibling */ + zap_put_leaf(l); + *lp = nl; + } else { + zap_put_leaf(nl); + *lp = l; + } + + return (0); +} + +static void +zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx) +{ + int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift && + l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); + + zap_put_leaf(l); + + if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) { + int err; + + /* + * We are in the middle of growing the pointer table, or + * this leaf will soon make us grow it. + */ + if (zap_tryupgradedir(zap, tx) == 0) { + objset_t *os = zap->zap_objset; + uint64_t zapobj = zap->zap_object; + + zap_unlockdir(zap); + err = zap_lockdir(os, zapobj, tx, + RW_WRITER, FALSE, &zap); + if (err) + return; + } + + /* could have finished growing while our locks were down */ + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift) + (void) zap_grow_ptrtbl(zap, tx); + } +} + + +static int +fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers) +{ + if (name && strlen(name) > ZAP_MAXNAMELEN) + return (E2BIG); + + /* Only integer sizes supported by C */ + switch (integer_size) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return (EINVAL); + } + + if (integer_size * num_integers > ZAP_MAXVALUELEN) + return (E2BIG); + + return (0); +} + +/* + * Routines for maniplulating attributes. + */ +int +fzap_lookup(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_leaf_t *l; + int err; + uint64_t hash; + zap_entry_handle_t zeh; + + err = fzap_checksize(name, integer_size, num_integers); + if (err != 0) + return (err); + + hash = zap_hash(zap, name); + err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, name, hash, &zeh); + if (err == 0) + err = zap_entry_read(&zeh, integer_size, num_integers, buf); + + zap_put_leaf(l); + return (err); +} + +int +fzap_add_cd(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx) +{ + zap_leaf_t *l; + uint64_t hash; + int err; + zap_entry_handle_t zeh; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(!zap->zap_ismicro); + ASSERT(fzap_checksize(name, integer_size, num_integers) == 0); + + hash = zap_hash(zap, name); + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, name, hash, &zeh); + if (err == 0) { + err = EEXIST; + goto out; + } + if (err != ENOENT) + goto out; + + err = zap_entry_create(l, name, hash, cd, + integer_size, num_integers, val, &zeh); + + if (err == 0) { + zap_increment_num_entries(zap, 1, tx); + } else if (err == EAGAIN) { + err = zap_expand_leaf(zap, l, hash, tx, &l); + if (err == 0) + goto retry; + } + +out: + zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); + return (err); +} + +int +fzap_add(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + int err = fzap_checksize(name, integer_size, num_integers); + if (err != 0) + return (err); + + return (fzap_add_cd(zap, name, integer_size, num_integers, + val, ZAP_MAXCD, tx)); +} + +int +fzap_update(zap_t *zap, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_leaf_t *l; + uint64_t hash; + int err, create; + zap_entry_handle_t zeh; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + err = fzap_checksize(name, integer_size, num_integers); + if (err != 0) + return (err); + + hash = zap_hash(zap, name); + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, name, hash, &zeh); + create = (err == ENOENT); + ASSERT(err == 0 || err == ENOENT); + + /* XXX If this leaf is chained, split it if we can. */ + + if (create) { + err = zap_entry_create(l, name, hash, ZAP_MAXCD, + integer_size, num_integers, val, &zeh); + if (err == 0) + zap_increment_num_entries(zap, 1, tx); + } else { + err = zap_entry_update(&zeh, integer_size, num_integers, val); + } + + if (err == EAGAIN) { + err = zap_expand_leaf(zap, l, hash, tx, &l); + if (err == 0) + goto retry; + } + + zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); + return (err); +} + +int +fzap_length(zap_t *zap, const char *name, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_leaf_t *l; + int err; + uint64_t hash; + zap_entry_handle_t zeh; + + hash = zap_hash(zap, name); + err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, name, hash, &zeh); + if (err != 0) + goto out; + + if (integer_size) + *integer_size = zeh.zeh_integer_size; + if (num_integers) + *num_integers = zeh.zeh_num_integers; +out: + zap_put_leaf(l); + return (err); +} + +int +fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx) +{ + zap_leaf_t *l; + uint64_t hash; + int err; + zap_entry_handle_t zeh; + + hash = zap_hash(zap, name); + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, name, hash, &zeh); + if (err == 0) { + zap_entry_remove(&zeh); + zap_increment_num_entries(zap, -1, tx); + } + zap_put_leaf(l); + dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n", + zap->zap_objset, zap->zap_object, name, err); + return (err); +} + +int +zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name) +{ + zap_cursor_t zc; + zap_attribute_t *za; + int err; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, os, zapobj); + (err = zap_cursor_retrieve(&zc, za)) == 0; + zap_cursor_advance(&zc)) { + if (za->za_first_integer == value) { + (void) strcpy(name, za->za_name); + break; + } + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + return (err); +} + + +/* + * Routines for iterating over the attributes. + */ + +int +fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) +{ + int err = ENOENT; + zap_entry_handle_t zeh; + zap_leaf_t *l; + + /* retrieve the next entry at or after zc_hash/zc_cd */ + /* if no entry, return ENOENT */ + + if (zc->zc_leaf && + (ZAP_HASH_IDX(zc->zc_hash, + zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) != + zc->zc_leaf->l_phys->l_hdr.lh_prefix)) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + +again: + if (zc->zc_leaf == NULL) { + err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, + &zc->zc_leaf); + if (err != 0) + return (err); + } else { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + } + l = zc->zc_leaf; + + err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); + + if (err == ENOENT) { + uint64_t nocare = + (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1; + zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; + zc->zc_cd = 0; + if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { + zc->zc_hash = -1ULL; + } else { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + goto again; + } + } + + if (err == 0) { + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + za->za_integer_length = zeh.zeh_integer_size; + za->za_num_integers = zeh.zeh_num_integers; + if (zeh.zeh_num_integers == 0) { + za->za_first_integer = 0; + } else { + err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); + ASSERT(err == 0 || err == EOVERFLOW); + } + err = zap_entry_read_name(&zeh, + sizeof (za->za_name), za->za_name); + ASSERT(err == 0); + } + rw_exit(&zc->zc_leaf->l_rwlock); + return (err); +} + + +static void +zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) +{ + int i, err; + uint64_t lastblk = 0; + + /* + * NB: if a leaf has more pointers than an entire ptrtbl block + * can hold, then it'll be accounted for more than once, since + * we won't have lastblk. + */ + for (i = 0; i < len; i++) { + zap_leaf_t *l; + + if (tbl[i] == lastblk) + continue; + lastblk = tbl[i]; + + err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); + if (err == 0) { + zap_leaf_stats(zap, l, zs); + zap_put_leaf(l); + } + } +} + +void +fzap_get_stats(zap_t *zap, zap_stats_t *zs) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + zs->zs_blocksize = 1ULL << bs; + + /* + * Set zap_phys_t fields + */ + zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs; + zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries; + zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk; + zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type; + zs->zs_magic = zap->zap_f.zap_phys->zap_magic; + zs->zs_salt = zap->zap_f.zap_phys->zap_salt; + + /* + * Set zap_ptrtbl fields + */ + zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk; + zs->zs_ptrtbl_blks_copied = + zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied; + zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk; + zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; + zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + /* the ptrtbl is entirely in the header block. */ + zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); + } else { + int b; + + dmu_prefetch(zap->zap_objset, zap->zap_object, + zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs, + zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs); + + for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; + b++) { + dmu_buf_t *db; + int err; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, + FTAG, &db); + if (err == 0) { + zap_stats_ptrtbl(zap, db->db_data, + 1<<(bs-3), zs); + dmu_buf_rele(db, FTAG); + } + } + } +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c new file mode 100644 index 0000000..5dff514 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c @@ -0,0 +1,741 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * The 512-byte leaf is broken into 32 16-byte chunks. + * chunk number n means l_chunk[n], even though the header precedes it. + * the names are stored null-terminated. + */ + +#include <sys/zfs_context.h> +#include <sys/zap.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> +#include <sys/spa.h> +#include <sys/dmu.h> + +#define CHAIN_END 0xffff /* end of the chunk chain */ + +/* half the (current) minimum block size */ +#define MAX_ARRAY_BYTES (8<<10) + +#define LEAF_HASH(l, h) \ + ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ + ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len))) + +#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) + + +static void +zap_memset(void *a, int c, size_t n) +{ + char *cp = a; + char *cpend = cp + n; + + while (cp < cpend) + *cp++ = c; +} + +static void +stv(int len, void *addr, uint64_t value) +{ + switch (len) { + case 1: + *(uint8_t *)addr = value; + return; + case 2: + *(uint16_t *)addr = value; + return; + case 4: + *(uint32_t *)addr = value; + return; + case 8: + *(uint64_t *)addr = value; + return; + } + ASSERT(!"bad int len"); +} + +static uint64_t +ldv(int len, const void *addr) +{ + switch (len) { + case 1: + return (*(uint8_t *)addr); + case 2: + return (*(uint16_t *)addr); + case 4: + return (*(uint32_t *)addr); + case 8: + return (*(uint64_t *)addr); + } + ASSERT(!"bad int len"); + return (0xFEEDFACEDEADBEEFULL); +} + +void +zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) +{ + int i; + zap_leaf_t l; + l.l_bs = highbit(size)-1; + l.l_phys = buf; + + buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); + buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); + buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); + buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); + buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); + buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); + buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); + + for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) + buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); + + for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { + zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); + struct zap_leaf_entry *le; + + switch (lc->l_free.lf_type) { + case ZAP_CHUNK_ENTRY: + le = &lc->l_entry; + + le->le_type = BSWAP_8(le->le_type); + le->le_int_size = BSWAP_8(le->le_int_size); + le->le_next = BSWAP_16(le->le_next); + le->le_name_chunk = BSWAP_16(le->le_name_chunk); + le->le_name_length = BSWAP_16(le->le_name_length); + le->le_value_chunk = BSWAP_16(le->le_value_chunk); + le->le_value_length = BSWAP_16(le->le_value_length); + le->le_cd = BSWAP_32(le->le_cd); + le->le_hash = BSWAP_64(le->le_hash); + break; + case ZAP_CHUNK_FREE: + lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); + lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); + break; + case ZAP_CHUNK_ARRAY: + lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); + lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); + /* la_array doesn't need swapping */ + break; + default: + ASSERT(!"bad leaf type"); + } + } +} + +void +zap_leaf_init(zap_leaf_t *l) +{ + int i; + + l->l_bs = highbit(l->l_dbuf->db_size)-1; + zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header)); + zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; + ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; + } + ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; + l->l_phys->l_hdr.lh_block_type = ZBT_LEAF; + l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC; + l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); +} + +/* + * Routines which manipulate leaf chunks (l_chunk[]). + */ + +static uint16_t +zap_leaf_chunk_alloc(zap_leaf_t *l) +{ + int chunk; + + ASSERT(l->l_phys->l_hdr.lh_nfree > 0); + + chunk = l->l_phys->l_hdr.lh_freelist; + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); + + l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; + + l->l_phys->l_hdr.lh_nfree--; + + return (chunk); +} + +static void +zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) +{ + struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; + ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); + + zlf->lf_type = ZAP_CHUNK_FREE; + zlf->lf_next = l->l_phys->l_hdr.lh_freelist; + bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ + l->l_phys->l_hdr.lh_freelist = chunk; + + l->l_phys->l_hdr.lh_nfree++; +} + +/* + * Routines which manipulate leaf arrays (zap_leaf_array type chunks). + */ + +static uint16_t +zap_leaf_array_create(zap_leaf_t *l, const char *buf, + int integer_size, int num_integers) +{ + uint16_t chunk_head; + uint16_t *chunkp = &chunk_head; + int byten = 0; + uint64_t value; + int shift = (integer_size-1)*8; + int len = num_integers; + + ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES); + + while (len > 0) { + uint16_t chunk = zap_leaf_chunk_alloc(l); + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + int i; + + la->la_type = ZAP_CHUNK_ARRAY; + for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { + if (byten == 0) + value = ldv(integer_size, buf); + la->la_array[i] = value >> shift; + value <<= 8; + if (++byten == integer_size) { + byten = 0; + buf += integer_size; + if (--len == 0) + break; + } + } + + *chunkp = chunk; + chunkp = &la->la_next; + } + *chunkp = CHAIN_END; + + return (chunk_head); +} + +static void +zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) +{ + uint16_t chunk = *chunkp; + + *chunkp = CHAIN_END; + + while (chunk != CHAIN_END) { + int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; + ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, + ZAP_CHUNK_ARRAY); + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + } +} + +/* array_len and buf_len are in integers, not bytes */ +static void +zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, + int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, + char *buf) +{ + int len = MIN(array_len, buf_len); + int byten = 0; + uint64_t value = 0; + + ASSERT3U(array_int_len, <=, buf_int_len); + + /* Fast path for one 8-byte integer */ + if (array_int_len == 8 && buf_int_len == 8 && len == 1) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + uint8_t *ip = la->la_array; + uint64_t *buf64 = (uint64_t *)buf; + + *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | + (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | + (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | + (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; + return; + } + + /* Fast path for an array of 1-byte integers (eg. the entry name) */ + if (array_int_len == 1 && buf_int_len == 1 && + buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { + while (chunk != CHAIN_END) { + struct zap_leaf_array *la = + &ZAP_LEAF_CHUNK(l, chunk).l_array; + bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES); + buf += ZAP_LEAF_ARRAY_BYTES; + chunk = la->la_next; + } + return; + } + + while (len > 0) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + int i; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { + value = (value << 8) | la->la_array[i]; + byten++; + if (byten == array_int_len) { + stv(buf_int_len, buf, value); + byten = 0; + len--; + if (len == 0) + return; + buf += buf_int_len; + } + } + chunk = la->la_next; + } +} + +/* + * Only to be used on 8-bit arrays. + * array_len is actual len in bytes (not encoded le_value_length). + * buf is null-terminated. + */ +static int +zap_leaf_array_equal(zap_leaf_t *l, int chunk, + int array_len, const char *buf) +{ + int bseen = 0; + + while (bseen < array_len) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + if (bcmp(la->la_array, buf + bseen, toread)) + break; + chunk = la->la_next; + bseen += toread; + } + return (bseen == array_len); +} + +/* + * Routines which manipulate leaf entries. + */ + +int +zap_leaf_lookup(zap_leaf_t *l, + const char *name, uint64_t h, zap_entry_handle_t *zeh) +{ + uint16_t *chunkp; + struct zap_leaf_entry *le; + + ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + for (chunkp = LEAF_HASH_ENTPTR(l, h); + *chunkp != CHAIN_END; chunkp = &le->le_next) { + uint16_t chunk = *chunkp; + le = ZAP_LEAF_ENTRY(l, chunk); + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (le->le_hash != h) + continue; + + if (zap_leaf_array_equal(l, le->le_name_chunk, + le->le_name_length, name)) { + zeh->zeh_num_integers = le->le_value_length; + zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + zeh->zeh_leaf = l; + return (0); + } + } + + return (ENOENT); +} + +/* Return (h1,cd1 >= h2,cd2) */ +#define HCD_GTEQ(h1, cd1, h2, cd2) \ + ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) + +int +zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) +{ + uint16_t chunk; + uint64_t besth = -1ULL; + uint32_t bestcd = ZAP_MAXCD; + uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; + uint16_t lh; + struct zap_leaf_entry *le; + + ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { + for (chunk = l->l_phys->l_hash[lh]; + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && + HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { + ASSERT3U(bestlh, >=, lh); + bestlh = lh; + besth = le->le_hash; + bestcd = le->le_cd; + + zeh->zeh_num_integers = le->le_value_length; + zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_fakechunk = chunk; + zeh->zeh_chunkp = &zeh->zeh_fakechunk; + zeh->zeh_leaf = l; + } + } + } + + return (bestcd == ZAP_MAXCD ? ENOENT : 0); +} + +int +zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf) +{ + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (le->le_int_size > integer_size) + return (EINVAL); + + zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size, + le->le_value_length, integer_size, num_integers, buf); + + if (zeh->zeh_num_integers > num_integers) + return (EOVERFLOW); + return (0); + +} + +int +zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf) +{ + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, + le->le_name_length, 1, buflen, buf); + if (le->le_name_length > buflen) + return (EOVERFLOW); + return (0); +} + +int +zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf) +{ + int delta_chunks; + zap_leaf_t *l = zeh->zeh_leaf; + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); + + delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size); + + if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) + return (EAGAIN); + + /* + * We should search other chained leaves (via + * zap_entry_remove,create?) otherwise returning EAGAIN will + * just send us into an infinite loop if we have to chain + * another leaf block, rather than being able to split this + * block. + */ + + zap_leaf_array_free(l, &le->le_value_chunk); + le->le_value_chunk = + zap_leaf_array_create(l, buf, integer_size, num_integers); + le->le_value_length = num_integers; + le->le_int_size = integer_size; + return (0); +} + +void +zap_entry_remove(zap_entry_handle_t *zeh) +{ + uint16_t entry_chunk; + struct zap_leaf_entry *le; + zap_leaf_t *l = zeh->zeh_leaf; + + ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); + + entry_chunk = *zeh->zeh_chunkp; + le = ZAP_LEAF_ENTRY(l, entry_chunk); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + zap_leaf_array_free(l, &le->le_name_chunk); + zap_leaf_array_free(l, &le->le_value_chunk); + + *zeh->zeh_chunkp = le->le_next; + zap_leaf_chunk_free(l, entry_chunk); + + l->l_phys->l_hdr.lh_nentries--; +} + +int +zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh) +{ + uint16_t chunk; + uint16_t *chunkp; + struct zap_leaf_entry *le; + uint64_t namelen, valuelen; + int numchunks; + + valuelen = integer_size * num_integers; + namelen = strlen(name) + 1; + ASSERT(namelen >= 2); + + numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) + + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); + if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) + return (E2BIG); + + if (cd == ZAP_MAXCD) { + for (cd = 0; cd < ZAP_MAXCD; cd++) { + for (chunk = *LEAF_HASH_ENTPTR(l, h); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + if (le->le_hash == h && + le->le_cd == cd) { + break; + } + } + /* If this cd is not in use, we are good. */ + if (chunk == CHAIN_END) + break; + } + /* If we tried all the cd's, we lose. */ + if (cd == ZAP_MAXCD) + return (ENOSPC); + } + + if (l->l_phys->l_hdr.lh_nfree < numchunks) + return (EAGAIN); + + /* make the entry */ + chunk = zap_leaf_chunk_alloc(l); + le = ZAP_LEAF_ENTRY(l, chunk); + le->le_type = ZAP_CHUNK_ENTRY; + le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen); + le->le_name_length = namelen; + le->le_value_chunk = + zap_leaf_array_create(l, buf, integer_size, num_integers); + le->le_value_length = num_integers; + le->le_int_size = integer_size; + le->le_hash = h; + le->le_cd = cd; + + /* link it into the hash chain */ + chunkp = LEAF_HASH_ENTPTR(l, h); + le->le_next = *chunkp; + *chunkp = chunk; + + l->l_phys->l_hdr.lh_nentries++; + + zeh->zeh_leaf = l; + zeh->zeh_num_integers = num_integers; + zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + + return (0); +} + +/* + * Routines for transferring entries between leafs. + */ + +static void +zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) +{ + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); + uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash); + le->le_next = *ptr; + *ptr = entry; +} + +static uint16_t +zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) +{ + uint16_t new_chunk; + uint16_t *nchunkp = &new_chunk; + + while (chunk != CHAIN_END) { + uint16_t nchunk = zap_leaf_chunk_alloc(nl); + struct zap_leaf_array *nla = + &ZAP_LEAF_CHUNK(nl, nchunk).l_array; + struct zap_leaf_array *la = + &ZAP_LEAF_CHUNK(l, chunk).l_array; + int nextchunk = la->la_next; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); + + *nla = *la; /* structure assignment */ + + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + *nchunkp = nchunk; + nchunkp = &nla->la_next; + } + *nchunkp = CHAIN_END; + return (new_chunk); +} + +static void +zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) +{ + struct zap_leaf_entry *le, *nle; + uint16_t chunk; + + le = ZAP_LEAF_ENTRY(l, entry); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + chunk = zap_leaf_chunk_alloc(nl); + nle = ZAP_LEAF_ENTRY(nl, chunk); + *nle = *le; /* structure assignment */ + + zap_leaf_rehash_entry(nl, chunk); + + nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); + nle->le_value_chunk = + zap_leaf_transfer_array(l, le->le_value_chunk, nl); + + zap_leaf_chunk_free(l, entry); + + l->l_phys->l_hdr.lh_nentries--; + nl->l_phys->l_hdr.lh_nentries++; +} + +/* + * Transfer the entries whose hash prefix ends in 1 to the new leaf. + */ +void +zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl) +{ + int i; + int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len; + + /* set new prefix and prefix_len */ + l->l_phys->l_hdr.lh_prefix <<= 1; + l->l_phys->l_hdr.lh_prefix_len++; + nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1; + nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len; + + /* break existing hash chains */ + zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + + /* + * Transfer entries whose hash bit 'bit' is set to nl; rehash + * the remaining entries + * + * NB: We could find entries via the hashtable instead. That + * would be O(hashents+numents) rather than O(numblks+numents), + * but this accesses memory more sequentially, and when we're + * called, the block is usually pretty full. + */ + for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); + if (le->le_type != ZAP_CHUNK_ENTRY) + continue; + + if (le->le_hash & (1ULL << bit)) + zap_leaf_transfer_entry(l, i, nl); + else + zap_leaf_rehash_entry(l, i); + } +} + +void +zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) +{ + int i, n; + + n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - + l->l_phys->l_hdr.lh_prefix_len; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_leafs_with_2n_pointers[n]++; + + + n = l->l_phys->l_hdr.lh_nentries/5; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_blocks_with_n5_entries[n]++; + + n = ((1<<FZAP_BLOCK_SHIFT(zap)) - + l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / + (1<<FZAP_BLOCK_SHIFT(zap)); + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_blocks_n_tenths_full[n]++; + + for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { + int nentries = 0; + int chunk = l->l_phys->l_hash[i]; + + while (chunk != CHAIN_END) { + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(l, chunk); + + n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) + + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * + le->le_int_size); + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_entries_using_n_chunks[n]++; + + chunk = le->le_next; + nentries++; + } + + n = nentries; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_buckets_with_n_entries[n]++; + } +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c new file mode 100644 index 0000000..9b7e23c --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -0,0 +1,855 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zfs_context.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> +#include <sys/avl.h> + + +static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx); + + +static void +mzap_byteswap(mzap_phys_t *buf, size_t size) +{ + int i, max; + buf->mz_block_type = BSWAP_64(buf->mz_block_type); + buf->mz_salt = BSWAP_64(buf->mz_salt); + max = (size / MZAP_ENT_LEN) - 1; + for (i = 0; i < max; i++) { + buf->mz_chunk[i].mze_value = + BSWAP_64(buf->mz_chunk[i].mze_value); + buf->mz_chunk[i].mze_cd = + BSWAP_32(buf->mz_chunk[i].mze_cd); + } +} + +void +zap_byteswap(void *buf, size_t size) +{ + uint64_t block_type; + + block_type = *(uint64_t *)buf; + + if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { + /* ASSERT(magic == ZAP_LEAF_MAGIC); */ + mzap_byteswap(buf, size); + } else { + fzap_byteswap(buf, size); + } +} + +static int +mze_compare(const void *arg1, const void *arg2) +{ + const mzap_ent_t *mze1 = arg1; + const mzap_ent_t *mze2 = arg2; + + if (mze1->mze_hash > mze2->mze_hash) + return (+1); + if (mze1->mze_hash < mze2->mze_hash) + return (-1); + if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) + return (+1); + if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) + return (-1); + return (0); +} + +static void +mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) +{ + mzap_ent_t *mze; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(mzep->mze_cd < ZAP_MAXCD); + ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash); + + mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); + mze->mze_chunkid = chunkid; + mze->mze_hash = hash; + mze->mze_phys = *mzep; + avl_add(&zap->zap_m.zap_avl, mze); +} + +static mzap_ent_t * +mze_find(zap_t *zap, const char *name, uint64_t hash) +{ + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zap->zap_m.zap_avl; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT3U(zap_hash(zap, name), ==, hash); + + if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name)) + return (NULL); + + mze_tofind.mze_hash = hash; + mze_tofind.mze_phys.mze_cd = 0; + + mze = avl_find(avl, &mze_tofind, &idx); + if (mze == NULL) + mze = avl_nearest(avl, idx, AVL_AFTER); + for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + if (strcmp(name, mze->mze_phys.mze_name) == 0) + return (mze); + } + return (NULL); +} + +static uint32_t +mze_find_unused_cd(zap_t *zap, uint64_t hash) +{ + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zap->zap_m.zap_avl; + uint32_t cd; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + mze_tofind.mze_hash = hash; + mze_tofind.mze_phys.mze_cd = 0; + + cd = 0; + for (mze = avl_find(avl, &mze_tofind, &idx); + mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + if (mze->mze_phys.mze_cd != cd) + break; + cd++; + } + + return (cd); +} + +static void +mze_remove(zap_t *zap, mzap_ent_t *mze) +{ + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + avl_remove(&zap->zap_m.zap_avl, mze); + kmem_free(mze, sizeof (mzap_ent_t)); +} + +static void +mze_destroy(zap_t *zap) +{ + mzap_ent_t *mze; + void *avlcookie = NULL; + + while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) + kmem_free(mze, sizeof (mzap_ent_t)); + avl_destroy(&zap->zap_m.zap_avl); +} + +static zap_t * +mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) +{ + zap_t *winner; + zap_t *zap; + int i; + + ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); + + zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); + rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0); + rw_enter(&zap->zap_rwlock, RW_WRITER); + zap->zap_objset = os; + zap->zap_object = obj; + zap->zap_dbuf = db; + + if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) { + mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, + MUTEX_DEFAULT, 0); + zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; + } else { + zap->zap_ismicro = TRUE; + } + + /* + * Make sure that zap_ismicro is set before we let others see + * it, because zap_lockdir() checks zap_ismicro without the lock + * held. + */ + winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); + + if (winner != NULL) { + if (!zap->zap_ismicro) + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + kmem_free(zap, sizeof (zap_t)); + return (winner); + } + + if (zap->zap_ismicro) { + zap->zap_salt = zap->zap_m.zap_phys->mz_salt; + zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; + avl_create(&zap->zap_m.zap_avl, mze_compare, + sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); + + for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = + &zap->zap_m.zap_phys->mz_chunk[i]; + if (mze->mze_name[0]) { + zap->zap_m.zap_num_entries++; + mze_insert(zap, i, + zap_hash(zap, mze->mze_name), mze); + } + } + } else { + zap->zap_salt = zap->zap_f.zap_phys->zap_salt; + + ASSERT3U(sizeof (struct zap_leaf_header), ==, + 2*ZAP_LEAF_CHUNKSIZE); + + /* + * The embedded pointer table should not overlap the + * other members. + */ + ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, + &zap->zap_f.zap_phys->zap_salt); + + /* + * The embedded pointer table should end at the end of + * the block + */ + ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, + 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - + (uintptr_t)zap->zap_f.zap_phys, ==, + zap->zap_dbuf->db_size); + } + rw_exit(&zap->zap_rwlock); + return (zap); +} + +int +zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, int fatreader, zap_t **zapp) +{ + zap_t *zap; + dmu_buf_t *db; + krw_t lt; + int err; + + *zapp = NULL; + + err = dmu_buf_hold(os, obj, 0, NULL, &db); + if (err) + return (err); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + } +#endif + + zap = dmu_buf_get_user(db); + if (zap == NULL) + zap = mzap_open(os, obj, db); + + /* + * We're checking zap_ismicro without the lock held, in order to + * tell what type of lock we want. Once we have some sort of + * lock, see if it really is the right type. In practice this + * can only be different if it was upgraded from micro to fat, + * and micro wanted WRITER but fat only needs READER. + */ + lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; + rw_enter(&zap->zap_rwlock, lt); + if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { + /* it was upgraded, now we only need reader */ + ASSERT(lt == RW_WRITER); + ASSERT(RW_READER == + (!zap->zap_ismicro && fatreader) ? RW_READER : lti); + rw_downgrade(&zap->zap_rwlock); + lt = RW_READER; + } + + zap->zap_objset = os; + + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + + ASSERT3P(zap->zap_dbuf, ==, db); + + ASSERT(!zap->zap_ismicro || + zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); + if (zap->zap_ismicro && tx && + zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { + uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; + if (newsz > MZAP_MAX_BLKSZ) { + dprintf("upgrading obj %llu: num_entries=%u\n", + obj, zap->zap_m.zap_num_entries); + mzap_upgrade(zap, tx); + *zapp = zap; + return (0); + } + err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); + ASSERT3U(err, ==, 0); + zap->zap_m.zap_num_chunks = + db->db_size / MZAP_ENT_LEN - 1; + } + + *zapp = zap; + return (0); +} + +void +zap_unlockdir(zap_t *zap) +{ + rw_exit(&zap->zap_rwlock); + dmu_buf_rele(zap->zap_dbuf, NULL); +} + +static void +mzap_upgrade(zap_t *zap, dmu_tx_t *tx) +{ + mzap_phys_t *mzp; + int i, sz, nchunks, err; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + sz = zap->zap_dbuf->db_size; + mzp = kmem_alloc(sz, KM_SLEEP); + bcopy(zap->zap_dbuf->db_data, mzp, sz); + nchunks = zap->zap_m.zap_num_chunks; + + err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, + 1ULL << fzap_default_block_shift, 0, tx); + ASSERT(err == 0); + + dprintf("upgrading obj=%llu with %u chunks\n", + zap->zap_object, nchunks); + mze_destroy(zap); + + fzap_upgrade(zap, tx); + + for (i = 0; i < nchunks; i++) { + int err; + mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; + if (mze->mze_name[0] == 0) + continue; + dprintf("adding %s=%llu\n", + mze->mze_name, mze->mze_value); + err = fzap_add_cd(zap, + mze->mze_name, 8, 1, &mze->mze_value, + mze->mze_cd, tx); + ASSERT3U(err, ==, 0); + } + kmem_free(mzp, sz); +} + +uint64_t +zap_hash(zap_t *zap, const char *name) +{ + const uint8_t *cp; + uint8_t c; + uint64_t crc = zap->zap_salt; + + ASSERT(crc != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; + + /* + * Only use 28 bits, since we need 4 bits in the cookie for the + * collision differentiator. We MUST use the high bits, since + * those are the onces that we first pay attention to when + * chosing the bucket. + */ + crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + + return (crc); +} + + +static void +mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + mzap_phys_t *zp; + + VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + } +#endif + + dmu_buf_will_dirty(db, tx); + zp = db->db_data; + zp->mz_block_type = ZBT_MICRO; + zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; + ASSERT(zp->mz_salt != 0); + dmu_buf_rele(db, FTAG); +} + +int +zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int err; + + err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); + if (err != 0) + return (err); + mzap_create_impl(os, obj, tx); + return (0); +} + +uint64_t +zap_create(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + + mzap_create_impl(os, obj, tx); + return (obj); +} + +int +zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) +{ + /* + * dmu_object_free will free the object number and free the + * data. Freeing the data will cause our pageout function to be + * called, which will destroy our data (zap_leaf_t's and zap_t). + */ + + return (dmu_object_free(os, zapobj, tx)); +} + +_NOTE(ARGSUSED(0)) +void +zap_evict(dmu_buf_t *db, void *vzap) +{ + zap_t *zap = vzap; + + rw_destroy(&zap->zap_rwlock); + + if (zap->zap_ismicro) + mze_destroy(zap); + else + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + + kmem_free(zap, sizeof (zap_t)); +} + +int +zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_count(zap, count); + } else { + *count = zap->zap_m.zap_num_entries; + } + zap_unlockdir(zap); + return (err); +} + +/* + * Routines for maniplulating attributes. + */ + +int +zap_lookup(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_lookup(zap, name, + integer_size, num_integers, buf); + } else { + mze = mze_find(zap, name, zap_hash(zap, name)); + if (mze == NULL) { + err = ENOENT; + } else { + if (num_integers < 1) + err = EOVERFLOW; + else if (integer_size != 8) + err = EINVAL; + else + *(uint64_t *)buf = mze->mze_phys.mze_value; + } + } + zap_unlockdir(zap); + return (err); +} + +int +zap_length(objset_t *os, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_length(zap, name, integer_size, num_integers); + } else { + mze = mze_find(zap, name, zap_hash(zap, name)); + if (mze == NULL) { + err = ENOENT; + } else { + if (integer_size) + *integer_size = 8; + if (num_integers) + *num_integers = 1; + } + } + zap_unlockdir(zap); + return (err); +} + +static void +mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value) +{ + int i; + int start = zap->zap_m.zap_alloc_next; + uint32_t cd; + + dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + +#ifdef ZFS_DEBUG + for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; + ASSERT(strcmp(name, mze->mze_name) != 0); + } +#endif + + cd = mze_find_unused_cd(zap, hash); + /* given the limited size of the microzap, this can't happen */ + ASSERT(cd != ZAP_MAXCD); + +again: + for (i = start; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; + if (mze->mze_name[0] == 0) { + mze->mze_value = value; + mze->mze_cd = cd; + (void) strcpy(mze->mze_name, name); + zap->zap_m.zap_num_entries++; + zap->zap_m.zap_alloc_next = i+1; + if (zap->zap_m.zap_alloc_next == + zap->zap_m.zap_num_chunks) + zap->zap_m.zap_alloc_next = 0; + mze_insert(zap, i, hash, mze); + return; + } + } + if (start != 0) { + start = 0; + goto again; + } + ASSERT(!"out of entries!"); +} + +int +zap_add(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + const uint64_t *intval = val; + uint64_t hash; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_add(zap, name, integer_size, num_integers, val, tx); + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + zapobj, integer_size, num_integers, name); + mzap_upgrade(zap, tx); + err = fzap_add(zap, name, integer_size, num_integers, val, tx); + } else { + hash = zap_hash(zap, name); + mze = mze_find(zap, name, hash); + if (mze != NULL) { + err = EEXIST; + } else { + mzap_addent(zap, name, hash, *intval); + } + } + zap_unlockdir(zap); + return (err); +} + +int +zap_update(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + mzap_ent_t *mze; + const uint64_t *intval = val; + uint64_t hash; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); + if (err) + return (err); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + if (!zap->zap_ismicro) { + err = fzap_update(zap, name, + integer_size, num_integers, val, tx); + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + zapobj, integer_size, num_integers, name); + mzap_upgrade(zap, tx); + err = fzap_update(zap, name, + integer_size, num_integers, val, tx); + } else { + hash = zap_hash(zap, name); + mze = mze_find(zap, name, hash); + if (mze != NULL) { + mze->mze_phys.mze_value = *intval; + zap->zap_m.zap_phys->mz_chunk + [mze->mze_chunkid].mze_value = *intval; + } else { + mzap_addent(zap, name, hash, *intval); + } + } + zap_unlockdir(zap); + return (err); +} + +int +zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_remove(zap, name, tx); + } else { + mze = mze_find(zap, name, zap_hash(zap, name)); + if (mze == NULL) { + dprintf("fail: %s\n", name); + err = ENOENT; + } else { + dprintf("success: %s\n", name); + zap->zap_m.zap_num_entries--; + bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], + sizeof (mzap_ent_phys_t)); + mze_remove(zap, mze); + } + } + zap_unlockdir(zap); + return (err); +} + + +/* + * Routines for iterating over the attributes. + */ + +/* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So use a small hash value so + * we can fit 4 bits of cd into the 32-bit cursor. + * + * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] + */ +void +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + zc->zc_objset = os; + zc->zc_zap = NULL; + zc->zc_leaf = NULL; + zc->zc_zapobj = zapobj; + if (serialized == -1ULL) { + zc->zc_hash = -1ULL; + zc->zc_cd = 0; + } else { + zc->zc_hash = serialized << (64-ZAP_HASHBITS); + zc->zc_cd = serialized >> ZAP_HASHBITS; + if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ + zc->zc_cd = 0; + } +} + +void +zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zap_cursor_init_serialized(zc, os, zapobj, 0); +} + +void +zap_cursor_fini(zap_cursor_t *zc) +{ + if (zc->zc_zap) { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + zap_unlockdir(zc->zc_zap); + zc->zc_zap = NULL; + } + if (zc->zc_leaf) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + zc->zc_objset = NULL; +} + +uint64_t +zap_cursor_serialize(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return (-1ULL); + ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); + ASSERT(zc->zc_cd < ZAP_MAXCD); + return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | + ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); +} + +int +zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) +{ + int err; + avl_index_t idx; + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + + if (zc->zc_hash == -1ULL) + return (ENOENT); + + if (zc->zc_zap == NULL) { + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, &zc->zc_zap); + if (err) + return (err); + } else { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + } + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_retrieve(zc->zc_zap, zc, za); + } else { + err = ENOENT; + + mze_tofind.mze_hash = zc->zc_hash; + mze_tofind.mze_phys.mze_cd = zc->zc_cd; + + mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); + ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys, + &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], + sizeof (mze->mze_phys))); + if (mze == NULL) { + mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, + idx, AVL_AFTER); + } + if (mze) { + za->za_integer_length = 8; + za->za_num_integers = 1; + za->za_first_integer = mze->mze_phys.mze_value; + (void) strcpy(za->za_name, mze->mze_phys.mze_name); + zc->zc_hash = mze->mze_hash; + zc->zc_cd = mze->mze_phys.mze_cd; + err = 0; + } else { + zc->zc_hash = -1ULL; + } + } + rw_exit(&zc->zc_zap->zap_rwlock); + return (err); +} + +void +zap_cursor_advance(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return; + zc->zc_cd++; + if (zc->zc_cd >= ZAP_MAXCD) { + zc->zc_cd = 0; + zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); + if (zc->zc_hash == 0) /* EOF */ + zc->zc_hash = -1ULL; + } +} + +int +zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) +{ + int err; + zap_t *zap; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + if (err) + return (err); + + bzero(zs, sizeof (zap_stats_t)); + + if (zap->zap_ismicro) { + zs->zs_blocksize = zap->zap_dbuf->db_size; + zs->zs_num_entries = zap->zap_m.zap_num_entries; + zs->zs_num_blocks = 1; + } else { + fzap_get_stats(zap, zs); + } + zap_unlockdir(zap); + return (0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf new file mode 100644 index 0000000..0988190 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +name="zfs" parent="pseudo"; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c new file mode 100644 index 0000000..030424b --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c @@ -0,0 +1,1607 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_vfsops.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <acl/acl_common.h> + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) + +#define SECURE_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define OGE_PAD 6 /* traditional owner/group/everyone ACES */ + +static int zfs_ace_can_use(znode_t *zp, ace_t *); + +static zfs_acl_t * +zfs_acl_alloc(int slots) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + if (slots != 0) { + aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP); + aclp->z_acl_count = 0; + aclp->z_state = ACL_DATA_ALLOCED; + } else { + aclp->z_state = 0; + } + aclp->z_slots = slots; + return (aclp); +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + if (aclp->z_state == ACL_DATA_ALLOCED) { + kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots)); + } + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static uint32_t +zfs_v4_to_unix(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + /* + * This is used for mapping v4 permissions into permissions + * that can be passed to secpolicy_vnode_access() + */ + if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY | + ACE_READ_ATTRIBUTES | ACE_READ_ACL)) + new_mask |= S_IROTH; + if (access_mask & (ACE_WRITE_DATA | ACE_APPEND_DATA | + ACE_WRITE_ATTRIBUTES | ACE_ADD_FILE | ACE_WRITE_NAMED_ATTRS)) + new_mask |= S_IWOTH; + if (access_mask & (ACE_EXECUTE | ACE_READ_NAMED_ATTRS)) + new_mask |= S_IXOTH; + + return (new_mask); +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & 01) + new_mask |= (ACE_EXECUTE); + if (access_mask & 02) { + new_mask |= (ACE_WRITE_DATA); + } if (access_mask & 04) { + new_mask |= ACE_READ_DATA; + } + return (new_mask); +} + +static void +zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type, + uid_t uid, int entry_type) +{ + zacep->a_access_mask = access_mask; + zacep->a_type = access_type; + zacep->a_who = uid; + zacep->a_flags = entry_type; +} + +static uint64_t +zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) +{ + int i; + int entry_type; + mode_t mode = (zp->z_phys->zp_mode & + (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + mode_t seen = 0; + ace_t *acep; + + for (i = 0, acep = aclp->z_acl; + i != aclp->z_acl_count; i++, acep++) { + entry_type = (acep->a_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER) { + if ((acep->a_access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (acep->a_type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((acep->a_access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (acep->a_type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((acep->a_access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (acep->a_type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP) { + if ((acep->a_access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (acep->a_type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((acep->a_access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (acep->a_type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((acep->a_access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (acep->a_type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((acep->a_access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (acep->a_type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (acep->a_type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (acep->a_type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((acep->a_access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (acep->a_type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (acep->a_type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (acep->a_type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((acep->a_access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (acep->a_type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (acep->a_type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (acep->a_type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } + } + return (mode); +} + +static zfs_acl_t * +zfs_acl_node_read_internal(znode_t *zp) +{ + zfs_acl_t *aclp; + + aclp = zfs_acl_alloc(0); + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; + aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0]; + + return (aclp); +} + +/* + * Read an external acl object. + */ +static int +zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp) +{ + uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; + zfs_acl_t *aclp; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { + *aclpp = zfs_acl_node_read_internal(zp); + return (0); + } + + aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count); + + error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, + ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl); + if (error != 0) { + zfs_acl_free(aclp); + return (error); + } + + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; + + *aclpp = aclp; + return (0); +} + +static boolean_t +zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit) +{ + ace_t *acep; + int i; + + *inherit = 0; + + if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) { + return (B_FALSE); + } + + for (i = 0, acep = uace; i != aclcnt; i++, acep++) { + + /* + * first check type of entry + */ + + switch (acep->a_flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + acep->a_who = -1; + break; + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + case ACE_IDENTIFIER_GROUP: + if (acep->a_flags & ACE_GROUP) { + acep->a_who = -1; + } + break; + case ACE_EVERYONE: + acep->a_who = -1; + break; + } + + /* + * next check inheritance level flags + */ + + if (acep->a_type != ALLOW && acep->a_type != DENY) + return (B_FALSE); + + /* + * Only directories should have inheritance flags. + */ + if (ZTOV(zp)->v_type != VDIR && (acep->a_flags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) { + return (B_FALSE); + } + + if (acep->a_flags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)) + *inherit = 1; + + if (acep->a_flags & + (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((acep->a_flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + } + + return (B_TRUE); +} +/* + * common code for setting acl's. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp) +{ + int inherit = 0; + int error; + znode_phys_t *zphys = zp->z_phys; + zfs_znode_acl_t *zacl = &zphys->zp_acl; + uint32_t acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (ihp) + inherit = *ihp; /* already determined by caller */ + else if (!zfs_acl_valid(zp, aclp->z_acl, + aclp->z_acl_count, &inherit)) { + return (EINVAL); + } + + dmu_buf_will_dirty(zp->z_dbuf, tx); + + /* + * Will ACL fit internally? + */ + if (aclp->z_acl_count > ACE_SLOT_CNT) { + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, + acl_phys_size, 0, tx); + } + zphys->zp_acl.z_acl_extern_obj = aoid; + zphys->zp_acl.z_acl_count = aclp->z_acl_count; + dmu_write(zfsvfs->z_os, aoid, 0, + acl_phys_size, aclp->z_acl, tx); + } else { + /* + * Migrating back embedded? + */ + if (zphys->zp_acl.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + zp->z_phys->zp_acl.z_acl_extern_obj, tx); + if (error) + return (error); + zphys->zp_acl.z_acl_extern_obj = 0; + } + bcopy(aclp->z_acl, zacl->z_ace_data, + aclp->z_acl_count * sizeof (ace_t)); + zacl->z_acl_count = aclp->z_acl_count; + } + + zp->z_phys->zp_flags &= ~(ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE); + if (inherit) { + zp->z_phys->zp_flags |= ZFS_INHERIT_ACE; + } else if (ace_trivial(zacl->z_ace_data, zacl->z_acl_count) == 0) { + zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; + } + + zphys->zp_mode = zfs_mode_compute(zp, aclp); + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + + return (0); +} + +/* + * Create space for slots_needed ACEs to be append + * to aclp. + */ +static void +zfs_acl_append(zfs_acl_t *aclp, int slots_needed) +{ + ace_t *newacep; + ace_t *oldaclp; + int slot_cnt; + int slots_left = aclp->z_slots - aclp->z_acl_count; + + if (aclp->z_state == ACL_DATA_ALLOCED) + ASSERT(aclp->z_slots >= aclp->z_acl_count); + if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) { + slot_cnt = aclp->z_slots + 1 + (slots_needed - slots_left); + newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP); + bcopy(aclp->z_acl, newacep, + ZFS_ACL_SIZE(aclp->z_acl_count)); + oldaclp = aclp->z_acl; + if (aclp->z_state == ACL_DATA_ALLOCED) + kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots)); + aclp->z_acl = newacep; + aclp->z_slots = slot_cnt; + aclp->z_state = ACL_DATA_ALLOCED; + } +} + +/* + * Remove "slot" ACE from aclp + */ +static void +zfs_ace_remove(zfs_acl_t *aclp, int slot) +{ + if (aclp->z_acl_count > 1) { + (void) memmove(&aclp->z_acl[slot], + &aclp->z_acl[slot +1], sizeof (ace_t) * + (--aclp->z_acl_count - slot)); + } else + aclp->z_acl_count--; +} + +/* + * Update access mask for prepended ACE + * + * This applies the "groupmask" value for aclmode property. + */ +static void +zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner) +{ + + int rmask, wmask, xmask; + int user_ace; + + user_ace = (!(acep->a_flags & + (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP))); + + if (user_ace && (acep->a_who == owner)) { + rmask = S_IRUSR; + wmask = S_IWUSR; + xmask = S_IXUSR; + } else { + rmask = S_IRGRP; + wmask = S_IWGRP; + xmask = S_IXGRP; + } + + if (origacep->a_access_mask & ACE_READ_DATA) { + if (mode & rmask) + acep->a_access_mask &= ~ACE_READ_DATA; + else + acep->a_access_mask |= ACE_READ_DATA; + } + + if (origacep->a_access_mask & ACE_WRITE_DATA) { + if (mode & wmask) + acep->a_access_mask &= ~ACE_WRITE_DATA; + else + acep->a_access_mask |= ACE_WRITE_DATA; + } + + if (origacep->a_access_mask & ACE_APPEND_DATA) { + if (mode & wmask) + acep->a_access_mask &= ~ACE_APPEND_DATA; + else + acep->a_access_mask |= ACE_APPEND_DATA; + } + + if (origacep->a_access_mask & ACE_EXECUTE) { + if (mode & xmask) + acep->a_access_mask &= ~ACE_EXECUTE; + else + acep->a_access_mask |= ACE_EXECUTE; + } +} + +/* + * Apply mode to canonical six ACEs. + */ +static void +zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode) +{ + int cnt; + ace_t *acep; + + cnt = aclp->z_acl_count -1; + acep = aclp->z_acl; + + /* + * Fixup final ACEs to match the mode + */ + + ASSERT(cnt >= 5); + adjust_ace_pair(&acep[cnt - 1], mode); /* everyone@ */ + adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3); /* group@ */ + adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6); /* owner@ */ +} + + +static int +zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask) +{ + return (acep->a_access_mask == mask && acep->a_type == allow_deny && + ((acep->a_flags & ACE_TYPE_FLAGS) == type)); +} + +/* + * Can prepended ACE be reused? + */ +static int +zfs_reuse_deny(ace_t *acep, int i) +{ + int okay_masks; + + if (i < 1) + return (B_FALSE); + + if (acep[i-1].a_type != DENY) + return (B_FALSE); + + if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP)) + return (B_FALSE); + + okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS); + + if (acep[i-1].a_access_mask & ~okay_masks) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Create space to prepend an ACE + */ +static void +zfs_acl_prepend(zfs_acl_t *aclp, int i) +{ + ace_t *oldaclp = NULL; + ace_t *to, *from; + int slots_left = aclp->z_slots - aclp->z_acl_count; + int oldslots; + int need_free = 0; + + if (aclp->z_state == ACL_DATA_ALLOCED) + ASSERT(aclp->z_slots >= aclp->z_acl_count); + + if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) { + + to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count + + OGE_PAD), KM_SLEEP); + if (aclp->z_state == ACL_DATA_ALLOCED) + need_free++; + from = aclp->z_acl; + oldaclp = aclp->z_acl; + (void) memmove(to, from, + sizeof (ace_t) * aclp->z_acl_count); + aclp->z_state = ACL_DATA_ALLOCED; + } else { + from = aclp->z_acl; + to = aclp->z_acl; + } + + + (void) memmove(&to[i + 1], &from[i], + sizeof (ace_t) * (aclp->z_acl_count - i)); + + if (oldaclp) { + aclp->z_acl = to; + oldslots = aclp->z_slots; + aclp->z_slots = aclp->z_acl_count + OGE_PAD; + if (need_free) + kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots)); + } + +} + +/* + * Prepend deny ACE + */ +static void +zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i, + mode_t mode) +{ + ace_t *acep; + + zfs_acl_prepend(aclp, i); + + acep = aclp->z_acl; + zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who, + (acep[i + 1].a_flags & ACE_TYPE_FLAGS)); + zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid); + aclp->z_acl_count++; +} + +/* + * Split an inherited ACE into inherit_only ACE + * and original ACE with inheritance flags stripped off. + */ +static void +zfs_acl_split_ace(zfs_acl_t *aclp, int i) +{ + ace_t *acep = aclp->z_acl; + + zfs_acl_prepend(aclp, i); + acep = aclp->z_acl; + acep[i] = acep[i + 1]; + acep[i].a_flags |= ACE_INHERIT_ONLY_ACE; + acep[i + 1].a_flags &= ~ALL_INHERIT; + aclp->z_acl_count++; +} + +/* + * Are ACES started at index i, the canonical six ACES? + */ +static int +zfs_have_canonical_six(zfs_acl_t *aclp, int i) +{ + ace_t *acep = aclp->z_acl; + + if ((zfs_acl_ace_match(&acep[i], + DENY, ACE_OWNER, 0) && + zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER, + OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2], + DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3], + ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4], + DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) && + zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE, + EVERYONE_ALLOW_MASK))) { + return (1); + } else { + return (0); + } +} + +/* + * Apply step 1g, to group entries + * + * Need to deal with corner case where group may have + * greater permissions than owner. If so then limit + * group permissions, based on what extra permissions + * group has. + */ +static void +zfs_fixup_group_entries(ace_t *acep, mode_t mode) +{ + mode_t extramode = (mode >> 3) & 07; + mode_t ownermode = (mode >> 6); + + if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) { + + extramode &= ~ownermode; + + if (extramode) { + if (extramode & 04) { + acep[0].a_access_mask &= ~ACE_READ_DATA; + acep[1].a_access_mask &= ~ACE_READ_DATA; + } + if (extramode & 02) { + acep[0].a_access_mask &= + ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + acep[1].a_access_mask &= + ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + } + if (extramode & 01) { + acep[0].a_access_mask &= ~ACE_EXECUTE; + acep[1].a_access_mask &= ~ACE_EXECUTE; + } + } + } +} + +/* + * Apply the chmod algorithm as described + * in PSARC/2002/240 + */ +static int +zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp, + dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ace_t *acep; + int i; + int error; + int entry_type; + int reuse_deny; + int need_canonical_six = 1; + int inherit = 0; + int iflags; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + ASSERT(MUTEX_HELD(&zp->z_lock)); + + i = 0; + while (i < aclp->z_acl_count) { + acep = aclp->z_acl; + entry_type = (acep[i].a_flags & ACE_TYPE_FLAGS); + iflags = (acep[i].a_flags & ALL_INHERIT); + + if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + i++; + if (iflags) + inherit = 1; + continue; + } + + + if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) { + zfs_ace_remove(aclp, i); + continue; + } + + /* + * Need to split ace into two? + */ + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) && + (!(iflags & ACE_INHERIT_ONLY_ACE))) { + zfs_acl_split_ace(aclp, i); + i++; + inherit = 1; + continue; + } + + if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || + (entry_type == OWNING_GROUP)) { + acep[i].a_access_mask &= ~OGE_CLEAR; + i++; + continue; + + } else { + if (acep[i].a_type == ALLOW) { + + /* + * Check preceding ACE if any, to see + * if we need to prepend a DENY ACE. + * This is only applicable when the acl_mode + * property == groupmask. + */ + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) { + + reuse_deny = zfs_reuse_deny(acep, i); + + if (reuse_deny == B_FALSE) { + zfs_acl_prepend_deny(zp, aclp, + i, mode); + i++; + acep = aclp->z_acl; + } else { + zfs_acl_prepend_fixup( + &acep[i - 1], + &acep[i], mode, + zp->z_phys->zp_uid); + } + zfs_fixup_group_entries(&acep[i - 1], + mode); + } + } + i++; + } + } + + /* + * Check out last six aces, if we have six. + */ + + if (aclp->z_acl_count >= 6) { + i = aclp->z_acl_count - 6; + + if (zfs_have_canonical_six(aclp, i)) { + need_canonical_six = 0; + } + } + + if (need_canonical_six) { + + zfs_acl_append(aclp, 6); + i = aclp->z_acl_count; + acep = aclp->z_acl; + zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER); + zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); + zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP); + zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP); + zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK, + DENY, -1, ACE_EVERYONE); + zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK, + ALLOW, -1, ACE_EVERYONE); + aclp->z_acl_count += 6; + } + + zfs_acl_fixup_canonical_six(aclp, mode); + + zp->z_phys->zp_mode = mode; + error = zfs_aclset_common(zp, aclp, tx, &inherit); + return (error); +} + + +int +zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx) +{ + zfs_acl_t *aclp = NULL; + int error; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + mutex_enter(&zp->z_acl_lock); + error = zfs_acl_node_read(zp, &aclp); + if (error == 0) + error = zfs_acl_chmod(zp, mode, aclp, tx); + mutex_exit(&zp->z_acl_lock); + if (aclp) + zfs_acl_free(aclp); + return (error); +} + +/* + * strip off write_owner and write_acl + */ +static void +zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep) +{ + if ((zfsvfs->z_acl_inherit == ZFS_ACL_SECURE) && + (acep->a_type == ALLOW)) + acep->a_access_mask &= ~SECURE_CLEAR; +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ace_t *pacep; + ace_t *acep; + int ace_cnt = 0; + int pace_cnt; + int i, j; + zfs_acl_t *aclp = NULL; + + i = j = 0; + pace_cnt = paclp->z_acl_count; + pacep = paclp->z_acl; + if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) { + for (i = 0; i != pace_cnt; i++) { + + if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW && + pacep[i].a_type == ALLOW) + continue; + + if (zfs_ace_can_use(zp, &pacep[i])) { + ace_cnt++; + if (!(pacep[i].a_flags & + ACE_NO_PROPAGATE_INHERIT_ACE)) + ace_cnt++; + } + } + } + + aclp = zfs_acl_alloc(ace_cnt + OGE_PAD); + if (ace_cnt && zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) { + acep = aclp->z_acl; + pacep = paclp->z_acl; + for (i = 0; i != pace_cnt; i++) { + + if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW && + pacep[i].a_type == ALLOW) + continue; + + if (zfs_ace_can_use(zp, &pacep[i])) { + + /* + * Now create entry for inherited ace + */ + + acep[j] = pacep[i]; + + /* + * When AUDIT/ALARM a_types are supported + * they should be inherited here. + */ + + if ((pacep[i].a_flags & + ACE_NO_PROPAGATE_INHERIT_ACE) || + (ZTOV(zp)->v_type != VDIR)) { + acep[j].a_flags &= ~ALL_INHERIT; + zfs_securemode_update(zfsvfs, &acep[j]); + j++; + continue; + } + + ASSERT(ZTOV(zp)->v_type == VDIR); + + /* + * If we are inheriting an ACE targeted for + * only files, then make sure inherit_only + * is on for future propagation. + */ + if ((pacep[i].a_flags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) != + ACE_FILE_INHERIT_ACE) { + j++; + acep[j] = acep[j-1]; + acep[j-1].a_flags |= + ACE_INHERIT_ONLY_ACE; + acep[j].a_flags &= ~ALL_INHERIT; + } else { + acep[j].a_flags |= ACE_INHERIT_ONLY_ACE; + } + zfs_securemode_update(zfsvfs, &acep[j]); + j++; + } + } + } + aclp->z_acl_count = j; + ASSERT(aclp->z_slots >= aclp->z_acl_count); + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + */ +void +zfs_perm_init(znode_t *zp, znode_t *parent, int flag, + vattr_t *vap, dmu_tx_t *tx, cred_t *cr) +{ + uint64_t mode; + uid_t uid; + gid_t gid; + int error; + int pull_down; + zfs_acl_t *aclp, *paclp; + + mode = MAKEIMODE(vap->va_type, vap->va_mode); + + /* + * Determine uid and gid. + */ + if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + uid = vap->va_uid; + gid = vap->va_gid; + } else { + uid = crgetuid(cr); + if ((vap->va_mask & AT_GID) && + ((vap->va_gid == parent->z_phys->zp_gid) || + groupmember(vap->va_gid, cr) || + secpolicy_vnode_create_gid(cr) == 0)) + gid = vap->va_gid; + else +#ifdef __FreeBSD__ + gid = parent->z_phys->zp_gid; +#else + gid = (parent->z_phys->zp_mode & S_ISGID) ? + parent->z_phys->zp_gid : crgetgid(cr); +#endif + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) + mode |= S_ISGID; + else { + if ((mode & S_ISGID) && + secpolicy_vnode_setids_setgids(cr, gid) != 0) + mode &= ~S_ISGID; + } + + zp->z_phys->zp_uid = uid; + zp->z_phys->zp_gid = gid; + zp->z_phys->zp_mode = mode; + + mutex_enter(&parent->z_lock); + pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE); + if (pull_down) { + mutex_enter(&parent->z_acl_lock); + VERIFY(0 == zfs_acl_node_read(parent, &paclp)); + mutex_exit(&parent->z_acl_lock); + aclp = zfs_acl_inherit(zp, paclp); + zfs_acl_free(paclp); + } else { + aclp = zfs_acl_alloc(6); + } + mutex_exit(&parent->z_lock); + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + error = zfs_acl_chmod(zp, mode, aclp, tx); + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + ASSERT3U(error, ==, 0); + zfs_acl_free(aclp); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(znode_t *zp, ace_t *acep) +{ + int vtype = ZTOV(zp)->v_type; + + int iflags = (acep->a_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!((vtype == VDIR) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +#ifdef TODO +/* + * Retrieve a files ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + int error; + + if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) { + /* + * If owner of file then allow reading of the + * ACL. + */ + if (crgetuid(cr) != zp->z_phys->zp_uid) + return (error); + } + + if (mask == 0) + return (ENOSYS); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, &aclp); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = aclp->z_acl_count; + } + + if (mask & VSA_ACE) { + vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count * + sizeof (ace_t), KM_SLEEP); + bcopy(aclp->z_acl, vsecp->vsa_aclentp, + aclp->z_acl_count * sizeof (ace_t)); + } + + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} +#endif /* TODO */ + +#ifdef TODO +/* + * Set a files ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + ace_t *acep = vsecp->vsa_aclentp; + int aclcnt = vsecp->vsa_aclcnt; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + int inherit; + zfs_acl_t *aclp; + + if (mask == 0) + return (EINVAL); + + if (!zfs_acl_valid(zp, acep, aclcnt, &inherit)) + return (EINVAL); +top: + error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr); + if (error == EACCES || error == ACCESS_UNDETERMINED) { + if ((error = secpolicy_vnode_setdac(cr, + zp->z_phys->zp_uid)) != 0) { + return (error); + } + } else if (error) { + return (error == EROFS ? error : EPERM); + } + + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + + if (zp->z_phys->zp_acl.z_acl_extern_obj) { + dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj, + 0, ZFS_ACL_SIZE(aclcnt)); + } else if (aclcnt > ACE_SLOT_CNT) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt)); + } + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + return (error); + } + + aclp = zfs_acl_alloc(aclcnt); + bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt); + aclp->z_acl_count = aclcnt; + error = zfs_aclset_common(zp, aclp, tx, &inherit); + ASSERT(error == 0); + + zfs_acl_free(aclp); + zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep); + dmu_tx_commit(tx); +done: + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + return (error); +} +#endif /* TODO */ + +static int +zfs_ace_access(ace_t *zacep, int *working_mode) +{ + if (*working_mode == 0) { + return (0); + } + + if (zacep->a_access_mask & *working_mode) { + if (zacep->a_type == ALLOW) { + *working_mode &= + ~(*working_mode & zacep->a_access_mask); + if (*working_mode == 0) + return (0); + } else if (zacep->a_type == DENY) { + return (EACCES); + } + } + + /* + * haven't been specifcally denied at this point + * so return UNDETERMINED. + */ + + return (ACCESS_UNDETERMINED); +} + + +static int +zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr) +{ + zfs_acl_t *aclp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ace_t *zacep; + gid_t gid; + int cnt; + int i; + int error; + int access_deny = ACCESS_UNDETERMINED; + uint_t entry_type; + uid_t uid = crgetuid(cr); + + if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + *working_mode = 0; + return (0); + } + + *working_mode = v4_mode; + + if ((v4_mode & WRITE_MASK) && + (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + (!IS_DEVVP(ZTOV(zp)))) { + return (EROFS); + } + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, &aclp); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + + zacep = aclp->z_acl; + cnt = aclp->z_acl_count; + + for (i = 0; i != cnt; i++) { + + DTRACE_PROBE2(zfs__access__common, + ace_t *, &zacep[i], int, *working_mode); + + if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE) + continue; + + entry_type = (zacep[i].a_flags & ACE_TYPE_FLAGS); + switch (entry_type) { + case ACE_OWNER: + if (uid == zp->z_phys->zp_uid) { + access_deny = zfs_ace_access(&zacep[i], + working_mode); + } + break; + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + case ACE_IDENTIFIER_GROUP: + /* + * Owning group gid is in znode not ACL + */ + if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP)) + gid = zp->z_phys->zp_gid; + else + gid = zacep[i].a_who; + + if (groupmember(gid, cr)) { + access_deny = zfs_ace_access(&zacep[i], + working_mode); + } + break; + case ACE_EVERYONE: + access_deny = zfs_ace_access(&zacep[i], working_mode); + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + if (uid == zacep[i].a_who) { + access_deny = zfs_ace_access(&zacep[i], + working_mode); + } + break; + } + zfs_acl_free(aclp); + mutex_exit(&zp->z_acl_lock); + return (EIO); + } + + if (access_deny != ACCESS_UNDETERMINED) + break; + } + + mutex_exit(&zp->z_acl_lock); + zfs_acl_free(aclp); + + return (access_deny); +} + + +/* + * Determine whether Access should be granted/denied, invoking least + * priv subsytem when a deny is determined. + */ +int +zfs_zaccess(znode_t *zp, int mode, cred_t *cr) +{ + int working_mode; + int error; + int is_attr; + znode_t *xzp; + znode_t *check_zp = zp; + + is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && + (ZTOV(zp)->v_type == VDIR)); + + /* + * If attribute then validate against base file + */ + if (is_attr) { + if ((error = zfs_zget(zp->z_zfsvfs, + zp->z_phys->zp_parent, &xzp)) != 0) { + return (error); + } + check_zp = xzp; + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } + + error = zfs_zaccess_common(check_zp, mode, &working_mode, cr); + + if (error == EROFS) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error || working_mode) { + working_mode = (zfs_v4_to_unix(working_mode) << 6); + error = secpolicy_vnode_access(cr, ZTOV(check_zp), + check_zp->z_phys->zp_uid, working_mode); + } + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Special zaccess function to check for special nfsv4 perm. + * doesn't call secpolicy_vnode_access() for failure, since that + * would probably be the wrong policy function to call. + * instead its up to the caller to handle that situation. + */ + +int +zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr) +{ + int working_mode = 0; + return (zfs_zaccess_common(zp, mode, &working_mode, cr)); +} + +/* + * Translate tradition unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, cr)); +} + +static int +zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr) +{ + int error; + + error = secpolicy_vnode_access(cr, ZTOV(zp), + dzp->z_phys->zp_uid, S_IWRITE|S_IEXEC); + + if (error == 0) + error = zfs_sticky_remove_access(dzp, zp, cr); + + return (error); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + int dzp_working_mode = 0; + int zp_working_mode = 0; + int dzp_error, zp_error; + + /* + * Arghh, this check is going to require a couple of questions + * to be asked. We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:joe:write_data:deny,user:joe:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, cr); + zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr); + + if (dzp_error == EROFS || zp_error == EROFS) + return (dzp_error); + + /* + * First check the first row. + * We only need to see if parent Allows delete_child + */ + if ((dzp_working_mode & ACE_DELETE_CHILD) == 0) + return (0); + + /* + * Second row + * we already have the necessary information in + * zp_working_mode, zp_error and dzp_error. + */ + + if ((zp_working_mode & ACE_DELETE) == 0) + return (0); + + /* + * Now zp_error should either be EACCES which indicates + * a "deny" delete entry or ACCESS_UNDETERMINED if the "delete" + * entry exists on the target. + * + * dzp_error should be either EACCES which indicates a "deny" + * entry for delete_child or ACCESS_UNDETERMINED if no delete_child + * entry exists. If value is EACCES then we are done + * and zfs_delete_final_check() will make the final decision + * regarding to allow the delete. + */ + + ASSERT(zp_error != 0 && dzp_error != 0); + if (dzp_error == EACCES) + return (zfs_delete_final_check(zp, dzp, cr)); + + /* + * Third Row + * Only need to check for write/execute on parent + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE, + &dzp_working_mode, cr); + + if (dzp_error == EROFS) + return (dzp_error); + + if ((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0) + return (zfs_sticky_remove_access(dzp, zp, cr)); + + /* + * Fourth Row + */ + + if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) != 0) && + ((zp_working_mode & ACE_DELETE) == 0)) + return (zfs_sticky_remove_access(dzp, zp, cr)); + + return (zfs_delete_final_check(zp, dzp, cr)); +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + add_perm = (ZTOV(szp)->v_type == VDIR) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + */ + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if (error = zfs_zaccess_delete(sdzp, szp, cr)) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if (error = zfs_zaccess_delete(tdzp, tzp, cr)) + return (error); + } + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, cr); + + return (error); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c new file mode 100644 index 0000000..c8450d4 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/vfs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_acl.h> + +void +zfs_ace_byteswap(ace_t *ace, int ace_cnt) +{ + int i; + + for (i = 0; i != ace_cnt; i++, ace++) { + ace->a_who = BSWAP_32(ace->a_who); + ace->a_access_mask = BSWAP_32(ace->a_access_mask); + ace->a_flags = BSWAP_16(ace->a_flags); + ace->a_type = BSWAP_16(ace->a_type); + } +} + +/* ARGSUSED */ +void +zfs_acl_byteswap(void *buf, size_t size) +{ + int cnt; + + /* + * Arggh, since we don't know how many ACEs are in + * the array, we have to swap the entire block + */ + + cnt = size / sizeof (ace_t); + + zfs_ace_byteswap((ace_t *)buf, cnt); +} + +void +zfs_znode_byteswap(void *buf, size_t size) +{ + znode_phys_t *zp = buf; + + ASSERT(size >= sizeof (znode_phys_t)); + + zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]); + zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]); + zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]); + zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]); + zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]); + zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]); + zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]); + zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]); + zp->zp_gen = BSWAP_64(zp->zp_gen); + zp->zp_mode = BSWAP_64(zp->zp_mode); + zp->zp_size = BSWAP_64(zp->zp_size); + zp->zp_parent = BSWAP_64(zp->zp_parent); + zp->zp_links = BSWAP_64(zp->zp_links); + zp->zp_xattr = BSWAP_64(zp->zp_xattr); + zp->zp_rdev = BSWAP_64(zp->zp_rdev); + zp->zp_flags = BSWAP_64(zp->zp_flags); + zp->zp_uid = BSWAP_64(zp->zp_uid); + zp->zp_gid = BSWAP_64(zp->zp_gid); + zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); + zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); + zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); + zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]); + + zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); + zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count); + zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); + zp->zp_acl.z_acl_pad = BSWAP_16(zp->zp_acl.z_acl_pad); + zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c new file mode 100644 index 0000000..c759962 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c @@ -0,0 +1,1120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by the kernel, but unmounts are + * (currently) handled from user land. The main reason is that there is no + * reliable way to auto-unmount the filesystem when it's "no longer in use". + * When the user unmounts a filesystem, we call zfsctl_unmount(), which + * unmounts any snapshots within the snapshot directory. + */ + +#include <sys/zfs_context.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/namei.h> +#include <sys/gfs.h> +#include <sys/stat.h> +#include <sys/dmu.h> +#include <sys/mount.h> + +typedef struct { + char *se_name; + vnode_t *se_root; + avl_node_t se_node; +} zfs_snapentry_t; + +static int +snapentry_compare(const void *a, const void *b) +{ + const zfs_snapentry_t *sa = a; + const zfs_snapentry_t *sb = b; + int ret = strcmp(sa->se_name, sb->se_name); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +static struct vop_vector zfsctl_ops_root; +static struct vop_vector zfsctl_ops_snapdir; +static struct vop_vector zfsctl_ops_snapshot; + +static vnode_t *zfsctl_mknode_snapdir(vnode_t *); +static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); + +typedef struct zfsctl_node { + gfs_dir_t zc_gfs_private; + uint64_t zc_id; + timestruc_t zc_cmtime; /* ctime and mtime, always the same */ +} zfsctl_node_t; + +typedef struct zfsctl_snapdir { + zfsctl_node_t sd_node; + kmutex_t sd_lock; + avl_tree_t sd_snaps; +} zfsctl_snapdir_t; + +/* + * Root directory elements. We have only a single static entry, 'snapshot'. + */ +static gfs_dirent_t zfsctl_root_entries[] = { + { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, + { NULL } +}; + +/* include . and .. in the calculation */ +#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ + sizeof (gfs_dirent_t)) + 1) + + +/* + * Initialize the various GFS pieces we'll need to create and manipulate .zfs + * directories. This is called from the ZFS init routine, and initializes the + * vnode ops vectors that we'll be using. + */ +void +zfsctl_init(void) +{ +} + +void +zfsctl_fini(void) +{ +} + +/* + * Return the inode number associated with the 'snapshot' directory. + */ +/* ARGSUSED */ +static ino64_t +zfsctl_root_inode_cb(vnode_t *vp, int index) +{ + ASSERT(index == 0); + return (ZFSCTL_INO_SNAPDIR); +} + +/* + * Create the '.zfs' directory. This directory is cached as part of the VFS + * structure. This results in a hold on the vfs_t. The code in zfs_umount() + * therefore checks against a vfs_count of 2 instead of 1. This reference + * is removed when the ctldir is destroyed in the unmount. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + vnode_t *vp, *rvp; + zfsctl_node_t *zcp; + + ASSERT(zfsvfs->z_ctldir == NULL); + + vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, + &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, + zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); + zcp = vp->v_data; + zcp->zc_id = ZFSCTL_INO_ROOT; + + VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0); + ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); + VN_URELE(rvp); + + /* + * We're only faking the fact that we have a root of a filesystem for + * the sake of the GFS interfaces. Undo the flag manipulation it did + * for us. + */ + vp->v_vflag &= ~VV_ROOT; + + zfsvfs->z_ctldir = vp; +} + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. + * There might still be more references if we were force unmounted, but only + * new zfs_inactive() calls can occur and they don't reference .zfs + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + VN_RELE(zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +vnode_t * +zfsctl_root(znode_t *zp) +{ + ASSERT(zfs_has_ctldir(zp)); + VN_HOLD(zp->z_zfsvfs->z_ctldir); + return (zp->z_zfsvfs->z_ctldir); +} + +/* + * Common open routine. Disallow any write access. + */ +/* ARGSUSED */ +static int +zfsctl_common_open(struct vop_open_args *ap) +{ + int flags = ap->a_mode; + + if (flags & FWRITE) + return (EACCES); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(struct vop_close_args *ap) +{ + return (0); +} + +/* + * Common access routine. Disallow writes. + */ +/* ARGSUSED */ +static int +zfsctl_common_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + int mode = ap->a_mode; + + if (mode & VWRITE) + return (EACCES); + + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) +{ + zfsctl_node_t *zcp = vp->v_data; + timestruc_t now; + + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purly virtual object, so we have no + * blocksize or allocated blocks. + */ + vap->va_blksize = 0; + vap->va_nblocks = 0; + vap->va_seq = 0; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + vap->va_type = VDIR; + /* + * We live in the now (for atime). + */ + gethrestime(&now); + vap->va_atime = now; + vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime; + /* FreeBSD: Reset chflags(2) flags. */ + vap->va_flags = 0; +} + +static int +zfsctl_common_fid(ap) + struct vop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_node_t *zcp = vp->v_data; + uint64_t object = zcp->zc_id; + zfid_short_t *zfid; + int i; + + ZFS_ENTER(zfsvfs); + + fidp->fid_len = SHORT_FID_LEN; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs znodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfsctl_common_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + + /* + * Destroy the vm object and flush associated pages. + */ + vnode_destroy_vobject(vp); + VI_LOCK(vp); + vp->v_data = NULL; + VI_UNLOCK(vp); + return (0); +} + +/* + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/<snap> objectid(snap) + */ + +#define ZFSCTL_INO_SNAP(id) (id) + +/* + * Get root directory attributes. + */ +/* ARGSUSED */ +static int +zfsctl_root_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + + ZFS_ENTER(zfsvfs); + vap->va_nodeid = ZFSCTL_INO_ROOT; + vap->va_nlink = vap->va_size = NROOT_ENTRIES; + + zfsctl_common_getattr(vp, vap); + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * Special case the handling of "..". + */ +/* ARGSUSED */ +int +zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int err; + + ZFS_ENTER(zfsvfs); + + if (strcmp(nm, "..") == 0) { + err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread); + if (err == 0) + VOP_UNLOCK(*vpp, 0, curthread); + } else { + err = gfs_dir_lookup(dvp, nm, vpp); + } + + ZFS_EXIT(zfsvfs); + + return (err); +} + +/* + * Special case the handling of "..". + */ +/* ARGSUSED */ +int +zfsctl_root_lookup_vop(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + cred_t *cr = ap->a_cnp->cn_cred; + int flags = ap->a_cnp->cn_flags; + int nameiop = ap->a_cnp->cn_nameiop; + char nm[NAME_MAX + 1]; + int err; + + if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE)) + return (EOPNOTSUPP); + + ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); + strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + + err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr); + if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread); + + return (err); +} + +static struct vop_vector zfsctl_ops_root = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_root_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = gfs_vop_readdir, + .vop_lookup = zfsctl_root_lookup_vop, + .vop_inactive = gfs_vop_inactive, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, +}; + +static int +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (ENAMETOOLONG); + (void) strcat(zname, "@"); + (void) strcat(zname, name); + return (0); +} + +static int +zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr) +{ + zfsctl_snapdir_t *sdp = dvp->v_data; + zfs_snapentry_t search, *sep; + struct vop_inactive_args ap; + avl_index_t where; + int err; + + ASSERT(MUTEX_HELD(&sdp->sd_lock)); + + search.se_name = (char *)name; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) + return (ENOENT); + + ASSERT(vn_ismntpt(sep->se_root)); + + /* this will be dropped by dounmount() */ + if ((err = vn_vfswlock(sep->se_root)) != 0) + return (err); + + err = dounmount(vn_mountedvfs(sep->se_root), force, curthread); + if (err) + return (err); + ASSERT(sep->se_root->v_count == 1); + ap.a_vp = sep->se_root; + gfs_vop_inactive(&ap); + + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + + return (0); +} + +#if 0 +static void +zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) +{ + avl_index_t where; + vfs_t *vfsp; + refstr_t *pathref; + char newpath[MAXNAMELEN]; + char *tail; + + ASSERT(MUTEX_HELD(&sdp->sd_lock)); + ASSERT(sep != NULL); + + vfsp = vn_mountedvfs(sep->se_root); + ASSERT(vfsp != NULL); + + vfs_lock_wait(vfsp); + + /* + * Change the name in the AVL tree. + */ + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); + (void) strcpy(sep->se_name, nm); + VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); + avl_insert(&sdp->sd_snaps, sep, where); + + /* + * Change the current mountpoint info: + * - update the tail of the mntpoint path + * - update the tail of the resource path + */ + pathref = vfs_getmntpoint(vfsp); + (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); + VERIFY((tail = strrchr(newpath, '/')) != NULL); + *(tail+1) = '\0'; + ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); + (void) strcat(newpath, nm); + refstr_rele(pathref); + vfs_setmntpoint(vfsp, newpath); + + pathref = vfs_getresource(vfsp); + (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); + VERIFY((tail = strrchr(newpath, '@')) != NULL); + *(tail+1) = '\0'; + ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); + (void) strcat(newpath, nm); + refstr_rele(pathref); + vfs_setresource(vfsp, newpath); + + vfs_unlock(vfsp); +} +#endif + +#if 0 +static int +zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, + cred_t *cr) +{ + zfsctl_snapdir_t *sdp = sdvp->v_data; + zfs_snapentry_t search, *sep; + avl_index_t where; + char from[MAXNAMELEN], to[MAXNAMELEN]; + int err; + + err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); + if (err) + return (err); + err = zfs_secpolicy_write(from, cr); + if (err) + return (err); + + /* + * Cannot move snapshots out of the snapdir. + */ + if (sdvp != tdvp) + return (EINVAL); + + if (strcmp(snm, tnm) == 0) + return (0); + + err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); + if (err) + return (err); + + mutex_enter(&sdp->sd_lock); + + search.se_name = (char *)snm; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { + mutex_exit(&sdp->sd_lock); + return (ENOENT); + } + + err = dmu_objset_rename(from, to); + if (err == 0) + zfsctl_rename_snap(sdp, sep, tnm); + + mutex_exit(&sdp->sd_lock); + + return (err); +} +#endif + +#if 0 +/* ARGSUSED */ +static int +zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) +{ + zfsctl_snapdir_t *sdp = dvp->v_data; + char snapname[MAXNAMELEN]; + int err; + + err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); + if (err) + return (err); + err = zfs_secpolicy_write(snapname, cr); + if (err) + return (err); + + mutex_enter(&sdp->sd_lock); + + err = zfsctl_unmount_snap(dvp, name, 0, cr); + if (err) { + mutex_exit(&sdp->sd_lock); + return (err); + } + + err = dmu_objset_destroy(snapname); + + mutex_exit(&sdp->sd_lock); + + return (err); +} +#endif + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + */ +/* ARGSUSED */ +int +zfsctl_snapdir_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + char nm[NAME_MAX + 1]; + zfsctl_snapdir_t *sdp = dvp->v_data; + objset_t *snap; + char snapname[MAXNAMELEN]; + char *mountpoint; + zfs_snapentry_t *sep, search; + size_t mountpoint_len; + avl_index_t where; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int err; + + ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); + strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + + ASSERT(dvp->v_type == VDIR); + + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) + return (0); + + *vpp = NULL; + + /* + * If we get a recursive call, that means we got called + * from the domount() code while it was trying to look up the + * spec (which looks like a local path for zfs). We need to + * add some flag to domount() to tell it not to do this lookup. + */ + if (MUTEX_HELD(&sdp->sd_lock)) + return (ENOENT); + + ZFS_ENTER(zfsvfs); + + mutex_enter(&sdp->sd_lock); + search.se_name = (char *)nm; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { + *vpp = sep->se_root; + VN_HOLD(*vpp); + if ((*vpp)->v_mountedhere == NULL) { + /* + * The snapshot was unmounted behind our backs, + * try to remount it. + */ + goto domount; + } + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread); + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * The requested snapshot is not currently mounted, look it up. + */ + err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); + if (err) { + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (err); + } + if (dmu_objset_open(snapname, DMU_OST_ZFS, + DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) { + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (ENOENT); + } + + sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); + sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); + (void) strcpy(sep->se_name, nm); + *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); + VN_HOLD(*vpp); + avl_insert(&sdp->sd_snaps, sep, where); + + dmu_objset_close(snap); +domount: + mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + + strlen("/.zfs/snapshot/") + strlen(nm) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", + dvp->v_vfsp->mnt_stat.f_mntonname, nm); + err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0); + kmem_free(mountpoint, mountpoint_len); + /* FreeBSD: This line was moved from below to avoid a lock recursion. */ + if (err == 0) + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + mutex_exit(&sdp->sd_lock); + + /* + * If we had an error, drop our hold on the vnode and + * zfsctl_snapshot_inactive() will clean up. + */ + if (err) { + VN_RELE(*vpp); + *vpp = NULL; + } + return (err); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, + offset_t *offp, offset_t *nextp, void *data) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + char snapname[MAXNAMELEN]; + uint64_t id, cookie; + + ZFS_ENTER(zfsvfs); + + cookie = *offp; + if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, + &cookie) == ENOENT) { + *eofp = 1; + ZFS_EXIT(zfsvfs); + return (0); + } + + (void) strcpy(dp->d_name, snapname); + dp->d_ino = ZFSCTL_INO_SNAP(id); + *nextp = cookie; + + ZFS_EXIT(zfsvfs); + + return (0); +} + +vnode_t * +zfsctl_mknode_snapdir(vnode_t *pvp) +{ + vnode_t *vp; + zfsctl_snapdir_t *sdp; + + vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp, + &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, + zfsctl_snapdir_readdir_cb, NULL); + sdp = vp->v_data; + sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; + sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; + mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&sdp->sd_snaps, snapentry_compare, + sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); + return (vp); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_snapdir_t *sdp = vp->v_data; + + ZFS_ENTER(zfsvfs); + zfsctl_common_getattr(vp, vap); + vap->va_nodeid = gfs_file_inode(vp); + vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + zfsctl_snapdir_t *sdp = vp->v_data; + void *private; + + private = gfs_dir_inactive(vp); + if (private != NULL) { + ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); + mutex_destroy(&sdp->sd_lock); + avl_destroy(&sdp->sd_snaps); + kmem_free(private, sizeof (zfsctl_snapdir_t)); + } + return (0); +} + +static struct vop_vector zfsctl_ops_snapdir = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_snapdir_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = gfs_vop_readdir, + .vop_lookup = zfsctl_snapdir_lookup, + .vop_inactive = zfsctl_snapdir_inactive, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, +}; + +static vnode_t * +zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) +{ + vnode_t *vp; + zfsctl_node_t *zcp; + + vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, + &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); + zcp = vp->v_data; + zcp->zc_id = objset; + + return (vp); +} + +static int +zfsctl_snapshot_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + struct vop_inactive_args iap; + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + int locked; + vnode_t *dvp; + + VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0); + sdp = dvp->v_data; + VOP_UNLOCK(dvp, 0, ap->a_td); + + if (!(locked = MUTEX_HELD(&sdp->sd_lock))) + mutex_enter(&sdp->sd_lock); + + if (vp->v_count > 1) { + if (!locked) + mutex_exit(&sdp->sd_lock); + return (0); + } + ASSERT(!vn_ismntpt(vp)); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + next = AVL_NEXT(&sdp->sd_snaps, sep); + + if (sep->se_root == vp) { + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + break; + } + sep = next; + } + ASSERT(sep != NULL); + + if (!locked) + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + + /* + * Dispose of the vnode for the snapshot mount point. + * This is safe to do because once this entry has been removed + * from the AVL tree, it can't be found again, so cannot become + * "active". If we lookup the same name again we will end up + * creating a new vnode. + */ + iap.a_vp = vp; + return (gfs_vop_inactive(&iap)); +} + +static int +zfsctl_traverse_begin(vnode_t **vpp, kthread_t *td) +{ + int err; + + VN_HOLD(*vpp); + /* Snapshot should be already mounted, but just in case. */ + if (vn_mountedvfs(*vpp) == NULL) + return (ENOENT); + err = traverse(vpp); + if (err == 0) + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + return (err); +} + +static void +zfsctl_traverse_end(vnode_t *vp, int err) +{ + + if (err == 0) + vput(vp); + else + VN_RELE(vp); +} + +static int +zfsctl_snapshot_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + int err; + + err = zfsctl_traverse_begin(&vp, ap->a_td); + if (err == 0) + err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td); + zfsctl_traverse_end(vp, err); + return (err); +} + +static int +zfsctl_snapshot_fid(ap) + struct vop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + int err; + + err = zfsctl_traverse_begin(&vp, curthread); + if (err == 0) + err = VOP_VPTOFH(vp, (void *)ap->a_fid); + zfsctl_traverse_end(vp, err); + return (err); +} + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +static struct vop_vector zfsctl_ops_snapshot = { + .vop_default = &default_vnodeops, + .vop_inactive = zfsctl_snapshot_inactive, + .vop_reclaim = zfsctl_common_reclaim, + .vop_getattr = zfsctl_snapshot_getattr, + .vop_fid = zfsctl_snapshot_fid, +}; + +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + vnode_t *dvp, *vp; + zfsctl_snapdir_t *sdp; + zfsctl_node_t *zcp; + zfs_snapentry_t *sep; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, kcred); + if (error != 0) + return (error); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + vp = sep->se_root; + zcp = vp->v_data; + if (zcp->zc_id == objsetid) + break; + + sep = AVL_NEXT(&sdp->sd_snaps, sep); + } + + if (sep != NULL) { + VN_HOLD(vp); + error = traverse(&vp); + if (error == 0) { + if (vp == sep->se_root) + error = EINVAL; + else + *zfsvfsp = VTOZ(vp)->z_zfsvfs; + } + mutex_exit(&sdp->sd_lock); + VN_RELE(vp); + } else { + error = EINVAL; + mutex_exit(&sdp->sd_lock); + } + + VN_RELE(dvp); + + return (error); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + struct vop_inactive_args ap; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + vnode_t *dvp, *svp; + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, cr); + if (error != 0) + return (error); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + svp = sep->se_root; + next = AVL_NEXT(&sdp->sd_snaps, sep); + + /* + * If this snapshot is not mounted, then it must + * have just been unmounted by somebody else, and + * will be cleaned up by zfsctl_snapdir_inactive(). + */ + if (vn_ismntpt(svp)) { + if ((error = vn_vfswlock(svp)) != 0) + goto out; + + /* + * Increase usecount, so dounmount() won't vrele() it + * to 0 and call zfsctl_snapdir_inactive(). + */ + VN_HOLD(svp); + vfsp = vn_mountedvfs(svp); + mtx_lock(&Giant); + error = dounmount(vfsp, fflags, curthread); + mtx_unlock(&Giant); + if (error != 0) { + VN_RELE(svp); + goto out; + } + + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + + /* + * We can't use VN_RELE(), as that will try to + * invoke zfsctl_snapdir_inactive(), and that + * would lead to an attempt to re-grab the sd_lock. + */ + ASSERT3U(svp->v_count, ==, 1); + ap.a_vp = svp; + gfs_vop_inactive(&ap); + } + sep = next; + } +out: + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + + return (error); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c new file mode 100644 index 0000000..486aa74 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c @@ -0,0 +1,796 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/uio.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/unistd.h> +#include <sys/random.h> +#include <sys/kcondvar.h> +#include <sys/callb.h> +#include <sys/smp.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/fs/zfs.h> +#include <sys/zap.h> +#include <sys/dmu.h> +#include <sys/atomic.h> +#include <sys/zfs_ctldir.h> +#include <sys/dnlc.h> + +/* + * Lock a directory entry. A dirlock on <dzp, name> protects that name + * in dzp's directory zap object. As long as you hold a dirlock, you can + * assume two things: (1) dzp cannot be reaped, and (2) no other thread + * can change the zap entry for (i.e. link or unlink) this name. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZSHARED: allow concurrent access with other ZSHARED callers. + * ZXATTR: we want dzp's xattr directory + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * dlpp - pointer to the dirlock for this entry (NULL on error) + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + */ +int +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, + int flag) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t *dl; + uint64_t zoid; + int error; + vnode_t *vp; + + *zpp = NULL; + *dlpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if (name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || + zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) + return (EEXIST); + + /* + * Wait until there are no locks on this name. + */ + rw_enter(&dzp->z_name_lock, RW_READER); + mutex_enter(&dzp->z_lock); + for (;;) { + if (dzp->z_unlinked) { + mutex_exit(&dzp->z_lock); + rw_exit(&dzp->z_name_lock); + return (ENOENT); + } + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) + if (strcmp(name, dl->dl_name) == 0) + break; + if (dl == NULL) { + /* + * Allocate a new dirlock and add it to the list. + */ + dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); + cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); + dl->dl_name = name; + dl->dl_sharecnt = 0; + dl->dl_namesize = 0; + dl->dl_dzp = dzp; + dl->dl_next = dzp->z_dirlocks; + dzp->z_dirlocks = dl; + break; + } + if ((flag & ZSHARED) && dl->dl_sharecnt != 0) + break; + cv_wait(&dl->dl_cv, &dzp->z_lock); + } + + if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { + /* + * We're the second shared reference to dl. Make a copy of + * dl_name in case the first thread goes away before we do. + * Note that we initialize the new name before storing its + * pointer into dl_name, because the first thread may load + * dl->dl_name at any time. He'll either see the old value, + * which is his, or the new shared copy; either is OK. + */ + dl->dl_namesize = strlen(dl->dl_name) + 1; + name = kmem_alloc(dl->dl_namesize, KM_SLEEP); + bcopy(dl->dl_name, name, dl->dl_namesize); + dl->dl_name = name; + } + + mutex_exit(&dzp->z_lock); + + /* + * We have a dirlock on the name. (Note that it is the dirlock, + * not the dzp's z_lock, that protects the name in the zap object.) + * See if there's an object by this name; if so, put a hold on it. + */ + if (flag & ZXATTR) { + zoid = dzp->z_phys->zp_xattr; + error = (zoid == 0 ? ENOENT : 0); + } else { + vp = dnlc_lookup(ZTOV(dzp), name); + if (vp == DNLC_NO_VNODE) { + VN_RELE(vp); + error = ENOENT; + } else if (vp) { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + VN_RELE(vp); + return (EEXIST); + } + *dlpp = dl; + *zpp = VTOZ(vp); + return (0); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, + 8, 1, &zoid); + zoid = ZFS_DIRENT_OBJ(zoid); + if (error == ENOENT) + dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); + } + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + zfs_dirent_unlock(dl); + return (error); + } + } else { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + return (EEXIST); + } + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) { + zfs_dirent_unlock(dl); + return (error); + } + if (!(flag & ZXATTR)) + dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); + } + + *dlpp = dl; + + return (0); +} + +/* + * Unlock this directory entry and wake anyone who was waiting for it. + */ +void +zfs_dirent_unlock(zfs_dirlock_t *dl) +{ + znode_t *dzp = dl->dl_dzp; + zfs_dirlock_t **prev_dl, *cur_dl; + + mutex_enter(&dzp->z_lock); + rw_exit(&dzp->z_name_lock); + if (dl->dl_sharecnt > 1) { + dl->dl_sharecnt--; + mutex_exit(&dzp->z_lock); + return; + } + prev_dl = &dzp->z_dirlocks; + while ((cur_dl = *prev_dl) != dl) + prev_dl = &cur_dl->dl_next; + *prev_dl = dl->dl_next; + cv_broadcast(&dl->dl_cv); + mutex_exit(&dzp->z_lock); + + if (dl->dl_namesize != 0) + kmem_free(dl->dl_name, dl->dl_namesize); + cv_destroy(&dl->dl_cv); + kmem_free(dl, sizeof (*dl)); +} + +/* + * Look up an entry in a directory. + * + * NOTE: '.' and '..' are handled as special cases because + * no directory entries are actually stored for them. If this is + * the root of a filesystem, then '.zfs' is also treated as a + * special pseudo-directory. + */ +int +zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp) +{ + zfs_dirlock_t *dl; + znode_t *zp; + int error = 0; + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *vpp = ZTOV(dzp); + VN_HOLD(*vpp); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (dzp->z_phys->zp_parent == dzp->z_id && + zfsvfs->z_parent != zfsvfs) { + error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, + "snapshot", vpp, NULL, 0, NULL, kcred); + return (error); + } + rw_enter(&dzp->z_parent_lock, RW_READER); + error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); + if (error == 0) + *vpp = ZTOV(zp); + rw_exit(&dzp->z_parent_lock); + } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { + *vpp = zfsctl_root(dzp); + } else { + error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED); + if (error == 0) { + *vpp = ZTOV(zp); + zfs_dirent_unlock(dl); + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + } + } + + return (error); +} + +static char * +zfs_unlinked_hexname(char namebuf[17], uint64_t x) +{ + char *name = &namebuf[16]; + const char digits[16] = "0123456789abcdef"; + + *name = '\0'; + do { + *--name = digits[x & 0xf]; + x >>= 4; + } while (x != 0); + + return (name); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + char obj_name[17]; + int error; + + ASSERT(zp->z_unlinked); + ASSERT3U(zp->z_phys->zp_links, ==, 0); + + error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx); + ASSERT3U(error, ==, 0); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + int error; + + /* + * Interate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + zp->z_unlinked = B_TRUE; + VN_RELE(ZTOV(zp)); + } + zap_cursor_fini(&zc); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t dl; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + ASSERT3U(error, ==, 0); + + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + VN_RELE(ZTOV(xzp)); + skipped += 1; + continue; + } + bzero(&dl, sizeof (dl)); + dl.dl_dzp = dzp; + dl.dl_name = zap.za_name; + + error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + ASSERT3U(error, ==, 0); + dmu_tx_commit(tx); + + VN_RELE(ZTOV(xzp)); + } + zap_cursor_fini(&zc); + ASSERT(error == ENOENT); + return (skipped); +} + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + znode_t *xzp = NULL; + char obj_name[17]; + dmu_tx_t *tx; + uint64_t acl_obj; + int error; + int vfslocked; + + vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs); + + ASSERT(zp->z_phys->zp_links == 0); + + /* + * If this is an attribute directory, purge its contents. + */ + if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && + (zp->z_phys->zp_flags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it on the unlinked set. + */ + VFS_UNLOCK_GIANT(vfslocked); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + if (zp->z_phys->zp_xattr) { + error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); + ASSERT(error == 0); + } + + acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + + /* + * Set up the transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xzp) { + dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + } + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + VFS_UNLOCK_GIANT(vfslocked); + return; + } + + if (xzp) { + dmu_buf_will_dirty(xzp->z_dbuf, tx); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ + xzp->z_phys->zp_links = 0; /* no more links to it */ + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + } + + /* Remove this znode from the unlinked set */ + error = zap_remove(os, zfsvfs->z_unlinkedobj, + zfs_unlinked_hexname(obj_name, zp->z_id), tx); + ASSERT3U(error, ==, 0); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); + + if (xzp) + VN_RELE(ZTOV(xzp)); + VFS_UNLOCK_GIANT(vfslocked); +} + +/* + * Link zp into dl. Can only fail if zp has been unlinked. + */ +int +zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +{ + znode_t *dzp = dl->dl_dzp; + vnode_t *vp = ZTOV(zp); + uint64_t value; + int zp_is_dir = (vp->v_type == VDIR); + int error; + + dmu_buf_will_dirty(zp->z_dbuf, tx); + mutex_enter(&zp->z_lock); + + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + mutex_exit(&zp->z_lock); + return (ENOENT); + } + zp->z_phys->zp_links++; + } + zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ + + if (!(flag & ZNEW)) + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + mutex_exit(&zp->z_lock); + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + mutex_enter(&dzp->z_lock); + dzp->z_phys->zp_size++; /* one dirent added */ + dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ + zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + mutex_exit(&dzp->z_lock); + + /* + * MacOS X will fill in the 4-bit object type here. + */ + value = ZFS_DIRENT_MAKE(IFTODT(zp->z_phys->zp_mode), zp->z_id); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, + 8, 1, &value, tx); + ASSERT(error == 0); + + dnlc_update(ZTOV(dzp), dl->dl_name, vp); + + return (0); +} + +/* + * Unlink zp from dl, and mark zp for deletion if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, + boolean_t *unlinkedp) +{ + znode_t *dzp = dl->dl_dzp; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + boolean_t unlinked = B_FALSE; + int error; + + dnlc_remove(ZTOV(dzp), dl->dl_name); + + if (!(flag & ZRENAMING)) { + dmu_buf_will_dirty(zp->z_dbuf, tx); + + if (vn_vfswlock(vp)) /* prevent new mounts on zp */ + return (EBUSY); + + if (vn_ismntpt(vp)) { /* don't remove mount point */ + vn_vfsunlock(vp); + return (EBUSY); + } + + mutex_enter(&zp->z_lock); + if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + return (ENOTEMPTY); + } + if (zp->z_phys->zp_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on vnode %p is %u, " + "should be at least %u", zp->z_vnode, + (int)zp->z_phys->zp_links, + zp_is_dir + 1); + zp->z_phys->zp_links = zp_is_dir + 1; + } + if (--zp->z_phys->zp_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_phys->zp_links = 0; + unlinked = B_TRUE; + } else { + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + } + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + } + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + mutex_enter(&dzp->z_lock); + dzp->z_phys->zp_size--; /* one dirent removed */ + dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ + zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + mutex_exit(&dzp->z_lock); + + error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx); + ASSERT(error == 0); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. Works with or without z_lock + * held, but can only be consider a hint in the latter case. Returns true + * if only "." and ".." remain and there's no work in progress. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + uint64_t xoid; + int error; + + *xvpp = NULL; + + if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr)) + return (error); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) + dmu_tx_wait(tx); + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0); + ASSERT(xzp->z_id == xoid); + ASSERT(xzp->z_phys->zp_parent == zp->z_id); + dmu_buf_will_dirty(zp->z_dbuf, tx); + zp->z_phys->zp_xattr = xoid; + + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, ""); + dmu_tx_commit(tx); + + *xvpp = ZTOV(xzp); + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + zfs_dirlock_t *dl; + vattr_t va; + int error; +top: + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR); + if (error) + return (error); + + if (xzp != NULL) { + *xvpp = ZTOV(xzp); + zfs_dirent_unlock(dl); + return (0); + } + + ASSERT(zp->z_phys->zp_xattr == 0); + +#ifdef TODO + if (!(flags & CREATE_XATTR_DIR)) { + zfs_dirent_unlock(dl); + return (ENOENT); + } +#endif + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + zfs_dirent_unlock(dl); + return (EROFS); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + va.va_uid = (uid_t)zp->z_phys->zp_uid; + va.va_gid = (gid_t)zp->z_phys->zp_gid; + + error = zfs_make_xattrdir(zp, &va, xvpp, cr); + zfs_dirent_unlock(dl); + + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + + if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ + return (0); + + if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 || + (uid = crgetuid(cr)) == zdp->z_phys->zp_uid || + uid == zp->z_phys->zp_uid || + (ZTOV(zp)->v_type == VREG && + zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(cr)); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c new file mode 100644 index 0000000..af765ba --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c @@ -0,0 +1,336 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> + +#include <sys/fm/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> + +#ifdef _KERNEL +/* Including sys/bus.h is just too hard, so I declare what I need here. */ +extern void devctl_notify(const char *__system, const char *__subsystem, + const char *__type, const char *__data); +#endif + +/* + * This general routine is responsible for generating all the different ZFS + * ereports. The payload is dependent on the class, and which arguments are + * supplied to the function: + * + * EREPORT POOL VDEV IO + * block X X X + * data X X + * device X X + * pool X + * + * If we are in a loading state, all errors are chained together by the same + * SPA-wide ENA. + * + * For isolated I/O requests, we get the ENA from the zio_t. The propagation + * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want + * to chain together all ereports associated with a logical piece of data. For + * read I/Os, there are basically three 'types' of I/O, which form a roughly + * layered diagram: + * + * +---------------+ + * | Aggregate I/O | No associated logical data or device + * +---------------+ + * | + * V + * +---------------+ Reads associated with a piece of logical data. + * | Read I/O | This includes reads on behalf of RAID-Z, + * +---------------+ mirrors, gang blocks, retries, etc. + * | + * V + * +---------------+ Reads associated with a particular device, but + * | Physical I/O | no logical data. Issued as part of vdev caching + * +---------------+ and I/O aggregation. + * + * Note that 'physical I/O' here is not the same terminology as used in the rest + * of ZIO. Typically, 'physical I/O' simply means that there is no attached + * blockpointer. But I/O with no associated block pointer can still be related + * to a logical piece of data (i.e. RAID-Z requests). + * + * Purely physical I/O always have unique ENAs. They are not related to a + * particular piece of logical data, and therefore cannot be chained together. + * We still generate an ereport, but the DE doesn't correlate it with any + * logical piece of data. When such an I/O fails, the delegated I/O requests + * will issue a retry, which will trigger the 'real' ereport with the correct + * ENA. + * + * We keep track of the ENA for a ZIO chain through the 'io_logical' member. + * When a new logical I/O is issued, we set this to point to itself. Child I/Os + * then inherit this pointer, so that when it is first set subsequent failures + * will use the same ENA. If a physical I/O is issued (by passing the + * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a + * unique ENA will be generated. For an aggregate I/O, this pointer is set to + * NULL, and no ereport will be generated (since it doesn't actually correspond + * to any particular device or piece of data). + */ +void +zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, + uint64_t stateoroffset, uint64_t size) +{ +#ifdef _KERNEL + char buf[1024]; + char class[64]; + struct sbuf sb; + struct timespec ts; + + /* + * If we are doing a spa_tryimport(), ignore errors. + */ + if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + return; + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa->spa_load_state != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return; + + /* + * Ignore any errors from I/Os that we are going to retry anyway - we + * only generate errors from the final failure. + */ + if (zio && zio_should_retry(zio)) + return; + + /* + * If this is not a read or write zio, ignore the error. This can occur + * if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio && zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return; + + nanotime(&ts); + + sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); + sbuf_printf(&sb, "time %ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec); + + /* + * Serialize ereport generation + */ + mutex_enter(&spa->spa_errlist_lock); + +#if 0 + /* + * Determine the ENA to use for this event. If we are in a loading + * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use + * a root zio-wide ENA. Otherwise, simply use a unique ENA. + */ + if (spa->spa_load_state != SPA_LOAD_NONE) { +#if 0 + if (spa->spa_ena == 0) + spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); +#endif + ena = spa->spa_ena; + } else if (zio != NULL && zio->io_logical != NULL) { +#if 0 + if (zio->io_logical->io_ena == 0) + zio->io_logical->io_ena = + fm_ena_generate(0, FM_ENA_FMT1); +#endif + ena = zio->io_logical->io_ena; + } else { +#if 0 + ena = fm_ena_generate(0, FM_ENA_FMT1); +#else + ena = 0; +#endif + } +#endif + + /* + * Construct the full class, detector, and other standard FMA fields. + */ + sbuf_printf(&sb, " ereport_version %u", FM_EREPORT_VERSION); + snprintf(class, sizeof(class), "%s.%s", ZFS_ERROR_CLASS, subclass); + sbuf_printf(&sb, " class %s", class); + + sbuf_printf(&sb, " zfs_scheme_version %u", FM_ZFS_SCHEME_VERSION); + + /* + * Construct the per-ereport payload, depending on which parameters are + * passed in. + */ + + /* + * Generic payload members common to all ereports. + * + * The direct reference to spa_name is used rather than spa_name() + * because of the asynchronous nature of the zio pipeline. spa_name() + * asserts that the config lock is held in some form. This is always + * the case in I/O context, but because the check for RW_WRITER compares + * against 'curthread', we may be in an asynchronous context and blow + * this assert. Rather than loosen this assert, we acknowledge that all + * contexts in which this function is called (pool open, I/O) are safe, + * and dereference the name directly. + */ + sbuf_printf(&sb, " %s %s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name); + sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + spa_guid(spa)); + sbuf_printf(&sb, " %s %u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, + spa->spa_load_state); + + if (vd != NULL) { + vdev_t *pvd = vd->vdev_parent; + + sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + vd->vdev_guid); + sbuf_printf(&sb, " %s %s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + vd->vdev_ops->vdev_op_type); + if (vd->vdev_path) + sbuf_printf(&sb, " %s %s", + FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path); + if (vd->vdev_devid) + sbuf_printf(&sb, " %s %s", + FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid); + + if (pvd != NULL) { + sbuf_printf(&sb, " %s %ju", + FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid); + sbuf_printf(&sb, " %s %s", + FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, + pvd->vdev_ops->vdev_op_type); + if (pvd->vdev_path) + sbuf_printf(&sb, " %s %s", + FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, + pvd->vdev_path); + if (pvd->vdev_devid) + sbuf_printf(&sb, " %s %s", + FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, + pvd->vdev_devid); + } + } + + if (zio != NULL) { + /* + * Payload common to all I/Os. + */ + sbuf_printf(&sb, " %s %u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, + zio->io_error); + + /* + * If the 'size' parameter is non-zero, it indicates this is a + * RAID-Z or other I/O where the physical offset and length are + * provided for us, instead of within the zio_t. + */ + if (vd != NULL) { + if (size) { + sbuf_printf(&sb, " %s %ju", + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + stateoroffset); + sbuf_printf(&sb, " %s %ju", + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size); + } else { + sbuf_printf(&sb, " %s %ju", + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + zio->io_offset); + sbuf_printf(&sb, " %s %ju", + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + zio->io_size); + } + } + + /* + * Payload for I/Os with corresponding logical information. + */ + if (zio->io_logical != NULL) { + sbuf_printf(&sb, " %s %ju", + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, + zio->io_logical->io_bookmark.zb_object); + sbuf_printf(&sb, " %s %ju", + FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, + zio->io_logical->io_bookmark.zb_level); + sbuf_printf(&sb, " %s %ju", + FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, + zio->io_logical->io_bookmark.zb_blkid); + } + } else if (vd != NULL) { + /* + * If we have a vdev but no zio, this is a device fault, and the + * 'stateoroffset' parameter indicates the previous state of the + * vdev. + */ + sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, + stateoroffset); + } + mutex_exit(&spa->spa_errlist_lock); + + sbuf_finish(&sb); + devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb)); + if (sbuf_overflowed(&sb)) + printf("ZFS WARNING: sbuf overflowed\n"); + sbuf_delete(&sb); +#endif +} + +/* + * The 'resource.fs.zfs.ok' event is an internal signal that the associated + * resource (pool or disk) has been identified by ZFS as healthy. This will + * then trigger the DE to close the associated case, if any. + */ +void +zfs_post_ok(spa_t *spa, vdev_t *vd) +{ +#ifdef _KERNEL + char buf[1024]; + char class[64]; + struct sbuf sb; + struct timespec ts; + + nanotime(&ts); + + sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); + sbuf_printf(&sb, "time %ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec); + + snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE, + ZFS_ERROR_CLASS, FM_RESOURCE_OK); + sbuf_printf(&sb, " %s %hhu", FM_VERSION, FM_RSRC_VERSION); + sbuf_printf(&sb, " %s %s", FM_CLASS, class); + sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + spa_guid(spa)); + if (vd) + sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + vd->vdev_guid); + sbuf_finish(&sb); + devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb)); + if (sbuf_overflowed(&sb)) + printf("ZFS WARNING: sbuf overflowed\n"); + sbuf_delete(&sb); +#endif +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c new file mode 100644 index 0000000..aac1bb1 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -0,0 +1,1811 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/dmu.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/nvpair.h> +#include <sys/mount.h> +#include <sys/taskqueue.h> +#include <sys/sdt.h> +#include <sys/varargs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ctldir.h> +#include <sys/zvol.h> + +#include "zfs_namecheck.h" +#include "zfs_prop.h" + +CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE); + +static struct cdev *zfsdev; + +extern void zfs_init(void); +extern void zfs_fini(void); + +typedef int zfs_ioc_func_t(zfs_cmd_t *); +typedef int zfs_secpolicy_func_t(const char *, cred_t *); + +typedef struct zfs_ioc_vec { + zfs_ioc_func_t *zvec_func; + zfs_secpolicy_func_t *zvec_secpolicy; + enum { + no_name, + pool_name, + dataset_name + } zvec_namecheck; +} zfs_ioc_vec_t; + +/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ +void +__dprintf(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + char buf[256]; + va_list adx; + + /* + * Get rid of annoying "../common/" prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + /* + * To get this data, use the zfs-dprintf probe as so: + * dtrace -q -n 'zfs-dprintf \ + * /stringof(arg0) == "dbuf.c"/ \ + * {printf("%s: %s", stringof(arg1), stringof(arg3))}' + * arg0 = file name + * arg1 = function name + * arg2 = line number + * arg3 = message + */ + DTRACE_PROBE4(zfs__dprintf, + char *, newfile, char *, func, int, line, char *, buf); +} + +/* + * Policy for top-level read operations (list pools). Requires no privileges, + * and can be used in the local zone, as there is no associated dataset. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_none(const char *unused1, cred_t *cr) +{ + return (0); +} + +/* + * Policy for dataset read operations (list children, get statistics). Requires + * no privileges, but must be visible in the local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_read(const char *dataset, cred_t *cr) +{ + if (INGLOBALZONE(curproc) || + zone_dataset_visible(dataset, NULL)) + return (0); + + return (ENOENT); +} + +static int +zfs_dozonecheck(const char *dataset, cred_t *cr) +{ + uint64_t zoned; + int writable = 1; + + /* + * The dataset must be visible by this zone -- check this first + * so they don't see EPERM on something they shouldn't know about. + */ + if (!INGLOBALZONE(curproc) && + !zone_dataset_visible(dataset, &writable)) + return (ENOENT); + + if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) + return (ENOENT); + + if (INGLOBALZONE(curproc)) { + /* + * If the fs is zoned, only root can access it from the + * global zone. + */ + if (secpolicy_zfs(cr) && zoned) + return (EPERM); + } else { + /* + * If we are in a local zone, the 'zoned' property must be set. + */ + if (!zoned) + return (EPERM); + + /* must be writable by this zone */ + if (!writable) + return (EPERM); + } + return (0); +} + +/* + * Policy for dataset write operations (create children, set properties, etc). + * Requires SYS_MOUNT privilege, and must be writable in the local zone. + */ +int +zfs_secpolicy_write(const char *dataset, cred_t *cr) +{ + int error; + + if (error = zfs_dozonecheck(dataset, cr)) + return (error); + + return (secpolicy_zfs(cr)); +} + +/* + * Policy for operations that want to write a dataset's parent: + * create, destroy, snapshot, clone, restore. + */ +static int +zfs_secpolicy_parent(const char *dataset, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + char *cp; + + /* + * Remove the @bla or /bla from the end of the name to get the parent. + */ + (void) strncpy(parentname, dataset, sizeof (parentname)); + cp = strrchr(parentname, '@'); + if (cp != NULL) { + cp[0] = '\0'; + } else { + cp = strrchr(parentname, '/'); + if (cp == NULL) + return (ENOENT); + cp[0] = '\0'; + + } + + return (zfs_secpolicy_write(parentname, cr)); +} + +/* + * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires + * SYS_CONFIG privilege, which is not available in a local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_config(const char *unused, cred_t *cr) +{ + if (secpolicy_sys_config(cr, B_FALSE) != 0) + return (EPERM); + + return (0); +} + +/* + * Policy for fault injection. Requires all privileges. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_inject(const char *unused, cred_t *cr) +{ + return (secpolicy_zinject(cr)); +} + +/* + * Policy for dataset backup operations (sendbackup). + * Requires SYS_MOUNT privilege, and must be writable in the local zone. + */ +static int +zfs_secpolicy_operator(const char *dataset, cred_t *cr) +{ + int writable = 1; + + if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) + return (ENOENT); + if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr)) + return (EPERM); + return (0); +} + +/* + * Returns the nvlist as specified by the user in the zfs_cmd_t. + */ +static int +get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp) +{ + char *packed; + size_t size; + int error; + nvlist_t *config = NULL; + + /* + * Read in and unpack the user-supplied nvlist. + */ + if ((size = zc->zc_nvlist_src_size) == 0) + return (EINVAL); + + packed = kmem_alloc(size, KM_SLEEP); + + if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed, + size)) != 0) { + kmem_free(packed, size); + return (error); + } + + if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) { + kmem_free(packed, size); + return (error); + } + + kmem_free(packed, size); + + *nvp = config; + return (0); +} + +static int +put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) +{ + char *packed = NULL; + size_t size; + int error; + + VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); + + if (size > zc->zc_nvlist_dst_size) { + /* + * Solaris returns ENOMEM here, because even if an error is + * returned from an ioctl(2), new zc_nvlist_dst_size will be + * passed to the userland. This is not the case for FreeBSD. + * We need to return 0, so the kernel will copy the + * zc_nvlist_dst_size back and the userland can discover that a + * bigger buffer is needed. + */ + error = 0; + } else { + VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, + KM_SLEEP) == 0); + error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, + size); + kmem_free(packed, size); + } + + zc->zc_nvlist_dst_size = size; + return (error); +} + +static int +zfs_ioc_pool_create(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config; + + if ((error = get_nvlist(zc, &config)) != 0) + return (error); + + error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ? + NULL : zc->zc_value); + + nvlist_free(config); + + return (error); +} + +static int +zfs_ioc_pool_destroy(zfs_cmd_t *zc) +{ + return (spa_destroy(zc->zc_name)); +} + +static int +zfs_ioc_pool_import(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config; + uint64_t guid; + + if ((error = get_nvlist(zc, &config)) != 0) + return (error); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || + guid != zc->zc_guid) + error = EINVAL; + else + error = spa_import(zc->zc_name, config, + zc->zc_value[0] == '\0' ? NULL : zc->zc_value); + + nvlist_free(config); + + return (error); +} + +static int +zfs_ioc_pool_export(zfs_cmd_t *zc) +{ + return (spa_export(zc->zc_name, NULL)); +} + +static int +zfs_ioc_pool_configs(zfs_cmd_t *zc) +{ + nvlist_t *configs; + int error; + + if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) + return (EEXIST); + + error = put_nvlist(zc, configs); + + nvlist_free(configs); + + return (error); +} + +static int +zfs_ioc_pool_stats(zfs_cmd_t *zc) +{ + nvlist_t *config; + int error; + int ret = 0; + + error = spa_get_stats(zc->zc_name, &config, zc->zc_value, + sizeof (zc->zc_value)); + + if (config != NULL) { + ret = put_nvlist(zc, config); + nvlist_free(config); + + /* + * The config may be present even if 'error' is non-zero. + * In this case we return success, and preserve the real errno + * in 'zc_cookie'. + */ + zc->zc_cookie = error; + } else { + ret = error; + } + + return (ret); +} + +/* + * Try to import the given pool, returning pool stats as appropriate so that + * user land knows which devices are available and overall pool health. + */ +static int +zfs_ioc_pool_tryimport(zfs_cmd_t *zc) +{ + nvlist_t *tryconfig, *config; + int error; + + if ((error = get_nvlist(zc, &tryconfig)) != 0) + return (error); + + config = spa_tryimport(tryconfig); + + nvlist_free(tryconfig); + + if (config == NULL) + return (EINVAL); + + error = put_nvlist(zc, config); + nvlist_free(config); + + return (error); +} + +static int +zfs_ioc_pool_scrub(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_scrub(spa, zc->zc_cookie, B_FALSE); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_freeze(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + spa_freeze(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_pool_upgrade(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + spa_upgrade(spa); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_get_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *hist_buf; + uint64_t size; + int error; + + if ((size = zc->zc_history_len) == 0) + return (EINVAL); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) { + spa_close(spa, FTAG); + return (ENOTSUP); + } + + hist_buf = kmem_alloc(size, KM_SLEEP); + if ((error = spa_history_get(spa, &zc->zc_history_offset, + &zc->zc_history_len, hist_buf)) == 0) { + error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history, + zc->zc_history_len); + } + + spa_close(spa, FTAG); + kmem_free(hist_buf, size); + return (error); +} + +static int +zfs_ioc_pool_log_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *history_str = NULL; + size_t size; + int error; + + size = zc->zc_history_len; + if (size == 0 || size > HIS_MAX_RECORD_LEN) + return (EINVAL); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) { + spa_close(spa, FTAG); + return (ENOTSUP); + } + + /* add one for the NULL delimiter */ + size++; + history_str = kmem_alloc(size, KM_SLEEP); + if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str, + size)) != 0) { + spa_close(spa, FTAG); + kmem_free(history_str, size); + return (error); + } + history_str[size - 1] = '\0'; + + error = spa_history_log(spa, history_str, zc->zc_history_offset); + + spa_close(spa, FTAG); + kmem_free(history_str, size); + + return (error); +} + +static int +zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) +{ + int error; + + if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)) + return (error); + + return (0); +} + +static int +zfs_ioc_obj_to_path(zfs_cmd_t *zc) +{ + objset_t *osp; + int error; + + if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS, + DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0) + return (error); + + error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_close(osp); + + return (error); +} + +static int +zfs_ioc_vdev_add(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *config; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + /* + * A root pool with concatenated devices is not supported. + * Thus, can not add a device to a root pool with one device. + */ + if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) { + spa_close(spa, FTAG); + return (EDOM); + } + + if ((error = get_nvlist(zc, &config)) == 0) { + error = spa_vdev_add(spa, config); + nvlist_free(config); + } + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_remove(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_online(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + error = vdev_online(spa, zc->zc_guid); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_offline(zfs_cmd_t *zc) +{ + spa_t *spa; + int istmp = zc->zc_cookie; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + error = vdev_offline(spa, zc->zc_guid, istmp); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_attach(zfs_cmd_t *zc) +{ + spa_t *spa; + int replacing = zc->zc_cookie; + nvlist_t *config; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if ((error = get_nvlist(zc, &config)) == 0) { + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); + nvlist_free(config); + } + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_detach(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_setpath(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setpath(spa, guid, path); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_objset_stats(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + nvlist_t *nv; + +retry: + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &os); + if (error != 0) { + /* + * This is ugly: dmu_objset_open() can return EBUSY if + * the objset is held exclusively. Fortunately this hold is + * only for a short while, so we retry here. + * This avoids user code having to handle EBUSY, + * for example for a "zfs list". + */ + if (error == EBUSY) { + delay(1); + goto retry; + } + return (error); + } + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + if (zc->zc_nvlist_dst != 0 && + (error = dsl_prop_get_all(os, &nv)) == 0) { + dmu_objset_stats(os, nv); + /* + * NB: zvol_get_stats() will read the objset contents, + * which we aren't supposed to do with a + * DS_MODE_STANDARD open, because it could be + * inconsistent. So this is a bit of a workaround... + */ + if (!zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZVOL) + VERIFY(zvol_get_stats(os, nv) == 0); + error = put_nvlist(zc, nv); + nvlist_free(nv); + } + + spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value)); + + dmu_objset_close(os); + if (error == ENOMEM) + error = 0; + return (error); +} + +static int +zfs_ioc_dataset_list_next(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + char *p; + +retry: + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &os); + if (error != 0) { + /* + * This is ugly: dmu_objset_open() can return EBUSY if + * the objset is held exclusively. Fortunately this hold is + * only for a short while, so we retry here. + * This avoids user code having to handle EBUSY, + * for example for a "zfs list". + */ + if (error == EBUSY) { + delay(1); + goto retry; + } + if (error == ENOENT) + error = ESRCH; + return (error); + } + + p = strrchr(zc->zc_name, '/'); + if (p == NULL || p[1] != '\0') + (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); + p = zc->zc_name + strlen(zc->zc_name); + + do { + error = dmu_dir_list_next(os, + sizeof (zc->zc_name) - (p - zc->zc_name), p, + NULL, &zc->zc_cookie); + if (error == ENOENT) + error = ESRCH; + } while (error == 0 && !INGLOBALZONE(curproc) && + !zone_dataset_visible(zc->zc_name, NULL)); + + /* + * If it's a hidden dataset (ie. with a '$' in its name), don't + * try to get stats for it. Userland will skip over it. + */ + if (error == 0 && strchr(zc->zc_name, '$') == NULL) + error = zfs_ioc_objset_stats(zc); /* fill in the stats */ + + dmu_objset_close(os); + return (error); +} + +static int +zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + +retry: + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &os); + if (error != 0) { + /* + * This is ugly: dmu_objset_open() can return EBUSY if + * the objset is held exclusively. Fortunately this hold is + * only for a short while, so we retry here. + * This avoids user code having to handle EBUSY, + * for example for a "zfs list". + */ + if (error == EBUSY) { + delay(1); + goto retry; + } + if (error == ENOENT) + error = ESRCH; + return (error); + } + + /* + * A dataset name of maximum length cannot have any snapshots, + * so exit immediately. + */ + if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { + dmu_objset_close(os); + return (ESRCH); + } + + error = dmu_snapshot_list_next(os, + sizeof (zc->zc_name) - strlen(zc->zc_name), + zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie); + if (error == ENOENT) + error = ESRCH; + + if (error == 0) + error = zfs_ioc_objset_stats(zc); /* fill in the stats */ + + dmu_objset_close(os); + return (error); +} + +static int +zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl) +{ + nvpair_t *elem; + int error; + const char *propname; + zfs_prop_t prop; + uint64_t intval; + char *strval; + char buf[MAXNAMELEN]; + const char *p; + spa_t *spa; + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + propname = nvpair_name(elem); + + if ((prop = zfs_name_to_prop(propname)) == + ZFS_PROP_INVAL) { + /* + * If this is a user-defined property, it must be a + * string, and there is no further validation to do. + */ + if (!zfs_prop_user(propname) || + nvpair_type(elem) != DATA_TYPE_STRING) + return (EINVAL); + + VERIFY(nvpair_value_string(elem, &strval) == 0); + error = dsl_prop_set(name, propname, 1, + strlen(strval) + 1, strval); + if (error == 0) + continue; + else + return (error); + } + + /* + * Check permissions for special properties. + */ + switch (prop) { + case ZFS_PROP_ZONED: + /* + * Disallow setting of 'zoned' from within a local zone. + */ + if (!INGLOBALZONE(curproc)) + return (EPERM); + break; + + case ZFS_PROP_QUOTA: + if (error = zfs_dozonecheck(name, cr)) + return (error); + + if (!INGLOBALZONE(curproc)) { + uint64_t zoned; + char setpoint[MAXNAMELEN]; + int dslen; + /* + * Unprivileged users are allowed to modify the + * quota on things *under* (ie. contained by) + * the thing they own. + */ + if (dsl_prop_get_integer(name, "jailed", &zoned, + setpoint)) + return (EPERM); + if (!zoned) /* this shouldn't happen */ + return (EPERM); + dslen = strlen(name); + if (dslen <= strlen(setpoint)) + return (EPERM); + } + break; + + case ZFS_PROP_COMPRESSION: + /* + * If the user specified gzip compression, make sure + * the SPA supports it. We ignore any errors here since + * we'll catch them later. + */ + if (nvpair_type(elem) == DATA_TYPE_UINT64 && + nvpair_value_uint64(elem, &intval) == 0 && + intval >= ZIO_COMPRESS_GZIP_1 && + intval <= ZIO_COMPRESS_GZIP_9) { + if ((p = strchr(name, '/')) == NULL) { + p = name; + } else { + bcopy(name, buf, p - name); + buf[p - name] = '\0'; + p = buf; + } + + if (spa_open(p, &spa, FTAG) == 0) { + if (spa_version(spa) < + ZFS_VERSION_GZIP_COMPRESSION) { + spa_close(spa, FTAG); + return (ENOTSUP); + } + + spa_close(spa, FTAG); + } + } + break; + } + + switch (prop) { + case ZFS_PROP_QUOTA: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dir_set_quota(name, + intval)) != 0) + return (error); + break; + + case ZFS_PROP_RESERVATION: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dir_set_reservation(name, + intval)) != 0) + return (error); + break; + + case ZFS_PROP_VOLSIZE: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zvol_set_volsize(name, dev, + intval)) != 0) + return (error); + break; + + case ZFS_PROP_VOLBLOCKSIZE: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zvol_set_volblocksize(name, + intval)) != 0) + return (error); + break; + + default: + if (nvpair_type(elem) == DATA_TYPE_STRING) { + if (zfs_prop_get_type(prop) != + prop_type_string) + return (EINVAL); + VERIFY(nvpair_value_string(elem, &strval) == 0); + if ((error = dsl_prop_set(name, + nvpair_name(elem), 1, strlen(strval) + 1, + strval)) != 0) + return (error); + } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { + const char *unused; + + VERIFY(nvpair_value_uint64(elem, &intval) == 0); + + switch (zfs_prop_get_type(prop)) { + case prop_type_number: + break; + case prop_type_boolean: + if (intval > 1) + return (EINVAL); + break; + case prop_type_string: + return (EINVAL); + case prop_type_index: + if (zfs_prop_index_to_string(prop, + intval, &unused) != 0) + return (EINVAL); + break; + default: + cmn_err(CE_PANIC, "unknown property " + "type"); + break; + } + + if ((error = dsl_prop_set(name, propname, + 8, 1, &intval)) != 0) + return (error); + } else { + return (EINVAL); + } + break; + } + } + + return (0); +} + +static int +zfs_ioc_set_prop(zfs_cmd_t *zc) +{ + nvlist_t *nvl; + int error; + zfs_prop_t prop; + + /* + * If zc_value is set, then this is an attempt to inherit a value. + * Otherwise, zc_nvlist refers to a list of properties to set. + */ + if (zc->zc_value[0] != '\0') { + if (!zfs_prop_user(zc->zc_value) && + ((prop = zfs_name_to_prop(zc->zc_value)) == + ZFS_PROP_INVAL || + !zfs_prop_inheritable(prop))) + return (EINVAL); + + return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL)); + } + + if ((error = get_nvlist(zc, &nvl)) != 0) + return (error); + + error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev, + (cred_t *)(uintptr_t)zc->zc_cred, nvl); + nvlist_free(nvl); + return (error); +} + +static int +zfs_ioc_pool_props_set(zfs_cmd_t *zc) +{ + nvlist_t *nvl; + int error, reset_bootfs = 0; + uint64_t objnum; + zpool_prop_t prop; + nvpair_t *elem; + char *propname, *strval; + spa_t *spa; + vdev_t *rvdev; + char *vdev_type; + objset_t *os; + + if ((error = get_nvlist(zc, &nvl)) != 0) + return (error); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + nvlist_free(nvl); + return (error); + } + + if (spa_version(spa) < ZFS_VERSION_BOOTFS) { + nvlist_free(nvl); + spa_close(spa, FTAG); + return (ENOTSUP); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + + propname = nvpair_name(elem); + + if ((prop = zpool_name_to_prop(propname)) == + ZFS_PROP_INVAL) { + nvlist_free(nvl); + spa_close(spa, FTAG); + return (EINVAL); + } + + switch (prop) { + case ZFS_PROP_BOOTFS: + /* + * A bootable filesystem can not be on a RAIDZ pool + * nor a striped pool with more than 1 device. + */ + rvdev = spa->spa_root_vdev; + vdev_type = + rvdev->vdev_child[0]->vdev_ops->vdev_op_type; + if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || + (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 && + rvdev->vdev_children > 1)) { + error = ENOTSUP; + break; + } + + reset_bootfs = 1; + + VERIFY(nvpair_value_string(elem, &strval) == 0); + if (strval == NULL || strval[0] == '\0') { + objnum = + zfs_prop_default_numeric(ZFS_PROP_BOOTFS); + break; + } + + if (error = dmu_objset_open(strval, DMU_OST_ZFS, + DS_MODE_STANDARD | DS_MODE_READONLY, &os)) + break; + objnum = dmu_objset_id(os); + dmu_objset_close(os); + break; + + default: + error = EINVAL; + } + + if (error) + break; + } + if (error == 0) { + if (reset_bootfs) { + VERIFY(nvlist_remove(nvl, + zpool_prop_to_name(ZFS_PROP_BOOTFS), + DATA_TYPE_STRING) == 0); + VERIFY(nvlist_add_uint64(nvl, + zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0); + } + error = spa_set_props(spa, nvl); + } + + nvlist_free(nvl); + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_props_get(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *nvp = NULL; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_get_props(spa, &nvp); + + if (error == 0 && zc->zc_nvlist_dst != 0) + error = put_nvlist(zc, nvp); + else + error = EFAULT; + + spa_close(spa, FTAG); + + if (nvp) + nvlist_free(nvp); + return (error); +} + +static int +zfs_ioc_create_minor(zfs_cmd_t *zc) +{ + return (zvol_create_minor(zc->zc_name, zc->zc_dev)); +} + +static int +zfs_ioc_remove_minor(zfs_cmd_t *zc) +{ + return (zvol_remove_minor(zc->zc_name)); +} + +/* + * Search the vfs list for a specified resource. Returns a pointer to it + * or NULL if no suitable entry is found. The caller of this routine + * is responsible for releasing the returned vfs pointer. + */ +static vfs_t * +zfs_get_vfs(const char *resource) +{ + vfs_t *vfsp; + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(vfsp, &mountlist, mnt_list) { + if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) { + VFS_HOLD(vfsp); + break; + } + } + mtx_unlock(&mountlist_mtx); + return (vfsp); +} + +static void +zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx) +{ + zfs_create_data_t *zc = arg; + + zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx); +} + +static int +zfs_ioc_create(zfs_cmd_t *zc) +{ + objset_t *clone; + int error = 0; + zfs_create_data_t cbdata = { 0 }; + void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx); + dmu_objset_type_t type = zc->zc_objset_type; + + switch (type) { + + case DMU_OST_ZFS: + cbfunc = zfs_create_cb; + break; + + case DMU_OST_ZVOL: + cbfunc = zvol_create_cb; + break; + + default: + cbfunc = NULL; + } + if (strchr(zc->zc_name, '@')) + return (EINVAL); + + if (zc->zc_nvlist_src != 0 && + (error = get_nvlist(zc, &cbdata.zc_props)) != 0) + return (error); + + cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred; + cbdata.zc_dev = (dev_t)zc->zc_dev; + + if (zc->zc_value[0] != '\0') { + /* + * We're creating a clone of an existing snapshot. + */ + zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { + nvlist_free(cbdata.zc_props); + return (EINVAL); + } + + error = dmu_objset_open(zc->zc_value, type, + DS_MODE_STANDARD | DS_MODE_READONLY, &clone); + if (error) { + nvlist_free(cbdata.zc_props); + return (error); + } + error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL); + dmu_objset_close(clone); + } else { + if (cbfunc == NULL) { + nvlist_free(cbdata.zc_props); + return (EINVAL); + } + + if (type == DMU_OST_ZVOL) { + uint64_t volsize, volblocksize; + + if (cbdata.zc_props == NULL || + nvlist_lookup_uint64(cbdata.zc_props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), + &volsize) != 0) { + nvlist_free(cbdata.zc_props); + return (EINVAL); + } + + if ((error = nvlist_lookup_uint64(cbdata.zc_props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize)) != 0 && error != ENOENT) { + nvlist_free(cbdata.zc_props); + return (EINVAL); + } + + if (error != 0) + volblocksize = zfs_prop_default_numeric( + ZFS_PROP_VOLBLOCKSIZE); + + if ((error = zvol_check_volblocksize( + volblocksize)) != 0 || + (error = zvol_check_volsize(volsize, + volblocksize)) != 0) { + nvlist_free(cbdata.zc_props); + return (error); + } + } + + error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc, + &cbdata); + } + + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + if ((error = zfs_set_prop_nvlist(zc->zc_name, + zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred, + cbdata.zc_props)) != 0) + (void) dmu_objset_destroy(zc->zc_name); + } + + nvlist_free(cbdata.zc_props); + return (error); +} + +static int +zfs_ioc_snapshot(zfs_cmd_t *zc) +{ + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + return (dmu_objset_snapshot(zc->zc_name, + zc->zc_value, zc->zc_cookie)); +} + +static int +zfs_unmount_snap(char *name, void *arg) +{ + char *snapname = arg; + char *cp; + vfs_t *vfsp = NULL; + + /* + * Snapshots (which are under .zfs control) must be unmounted + * before they can be destroyed. + */ + + if (snapname) { + (void) strcat(name, "@"); + (void) strcat(name, snapname); + vfsp = zfs_get_vfs(name); + cp = strchr(name, '@'); + *cp = '\0'; + } else if (strchr(name, '@')) { + vfsp = zfs_get_vfs(name); + } + + if (vfsp) { + /* + * Always force the unmount for snapshots. + */ + int flag = MS_FORCE; + int err; + + if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { + VFS_RELE(vfsp); + return (err); + } + VFS_RELE(vfsp); + mtx_lock(&Giant); /* dounmount() */ + dounmount(vfsp, flag, curthread); + mtx_unlock(&Giant); /* dounmount() */ + } + return (0); +} + +static int +zfs_ioc_destroy_snaps(zfs_cmd_t *zc) +{ + int err; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + err = dmu_objset_find(zc->zc_name, + zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); + if (err) + return (err); + return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); +} + +static int +zfs_ioc_destroy(zfs_cmd_t *zc) +{ + if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { + int err = zfs_unmount_snap(zc->zc_name, NULL); + if (err) + return (err); + } + + return (dmu_objset_destroy(zc->zc_name)); +} + +static int +zfs_ioc_rollback(zfs_cmd_t *zc) +{ + return (dmu_objset_rollback(zc->zc_name)); +} + +static int +zfs_ioc_rename(zfs_cmd_t *zc) +{ + zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + if (strchr(zc->zc_name, '@') != NULL && + zc->zc_objset_type == DMU_OST_ZFS) { + int err = zfs_unmount_snap(zc->zc_name, NULL); + if (err) + return (err); + } + + return (dmu_objset_rename(zc->zc_name, zc->zc_value)); +} + +static int +zfs_ioc_recvbackup(zfs_cmd_t *zc) +{ + kthread_t *td = curthread; + struct file *fp; + int error; + offset_t new_off; + + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '@') == NULL) + return (EINVAL); + + error = fget_read(td, zc->zc_cookie, &fp); + if (error) + return (error); + + error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record, + &zc->zc_cookie, (boolean_t)zc->zc_guid, fp, + fp->f_offset); + + new_off = fp->f_offset + zc->zc_cookie; + fp->f_offset = new_off; + + fdrop(fp, td); + return (error); +} + +static int +zfs_ioc_sendbackup(zfs_cmd_t *zc) +{ + kthread_t *td = curthread; + struct file *fp; + objset_t *fromsnap = NULL; + objset_t *tosnap; + int error, fd; + + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap); + if (error) + return (error); + + if (zc->zc_value[0] != '\0') { + char buf[MAXPATHLEN]; + char *cp; + + (void) strncpy(buf, zc->zc_name, sizeof (buf)); + cp = strchr(buf, '@'); + if (cp) + *(cp+1) = 0; + (void) strlcat(buf, zc->zc_value, sizeof (buf)); + error = dmu_objset_open(buf, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap); + if (error) { + dmu_objset_close(tosnap); + return (error); + } + } + + fd = zc->zc_cookie; + error = fget_write(td, fd, &fp); + if (error) { + dmu_objset_close(tosnap); + if (fromsnap) + dmu_objset_close(fromsnap); + return (error); + } + + error = dmu_sendbackup(tosnap, fromsnap, fp); + + fdrop(fp, td); + if (fromsnap) + dmu_objset_close(fromsnap); + dmu_objset_close(tosnap); + return (error); +} + +static int +zfs_ioc_inject_fault(zfs_cmd_t *zc) +{ + int id, error; + + error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, + &zc->zc_inject_record); + + if (error == 0) + zc->zc_guid = (uint64_t)id; + + return (error); +} + +static int +zfs_ioc_clear_fault(zfs_cmd_t *zc) +{ + return (zio_clear_fault((int)zc->zc_guid)); +} + +static int +zfs_ioc_inject_list_next(zfs_cmd_t *zc) +{ + int id = (int)zc->zc_guid; + int error; + + error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), + &zc->zc_inject_record); + + zc->zc_guid = id; + + return (error); +} + +static int +zfs_ioc_error_log(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + size_t count = (size_t)zc->zc_nvlist_dst_size; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, + &count); + if (error == 0) + zc->zc_nvlist_dst_size = count; + else + zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_clear(zfs_cmd_t *zc) +{ + spa_t *spa; + vdev_t *vd; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + spa_config_enter(spa, RW_WRITER, FTAG); + + if (zc->zc_guid == 0) { + vd = NULL; + } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { + spa_config_exit(spa, FTAG); + spa_close(spa, FTAG); + return (ENODEV); + } + + vdev_clear(spa, vd); + + spa_config_exit(spa, FTAG); + + spa_close(spa, FTAG); + + return (0); +} + +static int +zfs_ioc_promote(zfs_cmd_t *zc) +{ + char *cp; + + /* + * We don't need to unmount *all* the origin fs's snapshots, but + * it's easier. + */ + cp = strchr(zc->zc_value, '@'); + if (cp) + *cp = '\0'; + (void) dmu_objset_find(zc->zc_value, + zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); + return (dsl_dataset_promote(zc->zc_name)); +} + +static int +zfs_ioc_jail(zfs_cmd_t *zc) +{ + + return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred, + zc->zc_name, (int)zc->zc_jailid)); +} + +static int +zfs_ioc_unjail(zfs_cmd_t *zc) +{ + + return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred, + zc->zc_name, (int)zc->zc_jailid)); +} + +static zfs_ioc_vec_t zfs_ioc_vec[] = { + { zfs_ioc_pool_create, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_import, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_export, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_configs, zfs_secpolicy_none, no_name }, + { zfs_ioc_pool_stats, zfs_secpolicy_read, pool_name }, + { zfs_ioc_pool_tryimport, zfs_secpolicy_config, no_name }, + { zfs_ioc_pool_scrub, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_freeze, zfs_secpolicy_config, no_name }, + { zfs_ioc_pool_upgrade, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_get_history, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_log_history, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_setpath, zfs_secpolicy_config, pool_name }, + { zfs_ioc_objset_stats, zfs_secpolicy_read, dataset_name }, + { zfs_ioc_dataset_list_next, zfs_secpolicy_read, dataset_name }, + { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, dataset_name }, + { zfs_ioc_set_prop, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_create_minor, zfs_secpolicy_config, dataset_name }, + { zfs_ioc_remove_minor, zfs_secpolicy_config, dataset_name }, + { zfs_ioc_create, zfs_secpolicy_parent, dataset_name }, + { zfs_ioc_destroy, zfs_secpolicy_parent, dataset_name }, + { zfs_ioc_rollback, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_rename, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_sendbackup, zfs_secpolicy_operator, dataset_name }, + { zfs_ioc_inject_fault, zfs_secpolicy_inject, no_name }, + { zfs_ioc_clear_fault, zfs_secpolicy_inject, no_name }, + { zfs_ioc_inject_list_next, zfs_secpolicy_inject, no_name }, + { zfs_ioc_error_log, zfs_secpolicy_inject, pool_name }, + { zfs_ioc_clear, zfs_secpolicy_config, pool_name }, + { zfs_ioc_promote, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_destroy_snaps, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_snapshot, zfs_secpolicy_operator, dataset_name }, + { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, pool_name }, + { zfs_ioc_obj_to_path, zfs_secpolicy_config, no_name }, + { zfs_ioc_pool_props_set, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_props_get, zfs_secpolicy_read, pool_name }, + { zfs_ioc_jail, zfs_secpolicy_config, dataset_name }, + { zfs_ioc_unjail, zfs_secpolicy_config, dataset_name } +}; + +static int +zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, + struct thread *td) +{ + zfs_cmd_t *zc = (void *)addr; + uint_t vec; + int error; + + vec = ZFS_IOC(cmd); + + if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) + return (EINVAL); + + zc->zc_cred = (uintptr_t)td->td_ucred; + zc->zc_dev = (uintptr_t)dev; + error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred); + + /* + * Ensure that all pool/dataset names are valid before we pass down to + * the lower layers. + */ + if (error == 0) { + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + switch (zfs_ioc_vec[vec].zvec_namecheck) { + case pool_name: + if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) + error = EINVAL; + break; + + case dataset_name: + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) + error = EINVAL; + break; + + case no_name: + break; + } + } + + if (error == 0) + error = zfs_ioc_vec[vec].zvec_func(zc); + + return (error); +} + +/* + * OK, so this is a little weird. + * + * /dev/zfs is the control node, i.e. minor 0. + * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. + * + * /dev/zfs has basically nothing to do except serve up ioctls, + * so most of the standard driver entry points are in zvol.c. + */ +static struct cdevsw zfs_cdevsw = { + .d_version = D_VERSION, + .d_ioctl = zfsdev_ioctl, + .d_name = ZFS_DEV_NAME +}; + +static void +zfsdev_init(void) +{ + zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660, + ZFS_DEV_NAME); +} + +static void +zfsdev_fini(void) +{ + if (zfsdev != NULL) + destroy_dev(zfsdev); +} + +static struct task zfs_start_task; + +static void +zfs_start(void *context __unused, int pending __unused) +{ + + zfsdev_init(); + spa_init(FREAD | FWRITE); + zfs_init(); + zvol_init(); + printf("ZFS storage pool version " ZFS_VERSION_STRING "\n"); +} + +static int +zfs_modevent(module_t mod, int type, void *unused __unused) +{ + int error; + + error = EOPNOTSUPP; + switch (type) { + case MOD_LOAD: + printf("WARNING: ZFS is considered to be an experimental " + "feature in FreeBSD.\n"); + TASK_INIT(&zfs_start_task, 0, zfs_start, NULL); + taskqueue_enqueue(taskqueue_thread, &zfs_start_task); + error = 0; + break; + case MOD_UNLOAD: + if (spa_busy() || /* zfs_busy() || */ zvol_busy() || + zio_injection_enabled) { + error = EBUSY; + break; + } + zvol_fini(); + zfs_fini(); + spa_fini(); + zfsdev_fini(); + error = 0; + break; + } + return (error); +} + +static moduledata_t zfs_mod = { + "zfsctrl", + zfs_modevent, + 0 +}; +DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_MOUNT_ROOT, SI_ORDER_ANY); diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c new file mode 100644 index 0000000..06cb95a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c @@ -0,0 +1,348 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zil.h> +#include <sys/byteorder.h> +#include <sys/stat.h> +#include <sys/acl.h> +#include <sys/dmu.h> +#include <sys/spa.h> + +/* + * All the functions in this file are used to construct the log entries + * to record transactions. They allocate * a intent log transaction + * structure (itx_t) and save within it all the information necessary to + * possibly replay the transaction. The itx is then assigned a sequence + * number and inserted in the in-memory list anchored in the zilog. + */ + +/* + * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR + * transactions. + */ +void +zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_create_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_mode = zp->z_phys->zp_mode; + lr->lr_uid = zp->z_phys->zp_uid; + lr->lr_gid = zp->z_phys->zp_gid; + lr->lr_gen = zp->z_phys->zp_gen; + lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; + lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + lr->lr_rdev = zp->z_phys->zp_rdev; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; +} + +/* + * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. + */ +void +zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_remove_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_remove_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; +} + +/* + * zfs_log_link() handles TX_LINK transactions. + */ +void +zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_link_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_link_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_link_obj = zp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; +} + +/* + * zfs_log_symlink() handles TX_SYMLINK transactions. + */ +void +zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name, char *link) +{ + itx_t *itx; + uint64_t seq; + lr_create_t *lr; + size_t namesize = strlen(name) + 1; + size_t linksize = strlen(link) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_mode = zp->z_phys->zp_mode; + lr->lr_uid = zp->z_phys->zp_uid; + lr->lr_gid = zp->z_phys->zp_gid; + lr->lr_gen = zp->z_phys->zp_gen; + lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; + lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + bcopy(name, (char *)(lr + 1), namesize); + bcopy(link, (char *)(lr + 1) + namesize, linksize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; +} + +/* + * zfs_log_rename() handles TX_RENAME transactions. + */ +void +zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) +{ + itx_t *itx; + uint64_t seq; + lr_rename_t *lr; + size_t snamesize = strlen(sname) + 1; + size_t dnamesize = strlen(dname) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); + lr = (lr_rename_t *)&itx->itx_lr; + lr->lr_sdoid = sdzp->z_id; + lr->lr_tdoid = tdzp->z_id; + bcopy(sname, (char *)(lr + 1), snamesize); + bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + + seq = zil_itx_assign(zilog, itx, tx); + sdzp->z_last_itx = seq; + tdzp->z_last_itx = seq; + szp->z_last_itx = seq; +} + +/* + * zfs_log_write() handles TX_WRITE transactions. + */ +ssize_t zfs_immediate_write_sz = 32768; + +void +zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t len, int ioflag) +{ + itx_t *itx; + uint64_t seq; + lr_write_t *lr; + itx_wr_state_t write_state; + int err; + + if (zilog == NULL || zp->z_unlinked) + return; + + /* + * Writes are handled in three different ways: + * + * WR_INDIRECT: + * If the write is greater than zfs_immediate_write_sz then + * later *if* we need to log the write then dmu_sync() is used + * to immediately write the block and it's block pointer is put + * in the log record. + * WR_COPIED: + * If we know we'll immediately be committing the + * transaction (FDSYNC (O_DSYNC)), the we allocate a larger + * log record here for the data and copy the data in. + * WR_NEED_COPY: + * Otherwise we don't allocate a buffer, and *if* we need to + * flush the write later then a buffer is allocated and + * we retrieve the data using the dmu. + */ + if (len > zfs_immediate_write_sz) + write_state = WR_INDIRECT; + else if (ioflag & IO_SYNC) + write_state = WR_COPIED; + else + write_state = WR_NEED_COPY; + + itx = zil_itx_create(txtype, sizeof (*lr) + + (write_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + if (write_state == WR_COPIED) { + err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1); + if (err) { + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; + } + } + + itx->itx_wr_state = write_state; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + itx->itx_private = zp->z_zfsvfs; + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} + +/* + * zfs_log_truncate() handles TX_TRUNCATE transactions. + */ +void +zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len) +{ + itx_t *itx; + uint64_t seq; + lr_truncate_t *lr; + + if (zilog == NULL || zp->z_unlinked) + return; + + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_truncate_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} + +/* + * zfs_log_setattr() handles TX_SETATTR transactions. + */ +void +zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied) +{ + itx_t *itx; + uint64_t seq; + lr_setattr_t *lr; + + if (zilog == NULL || zp->z_unlinked) + return; + + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_setattr_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_mask = (uint64_t)mask_applied; + lr->lr_mode = (uint64_t)vap->va_mode; + lr->lr_uid = (uint64_t)vap->va_uid; + lr->lr_gid = (uint64_t)vap->va_gid; + lr->lr_size = (uint64_t)vap->va_size; + ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} + +/* + * zfs_log_acl() handles TX_ACL transactions. + */ +void +zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, int aclcnt, ace_t *z_ace) +{ + itx_t *itx; + uint64_t seq; + lr_acl_t *lr; + + if (zilog == NULL || zp->z_unlinked) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t)); + lr = (lr_acl_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_aclcnt = (uint64_t)aclcnt; + bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t)); + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c new file mode 100644 index 0000000..ad3ad91 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c @@ -0,0 +1,424 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/vfs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/spa.h> +#include <sys/zil.h> +#include <sys/byteorder.h> +#include <sys/stat.h> +#include <sys/acl.h> +#include <sys/atomic.h> +#include <sys/cred.h> +#include <sys/namei.h> + +/* + * Functions to replay ZFS intent log (ZIL) records + * The functions are called through a function vector (zfs_replay_vector) + * which is indexed by the transaction type. + */ + +static void +zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, + uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) +{ + VATTR_NULL(vap); + vap->va_mask = (uint_t)mask; + vap->va_type = IFTOVT(mode); + vap->va_mode = mode & MODEMASK; + vap->va_uid = (uid_t)uid; + vap->va_gid = (gid_t)gid; + vap->va_rdev = zfs_cmpldev(rdev); + vap->va_nodeid = nodeid; +} + +/* ARGSUSED */ +static int +zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) +{ + return (ENOTSUP); +} + +static int +zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_create_t */ + char *link; /* symlink content follows name */ + znode_t *dzp; + vnode_t *vp = NULL; + vattr_t va; + struct componentname cn; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time and generation number. The generic VOP_CREATE() + * doesn't have either concept, so we smuggle the values inside + * the vattr's otherwise unused va_ctime and va_nblocks fields. + */ + ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime); + va.va_nblocks = lr->lr_gen; + + cn.cn_nameptr = name; + cn.cn_cred = kcred; + cn.cn_thread = curthread; + cn.cn_flags = SAVENAME; + + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread); + switch ((int)lr->lr_common.lrc_txtype) { + case TX_CREATE: + error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va); + break; + case TX_MKDIR: + error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va); + break; + case TX_MKXATTR: + error = zfs_make_xattrdir(dzp, &va, &vp, kcred); + break; + case TX_SYMLINK: + link = name + strlen(name) + 1; + error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link); + break; + default: + error = ENOTSUP; + } + VOP_UNLOCK(ZTOV(dzp), 0, curthread); + + if (error == 0 && vp != NULL) { + VOP_UNLOCK(vp, 0, curthread); + VN_RELE(vp); + } + + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_remove_t */ + znode_t *dzp; + struct componentname cn; + vnode_t *vp; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + cn.cn_nameptr = name; + cn.cn_namelen = strlen(name); + cn.cn_nameiop = DELETE; + cn.cn_flags = ISLASTCN | SAVENAME; + cn.cn_cred = kcred; + cn.cn_thread = curthread; + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread); + error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn); + if (error != 0) { + VOP_UNLOCK(ZTOV(dzp), 0, curthread); + goto fail; + } + + switch ((int)lr->lr_common.lrc_txtype) { + case TX_REMOVE: + error = VOP_REMOVE(ZTOV(dzp), vp, &cn); + break; + case TX_RMDIR: + error = VOP_RMDIR(ZTOV(dzp), vp, &cn); + break; + default: + error = ENOTSUP; + } + vput(vp); + VOP_UNLOCK(ZTOV(dzp), 0, curthread); +fail: + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_link_t */ + znode_t *dzp, *zp; + struct componentname cn; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { + VN_RELE(ZTOV(dzp)); + return (error); + } + + cn.cn_nameptr = name; + cn.cn_cred = kcred; + cn.cn_thread = curthread; + cn.cn_flags = SAVENAME; + + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY, curthread); + error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn); + VOP_UNLOCK(ZTOV(zp), 0, curthread); + VOP_UNLOCK(ZTOV(dzp), 0, curthread); + + VN_RELE(ZTOV(zp)); + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) +{ + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + znode_t *sdzp, *tdzp; + struct componentname scn, tcn; + vnode_t *svp, *tvp; + kthread_t *td = curthread; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { + VN_RELE(ZTOV(sdzp)); + return (error); + } + + svp = tvp = NULL; + + scn.cn_nameptr = sname; + scn.cn_namelen = strlen(sname); + scn.cn_nameiop = DELETE; + scn.cn_flags = ISLASTCN | SAVENAME; + scn.cn_cred = kcred; + scn.cn_thread = td; + vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn); + VOP_UNLOCK(ZTOV(sdzp), 0, td); + if (error != 0) + goto fail; + VOP_UNLOCK(svp, 0, td); + + tcn.cn_nameptr = tname; + tcn.cn_namelen = strlen(tname); + tcn.cn_nameiop = RENAME; + tcn.cn_flags = ISLASTCN | SAVENAME; + tcn.cn_cred = kcred; + tcn.cn_thread = td; + vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn); + if (error == EJUSTRETURN) + tvp = NULL; + else if (error != 0) { + VOP_UNLOCK(ZTOV(tdzp), 0, td); + goto fail; + } + + error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn); + return (error); +fail: + if (svp != NULL) + vrele(svp); + if (tvp != NULL) + vrele(tvp); + VN_RELE(ZTOV(tdzp)); + VN_RELE(ZTOV(sdzp)); + + return (error); +} + +static int +zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) +{ + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + znode_t *zp; + int error; + ssize_t resid; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log writes out of order, it's possible the + * file has been removed. In this case just drop the write + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, + lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) +{ + + ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); + return (EOPNOTSUPP); +} + +static int +zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) +{ + znode_t *zp; + vattr_t va; + vnode_t *vp; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log setattrs out of order, it's possible the + * file has been removed. In this case just drop the setattr + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode, + lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); + + va.va_size = lr->lr_size; + ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime); + ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime); + + vp = ZTOV(zp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + error = VOP_SETATTR(vp, &va, kcred, curthread); + VOP_UNLOCK(vp, 0, curthread); + VN_RELE(vp); + + return (error); +} + +static int +zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) +{ + ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ +#ifdef TODO + vsecattr_t vsa; +#endif + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_ace_byteswap(ace, lr->lr_aclcnt); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log acls out of order, it's possible the + * file has been removed. In this case just drop the acl + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + +#ifdef TODO + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentp = ace; + + error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred); +#else + error = EOPNOTSUPP; +#endif + + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * Callback vectors for replaying records + */ +zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { + zfs_replay_error, /* 0 no such transaction type */ + zfs_replay_create, /* TX_CREATE */ + zfs_replay_create, /* TX_MKDIR */ + zfs_replay_create, /* TX_MKXATTR */ + zfs_replay_create, /* TX_SYMLINK */ + zfs_replay_remove, /* TX_REMOVE */ + zfs_replay_remove, /* TX_RMDIR */ + zfs_replay_link, /* TX_LINK */ + zfs_replay_rename, /* TX_RENAME */ + zfs_replay_write, /* TX_WRITE */ + zfs_replay_truncate, /* TX_TRUNCATE */ + zfs_replay_setattr, /* TX_SETATTR */ + zfs_replay_acl, /* TX_ACL */ +}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c new file mode 100644 index 0000000..07ec0f6 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c @@ -0,0 +1,594 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file contains the code to implement file range locking in + * ZFS, although there isn't much specific to ZFS (all that comes to mind + * support for growing the blocksize). + * + * Interface + * --------- + * Defined in zfs_rlock.h but essentially: + * rl = zfs_range_lock(zp, off, len, lock_type); + * zfs_range_unlock(rl); + * zfs_range_reduce(rl, off, len); + * + * AVL tree + * -------- + * An AVL tree is used to maintain the state of the existing ranges + * that are locked for exclusive (writer) or shared (reader) use. + * The starting range offset is used for searching and sorting the tree. + * + * Common case + * ----------- + * The (hopefully) usual case is of no overlaps or contention for + * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree + * searched that finds no overlap, and *this* rl_t is placed in the tree. + * + * Overlaps/Reference counting/Proxy locks + * --------------------------------------- + * The avl code only allows one node at a particular offset. Also it's very + * inefficient to search through all previous entries looking for overlaps + * (because the very 1st in the ordered list might be at offset 0 but + * cover the whole file). + * So this implementation uses reference counts and proxy range locks. + * Firstly, only reader locks use reference counts and proxy locks, + * because writer locks are exclusive. + * When a reader lock overlaps with another then a proxy lock is created + * for that range and replaces the original lock. If the overlap + * is exact then the reference count of the proxy is simply incremented. + * Otherwise, the proxy lock is split into smaller lock ranges and + * new proxy locks created for non overlapping ranges. + * The reference counts are adjusted accordingly. + * Meanwhile, the orginal lock is kept around (this is the callers handle) + * and its offset and length are used when releasing the lock. + * + * Thread coordination + * ------------------- + * In order to make wakeups efficient and to ensure multiple continuous + * readers on a range don't starve a writer for the same range lock, + * two condition variables are allocated in each rl_t. + * If a writer (or reader) can't get a range it initialises the writer + * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; + * and waits on that cv. When a thread unlocks that range it wakes up all + * writers then all readers before destroying the lock. + * + * Append mode writes + * ------------------ + * Append mode writes need to lock a range at the end of a file. + * The offset of the end of the file is determined under the + * range locking mutex, and the lock type converted from RL_APPEND to + * RL_WRITER and the range locked. + * + * Grow block handling + * ------------------- + * ZFS supports multiple block sizes currently upto 128K. The smallest + * block size is used for the file which is grown as needed. During this + * growth all other writers and readers must be excluded. + * So if the block size needs to be grown then the whole file is + * exclusively locked, then later the caller will reduce the lock + * range to just the range to be written using zfs_reduce_range. + */ + +#include <sys/zfs_rlock.h> + +/* + * Check if a write lock can be grabbed, or wait and recheck until available. + */ +static void +zfs_range_lock_writer(znode_t *zp, rl_t *new) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *rl; + avl_index_t where; + uint64_t end_size; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + for (;;) { + /* + * Range locking is also used by zvol and uses a + * dummied up znode. However, for zvol, we don't need to + * append or grow blocksize, and besides we don't have + * a z_phys or z_zfsvfs - so skip that processing. + * + * Yes, this is ugly, and would be solved by not handling + * grow or append in range lock code. If that was done then + * we could make the range locking code generically available + * to other non-zfs consumers. + */ + if (zp->z_vnode) { /* caller is ZPL */ + /* + * If in append mode pick up the current end of file. + * This is done under z_range_lock to avoid races. + */ + if (new->r_type == RL_APPEND) + new->r_off = zp->z_phys->zp_size; + + /* + * If we need to grow the block size then grab the whole + * file range. This is also done under z_range_lock to + * avoid races. + */ + end_size = MAX(zp->z_phys->zp_size, new->r_off + len); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { + new->r_off = 0; + new->r_len = UINT64_MAX; + } + } + + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(tree) == 0) { + new->r_type = RL_WRITER; /* convert to writer */ + avl_add(tree, new); + return; + } + + /* + * Look for any locks in the range. + */ + rl = avl_find(tree, new, &where); + if (rl) + goto wait; /* already locked at same offset */ + + rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + if (rl && (rl->r_off < new->r_off + new->r_len)) + goto wait; + + rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + if (rl && rl->r_off + rl->r_len > new->r_off) + goto wait; + + new->r_type = RL_WRITER; /* convert possible RL_APPEND */ + avl_insert(tree, new, where); + return; +wait: + if (!rl->r_write_wanted) { + cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); + rl->r_write_wanted = B_TRUE; + } + cv_wait(&rl->r_wr_cv, &zp->z_range_lock); + + /* reset to original */ + new->r_off = off; + new->r_len = len; + } +} + +/* + * If this is an original (non-proxy) lock then replace it by + * a proxy and return the proxy. + */ +static rl_t * +zfs_range_proxify(avl_tree_t *tree, rl_t *rl) +{ + rl_t *proxy; + + if (rl->r_proxy) + return (rl); /* already a proxy */ + + ASSERT3U(rl->r_cnt, ==, 1); + ASSERT(rl->r_write_wanted == B_FALSE); + ASSERT(rl->r_read_wanted == B_FALSE); + avl_remove(tree, rl); + rl->r_cnt = 0; + + /* create a proxy range lock */ + proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); + proxy->r_off = rl->r_off; + proxy->r_len = rl->r_len; + proxy->r_cnt = 1; + proxy->r_type = RL_READER; + proxy->r_proxy = B_TRUE; + proxy->r_write_wanted = B_FALSE; + proxy->r_read_wanted = B_FALSE; + avl_add(tree, proxy); + + return (proxy); +} + +/* + * Split the range lock at the supplied offset + * returning the *front* proxy. + */ +static rl_t * +zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) +{ + rl_t *front, *rear; + + ASSERT3U(rl->r_len, >, 1); + ASSERT3U(off, >, rl->r_off); + ASSERT3U(off, <, rl->r_off + rl->r_len); + ASSERT(rl->r_write_wanted == B_FALSE); + ASSERT(rl->r_read_wanted == B_FALSE); + + /* create the rear proxy range lock */ + rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); + rear->r_off = off; + rear->r_len = rl->r_off + rl->r_len - off; + rear->r_cnt = rl->r_cnt; + rear->r_type = RL_READER; + rear->r_proxy = B_TRUE; + rear->r_write_wanted = B_FALSE; + rear->r_read_wanted = B_FALSE; + + front = zfs_range_proxify(tree, rl); + front->r_len = off - rl->r_off; + + avl_insert_here(tree, rear, front, AVL_AFTER); + return (front); +} + +/* + * Create and add a new proxy range lock for the supplied range. + */ +static void +zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) +{ + rl_t *rl; + + ASSERT(len); + rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); + rl->r_off = off; + rl->r_len = len; + rl->r_cnt = 1; + rl->r_type = RL_READER; + rl->r_proxy = B_TRUE; + rl->r_write_wanted = B_FALSE; + rl->r_read_wanted = B_FALSE; + avl_add(tree, rl); +} + +static void +zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) +{ + rl_t *next; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + /* + * prev arrives either: + * - pointing to an entry at the same offset + * - pointing to the entry with the closest previous offset whose + * range may overlap with the new range + * - null, if there were no ranges starting before the new one + */ + if (prev) { + if (prev->r_off + prev->r_len <= off) { + prev = NULL; + } else if (prev->r_off != off) { + /* + * convert to proxy if needed then + * split this entry and bump ref count + */ + prev = zfs_range_split(tree, prev, off); + prev = AVL_NEXT(tree, prev); /* move to rear range */ + } + } + ASSERT((prev == NULL) || (prev->r_off == off)); + + if (prev) + next = prev; + else + next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + + if (next == NULL || off + len <= next->r_off) { + /* no overlaps, use the original new rl_t in the tree */ + avl_insert(tree, new, where); + return; + } + + if (off < next->r_off) { + /* Add a proxy for initial range before the overlap */ + zfs_range_new_proxy(tree, off, next->r_off - off); + } + + new->r_cnt = 0; /* will use proxies in tree */ + /* + * We now search forward through the ranges, until we go past the end + * of the new range. For each entry we make it a proxy if it + * isn't already, then bump its reference count. If there's any + * gaps between the ranges then we create a new proxy range. + */ + for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { + if (off + len <= next->r_off) + break; + if (prev && prev->r_off + prev->r_len < next->r_off) { + /* there's a gap */ + ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); + zfs_range_new_proxy(tree, prev->r_off + prev->r_len, + next->r_off - (prev->r_off + prev->r_len)); + } + if (off + len == next->r_off + next->r_len) { + /* exact overlap with end */ + next = zfs_range_proxify(tree, next); + next->r_cnt++; + return; + } + if (off + len < next->r_off + next->r_len) { + /* new range ends in the middle of this block */ + next = zfs_range_split(tree, next, off + len); + next->r_cnt++; + return; + } + ASSERT3U(off + len, >, next->r_off + next->r_len); + next = zfs_range_proxify(tree, next); + next->r_cnt++; + } + + /* Add the remaining end range. */ + zfs_range_new_proxy(tree, prev->r_off + prev->r_len, + (off + len) - (prev->r_off + prev->r_len)); +} + +/* + * Check if a reader lock can be grabbed, or wait and recheck until available. + */ +static void +zfs_range_lock_reader(znode_t *zp, rl_t *new) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *prev, *next; + avl_index_t where; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + /* + * Look for any writer locks in the range. + */ +retry: + prev = avl_find(tree, new, &where); + if (prev == NULL) + prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + + /* + * Check the previous range for a writer lock overlap. + */ + if (prev && (off < prev->r_off + prev->r_len)) { + if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { + if (!prev->r_read_wanted) { + cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); + prev->r_read_wanted = B_TRUE; + } + cv_wait(&prev->r_rd_cv, &zp->z_range_lock); + goto retry; + } + if (off + len < prev->r_off + prev->r_len) + goto got_lock; + } + + /* + * Search through the following ranges to see if there's + * write lock any overlap. + */ + if (prev) + next = AVL_NEXT(tree, prev); + else + next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + for (; next; next = AVL_NEXT(tree, next)) { + if (off + len <= next->r_off) + goto got_lock; + if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { + if (!next->r_read_wanted) { + cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); + next->r_read_wanted = B_TRUE; + } + cv_wait(&next->r_rd_cv, &zp->z_range_lock); + goto retry; + } + if (off + len <= next->r_off + next->r_len) + goto got_lock; + } + +got_lock: + /* + * Add the read lock, which may involve splitting existing + * locks and bumping ref counts (r_cnt). + */ + zfs_range_add_reader(tree, new, prev, where); +} + +/* + * Lock a range (offset, length) as either shared (RL_READER) + * or exclusive (RL_WRITER). Returns the range lock structure + * for later unlocking or reduce range (if entire file + * previously locked as RL_WRITER). + */ +rl_t * +zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) +{ + rl_t *new; + + ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); + + new = kmem_alloc(sizeof (rl_t), KM_SLEEP); + new->r_zp = zp; + new->r_off = off; + new->r_len = len; + new->r_cnt = 1; /* assume it's going to be in the tree */ + new->r_type = type; + new->r_proxy = B_FALSE; + new->r_write_wanted = B_FALSE; + new->r_read_wanted = B_FALSE; + + mutex_enter(&zp->z_range_lock); + if (type == RL_READER) { + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(&zp->z_range_avl) == 0) + avl_add(&zp->z_range_avl, new); + else + zfs_range_lock_reader(zp, new); + } else + zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ + mutex_exit(&zp->z_range_lock); + return (new); +} + +/* + * Unlock a reader lock + */ +static void +zfs_range_unlock_reader(znode_t *zp, rl_t *remove) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *rl, *next; + uint64_t len; + + /* + * The common case is when the remove entry is in the tree + * (cnt == 1) meaning there's been no other reader locks overlapping + * with this one. Otherwise the remove entry will have been + * removed from the tree and replaced by proxies (one or + * more ranges mapping to the entire range). + */ + if (remove->r_cnt == 1) { + avl_remove(tree, remove); + if (remove->r_write_wanted) + cv_broadcast(&remove->r_wr_cv); + if (remove->r_read_wanted) + cv_broadcast(&remove->r_rd_cv); + } else { + ASSERT3U(remove->r_cnt, ==, 0); + ASSERT3U(remove->r_write_wanted, ==, 0); + ASSERT3U(remove->r_read_wanted, ==, 0); + /* + * Find start proxy representing this reader lock, + * then decrement ref count on all proxies + * that make up this range, freeing them as needed. + */ + rl = avl_find(tree, remove, NULL); + ASSERT(rl); + ASSERT(rl->r_cnt); + ASSERT(rl->r_type == RL_READER); + for (len = remove->r_len; len != 0; rl = next) { + len -= rl->r_len; + if (len) { + next = AVL_NEXT(tree, rl); + ASSERT(next); + ASSERT(rl->r_off + rl->r_len == next->r_off); + ASSERT(next->r_cnt); + ASSERT(next->r_type == RL_READER); + } + rl->r_cnt--; + if (rl->r_cnt == 0) { + avl_remove(tree, rl); + if (rl->r_write_wanted) + cv_broadcast(&rl->r_wr_cv); + if (rl->r_read_wanted) + cv_broadcast(&rl->r_rd_cv); + kmem_free(rl, sizeof (rl_t)); + } + } + } + kmem_free(remove, sizeof (rl_t)); +} + +/* + * Unlock range and destroy range lock structure. + */ +void +zfs_range_unlock(rl_t *rl) +{ + znode_t *zp = rl->r_zp; + + ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); + ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); + ASSERT(!rl->r_proxy); + + mutex_enter(&zp->z_range_lock); + if (rl->r_type == RL_WRITER) { + /* writer locks can't be shared or split */ + avl_remove(&zp->z_range_avl, rl); + mutex_exit(&zp->z_range_lock); + if (rl->r_write_wanted) { + cv_broadcast(&rl->r_wr_cv); + cv_destroy(&rl->r_wr_cv); + } + if (rl->r_read_wanted) { + cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); + } + kmem_free(rl, sizeof (rl_t)); + } else { + /* + * lock may be shared, let zfs_range_unlock_reader() + * release the lock and free the rl_t + */ + zfs_range_unlock_reader(zp, rl); + mutex_exit(&zp->z_range_lock); + } +} + +/* + * Reduce range locked as RL_WRITER from whole file to specified range. + * Asserts the whole file is exclusivly locked and so there's only one + * entry in the tree. + */ +void +zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) +{ + znode_t *zp = rl->r_zp; + + /* Ensure there are no other locks */ + ASSERT(avl_numnodes(&zp->z_range_avl) == 1); + ASSERT(rl->r_off == 0); + ASSERT(rl->r_type == RL_WRITER); + ASSERT(!rl->r_proxy); + ASSERT3U(rl->r_len, ==, UINT64_MAX); + ASSERT3U(rl->r_cnt, ==, 1); + + mutex_enter(&zp->z_range_lock); + rl->r_off = off; + rl->r_len = len; + mutex_exit(&zp->z_range_lock); + if (rl->r_write_wanted) + cv_broadcast(&rl->r_wr_cv); + if (rl->r_read_wanted) + cv_broadcast(&rl->r_rd_cv); +} + +/* + * AVL comparison function used to order range locks + * Locks are ordered on the start offset of the range. + */ +int +zfs_range_compare(const void *arg1, const void *arg2) +{ + const rl_t *rl1 = arg1; + const rl_t *rl2 = arg2; + + if (rl1->r_off > rl2->r_off) + return (1); + if (rl1->r_off < rl2->r_off) + return (-1); + return (0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c new file mode 100644 index 0000000..27e00c3 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -0,0 +1,986 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/acl.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/mntent.h> +#include <sys/mount.h> +#include <sys/cmn_err.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zil.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/varargs.h> +#include <sys/atomic.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_ctldir.h> +#include <sys/dnlc.h> + +struct mtx atomic_mtx; +MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF); + +struct mtx zfs_debug_mtx; +MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); +SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); +int zfs_debug_level = 0; +SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, + "Debug level"); + +static int zfs_mount(vfs_t *vfsp, kthread_t *td); +static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td); +static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td); +static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td); +static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); +static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td); +static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp); +static void zfs_objset_close(zfsvfs_t *zfsvfs); +static void zfs_freevfs(vfs_t *vfsp); + +static struct vfsops zfs_vfsops = { + .vfs_mount = zfs_mount, + .vfs_unmount = zfs_umount, + .vfs_root = zfs_root, + .vfs_statfs = zfs_statfs, + .vfs_vget = zfs_vget, + .vfs_sync = zfs_sync, + .vfs_fhtovp = zfs_fhtovp, +}; + +VFS_SET(zfs_vfsops, zfs, VFCF_JAIL); + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +/*ARGSUSED*/ +static int +zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td) +{ + + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + int error; + + error = vfs_stdsync(vfsp, waitfor, td); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, UINT64_MAX, 0); + else + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + zfsvfs->z_atime = TRUE; + zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); + } else { + zfsvfs->z_atime = FALSE; + zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + /* XXX locking on vfs_flag? */ +#ifdef TODO + zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; +#endif + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ +#ifdef TODO + zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; +#endif + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval < SPA_MINBLOCKSIZE || + newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) + newval = SPA_MAXBLOCKSIZE; + + zfsvfs->z_max_blksz = newval; + zfsvfs->z_vfs->vfs_bsize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_show_ctldir = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static int +zfs_refresh_properties(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * Remount operations default to "rw" unless "ro" is explicitly + * specified. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + readonly_changed_cb(zfsvfs, B_TRUE); + } else { + if (!dmu_objset_is_snapshot(zfsvfs->z_os)) + readonly_changed_cb(zfsvfs, B_FALSE); + else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) + return (EROFS); + } + + if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { + setuid_changed_cb(zfsvfs, B_FALSE); + } else { + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) + setuid_changed_cb(zfsvfs, B_FALSE); + else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) + setuid_changed_cb(zfsvfs, B_TRUE); + } + + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) + exec_changed_cb(zfsvfs, B_FALSE); + else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) + exec_changed_cb(zfsvfs, B_TRUE); + + if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) + atime_changed_cb(zfsvfs, B_TRUE); + else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) + atime_changed_cb(zfsvfs, B_FALSE); + + if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) + xattr_changed_cb(zfsvfs, B_TRUE); + else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) + xattr_changed_cb(zfsvfs, B_FALSE); + + return (0); +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + int readonly, do_readonly = FALSE; + int setuid, do_setuid = FALSE; + int exec, do_exec = FALSE; + int xattr, do_xattr = FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else { + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; + } + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { + xattr = B_FALSE; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { + xattr = B_TRUE; + do_xattr = B_TRUE; + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + ds = dmu_objset_ds(os); + error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "xattr", xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "recordsize", blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "readonly", readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "setuid", setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "exec", exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "snapdir", snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "aclmode", acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "aclinherit", acl_inherit_changed_cb, zfsvfs); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + + return (0); + +unregister: + /* + * We may attempt to unregister some callbacks that are not + * registered, but this is OK; it will simply return ENOMSG, + * which we will ignore. + */ + (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, + zfsvfs); + return (error); + +} + +static int +zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td) +{ + cred_t *cr = td->td_ucred; + uint64_t recordsize, readonly; + int error = 0; + int mode; + zfsvfs_t *zfsvfs; + znode_t *zp = NULL; + + ASSERT(vfsp); + ASSERT(osname); + + /* + * Initialize the zfs-specific filesystem structure. + * Should probably make this a kmem cache, shuffle fields, + * and just bzero up to z_hold_mtx[]. + */ + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + zfsvfs->z_vfs = vfsp; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_assign = TXG_NOWAIT; + zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL); + + if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, + NULL)) + goto out; + zfsvfs->z_vfs->vfs_bsize = recordsize; + + vfsp->vfs_data = zfsvfs; + vfsp->mnt_flag |= MNT_LOCAL; + vfsp->mnt_kern_flag |= MNTK_MPSAFE; + vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; + + if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) + goto out; + + if (readonly) + mode = DS_MODE_PRIMARY | DS_MODE_READONLY; + else + mode = DS_MODE_PRIMARY; + + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + if (error == EROFS) { + /* + * FreeBSD: In Solaris there is DS_MODE_PRIMARY instead of + * DS_MODE_STANDARD, but it doesn't work on FreeBSD and + * I don't know why. It looks like the dataset is opened + * on mount DS_MODE_PRIMARY mode and snapshot cannot open + * the same dataset in DS_MODE_PRIMARY mode again. + */ + mode = DS_MODE_STANDARD | DS_MODE_READONLY; + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, + &zfsvfs->z_os); + } + + if (error) + goto out; + + if (error = zfs_init_fs(zfsvfs, &zp, cr)) + goto out; + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t xattr; + + ASSERT(mode & DS_MODE_READONLY); + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL)) + goto out; + xattr_changed_cb(zfsvfs, xattr); + zfsvfs->z_issnap = B_TRUE; + } else { + error = zfs_register_callbacks(vfsp); + if (error) + goto out; + + zfs_unlinked_drain(zfsvfs); + + /* + * Parse and replay the intent log. + */ + zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, + zfs_replay_vector); + + if (!zil_disable) + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + } + + vfs_mountedfrom(vfsp, osname); + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + if (zfsvfs->z_os) + dmu_objset_close(zfsvfs->z_os); + rw_destroy(&zfsvfs->z_um_lock); + mutex_destroy(&zfsvfs->z_znodes_lock); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + } else { + atomic_add_32(&zfs_active_fs_count, 1); + } + + return (error); + +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + struct dsl_dataset *ds; + + /* + * Unregister properties. + */ + if (!dmu_objset_is_snapshot(os)) { + ds = dmu_objset_ds(os); + VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "aclinherit", + acl_inherit_changed_cb, zfsvfs) == 0); + } +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp, kthread_t *td) +{ + char *from; + int error; + + /* TODO: For now deny user mounts. */ + if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) + return (error); + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (vfsp->vfs_flag & MS_REMOUNT) + return (zfs_refresh_properties(vfsp)); + + if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL)) + return (EINVAL); + + return (zfs_domount(vfsp, from, td)); +} + +static int +zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + uint64_t refdbytes, availbytes, usedobjs, availobjs; + + statp->f_version = STATFS_VERSION; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + statp->f_bsize = zfsvfs->z_vfs->vfs_bsize; + statp->f_iosize = zfsvfs->z_vfs->vfs_bsize; + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + + statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize; + statp->f_bfree = availbytes / statp->f_bsize; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, statp->f_bfree); + statp->f_files = statp->f_ffree + usedobjs; + + /* + * We're a zfs filesystem. + */ + (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); + + strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, + sizeof(statp->f_mntfromname)); + strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, + sizeof(statp->f_mntonname)); + + statp->f_namemax = ZFS_MAXNAMELEN; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) { + *vpp = ZTOV(rootzp); + error = vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + (*vpp)->v_vflag |= VV_ROOT; + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +static int +zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + cred_t *cr = td->td_ucred; + int ret; + + if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (ret); + + (void) dnlc_purge_vfsp(vfsp, 0); + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + if (zfsvfs->z_ctldir != NULL) { + if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) + return (ret); + ret = vflush(vfsp, 0, 0, td); + ASSERT(ret == EBUSY); + if (!(fflag & MS_FORCE)) { + if (zfsvfs->z_ctldir->v_count > 1) + return (EBUSY); + ASSERT(zfsvfs->z_ctldir->v_count == 1); + } + zfsctl_destroy(zfsvfs); + ASSERT(zfsvfs->z_ctldir == NULL); + } + + /* + * Flush all the files. + */ + ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); + if (ret != 0) { + if (!zfsvfs->z_issnap) { + zfsctl_create(zfsvfs); + ASSERT(zfsvfs->z_ctldir != NULL); + } + return (ret); + } + + if (fflag & MS_FORCE) { + MNT_ILOCK(vfsp); + vfsp->mnt_kern_flag |= MNTK_UNMOUNTF; + MNT_IUNLOCK(vfsp); + zfsvfs->z_unmounted1 = B_TRUE; + + /* + * Wait for all zfs threads to leave zfs. + * Grabbing a rwlock as reader in all vops and + * as writer here doesn't work because it too easy to get + * multiple reader enters as zfs can re-enter itself. + * This can lead to deadlock if there is an intervening + * rw_enter as writer. + * So a file system threads ref count (z_op_cnt) is used. + * A polling loop on z_op_cnt may seem inefficient, but + * - this saves all threads on exit from having to grab a + * mutex in order to cv_signal + * - only occurs on forced unmount in the rare case when + * there are outstanding threads within the file system. + */ + while (zfsvfs->z_op_cnt) { + delay(1); + } + } + + zfs_objset_close(zfsvfs); + VFS_RELE(vfsp); + zfs_freevfs(vfsp); + + return (0); +} + +static int +zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + int err; + + ZFS_ENTER(zfsvfs); + err = zfs_zget(zfsvfs, ino, &zp); + if (err == 0 && zp->z_unlinked) { + VN_RELE(ZTOV(zp)); + err = EINVAL; + } + if (err != 0) + *vpp = NULL; + else { + *vpp = ZTOV(zp); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + } + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) +{ + kthread_t *td = curthread; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + if (fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + ZFS_EXIT(zfsvfs); + + err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); + if (err) + return (EINVAL); + ZFS_ENTER(zfsvfs); + } + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* A zero fid_gen means we are in the .zfs control directories */ + if (fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { + *vpp = zfsvfs->z_ctldir; + ASSERT(*vpp != NULL); + if (object == ZFSCTL_INO_SNAPDIR) { + VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, + 0, NULL, NULL) == 0); + } else { + VN_HOLD(*vpp); + } + ZFS_EXIT(zfsvfs); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + return (0); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); + if (err = zfs_zget(zfsvfs, object, &zp)) { + ZFS_EXIT(zfsvfs); + return (err); + } + zp_gen = zp->z_phys->zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); + VN_RELE(ZTOV(zp)); + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + *vpp = ZTOV(zp); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + vnode_create_vobject(*vpp, zp->z_phys->zp_size, td); + ZFS_EXIT(zfsvfs); + return (0); +} + +static void +zfs_objset_close(zfsvfs_t *zfsvfs) +{ + znode_t *zp, *nextzp; + objset_t *os = zfsvfs->z_os; + + /* + * For forced unmount, at this point all vops except zfs_inactive + * are erroring EIO. We need to now suspend zfs_inactive threads + * while we are freeing dbufs before switching zfs_inactive + * to use behaviour without a objset. + */ + rw_enter(&zfsvfs->z_um_lock, RW_WRITER); + + /* + * Release all holds on dbufs + * Note, although we have stopped all other vop threads and + * zfs_inactive(), the dmu can callback via znode_pageout_func() + * which can zfs_znode_free() the znode. + * So we lock z_all_znodes; search the list for a held + * dbuf; drop the lock (we know zp can't disappear if we hold + * a dbuf lock; then regrab the lock and restart. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) { + nextzp = list_next(&zfsvfs->z_all_znodes, zp); + if (zp->z_dbuf_held) { + /* dbufs should only be held when force unmounting */ + zp->z_dbuf_held = 0; + mutex_exit(&zfsvfs->z_znodes_lock); + dmu_buf_rele(zp->z_dbuf, NULL); + /* Start again */ + mutex_enter(&zfsvfs->z_znodes_lock); + nextzp = list_head(&zfsvfs->z_all_znodes); + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * Unregister properties. + */ + if (!dmu_objset_is_snapshot(os)) + zfs_unregister_callbacks(zfsvfs); + + /* + * Switch zfs_inactive to behaviour without an objset. + * It just tosses cached pages and frees the znode & vnode. + * Then re-enable zfs_inactive threads in that new behaviour. + */ + zfsvfs->z_unmounted2 = B_TRUE; + rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */ + + /* + * Close the zil. Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + /* + * Evict all dbufs so that cached znodes will be freed + */ + if (dmu_objset_evict_dbufs(os, 1)) { + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) dmu_objset_evict_dbufs(os, 0); + } + + /* + * Finally close the objset + */ + dmu_objset_close(os); +} + +static void +zfs_freevfs(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + int i; + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + rw_destroy(&zfsvfs->z_um_lock); + mutex_destroy(&zfsvfs->z_znodes_lock); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + + atomic_add_32(&zfs_active_fs_count, -1); +} + +void +zfs_init(void) +{ + + printf("ZFS filesystem version " ZFS_VERSION_STRING "\n"); + + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); +} + +void +zfs_fini(void) +{ + zfsctl_fini(); + zfs_znode_fini(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c new file mode 100644 index 0000000..6769177 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -0,0 +1,3227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/uio.h> +#include <sys/atomic.h> +#include <sys/namei.h> +#include <sys/mman.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/dbuf.h> +#include <sys/zap.h> +#include <sys/dirent.h> +#include <sys/filio.h> +#include <sys/zfs_ctldir.h> +#include <sys/dnlc.h> +#include <sys/zfs_rlock.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/sf_buf.h> +#include <sys/sched.h> + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait the the intent log to commit if it's is a synchronous operation. + * Morover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). + * In normal operation, this will be TXG_NOWAIT. During ZIL replay, + * it will be a specific txg. Either way, dmu_tx_assign() never blocks. + * This is critical because we don't want to block while holding locks. + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing to + * use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, seq, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ +/* ARGSUSED */ +static int +zfs_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + int flag = ap->a_mode; + + /* Keep a count of the synchronous opens in the znode */ + if (flag & FFSYNC) { + atomic_inc_32(&zp->z_sync_cnt); + ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); + } + + vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); + return (0); +} + +/* ARGSUSED */ +static int +zfs_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + znode_t *zp = VTOZ(ap->a_vp); + int flag = ap->a_fflag; + + /* Decrement the synchronous opens in the znode */ + if (flag & FFSYNC) { + atomic_dec_32(&zp->z_sync_cnt); + ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); + } + + return (0); +} + +/* + * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and + * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey(vnode_t *vp, int cmd, offset_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_phys->zp_size; + if (noff >= file_sz) { + return (ENXIO); + } + + if (cmd == FIOSEEKHOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); + + /* end of file? */ + if ((error == ESRCH) || (noff > file_sz)) { + /* + * Handle the virtual hole at the end of file. + */ + if (hole) { + *off = file_sz; + return (0); + } + return (ENXIO); + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + u_long a_command; + caddr_t a_data; + int fflag; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + int com = ap->a_command; + caddr_t data = ap->a_data; + offset_t off; + zfsvfs_t *zfsvfs; + int error; + + switch (com) { + case FIOSEEKDATA: + case FIOSEEKHOLE: + off = *(offset_t *)data; + + zfsvfs = VTOZ(vp)->z_zfsvfs; + ZFS_ENTER(zfsvfs); + + /* offset parameter is in/out */ + error = zfs_holey(vp, com, &off); + ZFS_EXIT(zfsvfs); + if (error) + return (error); + *(offset_t *)data = off; + return (0); + } + return (ENOTTY); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + vm_object_t obj; + vm_page_t m; + struct sf_buf *sf; + int64_t start, off; + int len = nbytes; + int error = 0; + + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + VM_OBJECT_LOCK(obj); + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + uint64_t bytes = MIN(PAGESIZE - off, len); + uint64_t woff = uio->uio_loffset; + +again: + if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + vm_page_is_valid(m, (vm_offset_t)off, bytes)) { + caddr_t va; + + if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) + goto again; + vm_page_busy(m); + VM_OBJECT_UNLOCK(obj); + sched_pin(); + sf = sf_buf_alloc(m, SFB_CPUPRIVATE); + va = (caddr_t)sf_buf_kva(sf); + error = uiomove(va+off, bytes, UIO_WRITE, uio); + if (error == 0) + dmu_write(os, zp->z_id, woff, bytes, va+off, tx); + sf_buf_free(sf); + sched_unpin(); + VM_OBJECT_LOCK(obj); + vm_page_wakeup(m); + } else { + VM_OBJECT_UNLOCK(obj); + error = dmu_write_uio(os, zp->z_id, uio, bytes, tx); + VM_OBJECT_LOCK(obj); + } + len -= bytes; + off = 0; + if (error) + break; + } + VM_OBJECT_UNLOCK(obj); + return (error); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + vm_object_t obj; + vm_page_t m; + struct sf_buf *sf; + int64_t start, off; + int len = nbytes; + int error = 0; + + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + VM_OBJECT_LOCK(obj); + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + uint64_t bytes = MIN(PAGESIZE - off, len); + +again: + if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + vm_page_is_valid(m, (vm_offset_t)off, bytes)) { + caddr_t va; + + if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) + goto again; + vm_page_busy(m); + VM_OBJECT_UNLOCK(obj); + sched_pin(); + sf = sf_buf_alloc(m, SFB_CPUPRIVATE); + va = (caddr_t)sf_buf_kva(sf); + error = uiomove(va + off, bytes, UIO_READ, uio); + sf_buf_free(sf); + sched_unpin(); + VM_OBJECT_LOCK(obj); + vm_page_wakeup(m); + } else { + VM_OBJECT_UNLOCK(obj); + error = dmu_read_uio(os, zp->z_id, uio, bytes); + VM_OBJECT_LOCK(obj); + } + len -= bytes; + off = 0; + if (error) + break; + } + VM_OBJECT_UNLOCK(obj); + return (error); +} + +offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: vp - vnode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - SYNC flags; used to provide FRSYNC semantics. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 if success + * error code if failure + * + * Side Effects: + * vp - atime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + uio_t *uio = ap->a_uio; + int ioflag = ap->a_ioflag; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + ssize_t n, nbytes; + int error; + rl_t *rl; + + ZFS_ENTER(zfsvfs); + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + */ + if (ioflag & IO_SYNC) + zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + + /* + * Lock the range against changes. + */ + rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_phys->zp_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_phys->zp_size); + n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); + + while (n > 0) { + nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + + if (vn_has_cached_data(vp)) + error = mappedread(vp, nbytes, uio); + else + error = dmu_read_uio(os, zp->z_id, uio, nbytes); + if (error) + break; + n -= nbytes; + } + +out: + zfs_range_unlock(rl); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Fault in the pages of the first n bytes specified by the uio structure. + * 1 byte in each page is touched and the uio struct is unmodified. + * Any error will exit this routine as this is only a best + * attempt to get the pages resident. This is a copy of ufs_trans_touch(). + */ +static void +zfs_prefault_write(ssize_t n, struct uio *uio) +{ + struct iovec *iov; + ulong_t cnt, incr; + caddr_t p; + + if (uio->uio_segflg != UIO_USERSPACE) + return; + + iov = uio->uio_iov; + + while (n) { + cnt = MIN(iov->iov_len, n); + if (cnt == 0) { + /* empty iov entry */ + iov++; + continue; + } + n -= cnt; + /* + * touch each page in this segment. + */ + p = iov->iov_base; + while (cnt) { + if (fubyte(p) == -1) + return; + incr = MIN(cnt, PAGESIZE); + p += incr; + cnt -= incr; + } + /* + * touch the last byte in case it straddles a page. + */ + p--; + if (fubyte(p) == -1) + return; + iov++; + } +} + +/* + * Write the bytes to a file. + * + * IN: vp - vnode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - IO_APPEND flag set if in append mode. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + uio_t *uio = ap->a_uio; + int ioflag = ap->a_ioflag; + cred_t *cr = ap->a_cred; + znode_t *zp = VTOZ(vp); + ssize_t start_resid = uio->uio_resid; + ssize_t tx_bytes; + uint64_t end_size; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + offset_t woff; + ssize_t n, nbytes; + rl_t *rl; + int max_blksz = zfsvfs->z_max_blksz; + int error; + + /* + * Fasttrack empty write + */ + n = start_resid; + if (n == 0) + return (0); + + ZFS_ENTER(zfsvfs); + + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + */ + zfs_prefault_write(n, uio); + + /* + * If in append mode, set the io offset pointer to eof. + */ + if (ioflag & IO_APPEND) { + /* + * Range lock for a file append: + * The value for the start of range will be determined by + * zfs_range_lock() (to guarantee append semantics). + * If this write will cause the block size to increase, + * zfs_range_lock() will lock the entire file, so we must + * later reduce the range after we grow the block size. + */ + rl = zfs_range_lock(zp, 0, n, RL_APPEND); + if (rl->r_len == UINT64_MAX) { + /* overlocked, zp_size can't change */ + woff = uio->uio_loffset = zp->z_phys->zp_size; + } else { + woff = uio->uio_loffset = rl->r_off; + } + } else { + woff = uio->uio_loffset; + /* + * Validate file offset + */ + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * If we need to grow the block size then zfs_range_lock() + * will lock a wider range than we request here. + * Later after growing the block size we reduce the range. + */ + rl = zfs_range_lock(zp, woff, n, RL_WRITER); + } + + end_size = MAX(zp->z_phys->zp_size, woff + n); + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + /* + * Start a transaction. + */ + woff = uio->uio_loffset; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + continue; + } + dmu_tx_abort(tx); + break; + } + + /* + * If zfs_range_lock() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since zfs_range_reduce() will + * shrink down r_len to the appropriate size. + */ + if (rl->r_len == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_range_reduce(rl, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + rw_enter(&zp->z_map_lock, RW_READER); + + tx_bytes = uio->uio_resid; + if (woff + nbytes > zp->z_phys->zp_size) + vnode_pager_setsize(vp, woff + nbytes); + + if (vn_has_cached_data(vp)) { + rw_exit(&zp->z_map_lock); + error = mappedwrite(vp, nbytes, uio, tx); + } else { + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, + uio, nbytes, tx); + rw_exit(&zp->z_map_lock); + } + tx_bytes -= uio->uio_resid; + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the excute bits is set. + * + * It would be nice to to this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + (zp->z_phys->zp_mode & S_ISUID) != 0 && + zp->z_phys->zp_uid == 0) != 0) { + zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); + } + mutex_exit(&zp->z_acl_lock); + + /* + * Update time stamp. NOTE: This marks the bonus buffer as + * dirty, so we don't have to do it again for zp_size. + */ + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) + (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, + uio->uio_loffset); + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + } + + zfs_range_unlock(rl); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & IO_SYNC) + zil_commit(zilog, zp->z_last_itx, zp->z_id); + + ZFS_EXIT(zfsvfs); + return (0); +} + +void +zfs_get_done(dmu_buf_t *db, void *vzgd) +{ + zgd_t *zgd = (zgd_t *)vzgd; + rl_t *rl = zgd->zgd_rl; + vnode_t *vp = ZTOV(rl->r_zp); + int vfslocked; + + vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); + dmu_buf_rele(db, vzgd); + zfs_range_unlock(rl); + VN_RELE(vp); + zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); + kmem_free(zgd, sizeof (zgd_t)); + VFS_UNLOCK_GIANT(vfslocked); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t off = lr->lr_offset; + dmu_buf_t *db; + rl_t *rl; + zgd_t *zgd; + int dlen = lr->lr_length; /* length of user data */ + int error = 0; + + ASSERT(zio); + ASSERT(dlen != 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) + return (ENOENT); + if (zp->z_unlinked) { + VN_RELE(ZTOV(zp)); + return (ENOENT); + } + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + rl = zfs_range_lock(zp, off, dlen, RL_READER); + /* test for truncation needs to be done while range locked */ + if (off >= zp->z_phys->zp_size) { + error = ENOENT; + goto out; + } + VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); + } else { /* indirect write */ + uint64_t boff; /* block starting offset */ + + /* + * Have to lock the whole block to ensure when it's + * written out and it's checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + if (ISP2(zp->z_blksz)) { + boff = P2ALIGN_TYPED(off, zp->z_blksz, + uint64_t); + } else { + boff = 0; + } + dlen = zp->z_blksz; + rl = zfs_range_lock(zp, boff, dlen, RL_READER); + if (zp->z_blksz == dlen) + break; + zfs_range_unlock(rl); + } + /* test for truncation needs to be done while range locked */ + if (off >= zp->z_phys->zp_size) { + error = ENOENT; + goto out; + } + zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_rl = rl; + zgd->zgd_zilog = zfsvfs->z_log; + zgd->zgd_bp = &lr->lr_blkptr; + VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); + ASSERT(boff == db->db_offset); + lr->lr_blkoff = off - boff; + error = dmu_sync(zio, db, &lr->lr_blkptr, + lr->lr_common.lrc_txg, zfs_get_done, zgd); + ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz); + if (error == 0) { + zil_add_vdev(zfsvfs->z_log, + DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); + } + /* + * If we get EINPROGRESS, then we need to wait for a + * write IO initiated by dmu_sync() to complete before + * we can release this dbuf. We will finish everything + * up in the zfs_get_done() callback. + */ + if (error == EINPROGRESS) + return (0); + dmu_buf_rele(db, zgd); + kmem_free(zgd, sizeof (zgd_t)); + } +out: + zfs_range_unlock(rl); + VN_RELE(ZTOV(zp)); + return (error); +} + +/*ARGSUSED*/ +static int +zfs_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + error = zfs_zaccess_rwx(zp, ap->a_mode, ap->a_cred); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +int +zfs_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + kthread_t *td = ap->a_cnp->cn_thread; + struct componentname *cnp = ap->a_cnp; + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + u_long flags = cnp->cn_flags; + cred_t *cr = cnp->cn_cred; + int nameiop = cnp->cn_nameiop; + char nm[NAME_MAX + 1]; + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error; + + ASSERT(cnp->cn_namelen < sizeof(nm)); + strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); + + ZFS_ENTER(zfsvfs); + + *vpp = NULL; + +#ifdef TODO + if (flags & LOOKUP_XATTR) { + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_phys->zp_flags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + + if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) { + VN_RELE(*vpp); + } + + ZFS_EXIT(zfsvfs); + return (error); + } +#endif /* TODO */ + + if (dvp->v_type != VDIR) { + ZFS_EXIT(zfsvfs); + return (ENOTDIR); + } + + /* + * Check accessibility of directory. + */ + + if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_dirlook(zdp, nm, vpp); + + ZFS_EXIT(zfsvfs); + + /* Translate errors and add SAVENAME when needed. */ + if (cnp->cn_flags & ISLASTCN) { + switch (nameiop) { + case CREATE: + case RENAME: + if (error == ENOENT) { + error = EJUSTRETURN; + cnp->cn_flags |= SAVENAME; + break; + } + /* FALLTHROUGH */ + case DELETE: + if (error == 0) + cnp->cn_flags |= SAVENAME; + break; + } + } + if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { + if (flags & ISDOTDOT) + VOP_UNLOCK(dvp, 0, td); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + if (flags & ISDOTDOT) + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + } + +#ifdef FREEBSD_NAMECACHE + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) + cache_enter(dvp, *vpp, cnp); + /* + * Insert name into cache if appropriate. + */ + if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { + if (!(cnp->cn_flags & ISLASTCN) || + (nameiop != DELETE && nameiop != RENAME)) { + cache_enter(dvp, *vpp, cnp); + } + } +#endif + + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - large file flag [UNUSED]. + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ +/* ARGSUSED */ +static int +zfs_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + vattr_t *vap = ap->a_vap; + cred_t *cr = ap->a_cnp->cn_cred; + char *name = ap->a_cnp->cn_nameptr; + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + objset_t *os = zfsvfs->z_os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + uint64_t zoid; + int mode; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + mode = vap->va_mode & ALLPERMS; + + ZFS_ENTER(zfsvfs); + +top: + *vpp = NULL; + + if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~VSVTX; + + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + VN_HOLD(dvp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible VN_HOLD(zp) */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) { + if (strcmp(name, "..") == 0) + error = EISDIR; + ZFS_EXIT(zfsvfs); + return (error); + } + } + + zoid = zp ? zp->z_id : -1ULL; + + if (zp == NULL) { + /* + * Create a new file object and update the directory + * to reference it. + */ + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + if ((dzp->z_phys->zp_flags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = EINVAL; + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, SPA_MAXBLOCKSIZE); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); + ASSERT(zp->z_id == zoid); + (void) zfs_link_create(dl, zp, tx, ZNEW); + zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name); + dmu_tx_commit(tx); + } else { + /* + * A directory entry already exists for this name. + */ + + /* + * Can't open a directory for writing. + */ + if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { + error = EISDIR; + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if ((ZTOV(zp)->v_type == VREG) && + (zp->z_phys->zp_size != 0) && + (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { + error = zfs_freesp(zp, 0, 0, mode, TRUE); + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + /* NB: we already did dmu_tx_wait() */ + zfs_dirent_unlock(dl); + VN_RELE(ZTOV(zp)); + goto top; + } + } + } +out: + + if (error == 0) { + *vpp = ZTOV(zp); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread); + } + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + VN_RELE(ZTOV(zp)); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ +static int +zfs_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + cred_t *cr = ap->a_cnp->cn_cred; + char *name = ap->a_cnp->cn_nameptr; + znode_t *zp, *dzp = VTOZ(dvp); +#if 0 + znode_t *xzp = NULL; +#endif + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t xattr_obj; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t unlinked; + int error; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + ZFS_ENTER(zfsvfs); + +top: + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + ASSERT(vp == ap->a_vp); + ASSERT(vp->v_type != VDIR); + /* Drop an extra reference from zfs_dirent_lock(). */ + VN_RELE(vp); + ASSERT(vp->v_count > 0); + + if (error = zfs_zaccess_delete(dzp, zp, cr)) + goto out; + + dnlc_remove(dvp, name); + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_bonus(tx, zp->z_id); + + /* are there any extended attributes? */ + if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { + /* XXX - do we need this if we are deleting? */ + dmu_tx_hold_bonus(tx, xattr_obj); + } + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, 0, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) + zfs_unlinked_add(zp, tx); + + zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name); + + dmu_tx_commit(tx); +out: + zfs_dirent_unlock(dl); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +static int +zfs_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + vattr_t *vap = ap->a_vap; + cred_t *cr = ap->a_cnp->cn_cred; + char *dirname = ap->a_cnp->cn_nameptr; + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + zfs_dirlock_t *dl; + uint64_t zoid = 0; + dmu_tx_t *tx; + int error; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + ASSERT(vap->va_type == VDIR); + vattr_init_mask(vap); + + ZFS_ENTER(zfsvfs); + + if (dzp->z_phys->zp_flags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } +top: + *vpp = NULL; + + /* + * First make sure the new directory doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, SPA_MAXBLOCKSIZE); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); + + *vpp = ZTOV(zp); + + zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname); + dmu_tx_commit(tx); + + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread); + + zfs_dirent_unlock(dl); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + */ +static int +zfs_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + cred_t *cr = ap->a_cnp->cn_cred; + char *name = ap->a_cnp->cn_nameptr; + znode_t *dzp = VTOZ(dvp); + znode_t *zp; + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + ZFS_ENTER(zfsvfs); +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + ASSERT(vp == ap->a_vp); + ASSERT(vp->v_type == VDIR); + /* Drop an extra reference from zfs_dirent_lock(). */ + VN_RELE(vp); + ASSERT(vp->v_count > 0); + + if (error = zfs_zaccess_delete(dzp, zp, cr)) { + goto out; + } + + /* + * Grab a lock on the directory to make sure that noone is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + +#ifdef FREEBSD_NAMECACHE + cache_purge(dvp); +#endif + + error = zfs_link_destroy(dl, zp, tx, 0, NULL); + + if (error == 0) + zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name); + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +#ifdef FREEBSD_NAMECACHE + cache_purge(vp); +#endif +out: + zfs_dirent_unlock(dl); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure. + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +static int +zfs_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *ncookies; + u_long **a_cookies; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + znode_t *zp = VTOZ(vp); + iovec_t *iovp; + dirent64_t *odp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + int local_eof; + int outcount; + int error; + uint8_t prefetch; + uint8_t type; + int ncookies; + u_long *cookies = NULL; + + ZFS_ENTER(zfsvfs); + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio->uio_iov->iov_len <= 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + error = 0; + os = zfsvfs->z_os; + offset = uio->uio_loffset; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + iovp = uio->uio_iov; + bytes_wanted = iovp->iov_len; + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { + bufsize = bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + odp = (struct dirent64 *)outbuf; + } else { + bufsize = bytes_wanted; + odp = (struct dirent64 *)iovp->iov_base; + } + + if (ap->a_ncookies) { + /* + * Minimum entry size is dirent size and 1 byte for a file name. + */ + ncookies = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); + cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK); + *ap->a_cookies = cookies; + *ap->a_ncookies = ncookies; + } + + /* + * Transform to file-system independent format + */ + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + objnum = zp->z_id; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + objnum = zp->z_phys->zp_parent; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + objnum = ZFSCTL_INO_ROOT; + } else { + /* + * Grab next entry. + */ + if (error = zap_cursor_retrieve(&zc, &zap)) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = ENXIO; + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + } + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = EINVAL; + goto update; + } + break; + } + /* + * Add this entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + odp->d_namlen = strlen(zap.za_name); + (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); + odp->d_type = type; + outcount += reclen; + odp = (dirent64_t *)((intptr_t)odp + reclen); + + ASSERT(outcount <= bufsize); + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0); + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + if (cookies != NULL) { + *cookies++ = offset; + ncookies--; + KASSERT(ncookies >= 0, ("ncookies=%d", ncookies)); + } + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + /* Subtract unused cookies */ + if (ap->a_ncookies) + *ap->a_ncookies -= ncookies; + + if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { + iovp->iov_base += outcount; + iovp->iov_len -= outcount; + uio->uio_resid -= outcount; + } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { + /* + * Reset the pointer. + */ + offset = uio->uio_loffset; + } + +update: + zap_cursor_fini(&zc); + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + kmem_free(outbuf, bufsize); + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + uio->uio_loffset = offset; + ZFS_EXIT(zfsvfs); + if (error != 0) { + free(*ap->a_cookies, M_TEMP); + *ap->a_cookies = NULL; + *ap->a_ncookies = 0; + } + return (error); +} + +static int +zfs_fsync(struct vop_fsync_args *ap) +{ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + vop_stdfsync(ap); + + ZFS_ENTER(zfsvfs); + zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * flags - [UNUSED] + * cr - credentials of caller. + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +static int +zfs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + cred_t *cr = ap->a_cred; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_phys_t *pzp = zp->z_phys; + uint32_t blksize; + u_longlong_t nblocks; + int error; + + ZFS_ENTER(zfsvfs); + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + mutex_enter(&zp->z_lock); + + vap->va_type = IFTOVT(pzp->zp_mode); + vap->va_mode = pzp->zp_mode & ~S_IFMT; + vap->va_uid = zp->z_phys->zp_uid; + vap->va_gid = zp->z_phys->zp_gid; + vap->va_nodeid = zp->z_id; + vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */ + vap->va_size = pzp->zp_size; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_seq = zp->z_seq; + vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + + ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); + ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); + ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); + ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) && + (zp->z_phys->zp_uid != crgetuid(cr))) { + if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) { + mutex_exit(&zp->z_lock); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + mutex_exit(&zp->z_lock); + + dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); + vap->va_blksize = blksize; + vap->va_bytes = nblocks << 9; /* nblocks * 512 */ + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: vp - vnode of file to be modified. + * vap - new attribute values. + * flags - ATTR_UTIME set if non-default time values provided. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +static int +zfs_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + cred_t *cr = ap->a_cred; + znode_t *zp = VTOZ(vp); + znode_phys_t *pzp = zp->z_phys; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + dmu_tx_t *tx; + vattr_t oldva; + uint_t mask; + uint_t saved_mask; + int trim_mask = 0; + uint64_t new_mode; + znode_t *attrzp; + int need_policy = FALSE; + int flags = 0; + int err; + + /* No support for FreeBSD's chflags(2). */ + if (vap->va_flags != VNOVAL) + return (EOPNOTSUPP); + + vattr_init_mask(vap); + mask = vap->va_mask; + + if (mask == 0) + return (0); + + if (mask & AT_SIZE && vp->v_type == VDIR) + return (EISDIR); + + if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + +top: + attrzp = NULL; + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (EROFS); + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + do { + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + /* NB: we already did dmu_tx_wait() if necessary */ + } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + if (mask & (AT_ATIME|AT_MTIME)) + need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr); + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & AT_MODE)) + vap->va_mode = pzp->zp_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + secpolicy_setid_clear(vap, cr); + trim_mask = (mask & (AT_UID|AT_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = pzp->zp_mode; + oldva.va_uid = zp->z_phys->zp_uid; + oldva.va_gid = zp->z_phys->zp_gid; + mutex_exit(&zp->z_lock); + + if (mask & AT_MODE) { + if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + + if (trim_mask) + vap->va_mask |= saved_mask; + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + + if (mask & AT_MODE) { + uint64_t pmode = pzp->zp_mode; + + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (zp->z_phys->zp_acl.z_acl_extern_obj) + dmu_tx_hold_write(tx, + pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE); + else + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, ZFS_ACL_SIZE(MAX_ACL_SIZE)); + } + + if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) { + err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp); + if (err) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (err); + } + dmu_tx_hold_bonus(tx, attrzp->z_id); + } + + err = dmu_tx_assign(tx, zfsvfs->z_assign); + if (err) { + if (attrzp) + VN_RELE(ZTOV(attrzp)); + if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (err); + } + + dmu_buf_will_dirty(zp->z_dbuf, tx); + + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + mutex_enter(&zp->z_lock); + + if (mask & AT_MODE) { + err = zfs_acl_chmod_setattr(zp, new_mode, tx); + ASSERT3U(err, ==, 0); + } + + if (attrzp) + mutex_enter(&attrzp->z_lock); + + if (mask & AT_UID) { + zp->z_phys->zp_uid = (uint64_t)vap->va_uid; + if (attrzp) { + attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid; + } + } + + if (mask & AT_GID) { + zp->z_phys->zp_gid = (uint64_t)vap->va_gid; + if (attrzp) + attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid; + } + + if (attrzp) + mutex_exit(&attrzp->z_lock); + + if (mask & AT_ATIME) + ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); + + if (mask & AT_MTIME) + ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + + if (mask & AT_SIZE) + zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); + else if (mask != 0) + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask); + + mutex_exit(&zp->z_lock); + + if (attrzp) + VN_RELE(ZTOV(attrzp)); + + dmu_tx_commit(tx); + + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + VN_RELE(ZTOV(zl->zl_znode)); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = zp->z_zfsvfs->z_root; + uint64_t *oidp = &zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = &zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (*oidp == szp->z_id) /* We're a descendant of szp */ + return (EINVAL); + + if (*oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + oidp = &zp->z_phys->zp_parent; + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +static int +zfs_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + vnode_t *svp = ap->a_fvp; + vnode_t *tvp = ap->a_tvp; + vnode_t *sdvp = ap->a_fdvp; + vnode_t *tdvp = ap->a_tdvp; + char *snm = ap->a_fcnp->cn_nameptr; + char *tnm = ap->a_tcnp->cn_nameptr; + cred_t *cr = ap->a_fcnp->cn_cred; + znode_t *tdzp, *szp, *tzp; + znode_t *sdzp = VTOZ(sdvp); + zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int serr, terr, error; + + ASSERT(ap->a_fcnp->cn_flags & SAVENAME); + ASSERT(ap->a_tcnp->cn_flags & SAVENAME); + + ZFS_ENTER(zfsvfs); + + if (tdvp->v_vfsp != sdvp->v_vfsp) { + error = EXDEV; +abort: + ZFS_EXIT(zfsvfs); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + vrele(sdvp); + vrele(svp); + return (error); + } + + tdzp = VTOZ(tdvp); +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != + (sdzp->z_phys->zp_flags & ZFS_XATTR)) { + error = EINVAL; + goto abort; + } + + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + * This case is already handled in kern_rename(). + */ + ASSERT(svp != tvp); + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); + terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); + } else /* if (sdzp->z_id > tdzp->z_id) */ { + terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); + } + if (!terr && tzp != NULL) { + if (tvp) + vrele(tvp); + else { + ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); + tvp = ZTOV(tzp); + vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, curthread); + } + ASSERT(tvp->v_count > 0); + } + + if (serr || svp != ZTOV(szp)) { + /* + * Source entry invalid or not there. + */ + if (!serr) + zfs_dirent_unlock(sdl); + if (!terr) + zfs_dirent_unlock(tdl); + if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) + serr = EINVAL; + error = serr; + goto abort; + } + if (terr) { + zfs_dirent_unlock(sdl); + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + error = terr; + goto abort; + } + /* Remove an extra reference from zfs_dirent_lock(). */ + vrele(svp); + ASSERT(svp->v_count > 0); + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) + goto out; + + if (svp->v_type == VDIR) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (svp->v_type == VDIR) { + if (tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + } else { + if (tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ + dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) + dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ + if (tzp) + dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + if (ap->a_tvp == NULL && tvp != NULL) { + vput(tvp); + tvp = NULL; + } + goto top; + } + dmu_tx_abort(tx); + goto abort; + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, 0, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + ASSERT(error == 0); + zfs_log_rename(zilog, tx, TX_RENAME, sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + } +#ifdef FREEBSD_NAMECACHE + if (error == 0) { + cache_purge(sdvp); + cache_purge(tdvp); + } +#endif + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (tzp) + vput(tvp); + vput(tdvp); + vrele(sdvp); + vrele(svp); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * target - Target path of new symlink. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + */ +static int +zfs_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vattr_t *vap = ap->a_vap; + cred_t *cr = ap->a_cnp->cn_cred; + char *name = ap->a_cnp->cn_nameptr; + char *link = ap->a_target; + znode_t *zp, *dzp = VTOZ(dvp); + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t zoid; + int len = strlen(link); + int error; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ + vattr_init_mask(vap); + + ZFS_ENTER(zfsvfs); +top: + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (ENAMETOOLONG); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + + /* + * Create a new object for the symlink. + * Put the link content into bonus buffer if it will fit; + * otherwise, store it just like any other file data. + */ + zoid = 0; + if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { + zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len); + if (len != 0) + bcopy(link, zp->z_phys + 1, len); + } else { + dmu_buf_t *dbp; + + zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); + + /* + * Nothing can access the znode yet so no locking needed + * for growing the znode's blocksize. + */ + zfs_grow_blocksize(zp, len, tx); + + VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp)); + dmu_buf_will_dirty(dbp, tx); + + ASSERT3U(len, <=, dbp->db_size); + bcopy(link, dbp->db_data, len); + dmu_buf_rele(dbp, FTAG); + } + zp->z_phys->zp_size = len; + + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); + + if (error == 0) { + zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link); + + *ap->a_vpp = ZTOV(zp); + + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY, + ap->a_cnp->cn_thread); + } + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uoip - structure to contain the link path. + * cr - credentials of caller. + * + * OUT: uio - structure to contain the link path. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + uio_t *uio = ap->a_uio; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + size_t bufsz; + int error; + + ZFS_ENTER(zfsvfs); + + bufsz = (size_t)zp->z_phys->zp_size; + if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { + error = uiomove(zp->z_phys + 1, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + } else { + dmu_buf_t *dbp; + error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + error = uiomove(dbp->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + dmu_buf_rele(dbp, FTAG); + } + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +static int +zfs_link(ap) + struct vop_link_args /* { + struct vnode *a_tdvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *svp = ap->a_vp; + vnode_t *tdvp = ap->a_tdvp; + cred_t *cr = ap->a_cnp->cn_cred; + char *name = ap->a_cnp->cn_nameptr; + znode_t *dzp = VTOZ(tdvp); + znode_t *tzp, *szp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + ASSERT(tdvp->v_type == VDIR); + + ZFS_ENTER(zfsvfs); + + if (svp->v_vfsp != tdvp->v_vfsp) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } + + szp = VTOZ(svp); +top: + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_phys->zp_flags & ZFS_XATTR) != + (dzp->z_phys->zp_flags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (svp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) && + secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, szp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) + zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + /* + * Destroy the vm object and flush associated pages. + */ + vnode_destroy_vobject(vp); + + rw_enter(&zfsvfs->z_um_lock, RW_READER); + if (zfsvfs->z_unmounted2) { + ASSERT(zp->z_dbuf_held == 0); + + mutex_enter(&zp->z_lock); + VI_LOCK(vp); + vp->v_count = 0; /* count arrives as 1 */ + VI_UNLOCK(vp); + if (zp->z_dbuf == NULL) { + mutex_exit(&zp->z_lock); + zfs_znode_free(zp); + } else { + mutex_exit(&zp->z_lock); + } + rw_exit(&zfsvfs->z_um_lock); + VFS_RELE(zfsvfs->z_vfs); + return (0); + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_buf_will_dirty(zp->z_dbuf, tx); + mutex_enter(&zp->z_lock); + zp->z_atime_dirty = 0; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } + + zfs_zinactive(zp); + rw_exit(&zfsvfs->z_um_lock); + return (0); +} + +static int +zfs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + + if (zp != NULL) + mutex_enter(&zp->z_lock); +#if 0 /* + * We do it from zfs_inactive(), because after zfs_inactive() we can't + * VOP_WRITE() to the vnode. + */ + /* + * Destroy the vm object and flush associated pages. + */ + vnode_destroy_vobject(vp); +#endif + VI_LOCK(vp); + vp->v_data = NULL; + if (zp != NULL) + zp->z_vnode = NULL; + ASSERT(vp->v_holdcnt > 1); + vdropl(vp); + if (zp != NULL) + mutex_exit(&zp->z_lock); + return (0); +} + +CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); +CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); + +static int +zfs_fid(ap) + struct vop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint32_t gen = (uint32_t)zp->z_phys->zp_gen; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i; + + ZFS_ENTER(zfsvfs); + + size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; + fidp->fid_len = size; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + if (size == LONG_FID_LEN) { + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + zfid_long_t *zlfid; + + zlfid = (zfid_long_t *)fidp; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + /* XXX - this should be the generation number for the objset */ + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + int cmd = ap->a_name; + int *valp = ap->a_retval; +#if 0 + vnode_t *vp = ap->a_vp; + znode_t *zp, *xzp; + zfsvfs_t *zfsvfs; + zfs_dirlock_t *dl; + int error; +#endif + + switch (cmd) { + case _PC_LINK_MAX: + *valp = INT_MAX; + return (0); + + case _PC_FILESIZEBITS: + *valp = 64; + return (0); + +#if 0 + case _PC_XATTR_EXISTS: + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + *valp = 0; + error = zfs_dirent_lock(&dl, zp, "", &xzp, + ZXATTR | ZEXISTS | ZSHARED); + if (error == 0) { + zfs_dirent_unlock(dl); + if (!zfs_dirempty(xzp)) + *valp = 1; + VN_RELE(ZTOV(xzp)); + } else if (error == ENOENT) { + /* + * If there aren't extended attributes, it's the + * same as having zero of them. + */ + error = 0; + } + ZFS_EXIT(zfsvfs); + return (error); +#endif + + case _PC_ACL_EXTENDED: + *valp = 0; /* TODO */ + return (0); + + case _PC_MIN_HOLE_SIZE: + *valp = (int)SPA_MINBLOCKSIZE; + return (0); + + default: + return (vop_stdpathconf(ap)); + } +} + +#ifdef TODO +/*ARGSUSED*/ +static int +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + error = zfs_getacl(zp, vsecp, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} +#endif /* TODO */ + +#ifdef TODO +/*ARGSUSED*/ +static int +zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + error = zfs_setacl(zp, vsecp, cr); + ZFS_EXIT(zfsvfs); + return (error); +} +#endif /* TODO */ + +/* + * Advisory record locking support + */ +static int +zfs_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + znode_t *zp = VTOZ(ap->a_vp); + + return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size)); +} + +struct vop_vector zfs_vnodeops; +struct vop_vector zfs_fifoops; + +struct vop_vector zfs_vnodeops = { + .vop_default = &default_vnodeops, + .vop_inactive = zfs_inactive, + .vop_reclaim = zfs_reclaim, + .vop_access = zfs_access, +#ifdef FREEBSD_NAMECACHE + .vop_lookup = vfs_cache_lookup, + .vop_cachedlookup = zfs_lookup, +#else + .vop_lookup = zfs_lookup, +#endif + .vop_getattr = zfs_getattr, + .vop_setattr = zfs_setattr, + .vop_create = zfs_create, + .vop_mknod = zfs_create, + .vop_mkdir = zfs_mkdir, + .vop_readdir = zfs_readdir, + .vop_fsync = zfs_fsync, + .vop_open = zfs_open, + .vop_close = zfs_close, + .vop_rmdir = zfs_rmdir, + .vop_ioctl = zfs_ioctl, + .vop_link = zfs_link, + .vop_symlink = zfs_symlink, + .vop_readlink = zfs_readlink, + .vop_read = zfs_read, + .vop_write = zfs_write, + .vop_remove = zfs_remove, + .vop_rename = zfs_rename, + .vop_advlock = zfs_advlock, + .vop_pathconf = zfs_pathconf, + .vop_bmap = VOP_EOPNOTSUPP, + .vop_fid = zfs_fid, +}; + +struct vop_vector zfs_fifoops = { + .vop_default = &fifo_specops, + .vop_fsync = VOP_PANIC, + .vop_access = zfs_access, + .vop_getattr = zfs_getattr, + .vop_inactive = zfs_inactive, + .vop_read = VOP_PANIC, + .vop_reclaim = zfs_reclaim, + .vop_setattr = zfs_setattr, + .vop_write = VOP_PANIC, + .vop_fid = zfs_fid, +}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c new file mode 100644 index 0000000..d2806b9 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -0,0 +1,1061 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef _KERNEL +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/mntent.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/atomic.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_rlock.h> +#include <sys/fs/zfs.h> +#endif /* _KERNEL */ + +#include <sys/dmu.h> +#include <sys/refcount.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/zfs_znode.h> +#include <sys/refcount.h> + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL +struct kmem_cache *znode_cache = NULL; + +/*ARGSUSED*/ +static void +znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) +{ + znode_t *zp = user_ptr; + vnode_t *vp = ZTOV(zp); + + mutex_enter(&zp->z_lock); + if (vp == NULL) { + mutex_exit(&zp->z_lock); + zfs_znode_free(zp); + } else if (vp->v_count == 0) { + ZTOV(zp) = NULL; + mutex_exit(&zp->z_lock); + vhold(vp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vrecycle(vp, curthread); + VOP_UNLOCK(vp, 0, curthread); + vdrop(vp); + zfs_znode_free(zp); + } else { + /* signal force unmount that this znode can be freed */ + zp->z_dbuf = NULL; + mutex_exit(&zp->z_lock); + } +} + +extern struct vop_vector zfs_vnodeops; +extern struct vop_vector zfs_fifoops; + +/* + * XXX: We cannot use this function as a cache constructor, because + * there is one global cache for all file systems and we need + * to pass vfsp here, which is not possible, because argument + * 'cdrarg' is defined at kmem_cache_create() time. + */ +static int +zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + znode_t *zp = buf; + vfs_t *vfsp = cdrarg; + int error; + + if (cdrarg != NULL) { + error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &zp->z_vnode); + ASSERT(error == 0); + zp->z_vnode->v_data = (caddr_t)zp; + vhold(zp->z_vnode); + } else { + zp->z_vnode = NULL; + } + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + + mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&zp->z_range_avl, zfs_range_compare, + sizeof (rl_t), offsetof(rl_t, r_node)); + + zp->z_dbuf_held = 0; + zp->z_dirlocks = 0; + zp->z_lockf = NULL; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *cdarg) +{ + znode_t *zp = buf; + + ASSERT(zp->z_dirlocks == 0); + mutex_destroy(&zp->z_lock); + rw_destroy(&zp->z_map_lock); + rw_destroy(&zp->z_parent_lock); + rw_destroy(&zp->z_name_lock); + mutex_destroy(&zp->z_acl_lock); + mutex_destroy(&zp->z_range_lock); + avl_destroy(&zp->z_range_avl); + + ASSERT(zp->z_dbuf_held == 0); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, + zfs_znode_cache_destructor, NULL, NULL, NULL, 0); +} + +void +zfs_znode_fini(void) +{ + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; +} + +/* + * zfs_init_fs - Initialize the zfsvfs struct and the file system + * incore "master" object. Verify version compatibility. + */ +int +zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) +{ + objset_t *os = zfsvfs->z_os; + uint64_t version = ZPL_VERSION; + int i, error; + dmu_object_info_t doi; + uint64_t fsid_guid; + + *zpp = NULL; + + /* + * XXX - hack to auto-create the pool root filesystem at + * the first attempted mount. + */ + if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { + dmu_tx_t *tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */ + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */ + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ + error = dmu_tx_assign(tx, TXG_WAIT); + ASSERT3U(error, ==, 0); + zfs_create_fs(os, cr, tx); + dmu_tx_commit(tx); + } + + error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1, + &version); + if (error) { + return (error); + } else if (version != ZPL_VERSION) { + (void) printf("Mismatched versions: File system " + "is version %lld on-disk format, which is " + "incompatible with this software version %lld!", + (u_longlong_t)version, ZPL_VERSION); + return (ENOTSUP); + } + + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; + zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error) + return (error); + ASSERT(zfsvfs->z_root != 0); + + /* + * Create the per mount vop tables. + */ + + /* + * Initialize zget mutex's + */ + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); + if (error) + return (error); + ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error) + return (error); + + return (0); +} + +/* + * define a couple of values we need available + * for both 64 and 32 bit environments. + */ +#ifndef NBITSMINOR64 +#define NBITSMINOR64 32 +#endif +#ifndef MAXMAJ64 +#define MAXMAJ64 0xffffffffUL +#endif +#ifndef MAXMIN64 +#define MAXMIN64 0xffffffffUL +#endif + +/* + * Create special expldev for ZFS private use. + * Can't use standard expldev since it doesn't do + * what we want. The standard expldev() takes a + * dev32_t in LP64 and expands it to a long dev_t. + * We need an interface that takes a dev32_t in ILP32 + * and expands it to a long dev_t. + */ +static uint64_t +zfs_expldev(dev_t dev) +{ + return ((uint64_t)0); +} +/* + * Special cmpldev for ZFS private use. + * Can't use standard cmpldev since it takes + * a long dev_t and compresses it to dev32_t in + * LP64. We need to do a compaction of a long dev_t + * to a dev32_t in ILP32. + */ +dev_t +zfs_cmpldev(uint64_t dev) +{ + return ((dev_t)0); +} + +/* + * Construct a new znode/vnode and intialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) +{ + znode_t *zp; + vnode_t *vp; + int error; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0); + + ASSERT(zp->z_dirlocks == NULL); + + zp->z_phys = db->db_data; + zp->z_zfsvfs = zfsvfs; + zp->z_unlinked = 0; + zp->z_atime_dirty = 0; + zp->z_dbuf_held = 0; + zp->z_mapcnt = 0; + zp->z_last_itx = 0; + zp->z_dbuf = db; + zp->z_id = obj_num; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + mutex_exit(&zfsvfs->z_znodes_lock); + + vp = ZTOV(zp); + if (vp == NULL) + return (zp); + + error = insmntque(vp, zfsvfs->z_vfs); + KASSERT(error == 0, ("insmntque() failed: error %d", error)); + + vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); + switch (vp->v_type) { + case VDIR: + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + break; + case VFIFO: + vp->v_op = &zfs_fifoops; + break; + } + + return (zp); +} + +static void +zfs_znode_dmu_init(znode_t *zp) +{ + znode_t *nzp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_buf_t *db = zp->z_dbuf; + + mutex_enter(&zp->z_lock); + + nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func); + + /* + * there should be no + * concurrent zgets on this object. + */ + ASSERT3P(nzp, ==, NULL); + + /* + * Slap on VROOT if we are the root znode + */ + if (zp->z_id == zfsvfs->z_root) { + ZTOV(zp)->v_flag |= VROOT; + } + + ASSERT(zp->z_dbuf_held == 0); + zp->z_dbuf_held = 1; + VFS_HOLD(zfsvfs->z_vfs); + mutex_exit(&zp->z_lock); +} + +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * IS_REPLAY - intent log replay + * + * OUT: oid - ID of created object + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, int bonuslen) +{ + dmu_buf_t *dbp; + znode_phys_t *pzp; + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + timestruc_t now; + uint64_t gen; + int err; + + ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); + + if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + *oid = vap->va_nodeid; + flag |= IS_REPLAY; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + } else { + *oid = 0; + gethrestime(&now); + gen = dmu_tx_get_txg(tx); + } + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be to needed allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (vap->va_type == VDIR) { + if (flag & IS_REPLAY) { + err = zap_create_claim(zfsvfs->z_os, *oid, + DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + ASSERT3U(err, ==, 0); + } else { + *oid = zap_create(zfsvfs->z_os, + DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + } + } else { + if (flag & IS_REPLAY) { + err = dmu_object_claim(zfsvfs->z_os, *oid, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + ASSERT3U(err, ==, 0); + } else { + *oid = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + } + } + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp)); + dmu_buf_will_dirty(dbp, tx); + + /* + * Initialize the znode physical data to zero. + */ + ASSERT(dbp->db_size >= sizeof (znode_phys_t)); + bzero(dbp->db_data, dbp->db_size); + pzp = dbp->db_data; + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_phys = pzp; + dzp->z_id = *oid; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp->z_phys->zp_flags & ZFS_XATTR) + flag |= IS_XATTR; + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + pzp->zp_rdev = zfs_expldev(vap->va_rdev); + } + + if (vap->va_type == VDIR) { + pzp->zp_size = 2; /* contents ("." and "..") */ + pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } + + pzp->zp_parent = dzp->z_id; + if (flag & IS_XATTR) + pzp->zp_flags |= ZFS_XATTR; + + pzp->zp_gen = gen; + + ZFS_TIME_ENCODE(&now, pzp->zp_crtime); + ZFS_TIME_ENCODE(&now, pzp->zp_ctime); + + if (vap->va_mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); + } else { + ZFS_TIME_ENCODE(&now, pzp->zp_atime); + } + + if (vap->va_mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + } else { + ZFS_TIME_ENCODE(&now, pzp->zp_mtime); + } + + pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); + zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0); + + zfs_perm_init(zp, dzp, flag, vap, tx, cr); + + if (zpp) { + kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp); + + mutex_enter(hash_mtx); + zfs_znode_dmu_init(zp); + mutex_exit(hash_mtx); + + *zpp = zp; + } else { + if (ZTOV(zp) != NULL) + ZTOV(zp)->v_count = 0; + dmu_buf_rele(dbp, NULL); + zfs_znode_free(zp); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + vnode_t *vp; + int err; + + *zpp = NULL; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_ZNODE || + doi.doi_bonus_size < sizeof (znode_phys_t)) { + dmu_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EINVAL); + } + + ASSERT(db->db_object == obj_num); + ASSERT(db->db_offset == -1); + ASSERT(db->db_data != NULL); + + zp = dmu_buf_get_user(db); + + if (zp != NULL) { + mutex_enter(&zp->z_lock); + + ASSERT3U(zp->z_id, ==, obj_num); + if (zp->z_unlinked) { + dmu_buf_rele(db, NULL); + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (ENOENT); + } else if (zp->z_dbuf_held) { + dmu_buf_rele(db, NULL); + } else { + zp->z_dbuf_held = 1; + VFS_HOLD(zfsvfs->z_vfs); + } + + if (ZTOV(zp) != NULL) + VN_HOLD(ZTOV(zp)); + else { + err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops, + &zp->z_vnode); + ASSERT(err == 0); + vp = ZTOV(zp); + vp->v_data = (caddr_t)zp; + vhold(vp); + vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); + if (vp->v_type == VDIR) + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + err = insmntque(vp, zfsvfs->z_vfs); + KASSERT(err == 0, ("insmntque() failed: error %d", err)); + } + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + *zpp = zp; + return (0); + } + + /* + * Not found create new znode/vnode + */ + zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); + ASSERT3U(zp->z_id, ==, obj_num); + zfs_znode_dmu_init(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + *zpp = zp; + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); + if (zp->z_phys->zp_acl.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + zp->z_phys->zp_acl.z_acl_extern_obj, tx); + ASSERT3U(error, ==, 0); + } + error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); + ASSERT3U(error, ==, 0); + zp->z_dbuf_held = 0; + ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); + dmu_buf_rele(zp->z_dbuf, NULL); +} + +void +zfs_zinactive(znode_t *zp) +{ + vnode_t *vp = ZTOV(zp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + + ASSERT(zp->z_dbuf_held && zp->z_phys); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); + + mutex_enter(&zp->z_lock); + VI_LOCK(vp); + if (vp->v_count > 0) { + /* + * If the hold count is greater than zero, somebody has + * obtained a new reference on this znode while we were + * processing it here, so we are done. + */ + VI_UNLOCK(vp); + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + return; + } + VI_UNLOCK(vp); + + /* + * If this was the last reference to a file with no links, + * remove the file from the file system. + */ + if (zp->z_unlinked) { + ZTOV(zp) = NULL; + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + ASSERT(vp->v_count == 0); + vrecycle(vp, curthread); + zfs_rmnode(zp); + VFS_RELE(zfsvfs->z_vfs); + return; + } + ASSERT(zp->z_phys); + ASSERT(zp->z_dbuf_held); + + zp->z_dbuf_held = 0; + mutex_exit(&zp->z_lock); + dmu_buf_rele(zp->z_dbuf, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + VFS_RELE(zfsvfs->z_vfs); +} + +/* + * FreeBSD: Should be called from ->vop_reclaim(). + */ +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + mutex_enter(&zfsvfs->z_znodes_lock); + list_remove(&zfsvfs->z_all_znodes, zp); + mutex_exit(&zfsvfs->z_znodes_lock); + + kmem_cache_free(znode_cache, zp); +} + +void +zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) +{ + timestruc_t now; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + + gethrestime(&now); + + if (tx) { + dmu_buf_will_dirty(zp->z_dbuf, tx); + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & AT_ATIME) + ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); + + if (flag & AT_MTIME) + ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); + + if (flag & AT_CTIME) + ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); +} + +/* + * Update the requested znode timestamps with the current time. + * If we are in a transaction, then go ahead and mark the znode + * dirty in the transaction so the timestamps will go to disk. + * Otherwise, we will get pushed next time the znode is updated + * in a transaction, or when this znode eventually goes inactive. + * + * Why is this OK? + * 1 - Only the ACCESS time is ever updated outside of a transaction. + * 2 - Multiple consecutive updates will be collapsed into a single + * znode update by the transaction grouping semantics of the DMU. + */ +void +zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) +{ + mutex_enter(&zp->z_lock); + zfs_time_stamper_locked(zp, flag, tx); + mutex_exit(&zp->z_lock); +} + +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, + size, 0, tx); + if (error == ENOTSUP) + return; + ASSERT3U(error, ==, 0); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free (0 => to EOF). + * flag - current file open mode flags. + * + * RETURN: 0 if success + * error code if failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + rl_t *rl; + uint64_t end = off + len; + uint64_t size, new_blksz; + int error; + + if (ZTOV(zp)->v_type == VFIFO) + return (0); + + /* + * If we will change zp_size then lock the whole file, + * otherwise just lock the range being freed. + */ + if (len == 0 || off + len > zp->z_phys->zp_size) { + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + } else { + rl = zfs_range_lock(zp, off, len, RL_WRITER); + /* recheck, in case zp_size changed */ + if (off + len > zp->z_phys->zp_size) { + /* lost race: file size changed, lock whole file */ + zfs_range_unlock(rl); + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + } + } + + /* + * Nothing to do if file already at desired length. + */ + size = zp->z_phys->zp_size; + if (len == 0 && size == off) { + zfs_range_unlock(rl); + return (0); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + new_blksz = 0; + if (end > size && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end, SPA_MAXBLOCKSIZE); + } else { + new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz)); + } else if (off < size) { + /* + * If len == 0, we are truncating the file. + */ + dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END); + } + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zfs_range_unlock(rl); + return (error); + } + + if (new_blksz) + zfs_grow_blocksize(zp, new_blksz, tx); + + if (end > size || len == 0) + zp->z_phys->zp_size = end; + + if (off < size) { + objset_t *os = zfsvfs->z_os; + uint64_t rlen = len; + + if (len == 0) + rlen = -1; + else if (end > size) + rlen = size - off; + VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx)); + } + + if (log) { + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + } + + zfs_range_unlock(rl); + + dmu_tx_commit(tx); + + /* + * Clear any mapped pages in the truncated region. This has to + * happen outside of the transaction to avoid the possibility of + * a deadlock with someone trying to push a page that we are + * about to invalidate. + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + if (end > size) + vnode_pager_setsize(vp, end); + else if (len == 0) { +#if 0 + error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); +#else + error = vinvalbuf(vp, V_SAVE, curthread, 0, 0); + vnode_pager_setsize(vp, end); +#endif + } + rw_exit(&zp->z_map_lock); + + return (0); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) +{ + zfsvfs_t zfsvfs; + uint64_t moid, doid, roid = 0; + uint64_t version = ZPL_VERSION; + int error; + znode_t *rootzp = NULL; + vattr_t vattr; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + + error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx); + ASSERT(error == 0); + + /* + * Create a delete queue. + */ + doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = UID_ROOT; + vattr.va_gid = GID_WHEEL; + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + zfs_znode_cache_constructor(rootzp, NULL, 0); + rootzp->z_zfsvfs = &zfsvfs; + rootzp->z_unlinked = 0; + rootzp->z_atime_dirty = 0; + rootzp->z_dbuf_held = 0; + + bzero(&zfsvfs, sizeof (zfsvfs_t)); + + zfsvfs.z_os = os; + zfsvfs.z_assign = TXG_NOWAIT; + zfsvfs.z_parent = &zfsvfs; + + mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0); + ASSERT3U(rootzp->z_id, ==, roid); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx); + ASSERT(error == 0); + + kmem_cache_free(znode_cache, rootzp); +} +#endif /* _KERNEL */ + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) +{ + dmu_buf_t *db; + dmu_object_info_t doi; + znode_phys_t *zp; + int error; + + if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) + return (error); + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_ZNODE || + doi.doi_bonus_size < sizeof (znode_phys_t)) { + dmu_buf_rele(db, FTAG); + return (EINVAL); + } + + zp = db->db_data; + *pobjp = zp->zp_parent; + *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && + S_ISDIR(zp->zp_mode); + dmu_buf_rele(db, FTAG); + + return (0); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + char *path = buf + len - 1; + int error; + + *path = '\0'; + + for (;;) { + uint64_t pobj; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir; + + if ((error = zfs_obj_to_pobj(osp, obj, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) sprintf(component + 1, "<xattrdir>"); + } else { + error = zap_value_search(osp, pobj, obj, component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + return (error); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c new file mode 100644 index 0000000..6b52b55 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -0,0 +1,1607 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/arc.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/dsl_dataset.h> +#include <sys/vdev.h> +#include <sys/dmu_tx.h> + +/* + * The zfs intent log (ZIL) saves transaction records of system calls + * that change the file system in memory with enough information + * to be able to replay them. These are stored in memory until + * either the DMU transaction group (txg) commits them to the stable pool + * and they can be discarded, or they are flushed to the stable log + * (also in the pool) due to a fsync, O_DSYNC or other synchronous + * requirement. In the event of a panic or power fail then those log + * records (transactions) are replayed. + * + * There is one ZIL per file system. Its on-disk (pool) format consists + * of 3 parts: + * + * - ZIL header + * - ZIL blocks + * - ZIL records + * + * A log record holds a system call transaction. Log blocks can + * hold many log records and the blocks are chained together. + * Each ZIL block contains a block pointer (blkptr_t) to the next + * ZIL block in the chain. The ZIL header points to the first + * block in the chain. Note there is not a fixed place in the pool + * to hold blocks. They are dynamically allocated and freed as + * needed from the blocks available. Figure X shows the ZIL structure: + */ + +/* + * This global ZIL switch affects all pools + */ +int zil_disable = 0; /* disable intent logging */ +SYSCTL_DECL(_vfs_zfs); +TUNABLE_INT("vfs.zfs.zil_disable", &zil_disable); +SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_disable, CTLFLAG_RDTUN, &zil_disable, 0, + "Disable ZFS Intent Log (ZIL)"); + +/* + * Tunable parameter for debugging or performance analysis. Setting + * zfs_nocacheflush will cause corruption on power loss if a volatile + * out-of-order write cache is enabled. + */ +boolean_t zfs_nocacheflush = B_FALSE; +TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush); +SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN, + &zfs_nocacheflush, 0, "Disable cache flush"); + +static kmem_cache_t *zil_lwb_cache; + +static int +zil_dva_compare(const void *x1, const void *x2) +{ + const dva_t *dva1 = x1; + const dva_t *dva2 = x2; + + if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) + return (-1); + if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) + return (1); + + if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) + return (-1); + if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) + return (1); + + return (0); +} + +static void +zil_dva_tree_init(avl_tree_t *t) +{ + avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t), + offsetof(zil_dva_node_t, zn_node)); +} + +static void +zil_dva_tree_fini(avl_tree_t *t) +{ + zil_dva_node_t *zn; + void *cookie = NULL; + + while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(zn, sizeof (zil_dva_node_t)); + + avl_destroy(t); +} + +static int +zil_dva_tree_add(avl_tree_t *t, dva_t *dva) +{ + zil_dva_node_t *zn; + avl_index_t where; + + if (avl_find(t, dva, &where) != NULL) + return (EEXIST); + + zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP); + zn->zn_dva = *dva; + avl_insert(t, zn, where); + + return (0); +} + +static zil_header_t * +zil_header_in_syncing_context(zilog_t *zilog) +{ + return ((zil_header_t *)zilog->zl_header); +} + +static void +zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) +{ + zio_cksum_t *zc = &bp->blk_cksum; + + zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); + zc->zc_word[ZIL_ZC_SEQ] = 1ULL; +} + +/* + * Read a log block, make sure it's valid, and byteswap it if necessary. + */ +static int +zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) +{ + blkptr_t blk = *bp; + zbookmark_t zb; + uint32_t aflags = ARC_WAIT; + int error; + + zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + + *abufpp = NULL; + + error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array, + arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb); + + if (error == 0) { + char *data = (*abufpp)->b_data; + uint64_t blksz = BP_GET_LSIZE(bp); + zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1; + zio_cksum_t cksum = bp->blk_cksum; + + /* + * Sequence numbers should be... sequential. The checksum + * verifier for the next block should be bp's checksum plus 1. + */ + cksum.zc_word[ZIL_ZC_SEQ]++; + + if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum))) + error = ESTALE; + else if (BP_IS_HOLE(&ztp->zit_next_blk)) + error = ENOENT; + else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) + error = EOVERFLOW; + + if (error) { + VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1); + *abufpp = NULL; + } + } + + dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid); + + return (error); +} + +/* + * Parse the intent log, and call parse_func for each valid record within. + * Return the highest sequence number. + */ +uint64_t +zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) +{ + const zil_header_t *zh = zilog->zl_header; + uint64_t claim_seq = zh->zh_claim_seq; + uint64_t seq = 0; + uint64_t max_seq = 0; + blkptr_t blk = zh->zh_log; + arc_buf_t *abuf; + char *lrbuf, *lrp; + zil_trailer_t *ztp; + int reclen, error; + + if (BP_IS_HOLE(&blk)) + return (max_seq); + + /* + * Starting at the block pointed to by zh_log we read the log chain. + * For each block in the chain we strongly check that block to + * ensure its validity. We stop when an invalid block is found. + * For each block pointer in the chain we call parse_blk_func(). + * For each record in each valid block we call parse_lr_func(). + * If the log has been claimed, stop if we encounter a sequence + * number greater than the highest claimed sequence number. + */ + zil_dva_tree_init(&zilog->zl_dva_tree); + for (;;) { + seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + + if (claim_seq != 0 && seq > claim_seq) + break; + + ASSERT(max_seq < seq); + max_seq = seq; + + error = zil_read_log_block(zilog, &blk, &abuf); + + if (parse_blk_func != NULL) + parse_blk_func(zilog, &blk, arg, txg); + + if (error) + break; + + lrbuf = abuf->b_data; + ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; + blk = ztp->zit_next_blk; + + if (parse_lr_func == NULL) { + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + continue; + } + + for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) { + lr_t *lr = (lr_t *)lrp; + reclen = lr->lrc_reclen; + ASSERT3U(reclen, >=, sizeof (lr_t)); + parse_lr_func(zilog, lr, arg, txg); + } + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + } + zil_dva_tree_fini(&zilog->zl_dva_tree); + + return (max_seq); +} + +/* ARGSUSED */ +static void +zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +{ + spa_t *spa = zilog->zl_spa; + int err; + + /* + * Claim log block if not already committed and not already claimed. + */ + if (bp->blk_birth >= first_txg && + zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) { + err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL)); + ASSERT(err == 0); + } +} + +static void +zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg); + } +} + +/* ARGSUSED */ +static void +zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) +{ + zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx)); +} + +static void +zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) +{ + /* + * If we previously claimed it, we need to free it. + */ + if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + if (bp->blk_birth >= claim_txg && + !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) { + (void) arc_free(NULL, zilog->zl_spa, + dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT); + } + } +} + +/* + * Create an on-disk intent log. + */ +static void +zil_create(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb; + uint64_t txg = 0; + dmu_tx_t *tx = NULL; + blkptr_t blk; + int error = 0; + + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + ASSERT(zh->zh_claim_txg == 0); + ASSERT(zh->zh_replay_seq == 0); + + blk = zh->zh_log; + + /* + * If we don't already have an initial log block, allocate one now. + */ + if (BP_IS_HOLE(&blk)) { + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, + NULL, txg); + + if (error == 0) + zil_init_log_chain(zilog, &blk); + } + + /* + * Allocate a log write buffer (lwb) for the first log block. + */ + if (error == 0) { + lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + lwb->lwb_zilog = zilog; + lwb->lwb_blk = blk; + lwb->lwb_nused = 0; + lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk); + lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz); + lwb->lwb_max_txg = txg; + lwb->lwb_zio = NULL; + + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_lwb_list, lwb); + mutex_exit(&zilog->zl_lock); + } + + /* + * If we just allocated the first log block, commit our transaction + * and wait for zil_sync() to stuff the block poiner into zh_log. + * (zh is part of the MOS, so we cannot modify it in open context.) + */ + if (tx != NULL) { + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } + + ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); +} + +/* + * In one tx, free all log blocks and clear the log header. + * If keep_first is set, then we're replaying a log with no content. + * We want to keep the first block, however, so that the first + * synchronous transaction doesn't require a txg_wait_synced() + * in zil_create(). We don't need to txg_wait_synced() here either + * when keep_first is set, because both zil_create() and zil_destroy() + * will wait for any in-progress destroys to complete. + */ +void +zil_destroy(zilog_t *zilog, boolean_t keep_first) +{ + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb; + dmu_tx_t *tx; + uint64_t txg; + + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + if (BP_IS_HOLE(&zh->zh_log)) + return; + + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + mutex_enter(&zilog->zl_lock); + + ASSERT3U(zilog->zl_destroy_txg, <, txg); + zilog->zl_destroy_txg = txg; + zilog->zl_keep_first = keep_first; + + if (!list_is_empty(&zilog->zl_lwb_list)) { + ASSERT(zh->zh_claim_txg == 0); + ASSERT(!keep_first); + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + list_remove(&zilog->zl_lwb_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg); + kmem_cache_free(zil_lwb_cache, lwb); + } + } else { + if (!keep_first) { + (void) zil_parse(zilog, zil_free_log_block, + zil_free_log_record, tx, zh->zh_claim_txg); + } + } + mutex_exit(&zilog->zl_lock); + + dmu_tx_commit(tx); + + if (keep_first) /* no need to wait in this case */ + return; + + txg_wait_synced(zilog->zl_dmu_pool, txg); + ASSERT(BP_IS_HOLE(&zh->zh_log)); +} + +int +zil_claim(char *osname, void *txarg) +{ + dmu_tx_t *tx = txarg; + uint64_t first_txg = dmu_tx_get_txg(tx); + zilog_t *zilog; + zil_header_t *zh; + objset_t *os; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os); + if (error) { + cmn_err(CE_WARN, "can't process intent log for %s", osname); + return (0); + } + + zilog = dmu_objset_zil(os); + zh = zil_header_in_syncing_context(zilog); + + /* + * Claim all log blocks if we haven't already done so, and remember + * the highest claimed sequence number. This ensures that if we can + * read only part of the log now (e.g. due to a missing device), + * but we can read the entire log later, we will not try to replay + * or destroy beyond the last block we successfully claimed. + */ + ASSERT3U(zh->zh_claim_txg, <=, first_txg); + if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { + zh->zh_claim_txg = first_txg; + zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block, + zil_claim_log_record, tx, first_txg); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + + ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); + dmu_objset_close(os); + return (0); +} + +void +zil_add_vdev(zilog_t *zilog, uint64_t vdev) +{ + zil_vdev_t *zv, *new; + uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3; + uchar_t *cp; + + if (zfs_nocacheflush) + return; + + if (vdev < bmap_sz) { + cp = zilog->zl_vdev_bmap + (vdev / 8); + atomic_or_8(cp, 1 << (vdev % 8)); + } else { + /* + * insert into ordered list + */ + mutex_enter(&zilog->zl_lock); + for (zv = list_head(&zilog->zl_vdev_list); zv != NULL; + zv = list_next(&zilog->zl_vdev_list, zv)) { + if (zv->vdev == vdev) { + /* duplicate found - just return */ + mutex_exit(&zilog->zl_lock); + return; + } + if (zv->vdev > vdev) { + /* insert before this entry */ + new = kmem_alloc(sizeof (zil_vdev_t), + KM_SLEEP); + new->vdev = vdev; + list_insert_before(&zilog->zl_vdev_list, + zv, new); + mutex_exit(&zilog->zl_lock); + return; + } + } + /* ran off end of list, insert at the end */ + ASSERT(zv == NULL); + new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP); + new->vdev = vdev; + list_insert_tail(&zilog->zl_vdev_list, new); + mutex_exit(&zilog->zl_lock); + } +} + +/* start an async flush of the write cache for this vdev */ +void +zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio) +{ + vdev_t *vd; + + if (*zio == NULL) + *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + vd = vdev_lookup_top(spa, vdev); + ASSERT(vd); + + (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); +} + +void +zil_flush_vdevs(zilog_t *zilog) +{ + zil_vdev_t *zv; + zio_t *zio = NULL; + spa_t *spa = zilog->zl_spa; + uint64_t vdev; + uint8_t b; + int i, j; + + ASSERT(zilog->zl_writer); + + for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) { + b = zilog->zl_vdev_bmap[i]; + if (b == 0) + continue; + for (j = 0; j < 8; j++) { + if (b & (1 << j)) { + vdev = (i << 3) + j; + zil_flush_vdev(spa, vdev, &zio); + } + } + zilog->zl_vdev_bmap[i] = 0; + } + + while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) { + zil_flush_vdev(spa, zv->vdev, &zio); + list_remove(&zilog->zl_vdev_list, zv); + kmem_free(zv, sizeof (zil_vdev_t)); + } + /* + * Wait for all the flushes to complete. Not all devices actually + * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. + */ + if (zio) + (void) zio_wait(zio); +} + +/* + * Function called when a log block write completes + */ +static void +zil_lwb_write_done(zio_t *zio) +{ + lwb_t *lwb = zio->io_private; + zilog_t *zilog = lwb->lwb_zilog; + + /* + * Now that we've written this log block, we have a stable pointer + * to the next block in the chain, so it's OK to let the txg in + * which we allocated the next block sync. + */ + txg_rele_to_sync(&lwb->lwb_txgh); + + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + mutex_enter(&zilog->zl_lock); + lwb->lwb_buf = NULL; + if (zio->io_error) { + zilog->zl_log_error = B_TRUE; + mutex_exit(&zilog->zl_lock); + return; + } + mutex_exit(&zilog->zl_lock); +} + +/* + * Initialize the io for a log block. + * + * Note, we should not initialize the IO until we are about + * to use it, since zio_rewrite() does a spa_config_enter(). + */ +static void +zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) +{ + zbookmark_t zb; + + zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + + if (zilog->zl_root_zio == NULL) { + zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } + if (lwb->lwb_zio == NULL) { + lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, + ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf, + lwb->lwb_sz, zil_lwb_write_done, lwb, + ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + } +} + +/* + * Start a log block write and advance to the next log block. + * Calls are serialized. + */ +static lwb_t * +zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) +{ + lwb_t *nlwb; + zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; + spa_t *spa = zilog->zl_spa; + blkptr_t *bp = &ztp->zit_next_blk; + uint64_t txg; + uint64_t zil_blksz; + int error; + + ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb)); + + /* + * Allocate the next block and save its address in this block + * before writing it in order to establish the log chain. + * Note that if the allocation of nlwb synced before we wrote + * the block that points at it (lwb), we'd leak it if we crashed. + * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done(). + */ + txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh); + txg_rele_to_quiesce(&lwb->lwb_txgh); + + /* + * Pick a ZIL blocksize. We request a size that is the + * maximum of the previous used size, the current used size and + * the amount waiting in the queue. + */ + zil_blksz = MAX(zilog->zl_prev_used, + zilog->zl_cur_used + sizeof (*ztp)); + zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp)); + zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t); + if (zil_blksz > ZIL_MAX_BLKSZ) + zil_blksz = ZIL_MAX_BLKSZ; + + BP_ZERO(bp); + /* pass the old blkptr in order to spread log blocks across devs */ + error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg); + if (error) { + dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg); + + /* + * We dirty the dataset to ensure that zil_sync() will + * be called to remove this lwb from our zl_lwb_list. + * Failing to do so, may leave an lwb with a NULL lwb_buf + * hanging around on the zl_lwb_list. + */ + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + dmu_tx_commit(tx); + + /* + * Since we've just experienced an allocation failure so we + * terminate the current lwb and send it on its way. + */ + ztp->zit_pad = 0; + ztp->zit_nused = lwb->lwb_nused; + ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; + zio_nowait(lwb->lwb_zio); + + /* + * By returning NULL the caller will call tx_wait_synced() + */ + return (NULL); + } + + ASSERT3U(bp->blk_birth, ==, txg); + ztp->zit_pad = 0; + ztp->zit_nused = lwb->lwb_nused; + ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; + + /* + * Allocate a new log write buffer (lwb). + */ + nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + + nlwb->lwb_zilog = zilog; + nlwb->lwb_blk = *bp; + nlwb->lwb_nused = 0; + nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk); + nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz); + nlwb->lwb_max_txg = txg; + nlwb->lwb_zio = NULL; + + /* + * Put new lwb at the end of the log chain + */ + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_lwb_list, nlwb); + mutex_exit(&zilog->zl_lock); + + /* Record the vdev for later flushing */ + zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk)))); + + /* + * kick off the write for the old log block + */ + dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); + ASSERT(lwb->lwb_zio); + zio_nowait(lwb->lwb_zio); + + return (nlwb); +} + +static lwb_t * +zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +{ + lr_t *lrc = &itx->itx_lr; /* common log record */ + lr_write_t *lr = (lr_write_t *)lrc; + uint64_t txg = lrc->lrc_txg; + uint64_t reclen = lrc->lrc_reclen; + uint64_t dlen; + + if (lwb == NULL) + return (NULL); + ASSERT(lwb->lwb_buf != NULL); + + if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) + dlen = P2ROUNDUP_TYPED( + lr->lr_length, sizeof (uint64_t), uint64_t); + else + dlen = 0; + + zilog->zl_cur_used += (reclen + dlen); + + zil_lwb_write_init(zilog, lwb); + + /* + * If this record won't fit in the current log block, start a new one. + */ + if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) { + lwb = zil_lwb_write_start(zilog, lwb); + if (lwb == NULL) + return (NULL); + zil_lwb_write_init(zilog, lwb); + ASSERT(lwb->lwb_nused == 0); + if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + return (lwb); + } + } + + /* + * Update the lrc_seq, to be log record sequence number. See zil.h + * Then copy the record to the log buffer. + */ + lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ + bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen); + + /* + * If it's a write, fetch the data or get its blkptr as appropriate. + */ + if (lrc->lrc_txtype == TX_WRITE) { + if (txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, txg); + if (itx->itx_wr_state != WR_COPIED) { + char *dbuf; + int error; + + /* alignment is guaranteed */ + lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused); + if (dlen) { + ASSERT(itx->itx_wr_state == WR_NEED_COPY); + dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen; + lr->lr_common.lrc_reclen += dlen; + } else { + ASSERT(itx->itx_wr_state == WR_INDIRECT); + dbuf = NULL; + } + error = zilog->zl_get_data( + itx->itx_private, lr, dbuf, lwb->lwb_zio); + if (error) { + ASSERT(error == ENOENT || error == EEXIST || + error == EALREADY); + return (lwb); + } + } + } + + lwb->lwb_nused += reclen + dlen; + lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); + ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb)); + ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); + + return (lwb); +} + +itx_t * +zil_itx_create(int txtype, size_t lrsize) +{ + itx_t *itx; + + lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); + + itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); + itx->itx_lr.lrc_txtype = txtype; + itx->itx_lr.lrc_reclen = lrsize; + itx->itx_lr.lrc_seq = 0; /* defensive */ + + return (itx); +} + +uint64_t +zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) +{ + uint64_t seq; + + ASSERT(itx->itx_lr.lrc_seq == 0); + + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_itx_list, itx); + zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen; + itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); + itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq; + mutex_exit(&zilog->zl_lock); + + return (seq); +} + +/* + * Free up all in-memory intent log transactions that have now been synced. + */ +static void +zil_itx_clean(zilog_t *zilog) +{ + uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa); + uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa); + list_t clean_list; + itx_t *itx; + + list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); + + mutex_enter(&zilog->zl_lock); + /* wait for a log writer to finish walking list */ + while (zilog->zl_writer) { + cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + } + + /* + * Move the sync'd log transactions to a separate list so we can call + * kmem_free without holding the zl_lock. + * + * There is no need to set zl_writer as we don't drop zl_lock here + */ + while ((itx = list_head(&zilog->zl_itx_list)) != NULL && + itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { + list_remove(&zilog->zl_itx_list, itx); + zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen; + list_insert_tail(&clean_list, itx); + } + cv_broadcast(&zilog->zl_cv_writer); + mutex_exit(&zilog->zl_lock); + + /* destroy sync'd log transactions */ + while ((itx = list_head(&clean_list)) != NULL) { + list_remove(&clean_list, itx); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + } + list_destroy(&clean_list); +} + +/* + * If there are any in-memory intent log transactions which have now been + * synced then start up a taskq to free them. + */ +void +zil_clean(zilog_t *zilog) +{ + itx_t *itx; + + mutex_enter(&zilog->zl_lock); + itx = list_head(&zilog->zl_itx_list); + if ((itx != NULL) && + (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { + (void) taskq_dispatch(zilog->zl_clean_taskq, + (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP); + } + mutex_exit(&zilog->zl_lock); +} + +void +zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) +{ + uint64_t txg; + uint64_t reclen; + uint64_t commit_seq = 0; + itx_t *itx, *itx_next = (itx_t *)-1; + lwb_t *lwb; + spa_t *spa; + + zilog->zl_writer = B_TRUE; + zilog->zl_root_zio = NULL; + spa = zilog->zl_spa; + + if (zilog->zl_suspend) { + lwb = NULL; + } else { + lwb = list_tail(&zilog->zl_lwb_list); + if (lwb == NULL) { + /* + * Return if there's nothing to flush before we + * dirty the fs by calling zil_create() + */ + if (list_is_empty(&zilog->zl_itx_list)) { + zilog->zl_writer = B_FALSE; + return; + } + mutex_exit(&zilog->zl_lock); + zil_create(zilog); + mutex_enter(&zilog->zl_lock); + lwb = list_tail(&zilog->zl_lwb_list); + } + } + + /* Loop through in-memory log transactions filling log blocks. */ + DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); + for (;;) { + /* + * Find the next itx to push: + * Push all transactions related to specified foid and all + * other transactions except TX_WRITE, TX_TRUNCATE, + * TX_SETATTR and TX_ACL for all other files. + */ + if (itx_next != (itx_t *)-1) + itx = itx_next; + else + itx = list_head(&zilog->zl_itx_list); + for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) { + if (foid == 0) /* push all foids? */ + break; + if (itx->itx_sync) /* push all O_[D]SYNC */ + break; + switch (itx->itx_lr.lrc_txtype) { + case TX_SETATTR: + case TX_WRITE: + case TX_TRUNCATE: + case TX_ACL: + /* lr_foid is same offset for these records */ + if (((lr_write_t *)&itx->itx_lr)->lr_foid + != foid) { + continue; /* skip this record */ + } + } + break; + } + if (itx == NULL) + break; + + reclen = itx->itx_lr.lrc_reclen; + if ((itx->itx_lr.lrc_seq > seq) && + ((lwb == NULL) || (lwb->lwb_nused == 0) || + (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) { + break; + } + + /* + * Save the next pointer. Even though we soon drop + * zl_lock all threads that may change the list + * (another writer or zil_itx_clean) can't do so until + * they have zl_writer. + */ + itx_next = list_next(&zilog->zl_itx_list, itx); + list_remove(&zilog->zl_itx_list, itx); + mutex_exit(&zilog->zl_lock); + txg = itx->itx_lr.lrc_txg; + ASSERT(txg); + + if (txg > spa_last_synced_txg(spa) || + txg > spa_freeze_txg(spa)) + lwb = zil_lwb_commit(zilog, itx, lwb); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + mutex_enter(&zilog->zl_lock); + zilog->zl_itx_list_sz -= reclen; + } + DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); + /* determine commit sequence number */ + itx = list_head(&zilog->zl_itx_list); + if (itx) + commit_seq = itx->itx_lr.lrc_seq; + else + commit_seq = zilog->zl_itx_seq; + mutex_exit(&zilog->zl_lock); + + /* write the last block out */ + if (lwb != NULL && lwb->lwb_zio != NULL) + lwb = zil_lwb_write_start(zilog, lwb); + + zilog->zl_prev_used = zilog->zl_cur_used; + zilog->zl_cur_used = 0; + + /* + * Wait if necessary for the log blocks to be on stable storage. + */ + if (zilog->zl_root_zio) { + DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); + (void) zio_wait(zilog->zl_root_zio); + DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); + if (!zfs_nocacheflush) + zil_flush_vdevs(zilog); + } + + if (zilog->zl_log_error || lwb == NULL) { + zilog->zl_log_error = 0; + txg_wait_synced(zilog->zl_dmu_pool, 0); + } + + mutex_enter(&zilog->zl_lock); + zilog->zl_writer = B_FALSE; + + ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); + zilog->zl_commit_seq = commit_seq; +} + +/* + * Push zfs transactions to stable storage up to the supplied sequence number. + * If foid is 0 push out all transactions, otherwise push only those + * for that file or might have been used to create that file. + */ +void +zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid) +{ + if (zilog == NULL || seq == 0) + return; + + mutex_enter(&zilog->zl_lock); + + seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */ + + while (zilog->zl_writer) { + cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + if (seq < zilog->zl_commit_seq) { + mutex_exit(&zilog->zl_lock); + return; + } + } + zil_commit_writer(zilog, seq, foid); /* drops zl_lock */ + /* wake up others waiting on the commit */ + cv_broadcast(&zilog->zl_cv_writer); + mutex_exit(&zilog->zl_lock); +} + +/* + * Called in syncing context to free committed log blocks and update log header. + */ +void +zil_sync(zilog_t *zilog, dmu_tx_t *tx) +{ + zil_header_t *zh = zil_header_in_syncing_context(zilog); + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = zilog->zl_spa; + lwb_t *lwb; + + mutex_enter(&zilog->zl_lock); + + ASSERT(zilog->zl_stop_sync == 0); + + zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; + + if (zilog->zl_destroy_txg == txg) { + blkptr_t blk = zh->zh_log; + + ASSERT(list_head(&zilog->zl_lwb_list) == NULL); + ASSERT(spa_sync_pass(spa) == 1); + + bzero(zh, sizeof (zil_header_t)); + bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); + + if (zilog->zl_keep_first) { + /* + * If this block was part of log chain that couldn't + * be claimed because a device was missing during + * zil_claim(), but that device later returns, + * then this block could erroneously appear valid. + * To guard against this, assign a new GUID to the new + * log chain so it doesn't matter what blk points to. + */ + zil_init_log_chain(zilog, &blk); + zh->zh_log = blk; + } + } + + for (;;) { + lwb = list_head(&zilog->zl_lwb_list); + if (lwb == NULL) { + mutex_exit(&zilog->zl_lock); + return; + } + zh->zh_log = lwb->lwb_blk; + if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) + break; + list_remove(&zilog->zl_lwb_list, lwb); + zio_free_blk(spa, &lwb->lwb_blk, txg); + kmem_cache_free(zil_lwb_cache, lwb); + + /* + * If we don't have anything left in the lwb list then + * we've had an allocation failure and we need to zero + * out the zil_header blkptr so that we don't end + * up freeing the same block twice. + */ + if (list_head(&zilog->zl_lwb_list) == NULL) + BP_ZERO(&zh->zh_log); + } + mutex_exit(&zilog->zl_lock); +} + +void +zil_init(void) +{ + zil_lwb_cache = kmem_cache_create("zil_lwb_cache", + sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +zil_fini(void) +{ + kmem_cache_destroy(zil_lwb_cache); +} + +zilog_t * +zil_alloc(objset_t *os, zil_header_t *zh_phys) +{ + zilog_t *zilog; + + zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); + + zilog->zl_header = zh_phys; + zilog->zl_os = os; + zilog->zl_spa = dmu_objset_spa(os); + zilog->zl_dmu_pool = dmu_objset_pool(os); + zilog->zl_destroy_txg = TXG_INITIAL - 1; + + mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); + cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); + + list_create(&zilog->zl_itx_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + + list_create(&zilog->zl_lwb_list, sizeof (lwb_t), + offsetof(lwb_t, lwb_node)); + + list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t), + offsetof(zil_vdev_t, vdev_seq_node)); + + return (zilog); +} + +void +zil_free(zilog_t *zilog) +{ + lwb_t *lwb; + zil_vdev_t *zv; + + zilog->zl_stop_sync = 1; + + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + list_remove(&zilog->zl_lwb_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + kmem_cache_free(zil_lwb_cache, lwb); + } + list_destroy(&zilog->zl_lwb_list); + + while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) { + list_remove(&zilog->zl_vdev_list, zv); + kmem_free(zv, sizeof (zil_vdev_t)); + } + list_destroy(&zilog->zl_vdev_list); + + ASSERT(list_head(&zilog->zl_itx_list) == NULL); + list_destroy(&zilog->zl_itx_list); + cv_destroy(&zilog->zl_cv_suspend); + cv_destroy(&zilog->zl_cv_writer); + mutex_destroy(&zilog->zl_lock); + + kmem_free(zilog, sizeof (zilog_t)); +} + +/* + * return true if the initial log block is not valid + */ +static int +zil_empty(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + arc_buf_t *abuf = NULL; + + if (BP_IS_HOLE(&zh->zh_log)) + return (1); + + if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) + return (1); + + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + return (0); +} + +/* + * Open an intent log. + */ +zilog_t * +zil_open(objset_t *os, zil_get_data_t *get_data) +{ + zilog_t *zilog = dmu_objset_zil(os); + + zilog->zl_get_data = get_data; + zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, + 2, 2, TASKQ_PREPOPULATE); + + return (zilog); +} + +/* + * Close an intent log. + */ +void +zil_close(zilog_t *zilog) +{ + /* + * If the log isn't already committed, mark the objset dirty + * (so zil_sync() will be called) and wait for that txg to sync. + */ + if (!zil_is_committed(zilog)) { + uint64_t txg; + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } + + taskq_destroy(zilog->zl_clean_taskq); + zilog->zl_clean_taskq = NULL; + zilog->zl_get_data = NULL; + + zil_itx_clean(zilog); + ASSERT(list_head(&zilog->zl_itx_list) == NULL); +} + +/* + * Suspend an intent log. While in suspended mode, we still honor + * synchronous semantics, but we rely on txg_wait_synced() to do it. + * We suspend the log briefly when taking a snapshot so that the snapshot + * contains all the data it's supposed to, and has an empty intent log. + */ +int +zil_suspend(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + + mutex_enter(&zilog->zl_lock); + if (zh->zh_claim_txg != 0) { /* unplayed log */ + mutex_exit(&zilog->zl_lock); + return (EBUSY); + } + if (zilog->zl_suspend++ != 0) { + /* + * Someone else already began a suspend. + * Just wait for them to finish. + */ + while (zilog->zl_suspending) + cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); + ASSERT(BP_IS_HOLE(&zh->zh_log)); + mutex_exit(&zilog->zl_lock); + return (0); + } + zilog->zl_suspending = B_TRUE; + mutex_exit(&zilog->zl_lock); + + zil_commit(zilog, UINT64_MAX, 0); + + /* + * Wait for any in-flight log writes to complete. + */ + mutex_enter(&zilog->zl_lock); + while (zilog->zl_writer) + cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + mutex_exit(&zilog->zl_lock); + + zil_destroy(zilog, B_FALSE); + + mutex_enter(&zilog->zl_lock); + ASSERT(BP_IS_HOLE(&zh->zh_log)); + zilog->zl_suspending = B_FALSE; + cv_broadcast(&zilog->zl_cv_suspend); + mutex_exit(&zilog->zl_lock); + + return (0); +} + +void +zil_resume(zilog_t *zilog) +{ + mutex_enter(&zilog->zl_lock); + ASSERT(zilog->zl_suspend != 0); + zilog->zl_suspend--; + mutex_exit(&zilog->zl_lock); +} + +typedef struct zil_replay_arg { + objset_t *zr_os; + zil_replay_func_t **zr_replay; + void *zr_arg; + uint64_t *zr_txgp; + boolean_t zr_byteswap; + char *zr_lrbuf; +} zil_replay_arg_t; + +static void +zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) +{ + zil_replay_arg_t *zr = zra; + const zil_header_t *zh = zilog->zl_header; + uint64_t reclen = lr->lrc_reclen; + uint64_t txtype = lr->lrc_txtype; + char *name; + int pass, error, sunk; + + if (zilog->zl_stop_replay) + return; + + if (lr->lrc_txg < claim_txg) /* already committed */ + return; + + if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ + return; + + /* + * Make a copy of the data so we can revise and extend it. + */ + bcopy(lr, zr->zr_lrbuf, reclen); + + /* + * The log block containing this lr may have been byteswapped + * so that we can easily examine common fields like lrc_txtype. + * However, the log is a mix of different data types, and only the + * replay vectors know how to byteswap their records. Therefore, if + * the lr was byteswapped, undo it before invoking the replay vector. + */ + if (zr->zr_byteswap) + byteswap_uint64_array(zr->zr_lrbuf, reclen); + + /* + * If this is a TX_WRITE with a blkptr, suck in the data. + */ + if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { + lr_write_t *lrw = (lr_write_t *)lr; + blkptr_t *wbp = &lrw->lr_blkptr; + uint64_t wlen = lrw->lr_length; + char *wbuf = zr->zr_lrbuf + reclen; + + if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ + bzero(wbuf, wlen); + } else { + /* + * A subsequent write may have overwritten this block, + * in which case wbp may have been been freed and + * reallocated, and our read of wbp may fail with a + * checksum error. We can safely ignore this because + * the later write will provide the correct data. + */ + zbookmark_t zb; + + zb.zb_objset = dmu_objset_id(zilog->zl_os); + zb.zb_object = lrw->lr_foid; + zb.zb_level = -1; + zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp); + + (void) zio_wait(zio_read(NULL, zilog->zl_spa, + wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); + (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); + } + } + + /* + * We must now do two things atomically: replay this log record, + * and update the log header to reflect the fact that we did so. + * We use the DMU's ability to assign into a specific txg to do this. + */ + for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) { + uint64_t replay_txg; + dmu_tx_t *replay_tx; + + replay_tx = dmu_tx_create(zr->zr_os); + error = dmu_tx_assign(replay_tx, TXG_WAIT); + if (error) { + dmu_tx_abort(replay_tx); + break; + } + + replay_txg = dmu_tx_get_txg(replay_tx); + + if (txtype == 0 || txtype >= TX_MAX_TYPE) { + error = EINVAL; + } else { + /* + * On the first pass, arrange for the replay vector + * to fail its dmu_tx_assign(). That's the only way + * to ensure that those code paths remain well tested. + */ + *zr->zr_txgp = replay_txg - (pass == 1); + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, + zr->zr_byteswap); + *zr->zr_txgp = TXG_NOWAIT; + } + + if (error == 0) { + dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx); + zilog->zl_replay_seq[replay_txg & TXG_MASK] = + lr->lrc_seq; + } + + dmu_tx_commit(replay_tx); + + if (!error) + return; + + /* + * The DMU's dnode layer doesn't see removes until the txg + * commits, so a subsequent claim can spuriously fail with + * EEXIST. So if we receive any error other than ERESTART + * we try syncing out any removes then retrying the + * transaction. + */ + if (error != ERESTART && !sunk) { + txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); + sunk = B_TRUE; + continue; /* retry */ + } + + if (error != ERESTART) + break; + + if (pass != 1) + txg_wait_open(spa_get_dsl(zilog->zl_spa), + replay_txg + 1); + + dprintf("pass %d, retrying\n", pass); + } + + ASSERT(error && error != ERESTART); + name = kmem_alloc(MAXNAMELEN, KM_SLEEP); + dmu_objset_name(zr->zr_os, name); + cmn_err(CE_WARN, "ZFS replay transaction error %d, " + "dataset %s, seq 0x%llx, txtype %llu\n", + error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype); + zilog->zl_stop_replay = 1; + kmem_free(name, MAXNAMELEN); +} + +/* ARGSUSED */ +static void +zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zilog->zl_replay_blks++; +} + +/* + * If this dataset has a non-empty intent log, replay it and destroy it. + */ +void +zil_replay(objset_t *os, void *arg, uint64_t *txgp, + zil_replay_func_t *replay_func[TX_MAX_TYPE]) +{ + zilog_t *zilog = dmu_objset_zil(os); + const zil_header_t *zh = zilog->zl_header; + zil_replay_arg_t zr; + + if (zil_empty(zilog)) { + zil_destroy(zilog, B_TRUE); + return; + } + //printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name); + + zr.zr_os = os; + zr.zr_replay = replay_func; + zr.zr_arg = arg; + zr.zr_txgp = txgp; + zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); + zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); + + /* + * Wait for in-progress removes to sync before starting replay. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + + zilog->zl_stop_replay = 0; + zilog->zl_replay_time = lbolt; + ASSERT(zilog->zl_replay_blks == 0); + (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, + zh->zh_claim_txg); + kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE); + + zil_destroy(zilog, B_FALSE); + //printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name); +} + +/* + * Report whether all transactions are committed + */ +int +zil_is_committed(zilog_t *zilog) +{ + lwb_t *lwb; + int ret; + + mutex_enter(&zilog->zl_lock); + while (zilog->zl_writer) + cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + + /* recent unpushed intent log transactions? */ + if (!list_is_empty(&zilog->zl_itx_list)) { + ret = B_FALSE; + goto out; + } + + /* intent log never used? */ + lwb = list_head(&zilog->zl_lwb_list); + if (lwb == NULL) { + ret = B_TRUE; + goto out; + } + + /* + * more than 1 log buffer means zil_sync() hasn't yet freed + * entries after a txg has committed + */ + if (list_next(&zilog->zl_lwb_list, lwb)) { + ret = B_FALSE; + goto out; + } + + ASSERT(zil_empty(zilog)); + ret = B_TRUE; +out: + cv_broadcast(&zilog->zl_cv_writer); + mutex_exit(&zilog->zl_lock); + return (ret); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c new file mode 100644 index 0000000..6bc4a36 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -0,0 +1,1853 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zio_impl.h> +#include <sys/zio_compress.h> +#include <sys/zio_checksum.h> + +/* + * ========================================================================== + * I/O priority table + * ========================================================================== + */ +uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { + 0, /* ZIO_PRIORITY_NOW */ + 0, /* ZIO_PRIORITY_SYNC_READ */ + 0, /* ZIO_PRIORITY_SYNC_WRITE */ + 6, /* ZIO_PRIORITY_ASYNC_READ */ + 4, /* ZIO_PRIORITY_ASYNC_WRITE */ + 4, /* ZIO_PRIORITY_FREE */ + 0, /* ZIO_PRIORITY_CACHE_FILL */ + 0, /* ZIO_PRIORITY_LOG_WRITE */ + 10, /* ZIO_PRIORITY_RESILVER */ + 20, /* ZIO_PRIORITY_SCRUB */ +}; + +/* + * ========================================================================== + * I/O type descriptions + * ========================================================================== + */ +char *zio_type_name[ZIO_TYPES] = { + "null", "read", "write", "free", "claim", "ioctl" }; + +/* At or above this size, force gang blocking - for testing */ +uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; + +/* Force an allocation failure when non-zero */ +uint16_t zio_zil_fail_shift = 0; + +typedef struct zio_sync_pass { + int zp_defer_free; /* defer frees after this pass */ + int zp_dontcompress; /* don't compress after this pass */ + int zp_rewrite; /* rewrite new bps after this pass */ +} zio_sync_pass_t; + +zio_sync_pass_t zio_sync_pass = { + 1, /* zp_defer_free */ + 4, /* zp_dontcompress */ + 1, /* zp_rewrite */ +}; + +#ifdef ZIO_USE_UMA +/* + * ========================================================================== + * I/O kmem caches + * ========================================================================== + */ +kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +#endif + +#ifdef _KERNEL +extern vmem_t *zio_alloc_arena; +#endif + +void +zio_init(void) +{ +#ifdef ZIO_USE_UMA + size_t c; +#endif +#if 0 + vmem_t *data_alloc_arena = NULL; + +#ifdef _KERNEL + data_alloc_arena = zio_alloc_arena; +#endif +#endif + +#ifdef ZIO_USE_UMA + /* + * For small buffers, we want a cache for each multiple of + * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache + * for each quarter-power of 2. For large buffers, we want + * a cache for each multiple of PAGESIZE. + */ + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + size_t size = (c + 1) << SPA_MINBLOCKSHIFT; + size_t p2 = size; + size_t align = 0; + + while (p2 & (p2 - 1)) + p2 &= p2 - 1; + + if (size <= 4 * SPA_MINBLOCKSIZE) { + align = SPA_MINBLOCKSIZE; + } else if (P2PHASE(size, PAGESIZE) == 0) { + align = PAGESIZE; + } else if (P2PHASE(size, p2 >> 2) == 0) { + align = p2 >> 2; + } + + if (align != 0) { + char name[36]; + (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); + + (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, data_alloc_arena, + KMC_NODEBUG); + + dprintf("creating cache for size %5lx align %5lx\n", + size, align); + } + } + + while (--c != 0) { + ASSERT(zio_buf_cache[c] != NULL); + if (zio_buf_cache[c - 1] == NULL) + zio_buf_cache[c - 1] = zio_buf_cache[c]; + + ASSERT(zio_data_buf_cache[c] != NULL); + if (zio_data_buf_cache[c - 1] == NULL) + zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; + } +#endif + + zio_inject_init(); +} + +void +zio_fini(void) +{ +#ifdef ZIO_USE_UMA + size_t c; + kmem_cache_t *last_cache = NULL; + kmem_cache_t *last_data_cache = NULL; + + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + if (zio_buf_cache[c] != last_cache) { + last_cache = zio_buf_cache[c]; + kmem_cache_destroy(zio_buf_cache[c]); + } + zio_buf_cache[c] = NULL; + + if (zio_data_buf_cache[c] != last_data_cache) { + last_data_cache = zio_data_buf_cache[c]; + kmem_cache_destroy(zio_data_buf_cache[c]); + } + zio_data_buf_cache[c] = NULL; + } +#endif + + zio_inject_fini(); +} + +/* + * ========================================================================== + * Allocate and free I/O buffers + * ========================================================================== + */ + +/* + * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a + * crashdump if the kernel panics, so use it judiciously. Obviously, it's + * useful to inspect ZFS metadata, but if possible, we should avoid keeping + * excess / transient data in-core during a crashdump. + */ +void * +zio_buf_alloc(size_t size) +{ +#ifdef ZIO_USE_UMA + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); +#else + return (kmem_alloc(size, KM_SLEEP)); +#endif +} + +/* + * Use zio_data_buf_alloc to allocate data. The data will not appear in a + * crashdump if the kernel panics. This exists so that we will limit the amount + * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount + * of kernel heap dumped to disk when the kernel panics) + */ +void * +zio_data_buf_alloc(size_t size) +{ +#ifdef ZIO_USE_UMA + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); +#else + return (kmem_alloc(size, KM_SLEEP)); +#endif +} + +void +zio_buf_free(void *buf, size_t size) +{ +#ifdef ZIO_USE_UMA + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + kmem_cache_free(zio_buf_cache[c], buf); +#else + kmem_free(buf, size); +#endif +} + +void +zio_data_buf_free(void *buf, size_t size) +{ +#ifdef ZIO_USE_UMA + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + kmem_cache_free(zio_data_buf_cache[c], buf); +#else + kmem_free(buf, size); +#endif +} + +/* + * ========================================================================== + * Push and pop I/O transform buffers + * ========================================================================== + */ +static void +zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) +{ + zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + + zt->zt_data = data; + zt->zt_size = size; + zt->zt_bufsize = bufsize; + + zt->zt_next = zio->io_transform_stack; + zio->io_transform_stack = zt; + + zio->io_data = data; + zio->io_size = size; +} + +static void +zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) +{ + zio_transform_t *zt = zio->io_transform_stack; + + *data = zt->zt_data; + *size = zt->zt_size; + *bufsize = zt->zt_bufsize; + + zio->io_transform_stack = zt->zt_next; + kmem_free(zt, sizeof (zio_transform_t)); + + if ((zt = zio->io_transform_stack) != NULL) { + zio->io_data = zt->zt_data; + zio->io_size = zt->zt_size; + } +} + +static void +zio_clear_transform_stack(zio_t *zio) +{ + void *data; + uint64_t size, bufsize; + + ASSERT(zio->io_transform_stack != NULL); + + zio_pop_transform(zio, &data, &size, &bufsize); + while (zio->io_transform_stack != NULL) { + zio_buf_free(data, bufsize); + zio_pop_transform(zio, &data, &size, &bufsize); + } +} + +/* + * ========================================================================== + * Create the various types of I/O (read, write, free) + * ========================================================================== + */ +static zio_t * +zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_done_func_t *done, void *private, + zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) +{ + zio_t *zio; + + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); + + zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); + zio->io_parent = pio; + zio->io_spa = spa; + zio->io_txg = txg; + if (bp != NULL) { + zio->io_bp = bp; + zio->io_bp_copy = *bp; + zio->io_bp_orig = *bp; + } + zio->io_done = done; + zio->io_private = private; + zio->io_type = type; + zio->io_priority = priority; + zio->io_stage = stage; + zio->io_pipeline = pipeline; + zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; + zio->io_timestamp = lbolt64; + zio->io_flags = flags; + mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + zio_push_transform(zio, data, size, size); + + /* + * Note on config lock: + * + * If CONFIG_HELD is set, then the caller already has the config + * lock, so we don't need it for this io. + * + * We set CONFIG_GRABBED to indicate that we have grabbed the + * config lock on behalf of this io, so it should be released + * in zio_done. + * + * Unless CONFIG_HELD is set, we will grab the config lock for + * any top-level (parent-less) io, *except* NULL top-level ios. + * The NULL top-level ios rarely have any children, so we delay + * grabbing the lock until the first child is added (but it is + * still grabbed on behalf of the top-level i/o, so additional + * children don't need to also grab it). This greatly reduces + * contention on the config lock. + */ + if (pio == NULL) { + if (type != ZIO_TYPE_NULL && + !(flags & ZIO_FLAG_CONFIG_HELD)) { + spa_config_enter(zio->io_spa, RW_READER, zio); + zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; + } + zio->io_root = zio; + } else { + zio->io_root = pio->io_root; + if (!(flags & ZIO_FLAG_NOBOOKMARK)) + zio->io_logical = pio->io_logical; + mutex_enter(&pio->io_lock); + if (pio->io_parent == NULL && + pio->io_type == ZIO_TYPE_NULL && + !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && + !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { + pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; + spa_config_enter(zio->io_spa, RW_READER, pio); + } + if (stage < ZIO_STAGE_READY) + pio->io_children_notready++; + pio->io_children_notdone++; + zio->io_sibling_next = pio->io_child; + zio->io_sibling_prev = NULL; + if (pio->io_child != NULL) + pio->io_child->io_sibling_prev = zio; + pio->io_child = zio; + zio->io_ndvas = pio->io_ndvas; + mutex_exit(&pio->io_lock); + } + + return (zio); +} + +zio_t * +zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, + int flags) +{ + zio_t *zio; + + zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, + ZIO_WAIT_FOR_CHILDREN_PIPELINE); + + return (zio); +} + +zio_t * +zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) +{ + return (zio_null(NULL, spa, done, private, flags)); +} + +zio_t * +zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, + uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags, zbookmark_t *zb) +{ + zio_t *zio; + + ASSERT3U(size, ==, BP_GET_LSIZE(bp)); + + zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, + ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, + ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); + zio->io_bookmark = *zb; + + zio->io_logical = zio; + + /* + * Work off our copy of the bp so the caller can free it. + */ + zio->io_bp = &zio->io_bp_copy; + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + uint64_t csize = BP_GET_PSIZE(bp); + void *cbuf = zio_buf_alloc(csize); + + zio_push_transform(zio, cbuf, csize, csize); + zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; + } + + if (BP_IS_GANG(bp)) { + uint64_t gsize = SPA_GANGBLOCKSIZE; + void *gbuf = zio_buf_alloc(gsize); + + zio_push_transform(zio, gbuf, gsize, gsize); + zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; + } + + return (zio); +} + +zio_t * +zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, + uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb) +{ + zio_t *zio; + + ASSERT(checksum >= ZIO_CHECKSUM_OFF && + checksum < ZIO_CHECKSUM_FUNCTIONS); + + ASSERT(compress >= ZIO_COMPRESS_OFF && + compress < ZIO_COMPRESS_FUNCTIONS); + + zio = zio_create(pio, spa, txg, bp, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, + ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); + + zio->io_ready = ready; + + zio->io_bookmark = *zb; + + zio->io_logical = zio; + + zio->io_checksum = checksum; + zio->io_compress = compress; + zio->io_ndvas = ncopies; + + if (compress != ZIO_COMPRESS_OFF) + zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; + + if (bp->blk_birth != txg) { + /* XXX the bp usually (always?) gets re-zeroed later */ + BP_ZERO(bp); + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + } else { + /* Make sure someone doesn't change their mind on overwrites */ + ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), + spa_max_replication(spa)) == BP_GET_NDVAS(bp)); + } + + return (zio); +} + +zio_t * +zio_rewrite(zio_t *pio, spa_t *spa, int checksum, + uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *done, void *private, int priority, int flags, + zbookmark_t *zb) +{ + zio_t *zio; + + zio = zio_create(pio, spa, txg, bp, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, + ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); + + zio->io_bookmark = *zb; + zio->io_checksum = checksum; + zio->io_compress = ZIO_COMPRESS_OFF; + + if (pio != NULL) + ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); + + return (zio); +} + +static zio_t * +zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, + uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *done, void *private, int priority, int flags) +{ + zio_t *zio; + + BP_ZERO(bp); + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + + zio = zio_create(pio, spa, txg, bp, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags, + ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); + + zio->io_checksum = checksum; + zio->io_compress = ZIO_COMPRESS_OFF; + + return (zio); +} + +zio_t * +zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private) +{ + zio_t *zio; + + ASSERT(!BP_IS_HOLE(bp)); + + if (txg == spa->spa_syncing_txg && + spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { + bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); + return (zio_null(pio, spa, NULL, NULL, 0)); + } + + zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, + ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, + ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); + + zio->io_bp = &zio->io_bp_copy; + + return (zio); +} + +zio_t * +zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private) +{ + zio_t *zio; + + /* + * A claim is an allocation of a specific block. Claims are needed + * to support immediate writes in the intent log. The issue is that + * immediate writes contain committed data, but in a txg that was + * *not* committed. Upon opening the pool after an unclean shutdown, + * the intent log claims all blocks that contain immediate write data + * so that the SPA knows they're in use. + * + * All claims *must* be resolved in the first txg -- before the SPA + * starts allocating blocks -- so that nothing is allocated twice. + */ + ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); + ASSERT3U(spa_first_txg(spa), <=, txg); + + zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, + ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, + ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + + zio->io_bp = &zio->io_bp_copy; + + return (zio); +} + +zio_t * +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + zio_done_func_t *done, void *private, int priority, int flags) +{ + zio_t *zio; + int c; + + if (vd->vdev_children == 0) { + zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, + ZIO_TYPE_IOCTL, priority, flags, + ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); + + zio->io_vd = vd; + zio->io_cmd = cmd; + } else { + zio = zio_null(pio, spa, NULL, NULL, flags); + + for (c = 0; c < vd->vdev_children; c++) + zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, + done, private, priority, flags)); + } + + return (zio); +} + +static void +zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, + int checksum) +{ + ASSERT(vd->vdev_children == 0); + + ASSERT(size <= SPA_MAXBLOCKSIZE); + ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); + ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); + + ASSERT(offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); + + BP_ZERO(bp); + + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + + BP_SET_CHECKSUM(bp, checksum); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + if (checksum != ZIO_CHECKSUM_OFF) + ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); +} + +zio_t * +zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + void *data, int checksum, zio_done_func_t *done, void *private, + int priority, int flags) +{ + zio_t *zio; + blkptr_t blk; + + zio_phys_bp_init(vd, &blk, offset, size, checksum); + + zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, + ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, + ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); + + zio->io_vd = vd; + zio->io_offset = offset; + + /* + * Work off our copy of the bp so the caller can free it. + */ + zio->io_bp = &zio->io_bp_copy; + + return (zio); +} + +zio_t * +zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + void *data, int checksum, zio_done_func_t *done, void *private, + int priority, int flags) +{ + zio_block_tail_t *zbt; + void *wbuf; + zio_t *zio; + blkptr_t blk; + + zio_phys_bp_init(vd, &blk, offset, size, checksum); + + zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, + ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); + + zio->io_vd = vd; + zio->io_offset = offset; + + zio->io_bp = &zio->io_bp_copy; + zio->io_checksum = checksum; + + if (zio_checksum_table[checksum].ci_zbt) { + /* + * zbt checksums are necessarily destructive -- they modify + * one word of the write buffer to hold the verifier/checksum. + * Therefore, we must make a local copy in case the data is + * being written to multiple places. + */ + wbuf = zio_buf_alloc(size); + bcopy(data, wbuf, size); + zio_push_transform(zio, wbuf, size, size); + + zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; + zbt->zbt_cksum = blk.blk_cksum; + } + + return (zio); +} + +/* + * Create a child I/O to do some work for us. It has no associated bp. + */ +zio_t * +zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, + void *data, uint64_t size, int type, int priority, int flags, + zio_done_func_t *done, void *private) +{ + uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; + zio_t *cio; + + if (type == ZIO_TYPE_READ && bp != NULL) { + /* + * If we have the bp, then the child should perform the + * checksum and the parent need not. This pushes error + * detection as close to the leaves as possible and + * eliminates redundant checksums in the interior nodes. + */ + pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; + zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); + } + + cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, + done, private, type, priority, + (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, + ZIO_STAGE_VDEV_IO_START - 1, pipeline); + + cio->io_vd = vd; + cio->io_offset = offset; + + return (cio); +} + +/* + * ========================================================================== + * Initiate I/O, either sync or async + * ========================================================================== + */ +int +zio_wait(zio_t *zio) +{ + int error; + + ASSERT(zio->io_stage == ZIO_STAGE_OPEN); + + zio->io_waiter = curthread; + + zio_next_stage_async(zio); + + mutex_enter(&zio->io_lock); + while (zio->io_stalled != ZIO_STAGE_DONE) + cv_wait(&zio->io_cv, &zio->io_lock); + mutex_exit(&zio->io_lock); + + error = zio->io_error; + cv_destroy(&zio->io_cv); + mutex_destroy(&zio->io_lock); + kmem_free(zio, sizeof (zio_t)); + + return (error); +} + +void +zio_nowait(zio_t *zio) +{ + zio_next_stage_async(zio); +} + +/* + * ========================================================================== + * I/O pipeline interlocks: parent/child dependency scoreboarding + * ========================================================================== + */ +static void +zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) +{ + mutex_enter(&zio->io_lock); + if (*countp == 0) { + ASSERT(zio->io_stalled == 0); + mutex_exit(&zio->io_lock); + zio_next_stage(zio); + } else { + zio->io_stalled = stage; + mutex_exit(&zio->io_lock); + } +} + +static void +zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) +{ + zio_t *pio = zio->io_parent; + + mutex_enter(&pio->io_lock); + if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + pio->io_error = zio->io_error; + if (--*countp == 0 && pio->io_stalled == stage) { + pio->io_stalled = 0; + mutex_exit(&pio->io_lock); + zio_next_stage_async(pio); + } else { + mutex_exit(&pio->io_lock); + } +} + +static void +zio_wait_children_ready(zio_t *zio) +{ + zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, + &zio->io_children_notready); +} + +void +zio_wait_children_done(zio_t *zio) +{ + zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, + &zio->io_children_notdone); +} + +static void +zio_ready(zio_t *zio) +{ + zio_t *pio = zio->io_parent; + + if (zio->io_ready) + zio->io_ready(zio); + + if (pio != NULL) + zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, + &pio->io_children_notready); + + if (zio->io_bp) + zio->io_bp_copy = *zio->io_bp; + + zio_next_stage(zio); +} + +static void +zio_done(zio_t *zio) +{ + zio_t *pio = zio->io_parent; + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + vdev_t *vd = zio->io_vd; + + ASSERT(zio->io_children_notready == 0); + ASSERT(zio->io_children_notdone == 0); + + if (bp != NULL) { + ASSERT(bp->blk_pad[0] == 0); + ASSERT(bp->blk_pad[1] == 0); + ASSERT(bp->blk_pad[2] == 0); + ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); + if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + if (zio->io_ndvas != 0) + ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); + ASSERT(BP_COUNT_GANG(bp) == 0 || + (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); + } + } + + if (vd != NULL) + vdev_stat_update(zio); + + if (zio->io_error) { + /* + * If this I/O is attached to a particular vdev, + * generate an error message describing the I/O failure + * at the block level. We ignore these errors if the + * device is currently unavailable. + */ + if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) + zfs_ereport_post(FM_EREPORT_ZFS_IO, + zio->io_spa, vd, zio, 0, 0); + + if ((zio->io_error == EIO || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && + zio->io_logical == zio) { + /* + * For root I/O requests, tell the SPA to log the error + * appropriately. Also, generate a logical data + * ereport. + */ + spa_log_error(zio->io_spa, zio); + + zfs_ereport_post(FM_EREPORT_ZFS_DATA, + zio->io_spa, NULL, zio, 0, 0); + } + + /* + * For I/O requests that cannot fail, panic appropriately. + */ + if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { + char *blkbuf; + + blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); + if (blkbuf) { + sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, + bp ? bp : &zio->io_bp_copy); + } + panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " + "%d", zio->io_error == ECKSUM ? + "bad checksum" : "I/O failure", + zio_type_name[zio->io_type], + vdev_description(vd), + (u_longlong_t)zio->io_offset, + zio, blkbuf ? blkbuf : "", zio->io_error); + } + } + zio_clear_transform_stack(zio); + + if (zio->io_done) + zio->io_done(zio); + + ASSERT(zio->io_delegate_list == NULL); + ASSERT(zio->io_delegate_next == NULL); + + if (pio != NULL) { + zio_t *next, *prev; + + mutex_enter(&pio->io_lock); + next = zio->io_sibling_next; + prev = zio->io_sibling_prev; + if (next != NULL) + next->io_sibling_prev = prev; + if (prev != NULL) + prev->io_sibling_next = next; + if (pio->io_child == zio) + pio->io_child = next; + mutex_exit(&pio->io_lock); + + zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, + &pio->io_children_notdone); + } + + /* + * Note: this I/O is now done, and will shortly be + * kmem_free()'d, so there is no need to clear this (or any + * other) flag. + */ + if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) + spa_config_exit(spa, zio); + + if (zio->io_waiter != NULL) { + mutex_enter(&zio->io_lock); + ASSERT(zio->io_stage == ZIO_STAGE_DONE); + zio->io_stalled = zio->io_stage; + cv_broadcast(&zio->io_cv); + mutex_exit(&zio->io_lock); + } else { + kmem_free(zio, sizeof (zio_t)); + } +} + +/* + * ========================================================================== + * Compression support + * ========================================================================== + */ +static void +zio_write_compress(zio_t *zio) +{ + int compress = zio->io_compress; + blkptr_t *bp = zio->io_bp; + void *cbuf; + uint64_t lsize = zio->io_size; + uint64_t csize = lsize; + uint64_t cbufsize = 0; + int pass; + + if (bp->blk_birth == zio->io_txg) { + /* + * We're rewriting an existing block, which means we're + * working on behalf of spa_sync(). For spa_sync() to + * converge, it must eventually be the case that we don't + * have to allocate new blocks. But compression changes + * the blocksize, which forces a reallocate, and makes + * convergence take longer. Therefore, after the first + * few passes, stop compressing to ensure convergence. + */ + pass = spa_sync_pass(zio->io_spa); + if (pass > zio_sync_pass.zp_dontcompress) + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT(BP_IS_HOLE(bp)); + pass = 1; + } + + if (compress != ZIO_COMPRESS_OFF) + if (!zio_compress_data(compress, zio->io_data, zio->io_size, + &cbuf, &csize, &cbufsize)) + compress = ZIO_COMPRESS_OFF; + + if (compress != ZIO_COMPRESS_OFF && csize != 0) + zio_push_transform(zio, cbuf, csize, cbufsize); + + /* + * The final pass of spa_sync() must be all rewrites, but the first + * few passes offer a trade-off: allocating blocks defers convergence, + * but newly allocated blocks are sequential, so they can be written + * to disk faster. Therefore, we allow the first few passes of + * spa_sync() to reallocate new blocks, but force rewrites after that. + * There should only be a handful of blocks after pass 1 in any case. + */ + if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && + pass > zio_sync_pass.zp_rewrite) { + ASSERT(csize != 0); + BP_SET_LSIZE(bp, lsize); + BP_SET_COMPRESS(bp, compress); + zio->io_pipeline = ZIO_REWRITE_PIPELINE; + } else { + if (bp->blk_birth == zio->io_txg) + BP_ZERO(bp); + if (csize == 0) { + BP_ZERO(bp); + zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; + } else { + ASSERT3U(BP_GET_NDVAS(bp), ==, 0); + BP_SET_LSIZE(bp, lsize); + BP_SET_PSIZE(bp, csize); + BP_SET_COMPRESS(bp, compress); + zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; + } + } + + zio_next_stage(zio); +} + +static void +zio_read_decompress(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + void *data; + uint64_t size; + uint64_t bufsize; + int compress = BP_GET_COMPRESS(bp); + + ASSERT(compress != ZIO_COMPRESS_OFF); + + zio_pop_transform(zio, &data, &size, &bufsize); + + if (zio_decompress_data(compress, data, size, + zio->io_data, zio->io_size)) + zio->io_error = EIO; + + zio_buf_free(data, bufsize); + + zio_next_stage(zio); +} + +/* + * ========================================================================== + * Gang block support + * ========================================================================== + */ +static void +zio_gang_pipeline(zio_t *zio) +{ + /* + * By default, the pipeline assumes that we're dealing with a gang + * block. If we're not, strip out any gang-specific stages. + */ + if (!BP_IS_GANG(zio->io_bp)) + zio->io_pipeline &= ~ZIO_GANG_STAGES; + + zio_next_stage(zio); +} + +static void +zio_gang_byteswap(zio_t *zio) +{ + ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); + + if (BP_SHOULD_BYTESWAP(zio->io_bp)) + byteswap_uint64_array(zio->io_data, zio->io_size); +} + +static void +zio_get_gang_header(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + uint64_t gsize = SPA_GANGBLOCKSIZE; + void *gbuf = zio_buf_alloc(gsize); + + ASSERT(BP_IS_GANG(bp)); + + zio_push_transform(zio, gbuf, gsize, gsize); + + zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, + NULL, NULL, ZIO_TYPE_READ, zio->io_priority, + zio->io_flags & ZIO_FLAG_GANG_INHERIT, + ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); + + zio_wait_children_done(zio); +} + +static void +zio_read_gang_members(zio_t *zio) +{ + zio_gbh_phys_t *gbh; + uint64_t gsize, gbufsize, loff, lsize; + int i; + + ASSERT(BP_IS_GANG(zio->io_bp)); + + zio_gang_byteswap(zio); + zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + + for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + lsize = BP_GET_PSIZE(gbp); + + ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); + ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); + ASSERT3U(loff + lsize, <=, zio->io_size); + ASSERT(i < SPA_GBH_NBLKPTRS); + ASSERT(!BP_IS_HOLE(gbp)); + + zio_nowait(zio_read(zio, zio->io_spa, gbp, + (char *)zio->io_data + loff, lsize, NULL, NULL, + zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, + &zio->io_bookmark)); + } + + zio_buf_free(gbh, gbufsize); + zio_wait_children_done(zio); +} + +static void +zio_rewrite_gang_members(zio_t *zio) +{ + zio_gbh_phys_t *gbh; + uint64_t gsize, gbufsize, loff, lsize; + int i; + + ASSERT(BP_IS_GANG(zio->io_bp)); + ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); + + zio_gang_byteswap(zio); + zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + + ASSERT(gsize == gbufsize); + + for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + lsize = BP_GET_PSIZE(gbp); + + ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); + ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); + ASSERT3U(loff + lsize, <=, zio->io_size); + ASSERT(i < SPA_GBH_NBLKPTRS); + ASSERT(!BP_IS_HOLE(gbp)); + + zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, + zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, + NULL, NULL, zio->io_priority, zio->io_flags, + &zio->io_bookmark)); + } + + zio_push_transform(zio, gbh, gsize, gbufsize); + zio_wait_children_ready(zio); +} + +static void +zio_free_gang_members(zio_t *zio) +{ + zio_gbh_phys_t *gbh; + uint64_t gsize, gbufsize; + int i; + + ASSERT(BP_IS_GANG(zio->io_bp)); + + zio_gang_byteswap(zio); + zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + + for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + + if (BP_IS_HOLE(gbp)) + continue; + zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, + gbp, NULL, NULL)); + } + + zio_buf_free(gbh, gbufsize); + zio_next_stage(zio); +} + +static void +zio_claim_gang_members(zio_t *zio) +{ + zio_gbh_phys_t *gbh; + uint64_t gsize, gbufsize; + int i; + + ASSERT(BP_IS_GANG(zio->io_bp)); + + zio_gang_byteswap(zio); + zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + + for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + if (BP_IS_HOLE(gbp)) + continue; + zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, + gbp, NULL, NULL)); + } + + zio_buf_free(gbh, gbufsize); + zio_next_stage(zio); +} + +static void +zio_write_allocate_gang_member_done(zio_t *zio) +{ + zio_t *pio = zio->io_parent; + dva_t *cdva = zio->io_bp->blk_dva; + dva_t *pdva = pio->io_bp->blk_dva; + uint64_t asize; + int d; + + ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); + ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); + ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); + ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); + + mutex_enter(&pio->io_lock); + for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { + ASSERT(DVA_GET_GANG(&pdva[d])); + asize = DVA_GET_ASIZE(&pdva[d]); + asize += DVA_GET_ASIZE(&cdva[d]); + DVA_SET_ASIZE(&pdva[d], asize); + } + mutex_exit(&pio->io_lock); +} + +static void +zio_write_allocate_gang_members(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dva_t *dva = bp->blk_dva; + spa_t *spa = zio->io_spa; + zio_gbh_phys_t *gbh; + uint64_t txg = zio->io_txg; + uint64_t resid = zio->io_size; + uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); + uint64_t gsize, loff, lsize; + uint32_t gbps_left; + int ndvas = zio->io_ndvas; + int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); + int error; + int i, d; + + gsize = SPA_GANGBLOCKSIZE; + gbps_left = SPA_GBH_NBLKPTRS; + + error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); + if (error == ENOSPC) + panic("can't allocate gang block header"); + ASSERT(error == 0); + + for (d = 0; d < gbh_ndvas; d++) + DVA_SET_GANG(&dva[d], 1); + + bp->blk_birth = txg; + + gbh = zio_buf_alloc(gsize); + bzero(gbh, gsize); + + /* We need to test multi-level gang blocks */ + if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) + maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); + + for (loff = 0, i = 0; loff != zio->io_size; + loff += lsize, resid -= lsize, gbps_left--, i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + dva = gbp->blk_dva; + + ASSERT(gbps_left != 0); + maxalloc = MIN(maxalloc, resid); + + while (resid <= maxalloc * gbps_left) { + error = metaslab_alloc(spa, maxalloc, gbp, ndvas, + txg, bp, B_FALSE); + if (error == 0) + break; + ASSERT3U(error, ==, ENOSPC); + if (maxalloc == SPA_MINBLOCKSIZE) + panic("really out of space"); + maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); + } + + if (resid <= maxalloc * gbps_left) { + lsize = maxalloc; + BP_SET_LSIZE(gbp, lsize); + BP_SET_PSIZE(gbp, lsize); + BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); + gbp->blk_birth = txg; + zio_nowait(zio_rewrite(zio, spa, + zio->io_checksum, txg, gbp, + (char *)zio->io_data + loff, lsize, + zio_write_allocate_gang_member_done, NULL, + zio->io_priority, zio->io_flags, + &zio->io_bookmark)); + } else { + lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); + ASSERT(lsize != SPA_MINBLOCKSIZE); + zio_nowait(zio_write_allocate(zio, spa, + zio->io_checksum, txg, gbp, + (char *)zio->io_data + loff, lsize, + zio_write_allocate_gang_member_done, NULL, + zio->io_priority, zio->io_flags)); + } + } + + ASSERT(resid == 0 && loff == zio->io_size); + + zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; + + zio_push_transform(zio, gbh, gsize, gsize); + /* + * As much as we'd like this to be zio_wait_children_ready(), + * updating our ASIZE doesn't happen until the io_done callback, + * so we have to wait for that to finish in order for our BP + * to be stable. + */ + zio_wait_children_done(zio); +} + +/* + * ========================================================================== + * Allocate and free blocks + * ========================================================================== + */ +static void +zio_dva_allocate(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + int error; + + ASSERT(BP_IS_HOLE(bp)); + ASSERT3U(BP_GET_NDVAS(bp), ==, 0); + ASSERT3U(zio->io_ndvas, >, 0); + ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); + + /* For testing, make some blocks above a certain size be gang blocks */ + if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { + zio_write_allocate_gang_members(zio); + return; + } + + ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + + error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, + zio->io_txg, NULL, B_FALSE); + + if (error == 0) { + bp->blk_birth = zio->io_txg; + } else if (error == ENOSPC) { + if (zio->io_size == SPA_MINBLOCKSIZE) + panic("really, truly out of space"); + zio_write_allocate_gang_members(zio); + return; + } else { + zio->io_error = error; + } + zio_next_stage(zio); +} + +static void +zio_dva_free(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); + + BP_ZERO(bp); + + zio_next_stage(zio); +} + +static void +zio_dva_claim(zio_t *zio) +{ + zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); + + zio_next_stage(zio); +} + +/* + * ========================================================================== + * Read and write to physical devices + * ========================================================================== + */ + +static void +zio_vdev_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd ? vd->vdev_top : NULL; + blkptr_t *bp = zio->io_bp; + uint64_t align; + + if (vd == NULL) { + /* The mirror_ops handle multiple DVAs in a single BP */ + vdev_mirror_ops.vdev_op_io_start(zio); + return; + } + + align = 1ULL << tvd->vdev_ashift; + + if (zio->io_retries == 0 && vd == tvd) + zio->io_flags |= ZIO_FLAG_FAILFAST; + + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && + vd->vdev_children == 0) { + zio->io_flags |= ZIO_FLAG_PHYSICAL; + zio->io_offset += VDEV_LABEL_START_SIZE; + } + + if (P2PHASE(zio->io_size, align) != 0) { + uint64_t asize = P2ROUNDUP(zio->io_size, align); + char *abuf = zio_buf_alloc(asize); + ASSERT(vd == tvd); + if (zio->io_type == ZIO_TYPE_WRITE) { + bcopy(zio->io_data, abuf, zio->io_size); + bzero(abuf + zio->io_size, asize - zio->io_size); + } + zio_push_transform(zio, abuf, asize, asize); + ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); + zio->io_flags |= ZIO_FLAG_SUBBLOCK; + } + + ASSERT(P2PHASE(zio->io_offset, align) == 0); + ASSERT(P2PHASE(zio->io_size, align) == 0); + ASSERT(bp == NULL || + P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); + ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); + + vdev_io_start(zio); + + /* zio_next_stage_async() gets called from io completion interrupt */ +} + +static void +zio_vdev_io_done(zio_t *zio) +{ + if (zio->io_vd == NULL) + /* The mirror_ops handle multiple DVAs in a single BP */ + vdev_mirror_ops.vdev_op_io_done(zio); + else + vdev_io_done(zio); +} + +/* XXPOLICY */ +boolean_t +zio_should_retry(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + if (zio->io_error == 0) + return (B_FALSE); + if (zio->io_delegate_list != NULL) + return (B_FALSE); + if (vd && vd != vd->vdev_top) + return (B_FALSE); + if (zio->io_flags & ZIO_FLAG_DONT_RETRY) + return (B_FALSE); + if (zio->io_retries > 0) + return (B_FALSE); + + return (B_TRUE); +} + +static void +zio_vdev_io_assess(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd ? vd->vdev_top : NULL; + + ASSERT(zio->io_vsd == NULL); + + if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { + void *abuf; + uint64_t asize; + ASSERT(vd == tvd); + zio_pop_transform(zio, &abuf, &asize, &asize); + if (zio->io_type == ZIO_TYPE_READ) + bcopy(abuf, zio->io_data, zio->io_size); + zio_buf_free(abuf, asize); + zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; + } + + if (zio_injection_enabled && !zio->io_error) + zio->io_error = zio_handle_fault_injection(zio, EIO); + + /* + * If the I/O failed, determine whether we should attempt to retry it. + */ + /* XXPOLICY */ + if (zio_should_retry(zio)) { + ASSERT(tvd == vd); + + zio->io_retries++; + zio->io_error = 0; + zio->io_flags &= ZIO_FLAG_VDEV_INHERIT | + ZIO_FLAG_CONFIG_GRABBED; + /* XXPOLICY */ + zio->io_flags &= ~ZIO_FLAG_FAILFAST; + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; + + dprintf("retry #%d for %s to %s offset %llx\n", + zio->io_retries, zio_type_name[zio->io_type], + vdev_description(vd), zio->io_offset); + + zio_next_stage_async(zio); + return; + } + + if (zio->io_error != 0 && zio->io_error != ECKSUM && + !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { + /* + * Poor man's hotplug support. Even if we're done retrying this + * I/O, try to reopen the vdev to see if it's still attached. + * To avoid excessive thrashing, we only try it once a minute. + * This also has the effect of detecting when missing devices + * have come back, by polling the device once a minute. + * + * We need to do this asynchronously because we can't grab + * all the necessary locks way down here. + */ + if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { + vd->vdev_last_try = gethrtime(); + tvd->vdev_reopen_wanted = 1; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); + } + } + + zio_next_stage(zio); +} + +void +zio_vdev_io_reissue(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_stage--; +} + +void +zio_vdev_io_redone(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); + + zio->io_stage--; +} + +void +zio_vdev_io_bypass(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_flags |= ZIO_FLAG_IO_BYPASS; + zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; +} + +/* + * ========================================================================== + * Generate and verify checksums + * ========================================================================== + */ +static void +zio_checksum_generate(zio_t *zio) +{ + int checksum = zio->io_checksum; + blkptr_t *bp = zio->io_bp; + + ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + + BP_SET_CHECKSUM(bp, checksum); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); + + zio_next_stage(zio); +} + +static void +zio_gang_checksum_generate(zio_t *zio) +{ + zio_cksum_t zc; + zio_gbh_phys_t *gbh = zio->io_data; + + ASSERT(BP_IS_GANG(zio->io_bp)); + ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); + + zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); + + zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); + + zio_next_stage(zio); +} + +static void +zio_checksum_verify(zio_t *zio) +{ + if (zio->io_bp != NULL) { + zio->io_error = zio_checksum_error(zio); + if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, zio->io_vd, zio, 0, 0); + } + + zio_next_stage(zio); +} + +/* + * Called by RAID-Z to ensure we don't compute the checksum twice. + */ +void +zio_checksum_verified(zio_t *zio) +{ + zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); +} + +/* + * Set the external verifier for a gang block based on stuff in the bp + */ +void +zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) +{ + blkptr_t *bp = zio->io_bp; + + zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); + zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); + zcp->zc_word[2] = bp->blk_birth; + zcp->zc_word[3] = 0; +} + +/* + * ========================================================================== + * Define the pipeline + * ========================================================================== + */ +typedef void zio_pipe_stage_t(zio_t *zio); + +static void +zio_badop(zio_t *zio) +{ + panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); +} + +zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { + zio_badop, + zio_wait_children_ready, + zio_write_compress, + zio_checksum_generate, + zio_gang_pipeline, + zio_get_gang_header, + zio_rewrite_gang_members, + zio_free_gang_members, + zio_claim_gang_members, + zio_dva_allocate, + zio_dva_free, + zio_dva_claim, + zio_gang_checksum_generate, + zio_ready, + zio_vdev_io_start, + zio_vdev_io_done, + zio_vdev_io_assess, + zio_wait_children_done, + zio_checksum_verify, + zio_read_gang_members, + zio_read_decompress, + zio_done, + zio_badop +}; + +/* + * Move an I/O to the next stage of the pipeline and execute that stage. + * There's no locking on io_stage because there's no legitimate way for + * multiple threads to be attempting to process the same I/O. + */ +void +zio_next_stage(zio_t *zio) +{ + uint32_t pipeline = zio->io_pipeline; + + ASSERT(!MUTEX_HELD(&zio->io_lock)); + + if (zio->io_error) { + dprintf("zio %p vdev %s offset %llx stage %d error %d\n", + zio, vdev_description(zio->io_vd), + zio->io_offset, zio->io_stage, zio->io_error); + if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) + pipeline &= ZIO_ERROR_PIPELINE_MASK; + } + + while (((1U << ++zio->io_stage) & pipeline) == 0) + continue; + + ASSERT(zio->io_stage <= ZIO_STAGE_DONE); + ASSERT(zio->io_stalled == 0); + + /* + * See the comment in zio_next_stage_async() about per-CPU taskqs. + */ + if (((1U << zio->io_stage) & zio->io_async_stages) && + (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) && + !(zio->io_flags & ZIO_FLAG_METADATA)) { + taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; + (void) taskq_dispatch(tq, + (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); + } else { + zio_pipeline[zio->io_stage](zio); + } +} + +void +zio_next_stage_async(zio_t *zio) +{ + taskq_t *tq; + uint32_t pipeline = zio->io_pipeline; + + ASSERT(!MUTEX_HELD(&zio->io_lock)); + + if (zio->io_error) { + dprintf("zio %p vdev %s offset %llx stage %d error %d\n", + zio, vdev_description(zio->io_vd), + zio->io_offset, zio->io_stage, zio->io_error); + if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) + pipeline &= ZIO_ERROR_PIPELINE_MASK; + } + + while (((1U << ++zio->io_stage) & pipeline) == 0) + continue; + + ASSERT(zio->io_stage <= ZIO_STAGE_DONE); + ASSERT(zio->io_stalled == 0); + + /* + * For performance, we'll probably want two sets of task queues: + * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU + * part is for read performance: since we have to make a pass over + * the data to checksum it anyway, we want to do this on the same CPU + * that issued the read, because (assuming CPU scheduling affinity) + * that thread is probably still there. Getting this optimization + * right avoids performance-hostile cache-to-cache transfers. + * + * Note that having two sets of task queues is also necessary for + * correctness: if all of the issue threads get bogged down waiting + * for dependent reads (e.g. metaslab freelist) to complete, then + * there won't be any threads available to service I/O completion + * interrupts. + */ + if ((1U << zio->io_stage) & zio->io_async_stages) { + if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) + tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; + else + tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; + (void) taskq_dispatch(tq, + (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); + } else { + zio_pipeline[zio->io_stage](zio); + } +} + +static boolean_t +zio_alloc_should_fail(void) +{ + static uint16_t allocs = 0; + + return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0); +} + +/* + * Try to allocate an intent log block. Return 0 on success, errno on failure. + */ +int +zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, + uint64_t txg) +{ + int error; + + spa_config_enter(spa, RW_READER, FTAG); + + if (zio_zil_fail_shift && zio_alloc_should_fail()) { + spa_config_exit(spa, FTAG); + return (ENOSPC); + } + + /* + * We were passed the previous log blocks dva_t in bp->blk_dva[0]. + */ + error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE); + + if (error == 0) { + BP_SET_LSIZE(new_bp, size); + BP_SET_PSIZE(new_bp, size); + BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); + BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); + BP_SET_LEVEL(new_bp, 0); + BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); + new_bp->blk_birth = txg; + } + + spa_config_exit(spa, FTAG); + + return (error); +} + +/* + * Free an intent log block. We know it can't be a gang block, so there's + * nothing to do except metaslab_free() it. + */ +void +zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) +{ + ASSERT(!BP_IS_GANG(bp)); + + spa_config_enter(spa, RW_READER, FTAG); + + metaslab_free(spa, bp, txg, B_FALSE); + + spa_config_exit(spa, FTAG); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c new file mode 100644 index 0000000..f0d9a14 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c @@ -0,0 +1,172 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +/* + * Checksum vectors. + * + * In the SPA, everything is checksummed. We support checksum vectors + * for three distinct reasons: + * + * 1. Different kinds of data need different levels of protection. + * For SPA metadata, we always want a very strong checksum. + * For user data, we let users make the trade-off between speed + * and checksum strength. + * + * 2. Cryptographic hash and MAC algorithms are an area of active research. + * It is likely that in future hash functions will be at least as strong + * as current best-of-breed, and may be substantially faster as well. + * We want the ability to take advantage of these new hashes as soon as + * they become available. + * + * 3. If someone develops hardware that can compute a strong hash quickly, + * we want the ability to take advantage of that hardware. + * + * Of course, we don't want a checksum upgrade to invalidate existing + * data, so we store the checksum *function* in five bits of the DVA. + * This gives us room for up to 32 different checksum functions. + * + * When writing a block, we always checksum it with the latest-and-greatest + * checksum function of the appropriate strength. When reading a block, + * we compare the expected checksum against the actual checksum, which we + * compute via the checksum function specified in the DVA encoding. + */ + +/*ARGSUSED*/ +static void +zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { + {{NULL, NULL}, 0, 0, "inherit"}, + {{NULL, NULL}, 0, 0, "on"}, + {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"}, + {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"}, + {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"}, + {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"}, +}; + +uint8_t +zio_checksum_select(uint8_t child, uint8_t parent) +{ + ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); + + if (child == ZIO_CHECKSUM_INHERIT) + return (parent); + + if (child == ZIO_CHECKSUM_ON) + return (ZIO_CHECKSUM_ON_VALUE); + + return (child); +} + +/* + * Generate the checksum. + */ +void +zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size) +{ + zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t zbt_cksum; + + ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(ci->ci_func[0] != NULL); + + if (ci->ci_zbt) { + *zcp = zbt->zbt_cksum; + zbt->zbt_magic = ZBT_MAGIC; + ci->ci_func[0](data, size, &zbt_cksum); + zbt->zbt_cksum = zbt_cksum; + } else { + ci->ci_func[0](data, size, zcp); + } +} + +int +zio_checksum_error(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + zio_cksum_t zc = bp->blk_cksum; + uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : + BP_GET_CHECKSUM(bp); + int byteswap = BP_SHOULD_BYTESWAP(bp); + void *data = zio->io_data; + uint64_t size = ZIO_GET_IOSIZE(zio); + zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t actual_cksum, expected_cksum; + + if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) + return (EINVAL); + + if (ci->ci_zbt) { + if (checksum == ZIO_CHECKSUM_GANG_HEADER) + zio_set_gang_verifier(zio, &zc); + + if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) { + expected_cksum = zbt->zbt_cksum; + byteswap_uint64_array(&expected_cksum, + sizeof (zio_cksum_t)); + zbt->zbt_cksum = zc; + byteswap_uint64_array(&zbt->zbt_cksum, + sizeof (zio_cksum_t)); + ci->ci_func[1](data, size, &actual_cksum); + zbt->zbt_cksum = expected_cksum; + byteswap_uint64_array(&zbt->zbt_cksum, + sizeof (zio_cksum_t)); + } else { + expected_cksum = zbt->zbt_cksum; + zbt->zbt_cksum = zc; + ci->ci_func[0](data, size, &actual_cksum); + zbt->zbt_cksum = expected_cksum; + } + zc = expected_cksum; + } else { + ASSERT(!BP_IS_GANG(bp)); + ci->ci_func[byteswap](data, size, &actual_cksum); + } + + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) + return (ECKSUM); + + if (zio_injection_enabled && !zio->io_error) + return (zio_handle_fault_injection(zio, ECKSUM)); + + return (0); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c new file mode 100644 index 0000000..c563be4 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c @@ -0,0 +1,148 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/compress.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zio_compress.h> + +/* + * Compression vectors. + */ + +zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { + {NULL, NULL, 0, "inherit"}, + {NULL, NULL, 0, "on"}, + {NULL, NULL, 0, "uncompressed"}, + {lzjb_compress, lzjb_decompress, 0, "lzjb"}, + {NULL, NULL, 0, "empty"}, + {gzip_compress, gzip_decompress, 1, "gzip-1"}, + {gzip_compress, gzip_decompress, 2, "gzip-2"}, + {gzip_compress, gzip_decompress, 3, "gzip-3"}, + {gzip_compress, gzip_decompress, 4, "gzip-4"}, + {gzip_compress, gzip_decompress, 5, "gzip-5"}, + {gzip_compress, gzip_decompress, 6, "gzip-6"}, + {gzip_compress, gzip_decompress, 7, "gzip-7"}, + {gzip_compress, gzip_decompress, 8, "gzip-8"}, + {gzip_compress, gzip_decompress, 9, "gzip-9"}, +}; + +uint8_t +zio_compress_select(uint8_t child, uint8_t parent) +{ + ASSERT(child < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON); + + if (child == ZIO_COMPRESS_INHERIT) + return (parent); + + if (child == ZIO_COMPRESS_ON) + return (ZIO_COMPRESS_ON_VALUE); + + return (child); +} + +int +zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp, + uint64_t *destsizep, uint64_t *destbufsizep) +{ + uint64_t *word, *word_end; + uint64_t ciosize, gapsize, destbufsize; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + char *dest; + uint_t allzero; + + ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); + ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); + + /* + * If the data is all zeroes, we don't even need to allocate + * a block for it. We indicate this by setting *destsizep = 0. + */ + allzero = 1; + word = src; + word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize); + while (word < word_end) { + if (*word++ != 0) { + allzero = 0; + break; + } + } + if (allzero) { + *destp = NULL; + *destsizep = 0; + *destbufsizep = 0; + return (1); + } + + if (cpfunc == ZIO_COMPRESS_EMPTY) + return (0); + + /* Compress at least 12.5% */ + destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE); + if (destbufsize == 0) + return (0); + dest = zio_buf_alloc(destbufsize); + + ciosize = ci->ci_compress(src, dest, (size_t)srcsize, + (size_t)destbufsize, ci->ci_level); + if (ciosize > destbufsize) { + zio_buf_free(dest, destbufsize); + return (0); + } + + /* Cool. We compressed at least as much as we were hoping to. */ + + /* For security, make sure we don't write random heap crap to disk */ + gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize; + if (gapsize != 0) { + bzero(dest + ciosize, gapsize); + ciosize += gapsize; + } + + ASSERT3U(ciosize, <=, destbufsize); + ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0); + *destp = dest; + *destsizep = ciosize; + *destbufsizep = destbufsize; + + return (1); +} + +int +zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, + void *dest, uint64_t destsize) +{ + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); + + return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c new file mode 100644 index 0000000..4cada09 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c @@ -0,0 +1,315 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZFS fault injection + * + * To handle fault injection, we keep track of a series of zinject_record_t + * structures which describe which logical block(s) should be injected with a + * fault. These are kept in a global list. Each record corresponds to a given + * spa_t and maintains a special hold on the spa_t so that it cannot be deleted + * or exported while the injection record exists. + * + * Device level injection is done using the 'zi_guid' field. If this is set, it + * means that the error is destined for a particular device, not a piece of + * data. + * + * This is a rather poor data structure and algorithm, but we don't expect more + * than a few faults at any one time, so it should be sufficient for our needs. + */ + +#include <sys/arc.h> +#include <sys/zio_impl.h> +#include <sys/zfs_ioctl.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> + +uint32_t zio_injection_enabled; + +typedef struct inject_handler { + int zi_id; + spa_t *zi_spa; + zinject_record_t zi_record; + list_node_t zi_link; +} inject_handler_t; + +static list_t inject_handlers; +static krwlock_t inject_lock; +static int inject_next_id = 1; + +/* + * Returns true if the given record matches the I/O in progress. + */ +static boolean_t +zio_match_handler(zbookmark_t *zb, uint64_t type, + zinject_record_t *record, int error) +{ + /* + * Check for a match against the MOS, which is based on type + */ + if (zb->zb_objset == 0 && record->zi_objset == 0 && + record->zi_object == 0) { + if (record->zi_type == DMU_OT_NONE || + type == record->zi_type) + return (record->zi_freq == 0 || + spa_get_random(100) < record->zi_freq); + else + return (B_FALSE); + } + + /* + * Check for an exact match. + */ + if (zb->zb_objset == record->zi_objset && + zb->zb_object == record->zi_object && + zb->zb_level == record->zi_level && + zb->zb_blkid >= record->zi_start && + zb->zb_blkid <= record->zi_end && + error == record->zi_error) + return (record->zi_freq == 0 || + spa_get_random(100) < record->zi_freq); + + return (B_FALSE); +} + +/* + * Determine if the I/O in question should return failure. Returns the errno + * to be returned to the caller. + */ +int +zio_handle_fault_injection(zio_t *zio, int error) +{ + int ret = 0; + inject_handler_t *handler; + + /* + * Ignore I/O not associated with any logical data. + */ + if (zio->io_logical == NULL) + return (0); + + /* + * Currently, we only support fault injection on reads. + */ + if (zio->io_type != ZIO_TYPE_READ) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + /* Ignore errors not destined for this pool */ + if (zio->io_spa != handler->zi_spa) + continue; + + /* Ignore device errors */ + if (handler->zi_record.zi_guid != 0) + continue; + + /* If this handler matches, return EIO */ + if (zio_match_handler(&zio->io_logical->io_bookmark, + zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, + &handler->zi_record, error)) { + ret = error; + break; + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +int +zio_handle_device_injection(vdev_t *vd, int error) +{ + inject_handler_t *handler; + int ret = 0; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + if (handler->zi_record.zi_error == error) { + /* + * For a failed open, pretend like the device + * has gone away. + */ + if (error == ENXIO) + vd->vdev_stat.vs_aux = + VDEV_AUX_OPEN_FAILED; + ret = error; + break; + } + if (handler->zi_record.zi_error == ENXIO) { + ret = EIO; + break; + } + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +/* + * Create a new handler for the given record. We add it to the list, adding + * a reference to the spa_t in the process. We increment zio_injection_enabled, + * which is the switch to trigger all fault injection. + */ +int +zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) +{ + inject_handler_t *handler; + int error; + spa_t *spa; + + /* + * If this is pool-wide metadata, make sure we unload the corresponding + * spa_t, so that the next attempt to load it will trigger the fault. + * We call spa_reset() to unload the pool appropriately. + */ + if (flags & ZINJECT_UNLOAD_SPA) + if ((error = spa_reset(name)) != 0) + return (error); + + if (!(flags & ZINJECT_NULL)) { + /* + * spa_inject_ref() will add an injection reference, which will + * prevent the pool from being removed from the namespace while + * still allowing it to be unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (ENOENT); + + handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); + + rw_enter(&inject_lock, RW_WRITER); + + *id = handler->zi_id = inject_next_id++; + handler->zi_spa = spa; + handler->zi_record = *record; + list_insert_tail(&inject_handlers, handler); + atomic_add_32(&zio_injection_enabled, 1); + + rw_exit(&inject_lock); + } + + /* + * Flush the ARC, so that any attempts to read this data will end up + * going to the ZIO layer. Note that this is a little overkill, but + * we don't have the necessary ARC interfaces to do anything else, and + * fault injection isn't a performance critical path. + */ + if (flags & ZINJECT_FLUSH_ARC) + arc_flush(); + + return (0); +} + +/* + * Returns the next record with an ID greater than that supplied to the + * function. Used to iterate over all handlers in the system. + */ +int +zio_inject_list_next(int *id, char *name, size_t buflen, + zinject_record_t *record) +{ + inject_handler_t *handler; + int ret; + + mutex_enter(&spa_namespace_lock); + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id > *id) + break; + + if (handler) { + *record = handler->zi_record; + *id = handler->zi_id; + (void) strncpy(name, spa_name(handler->zi_spa), buflen); + ret = 0; + } else { + ret = ENOENT; + } + + rw_exit(&inject_lock); + mutex_exit(&spa_namespace_lock); + + return (ret); +} + +/* + * Clear the fault handler with the given identifier, or return ENOENT if none + * exists. + */ +int +zio_clear_fault(int id) +{ + inject_handler_t *handler; + int ret; + + rw_enter(&inject_lock, RW_WRITER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id == id) + break; + + if (handler == NULL) { + ret = ENOENT; + } else { + list_remove(&inject_handlers, handler); + spa_inject_delref(handler->zi_spa); + kmem_free(handler, sizeof (inject_handler_t)); + atomic_add_32(&zio_injection_enabled, -1); + ret = 0; + } + + rw_exit(&inject_lock); + + return (ret); +} + +void +zio_inject_init(void) +{ + list_create(&inject_handlers, sizeof (inject_handler_t), + offsetof(inject_handler_t, zi_link)); +} + +void +zio_inject_fini(void) +{ + list_destroy(&inject_handlers); +} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c new file mode 100644 index 0000000..9aa8b7f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -0,0 +1,796 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev/zvol/dsk/<pool_name>/<dataset_name> + * /dev/zvol/rdsk/<pool_name>/<dataset_name> + * + * These links are created by the ZFS-specific devfsadm link generator. + * Volumes are persistent through reboot. No user command needs to be + * run before opening and using a device. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/bio.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dsl_prop.h> +#include <sys/byteorder.h> +#include <sys/dirent.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ioctl.h> +#include <sys/zil.h> +#include <sys/refcount.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_rlock.h> +#include <geom/geom.h> + +#include "zfs_namecheck.h" + +struct g_class zfs_zvol_class = { + .name = "ZFS::ZVOL", + .version = G_VERSION, +}; + +DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); + +#define ZVOL_OBJ 1ULL +#define ZVOL_ZAP_OBJ 2ULL + +static uint32_t zvol_minors; + +/* + * The in-core state of each volume. + */ +typedef struct zvol_state { + char zv_name[MAXPATHLEN]; /* pool/dd name */ + uint64_t zv_volsize; /* amount of space we advertise */ + uint64_t zv_volblocksize; /* volume block size */ + struct g_provider *zv_provider; /* GEOM provider */ + uint8_t zv_min_bs; /* minimum addressable block shift */ + uint8_t zv_readonly; /* hard readonly; like write-protect */ + objset_t *zv_objset; /* objset handle */ + uint32_t zv_mode; /* DS_MODE_* flags at open time */ + uint32_t zv_total_opens; /* total open count */ + zilog_t *zv_zilog; /* ZIL handle */ + uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ + znode_t zv_znode; /* for range locking */ + int zv_state; + struct bio_queue_head zv_queue; + struct mtx zv_queue_mtx; /* zv_queue mutex */ +} zvol_state_t; + +/* + * zvol maximum transfer in one DMU tx. + */ +int zvol_maxphys = DMU_MAX_ACCESS/2; + +static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); + +int +zvol_check_volsize(uint64_t volsize, uint64_t blocksize) +{ + if (volsize == 0) + return (EINVAL); + + if (volsize % blocksize != 0) + return (EINVAL); + +#ifdef _ILP32 + if (volsize - 1 > SPEC_MAXOFFSET_T) + return (EOVERFLOW); +#endif + return (0); +} + +int +zvol_check_volblocksize(uint64_t volblocksize) +{ + if (volblocksize < SPA_MINBLOCKSIZE || + volblocksize > SPA_MAXBLOCKSIZE || + !ISP2(volblocksize)) + return (EDOM); + + return (0); +} + +static void +zvol_readonly_changed_cb(void *arg, uint64_t newval) +{ + zvol_state_t *zv = arg; + + zv->zv_readonly = (uint8_t)newval; +} + +int +zvol_get_stats(objset_t *os, nvlist_t *nv) +{ + int error; + dmu_object_info_t doi; + uint64_t val; + + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); + if (error) + return (error); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); + + error = dmu_object_info(os, ZVOL_OBJ, &doi); + + if (error == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, + doi.doi_data_block_size); + } + + return (error); +} + +static zvol_state_t * +zvol_minor_lookup(const char *name) +{ + struct g_provider *pp; + struct g_geom *gp; + + g_topology_assert(); + + LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) { + LIST_FOREACH(pp, &gp->provider, provider) { + if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0) + return (pp->private); + } + } + + return (NULL); +} + +static int +zvol_access(struct g_provider *pp, int acr, int acw, int ace) +{ + zvol_state_t *zv; + + g_topology_assert(); + + zv = pp->private; + if (zv == NULL) { + if (acr <= 0 && acw <= 0 && ace <= 0) + return (0); + return (pp->error); + } + + ASSERT(zv->zv_objset != NULL); + + if (acw > 0 && (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) + return (EROFS); + + zv->zv_total_opens += acr + acw + ace; + + return (0); +} + +/* + * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. + * + * We store data in the log buffers if it's small enough. + * Otherwise we will later flush the data out via dmu_sync(). + */ +ssize_t zvol_immediate_write_sz = 32768; + +static void +zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) +{ + uint32_t blocksize = zv->zv_volblocksize; + lr_write_t *lr; + + while (len) { + ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); + itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + + itx->itx_wr_state = + len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; + itx->itx_private = zv; + lr = (lr_write_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = nbytes; + lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); + BP_ZERO(&lr->lr_blkptr); + + (void) zil_itx_assign(zv->zv_zilog, itx, tx); + len -= nbytes; + off += nbytes; + } +} + +static void +zvol_start(struct bio *bp) +{ + zvol_state_t *zv; + + switch (bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_FLUSH: + zv = bp->bio_to->private; + ASSERT(zv != NULL); + mtx_lock(&zv->zv_queue_mtx); + bioq_insert_tail(&zv->zv_queue, bp); + wakeup_one(&zv->zv_queue); + mtx_unlock(&zv->zv_queue_mtx); + break; + case BIO_DELETE: + case BIO_GETATTR: + default: + g_io_deliver(bp, EOPNOTSUPP); + break; + } +} + +static void +zvol_serve_one(zvol_state_t *zv, struct bio *bp) +{ + uint64_t off, volsize; + size_t size, resid; + char *addr; + objset_t *os; + rl_t *rl; + int error = 0; + boolean_t reading; + + off = bp->bio_offset; + volsize = zv->zv_volsize; + + os = zv->zv_objset; + ASSERT(os != NULL); + + addr = bp->bio_data; + resid = bp->bio_length; + + error = 0; + + /* + * There must be no buffer changes when doing a dmu_sync() because + * we can't change the data whilst calculating the checksum. + * A better approach than a per zvol rwlock would be to lock ranges. + */ + reading = (bp->bio_cmd == BIO_READ); + rl = zfs_range_lock(&zv->zv_znode, off, resid, + reading ? RL_READER : RL_WRITER); + + while (resid != 0 && off < volsize) { + + size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */ + + if (size > volsize - off) /* don't write past the end */ + size = volsize - off; + + if (reading) { + error = dmu_read(os, ZVOL_OBJ, off, size, addr); + } else { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + zvol_log_write(zv, tx, off, size); + dmu_tx_commit(tx); + } + } + if (error) + break; + off += size; + addr += size; + resid -= size; + } + zfs_range_unlock(rl); + + bp->bio_completed = bp->bio_length - resid; + if (bp->bio_completed < bp->bio_length) + bp->bio_error = (off > volsize ? EINVAL : error); + + /* + * XXX: We are devilering here? + * Looks like I don't understand something here, but I was sure it was + * an async request. + */ + g_io_deliver(bp, bp->bio_error); +} + +static void +zvol_worker(void *arg) +{ + zvol_state_t *zv; + struct bio *bp; + + zv = arg; + for (;;) { + mtx_lock(&zv->zv_queue_mtx); + bp = bioq_takefirst(&zv->zv_queue); + if (bp == NULL) { + if (zv->zv_state == 1) { + zv->zv_state = 2; + wakeup(&zv->zv_state); + mtx_unlock(&zv->zv_queue_mtx); + kthread_exit(0); + } + msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP, + "zvol:io", 0); + continue; + } + mtx_unlock(&zv->zv_queue_mtx); + if (bp->bio_cmd == BIO_FLUSH) { + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); + g_io_deliver(bp, 0); + } else { + zvol_serve_one(zv, bp); + } + } +} + +void +zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx) +{ + zfs_create_data_t *zc = arg; + int error; + uint64_t volblocksize, volsize; + + VERIFY(nvlist_lookup_uint64(zc->zc_props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); + if (nvlist_lookup_uint64(zc->zc_props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) + volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); + + /* + * These properites must be removed from the list so the generic + * property setting step won't apply to them. + */ + VERIFY(nvlist_remove_all(zc->zc_props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); + (void) nvlist_remove_all(zc->zc_props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); + + error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); + ASSERT(error == 0); +} + +/* + * Replay a TX_WRITE ZIL transaction that didn't get committed + * after a system failure + */ +static int +zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) +{ + objset_t *os = zv->zv_objset; + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + uint64_t off = lr->lr_offset; + uint64_t len = lr->lr_length; + dmu_tx_t *tx; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); + error = dmu_tx_assign(tx, zv->zv_txg_assign); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, len, data, tx); + dmu_tx_commit(tx); + } + + return (error); +} + +/* ARGSUSED */ +static int +zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) +{ + return (ENOTSUP); +} + +/* + * Callback vectors for replaying records. + * Only TX_WRITE is needed for zvol. + */ +zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { + zvol_replay_err, /* 0 no such transaction type */ + zvol_replay_err, /* TX_CREATE */ + zvol_replay_err, /* TX_MKDIR */ + zvol_replay_err, /* TX_MKXATTR */ + zvol_replay_err, /* TX_SYMLINK */ + zvol_replay_err, /* TX_REMOVE */ + zvol_replay_err, /* TX_RMDIR */ + zvol_replay_err, /* TX_LINK */ + zvol_replay_err, /* TX_RENAME */ + zvol_replay_write, /* TX_WRITE */ + zvol_replay_err, /* TX_TRUNCATE */ + zvol_replay_err, /* TX_SETATTR */ + zvol_replay_err, /* TX_ACL */ +}; + +/* + * Create a minor node for the specified volume. + */ +int +zvol_create_minor(const char *name, dev_t dev) +{ + struct g_provider *pp; + struct g_geom *gp; + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t doi; + uint64_t volsize; + int ds_mode = DS_MODE_PRIMARY; + int error; + + DROP_GIANT(); + g_topology_lock(); + + if ((zv = zvol_minor_lookup(name)) != NULL) { + error = EEXIST; + goto end; + } + + if (strchr(name, '@') != 0) + ds_mode |= DS_MODE_READONLY; + + error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); + if (error) + goto end; + + g_topology_unlock(); + PICKUP_GIANT(); + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + DROP_GIANT(); + g_topology_lock(); + if (error) { + dmu_objset_close(os); + goto end; + } + + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_start; + gp->access = zvol_access; + pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name); + pp->mediasize = volsize; + pp->sectorsize = DEV_BSIZE; + + zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); + (void) strcpy(zv->zv_name, name); + zv->zv_min_bs = DEV_BSHIFT; + zv->zv_provider = pp; + zv->zv_volsize = pp->mediasize; + zv->zv_objset = os; + zv->zv_mode = ds_mode; + zv->zv_zilog = zil_open(os, zvol_get_data); + mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, + sizeof (rl_t), offsetof(rl_t, r_node)); + + + /* get and cache the blocksize */ + error = dmu_object_info(os, ZVOL_OBJ, &doi); + ASSERT(error == 0); + zv->zv_volblocksize = doi.doi_data_block_size; + + zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector); + + /* XXX this should handle the possible i/o error */ + VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), + "readonly", zvol_readonly_changed_cb, zv) == 0); + + pp->private = zv; + g_error_provider(pp, 0); + + bioq_init(&zv->zv_queue); + mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF); + zv->zv_state = 0; + kthread_create(zvol_worker, zv, NULL, 0, 0, "zvol:worker %s", pp->name); + + zvol_minors++; +end: + g_topology_unlock(); + PICKUP_GIANT(); + + return (error); +} + +/* + * Remove minor node for the specified volume. + */ +int +zvol_remove_minor(const char *name) +{ + struct g_provider *pp; + zvol_state_t *zv; + int error = 0; + + DROP_GIANT(); + g_topology_lock(); + + if ((zv = zvol_minor_lookup(name)) == NULL) { + error = ENXIO; + goto end; + } + + if (zv->zv_total_opens != 0) { + error = EBUSY; + goto end; + } + + VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), + "readonly", zvol_readonly_changed_cb, zv) == 0); + + mtx_lock(&zv->zv_queue_mtx); + zv->zv_state = 1; + wakeup_one(&zv->zv_queue); + while (zv->zv_state != 2) + msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0); + mtx_unlock(&zv->zv_queue_mtx); + mtx_destroy(&zv->zv_queue_mtx); + + pp = zv->zv_provider; + pp->private = NULL; + g_wither_geom(pp->geom, ENXIO); + + zil_close(zv->zv_zilog); + zv->zv_zilog = NULL; + dmu_objset_close(zv->zv_objset); + zv->zv_objset = NULL; + avl_destroy(&zv->zv_znode.z_range_avl); + mutex_destroy(&zv->zv_znode.z_range_lock); + + kmem_free(zv, sizeof(*zv)); + + zvol_minors--; +end: + g_topology_unlock(); + PICKUP_GIANT(); + + return (error); +} + +int +zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize) +{ + zvol_state_t *zv; + dmu_tx_t *tx; + int error; + dmu_object_info_t doi; + + DROP_GIANT(); + g_topology_lock(); + + if ((zv = zvol_minor_lookup(name)) == NULL) { + error = ENXIO; + goto end; + } + + if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || + (error = zvol_check_volsize(volsize, + doi.doi_data_block_size)) != 0) { + goto end; + } + + if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { + error = EROFS; + goto end; + } + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto end; + } + + error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, + &volsize, tx); + if (error == 0) { + error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize, + DMU_OBJECT_END, tx); + } + + dmu_tx_commit(tx); + + if (error == 0) { + zv->zv_volsize = volsize; + zv->zv_provider->mediasize = volsize; /* XXX: Not supported. */ + } +end: + g_topology_unlock(); + PICKUP_GIANT(); + + return (error); +} + +int +zvol_set_volblocksize(const char *name, uint64_t volblocksize) +{ + zvol_state_t *zv; + dmu_tx_t *tx; + int error; + + DROP_GIANT(); + g_topology_lock(); + + if ((zv = zvol_minor_lookup(name)) == NULL) { + error = ENXIO; + goto end; + } + + if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { + error = EROFS; + goto end; + } + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, + volblocksize, 0, tx); + if (error == ENOTSUP) + error = EBUSY; + dmu_tx_commit(tx); + /* XXX: Not supported. */ +#if 0 + if (error == 0) + zv->zv_provider->sectorsize = zc->zc_volblocksize; +#endif + } +end: + g_topology_unlock(); + PICKUP_GIANT(); + + return (error); +} + +void +zvol_get_done(dmu_buf_t *db, void *vzgd) +{ + zgd_t *zgd = (zgd_t *)vzgd; + rl_t *rl = zgd->zgd_rl; + + dmu_buf_rele(db, vzgd); + zfs_range_unlock(rl); + zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); + kmem_free(zgd, sizeof (zgd_t)); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +static int +zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) +{ + zvol_state_t *zv = arg; + objset_t *os = zv->zv_objset; + dmu_buf_t *db; + rl_t *rl; + zgd_t *zgd; + uint64_t boff; /* block starting offset */ + int dlen = lr->lr_length; /* length of user data */ + int error; + + ASSERT(zio); + ASSERT(dlen != 0); + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) /* immediate write */ + return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf)); + + zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_zilog = zv->zv_zilog; + zgd->zgd_bp = &lr->lr_blkptr; + + /* + * Lock the range of the block to ensure that when the data is + * written out and it's checksum is being calculated that no other + * thread can change the block. + */ + boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t); + rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize, + RL_READER); + zgd->zgd_rl = rl; + + VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); + error = dmu_sync(zio, db, &lr->lr_blkptr, + lr->lr_common.lrc_txg, zvol_get_done, zgd); + if (error == 0) + zil_add_vdev(zv->zv_zilog, + DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); + /* + * If we get EINPROGRESS, then we need to wait for a + * write IO initiated by dmu_sync() to complete before + * we can release this dbuf. We will finish everything + * up in the zvol_get_done() callback. + */ + if (error == EINPROGRESS) + return (0); + dmu_buf_rele(db, zgd); + zfs_range_unlock(rl); + kmem_free(zgd, sizeof (zgd_t)); + return (error); +} + +int +zvol_busy(void) +{ + return (zvol_minors != 0); +} + +void +zvol_init(void) +{ + ZFS_LOG(1, "ZVOL Initialized."); +} + +void +zvol_fini(void) +{ + ZFS_LOG(1, "ZVOL Deinitialized."); +} diff --git a/sys/contrib/opensolaris/uts/common/os/callb.c b/sys/contrib/opensolaris/uts/common/os/callb.c new file mode 100644 index 0000000..9bfae13 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/os/callb.c @@ -0,0 +1,363 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/mutex.h> +#include <sys/condvar.h> +#include <sys/callb.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/kobj.h> +#include <sys/systm.h> /* for delay() */ +#include <sys/taskq.h> /* For TASKQ_NAMELEN */ +#include <sys/kernel.h> + +#define CB_MAXNAME TASKQ_NAMELEN + +/* + * The callb mechanism provides generic event scheduling/echoing. + * A callb function is registered and called on behalf of the event. + */ +typedef struct callb { + struct callb *c_next; /* next in class or on freelist */ + kthread_id_t c_thread; /* ptr to caller's thread struct */ + char c_flag; /* info about the callb state */ + uchar_t c_class; /* this callb's class */ + kcondvar_t c_done_cv; /* signal callb completion */ + boolean_t (*c_func)(); /* cb function: returns true if ok */ + void *c_arg; /* arg to c_func */ + char c_name[CB_MAXNAME+1]; /* debug:max func name length */ +} callb_t; + +/* + * callb c_flag bitmap definitions + */ +#define CALLB_FREE 0x0 +#define CALLB_TAKEN 0x1 +#define CALLB_EXECUTING 0x2 + +/* + * Basic structure for a callb table. + * All callbs are organized into different class groups described + * by ct_class array. + * The callbs within a class are single-linked and normally run by a + * serial execution. + */ +typedef struct callb_table { + kmutex_t ct_lock; /* protect all callb states */ + callb_t *ct_freelist; /* free callb structures */ + int ct_busy; /* != 0 prevents additions */ + kcondvar_t ct_busy_cv; /* to wait for not busy */ + int ct_ncallb; /* num of callbs allocated */ + callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */ +} callb_table_t; + +int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC; + +static callb_id_t callb_add_common(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); + +static callb_table_t callb_table; /* system level callback table */ +static callb_table_t *ct = &callb_table; +static kmutex_t callb_safe_mutex; +callb_cpr_t callb_cprinfo_safe = { + &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, 0, 0 }; + +/* + * Init all callb tables in the system. + */ +void +callb_init(void *dummy __unused) +{ + callb_table.ct_busy = 0; /* mark table open for additions */ + mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +callb_fini(void *dummy __unused) +{ + callb_t *cp; + + mutex_enter(&ct->ct_lock); + while ((cp = ct->ct_freelist) != NULL) { + ct->ct_freelist = cp->c_next; + ct->ct_ncallb--; + kmem_free(cp, sizeof (callb_t)); + } + ASSERT(ct->ct_ncallb == 0); + mutex_exit(&ct->ct_lock); + mutex_destroy(&callb_safe_mutex); + mutex_destroy(&callb_table.ct_lock); +} + +/* + * callout_add() is called to register func() be called later. + */ +static callb_id_t +callb_add_common(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + callb_t *cp; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + while (ct->ct_busy) + cv_wait(&ct->ct_busy_cv, &ct->ct_lock); + if ((cp = ct->ct_freelist) == NULL) { + ct->ct_ncallb++; + cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP); + } + ct->ct_freelist = cp->c_next; + cp->c_thread = t; + cp->c_func = func; + cp->c_arg = arg; + cp->c_class = (uchar_t)class; + cp->c_flag |= CALLB_TAKEN; +#ifdef DEBUG + if (strlen(name) > CB_MAXNAME) + cmn_err(CE_WARN, "callb_add: name of callback function '%s' " + "too long -- truncated to %d chars", + name, CB_MAXNAME); +#endif + (void) strncpy(cp->c_name, name, CB_MAXNAME); + cp->c_name[CB_MAXNAME] = '\0'; + + /* + * Insert the new callb at the head of its class list. + */ + cp->c_next = ct->ct_first_cb[class]; + ct->ct_first_cb[class] = cp; + + mutex_exit(&ct->ct_lock); + return ((callb_id_t)cp); +} + +/* + * The default function to add an entry to the callback table. Since + * it uses curthread as the thread identifier to store in the table, + * it should be used for the normal case of a thread which is calling + * to add ITSELF to the table. + */ +callb_id_t +callb_add(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name) +{ + return (callb_add_common(func, arg, class, name, curthread)); +} + +/* + * A special version of callb_add() above for use by threads which + * might be adding an entry to the table on behalf of some other + * thread (for example, one which is constructed but not yet running). + * In this version the thread id is an argument. + */ +callb_id_t +callb_add_thread(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + return (callb_add_common(func, arg, class, name, t)); +} + +/* + * callout_delete() is called to remove an entry identified by id + * that was originally placed there by a call to callout_add(). + * return -1 if fail to delete a callb entry otherwise return 0. + */ +int +callb_delete(callb_id_t id) +{ + callb_t **pp; + callb_t *me = (callb_t *)id; + + mutex_enter(&ct->ct_lock); + + for (;;) { + pp = &ct->ct_first_cb[me->c_class]; + while (*pp != NULL && *pp != me) + pp = &(*pp)->c_next; + +#ifdef DEBUG + if (*pp != me) { + cmn_err(CE_WARN, "callb delete bogus entry 0x%p", + (void *)me); + mutex_exit(&ct->ct_lock); + return (-1); + } +#endif /* DEBUG */ + + /* + * It is not allowed to delete a callb in the middle of + * executing otherwise, the callb_execute() will be confused. + */ + if (!(me->c_flag & CALLB_EXECUTING)) + break; + + cv_wait(&me->c_done_cv, &ct->ct_lock); + } + /* relink the class list */ + *pp = me->c_next; + + /* clean up myself and return the free callb to the head of freelist */ + me->c_flag = CALLB_FREE; + me->c_next = ct->ct_freelist; + ct->ct_freelist = me; + + mutex_exit(&ct->ct_lock); + return (0); +} + +/* + * class: indicates to execute all callbs in the same class; + * code: optional argument for the callb functions. + * return: = 0: success + * != 0: ptr to string supplied when callback was registered + */ +void * +callb_execute_class(int class, int code) +{ + callb_t *cp; + void *ret = NULL; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + + for (cp = ct->ct_first_cb[class]; + cp != NULL && ret == 0; cp = cp->c_next) { + while (cp->c_flag & CALLB_EXECUTING) + cv_wait(&cp->c_done_cv, &ct->ct_lock); + /* + * cont if the callb is deleted while we're sleeping + */ + if (cp->c_flag == CALLB_FREE) + continue; + cp->c_flag |= CALLB_EXECUTING; + +#ifdef CALLB_DEBUG + printf("callb_execute: name=%s func=%p arg=%p\n", + cp->c_name, (void *)cp->c_func, (void *)cp->c_arg); +#endif /* CALLB_DEBUG */ + + mutex_exit(&ct->ct_lock); + /* If callback function fails, pass back client's name */ + if (!(*cp->c_func)(cp->c_arg, code)) + ret = cp->c_name; + mutex_enter(&ct->ct_lock); + + cp->c_flag &= ~CALLB_EXECUTING; + cv_broadcast(&cp->c_done_cv); + } + mutex_exit(&ct->ct_lock); + return (ret); +} + +/* + * callers make sure no recursive entries to this func. + * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure. + * + * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we + * use a cv_timedwait() in case the kernel thread is blocked. + * + * Note that this is a generic callback handler for daemon CPR and + * should NOT be changed to accommodate any specific requirement in a daemon. + * Individual daemons that require changes to the handler shall write + * callback routines in their own daemon modules. + */ +boolean_t +callb_generic_cpr(void *arg, int code) +{ + callb_cpr_t *cp = (callb_cpr_t *)arg; + clock_t ret = 0; /* assume success */ + + mutex_enter(cp->cc_lockp); + + switch (code) { + case CB_CODE_CPR_CHKPT: + cp->cc_events |= CALLB_CPR_START; + while (!(cp->cc_events & CALLB_CPR_SAFE)) + /* cv_timedwait() returns -1 if it times out. */ + if ((ret = cv_timedwait(&cp->cc_callb_cv, + cp->cc_lockp, + callb_timeout_sec * hz)) == -1) + break; + break; + + case CB_CODE_CPR_RESUME: + cp->cc_events &= ~CALLB_CPR_START; + cv_signal(&cp->cc_stop_cv); + break; + } + mutex_exit(cp->cc_lockp); + return (ret != -1); +} + +/* + * The generic callback function associated with kernel threads which + * are always considered safe. + */ +/* ARGSUSED */ +boolean_t +callb_generic_cpr_safe(void *arg, int code) +{ + return (B_TRUE); +} +/* + * Prevent additions to callback table. + */ +void +callb_lock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy == 0); + ct->ct_busy = 1; + mutex_exit(&ct->ct_lock); +} + +/* + * Allow additions to callback table. + */ +void +callb_unlock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy != 0); + ct->ct_busy = 0; + cv_broadcast(&ct->ct_busy_cv); + mutex_exit(&ct->ct_lock); +} + +SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL) +SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL); diff --git a/sys/contrib/opensolaris/uts/common/os/list.c b/sys/contrib/opensolaris/uts/common/os/list.c new file mode 100644 index 0000000..f9b6fcb --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/os/list.c @@ -0,0 +1,193 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Generic doubly-linked list implementation + */ + +#include <sys/list.h> +#include <sys/list_impl.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = node; \ + lnew->list_next = node->list_next; \ + node->list_next->list_prev = lnew; \ + node->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = node; \ + lnew->list_prev = node->list_prev; \ + node->list_prev->list_next = lnew; \ + node->list_prev = lnew; \ +} + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.list_next == node); + ASSERT(list->list_head.list_prev == node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject) +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + lold->list_prev->list_next = lold->list_next; + lold->list_next->list_prev = lold->list_prev; + lold->list_next = lold->list_prev = NULL; +} + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +int +list_link_active(list_node_t *link) +{ + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c b/sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c new file mode 100644 index 0000000..3682853 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/nvpair.h> + +static void * +nv_alloc_sys(nv_alloc_t *nva, size_t size) +{ + return (kmem_alloc(size, (int)(uintptr_t)nva->nva_arg)); +} + +/*ARGSUSED*/ +static void +nv_free_sys(nv_alloc_t *nva, void *buf, size_t size) +{ + kmem_free(buf, size); +} + +static const nv_alloc_ops_t system_ops = { + NULL, /* nv_ao_init() */ + NULL, /* nv_ao_fini() */ + nv_alloc_sys, /* nv_ao_alloc() */ + nv_free_sys, /* nv_ao_free() */ + NULL /* nv_ao_reset() */ +}; + +nv_alloc_t nv_alloc_sleep_def = { + &system_ops, + (void *)KM_SLEEP +}; + +nv_alloc_t nv_alloc_nosleep_def = { + &system_ops, + (void *)KM_NOSLEEP +}; + +nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def; +nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def; diff --git a/sys/contrib/opensolaris/uts/common/os/taskq.c b/sys/contrib/opensolaris/uts/common/os/taskq.c new file mode 100644 index 0000000..efaea2e --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/os/taskq.c @@ -0,0 +1,1020 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Kernel task queues: general-purpose asynchronous task scheduling. + * + * A common problem in kernel programming is the need to schedule tasks + * to be performed later, by another thread. There are several reasons + * you may want or need to do this: + * + * (1) The task isn't time-critical, but your current code path is. + * + * (2) The task may require grabbing locks that you already hold. + * + * (3) The task may need to block (e.g. to wait for memory), but you + * cannot block in your current context. + * + * (4) Your code path can't complete because of some condition, but you can't + * sleep or fail, so you queue the task for later execution when condition + * disappears. + * + * (5) You just want a simple way to launch multiple tasks in parallel. + * + * Task queues provide such a facility. In its simplest form (used when + * performance is not a critical consideration) a task queue consists of a + * single list of tasks, together with one or more threads to service the + * list. There are some cases when this simple queue is not sufficient: + * + * (1) The task queues are very hot and there is a need to avoid data and lock + * contention over global resources. + * + * (2) Some tasks may depend on other tasks to complete, so they can't be put in + * the same list managed by the same thread. + * + * (3) Some tasks may block for a long time, and this should not block other + * tasks in the queue. + * + * To provide useful service in such cases we define a "dynamic task queue" + * which has an individual thread for each of the tasks. These threads are + * dynamically created as they are needed and destroyed when they are not in + * use. The API for managing task pools is the same as for managing task queues + * with the exception of a taskq creation flag TASKQ_DYNAMIC which tells that + * dynamic task pool behavior is desired. + * + * Dynamic task queues may also place tasks in the normal queue (called "backing + * queue") when task pool runs out of resources. Users of task queues may + * disallow such queued scheduling by specifying TQ_NOQUEUE in the dispatch + * flags. + * + * The backing task queue is also used for scheduling internal tasks needed for + * dynamic task queue maintenance. + * + * INTERFACES: + * + * taskq_t *taskq_create(name, nthreads, pri_t pri, minalloc, maxall, flags); + * + * Create a taskq with specified properties. + * Possible 'flags': + * + * TASKQ_DYNAMIC: Create task pool for task management. If this flag is + * specified, 'nthreads' specifies the maximum number of threads in + * the task queue. Task execution order for dynamic task queues is + * not predictable. + * + * If this flag is not specified (default case) a + * single-list task queue is created with 'nthreads' threads + * servicing it. Entries in this queue are managed by + * taskq_ent_alloc() and taskq_ent_free() which try to keep the + * task population between 'minalloc' and 'maxalloc', but the + * latter limit is only advisory for TQ_SLEEP dispatches and the + * former limit is only advisory for TQ_NOALLOC dispatches. If + * TASKQ_PREPOPULATE is set in 'flags', the taskq will be + * prepopulated with 'minalloc' task structures. + * + * Since non-DYNAMIC taskqs are queues, tasks are guaranteed to be + * executed in the order they are scheduled if nthreads == 1. + * If nthreads > 1, task execution order is not predictable. + * + * TASKQ_PREPOPULATE: Prepopulate task queue with threads. + * Also prepopulate the task queue with 'minalloc' task structures. + * + * TASKQ_CPR_SAFE: This flag specifies that users of the task queue will + * use their own protocol for handling CPR issues. This flag is not + * supported for DYNAMIC task queues. + * + * The 'pri' field specifies the default priority for the threads that + * service all scheduled tasks. + * + * void taskq_destroy(tap): + * + * Waits for any scheduled tasks to complete, then destroys the taskq. + * Caller should guarantee that no new tasks are scheduled in the closing + * taskq. + * + * taskqid_t taskq_dispatch(tq, func, arg, flags): + * + * Dispatches the task "func(arg)" to taskq. The 'flags' indicates whether + * the caller is willing to block for memory. The function returns an + * opaque value which is zero iff dispatch fails. If flags is TQ_NOSLEEP + * or TQ_NOALLOC and the task can't be dispatched, taskq_dispatch() fails + * and returns (taskqid_t)0. + * + * ASSUMES: func != NULL. + * + * Possible flags: + * TQ_NOSLEEP: Do not wait for resources; may fail. + * + * TQ_NOALLOC: Do not allocate memory; may fail. May only be used with + * non-dynamic task queues. + * + * TQ_NOQUEUE: Do not enqueue a task if it can't dispatch it due to + * lack of available resources and fail. If this flag is not + * set, and the task pool is exhausted, the task may be scheduled + * in the backing queue. This flag may ONLY be used with dynamic + * task queues. + * + * NOTE: This flag should always be used when a task queue is used + * for tasks that may depend on each other for completion. + * Enqueueing dependent tasks may create deadlocks. + * + * TQ_SLEEP: May block waiting for resources. May still fail for + * dynamic task queues if TQ_NOQUEUE is also specified, otherwise + * always succeed. + * + * NOTE: Dynamic task queues are much more likely to fail in + * taskq_dispatch() (especially if TQ_NOQUEUE was specified), so it + * is important to have backup strategies handling such failures. + * + * void taskq_wait(tq): + * + * Waits for all previously scheduled tasks to complete. + * + * NOTE: It does not stop any new task dispatches. + * Do NOT call taskq_wait() from a task: it will cause deadlock. + * + * void taskq_suspend(tq) + * + * Suspend all task execution. Tasks already scheduled for a dynamic task + * queue will still be executed, but all new scheduled tasks will be + * suspended until taskq_resume() is called. + * + * int taskq_suspended(tq) + * + * Returns 1 if taskq is suspended and 0 otherwise. It is intended to + * ASSERT that the task queue is suspended. + * + * void taskq_resume(tq) + * + * Resume task queue execution. + * + * int taskq_member(tq, thread) + * + * Returns 1 if 'thread' belongs to taskq 'tq' and 0 otherwise. The + * intended use is to ASSERT that a given function is called in taskq + * context only. + * + * system_taskq + * + * Global system-wide dynamic task queue for common uses. It may be used by + * any subsystem that needs to schedule tasks and does not need to manage + * its own task queues. It is initialized quite early during system boot. + * + * IMPLEMENTATION. + * + * This is schematic representation of the task queue structures. + * + * taskq: + * +-------------+ + * |tq_lock | +---< taskq_ent_free() + * +-------------+ | + * |... | | tqent: tqent: + * +-------------+ | +------------+ +------------+ + * | tq_freelist |-->| tqent_next |--> ... ->| tqent_next | + * +-------------+ +------------+ +------------+ + * |... | | ... | | ... | + * +-------------+ +------------+ +------------+ + * | tq_task | | + * | | +-------------->taskq_ent_alloc() + * +--------------------------------------------------------------------------+ + * | | | tqent tqent | + * | +---------------------+ +--> +------------+ +--> +------------+ | + * | | ... | | | func, arg | | | func, arg | | + * +>+---------------------+ <---|-+ +------------+ <---|-+ +------------+ | + * | tq_taskq.tqent_next | ----+ | | tqent_next | --->+ | | tqent_next |--+ + * +---------------------+ | +------------+ ^ | +------------+ + * +-| tq_task.tqent_prev | +--| tqent_prev | | +--| tqent_prev | ^ + * | +---------------------+ +------------+ | +------------+ | + * | |... | | ... | | | ... | | + * | +---------------------+ +------------+ | +------------+ | + * | ^ | | + * | | | | + * +--------------------------------------+--------------+ TQ_APPEND() -+ + * | | | + * |... | taskq_thread()-----+ + * +-------------+ + * | tq_buckets |--+-------> [ NULL ] (for regular task queues) + * +-------------+ | + * | DYNAMIC TASK QUEUES: + * | + * +-> taskq_bucket[nCPU] taskq_bucket_dispatch() + * +-------------------+ ^ + * +--->| tqbucket_lock | | + * | +-------------------+ +--------+ +--------+ + * | | tqbucket_freelist |-->| tqent |-->...| tqent | ^ + * | +-------------------+<--+--------+<--...+--------+ | + * | | ... | | thread | | thread | | + * | +-------------------+ +--------+ +--------+ | + * | +-------------------+ | + * taskq_dispatch()--+--->| tqbucket_lock | TQ_APPEND()------+ + * TQ_HASH() | +-------------------+ +--------+ +--------+ + * | | tqbucket_freelist |-->| tqent |-->...| tqent | + * | +-------------------+<--+--------+<--...+--------+ + * | | ... | | thread | | thread | + * | +-------------------+ +--------+ +--------+ + * +---> ... + * + * + * Task queues use tq_task field to link new entry in the queue. The queue is a + * circular doubly-linked list. Entries are put in the end of the list with + * TQ_APPEND() and processed from the front of the list by taskq_thread() in + * FIFO order. Task queue entries are cached in the free list managed by + * taskq_ent_alloc() and taskq_ent_free() functions. + * + * All threads used by task queues mark t_taskq field of the thread to + * point to the task queue. + * + * Dynamic Task Queues Implementation. + * + * For a dynamic task queues there is a 1-to-1 mapping between a thread and + * taskq_ent_structure. Each entry is serviced by its own thread and each thread + * is controlled by a single entry. + * + * Entries are distributed over a set of buckets. To avoid using modulo + * arithmetics the number of buckets is 2^n and is determined as the nearest + * power of two roundown of the number of CPUs in the system. Tunable + * variable 'taskq_maxbuckets' limits the maximum number of buckets. Each entry + * is attached to a bucket for its lifetime and can't migrate to other buckets. + * + * Entries that have scheduled tasks are not placed in any list. The dispatch + * function sets their "func" and "arg" fields and signals the corresponding + * thread to execute the task. Once the thread executes the task it clears the + * "func" field and places an entry on the bucket cache of free entries pointed + * by "tqbucket_freelist" field. ALL entries on the free list should have "func" + * field equal to NULL. The free list is a circular doubly-linked list identical + * in structure to the tq_task list above, but entries are taken from it in LIFO + * order - the last freed entry is the first to be allocated. The + * taskq_bucket_dispatch() function gets the most recently used entry from the + * free list, sets its "func" and "arg" fields and signals a worker thread. + * + * After executing each task a per-entry thread taskq_d_thread() places its + * entry on the bucket free list and goes to a timed sleep. If it wakes up + * without getting new task it removes the entry from the free list and destroys + * itself. The thread sleep time is controlled by a tunable variable + * `taskq_thread_timeout'. + * + * There is various statistics kept in the bucket which allows for later + * analysis of taskq usage patterns. Also, a global copy of taskq creation and + * death statistics is kept in the global taskq data structure. Since thread + * creation and death happen rarely, updating such global data does not present + * a performance problem. + * + * NOTE: Threads are not bound to any CPU and there is absolutely no association + * between the bucket and actual thread CPU, so buckets are used only to + * split resources and reduce resource contention. Having threads attached + * to the CPU denoted by a bucket may reduce number of times the job + * switches between CPUs. + * + * Current algorithm creates a thread whenever a bucket has no free + * entries. It would be nice to know how many threads are in the running + * state and don't create threads if all CPUs are busy with existing + * tasks, but it is unclear how such strategy can be implemented. + * + * Currently buckets are created statically as an array attached to task + * queue. On some system with nCPUs < max_ncpus it may waste system + * memory. One solution may be allocation of buckets when they are first + * touched, but it is not clear how useful it is. + * + * SUSPEND/RESUME implementation. + * + * Before executing a task taskq_thread() (executing non-dynamic task + * queues) obtains taskq's thread lock as a reader. The taskq_suspend() + * function gets the same lock as a writer blocking all non-dynamic task + * execution. The taskq_resume() function releases the lock allowing + * taskq_thread to continue execution. + * + * For dynamic task queues, each bucket is marked as TQBUCKET_SUSPEND by + * taskq_suspend() function. After that taskq_bucket_dispatch() always + * fails, so that taskq_dispatch() will either enqueue tasks for a + * suspended backing queue or fail if TQ_NOQUEUE is specified in dispatch + * flags. + * + * NOTE: taskq_suspend() does not immediately block any tasks already + * scheduled for dynamic task queues. It only suspends new tasks + * scheduled after taskq_suspend() was called. + * + * taskq_member() function works by comparing a thread t_taskq pointer with + * the passed thread pointer. + * + * LOCKS and LOCK Hierarchy: + * + * There are two locks used in task queues. + * + * 1) Task queue structure has a lock, protecting global task queue state. + * + * 2) Each per-CPU bucket has a lock for bucket management. + * + * If both locks are needed, task queue lock should be taken only after bucket + * lock. + * + * DEBUG FACILITIES. + * + * For DEBUG kernels it is possible to induce random failures to + * taskq_dispatch() function when it is given TQ_NOSLEEP argument. The value of + * taskq_dmtbf and taskq_smtbf tunables control the mean time between induced + * failures for dynamic and static task queues respectively. + * + * Setting TASKQ_STATISTIC to 0 will disable per-bucket statistics. + * + * TUNABLES + * + * system_taskq_size - Size of the global system_taskq. + * This value is multiplied by nCPUs to determine + * actual size. + * Default value: 64 + * + * taskq_thread_timeout - Maximum idle time for taskq_d_thread() + * Default value: 5 minutes + * + * taskq_maxbuckets - Maximum number of buckets in any task queue + * Default value: 128 + * + * taskq_search_depth - Maximum # of buckets searched for a free entry + * Default value: 4 + * + * taskq_dmtbf - Mean time between induced dispatch failures + * for dynamic task queues. + * Default value: UINT_MAX (no induced failures) + * + * taskq_smtbf - Mean time between induced dispatch failures + * for static task queues. + * Default value: UINT_MAX (no induced failures) + * + * CONDITIONAL compilation. + * + * TASKQ_STATISTIC - If set will enable bucket statistic (default). + * + */ + +#include <sys/taskq_impl.h> +#include <sys/proc.h> +#include <sys/kmem.h> +#include <sys/callb.h> +#include <sys/systm.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/sdt.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/limits.h> + +static kmem_cache_t *taskq_ent_cache, *taskq_cache; + +/* Global system task queue for common use */ +taskq_t *system_taskq; + +/* + * Maxmimum number of entries in global system taskq is + * system_taskq_size * max_ncpus + */ +#define SYSTEM_TASKQ_SIZE 64 +int system_taskq_size = SYSTEM_TASKQ_SIZE; + +/* + * Dynamic task queue threads that don't get any work within + * taskq_thread_timeout destroy themselves + */ +#define TASKQ_THREAD_TIMEOUT (60 * 5) +int taskq_thread_timeout = TASKQ_THREAD_TIMEOUT; + +#define TASKQ_MAXBUCKETS 128 +int taskq_maxbuckets = TASKQ_MAXBUCKETS; + +/* + * When a bucket has no available entries another buckets are tried. + * taskq_search_depth parameter limits the amount of buckets that we search + * before failing. This is mostly useful in systems with many CPUs where we may + * spend too much time scanning busy buckets. + */ +#define TASKQ_SEARCH_DEPTH 4 +int taskq_search_depth = TASKQ_SEARCH_DEPTH; + +/* + * Hashing function: mix various bits of x. May be pretty much anything. + */ +#define TQ_HASH(x) ((x) ^ ((x) >> 11) ^ ((x) >> 17) ^ ((x) ^ 27)) + +/* + * We do not create any new threads when the system is low on memory and start + * throttling memory allocations. The following macro tries to estimate such + * condition. + */ +#define ENOUGH_MEMORY() (freemem > throttlefree) + +/* + * Static functions. + */ +static taskq_t *taskq_create_common(const char *, int, int, pri_t, int, + int, uint_t); +static void taskq_thread(void *); +static int taskq_constructor(void *, void *, int); +static void taskq_destructor(void *, void *); +static int taskq_ent_constructor(void *, void *, int); +static void taskq_ent_destructor(void *, void *); +static taskq_ent_t *taskq_ent_alloc(taskq_t *, int); +static void taskq_ent_free(taskq_t *, taskq_ent_t *); + +/* + * Collect per-bucket statistic when TASKQ_STATISTIC is defined. + */ +#define TASKQ_STATISTIC 1 + +#if TASKQ_STATISTIC +#define TQ_STAT(b, x) b->tqbucket_stat.x++ +#else +#define TQ_STAT(b, x) +#endif + +/* + * Random fault injection. + */ +uint_t taskq_random; +uint_t taskq_dmtbf = UINT_MAX; /* mean time between injected failures */ +uint_t taskq_smtbf = UINT_MAX; /* mean time between injected failures */ + +/* + * TQ_NOSLEEP dispatches on dynamic task queues are always allowed to fail. + * + * TQ_NOSLEEP dispatches on static task queues can't arbitrarily fail because + * they could prepopulate the cache and make sure that they do not use more + * then minalloc entries. So, fault injection in this case insures that + * either TASKQ_PREPOPULATE is not set or there are more entries allocated + * than is specified by minalloc. TQ_NOALLOC dispatches are always allowed + * to fail, but for simplicity we treat them identically to TQ_NOSLEEP + * dispatches. + */ +#ifdef DEBUG +#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) \ + taskq_random = (taskq_random * 2416 + 374441) % 1771875;\ + if ((flag & TQ_NOSLEEP) && \ + taskq_random < 1771875 / taskq_dmtbf) { \ + return (NULL); \ + } + +#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) \ + taskq_random = (taskq_random * 2416 + 374441) % 1771875;\ + if ((flag & (TQ_NOSLEEP | TQ_NOALLOC)) && \ + (!(tq->tq_flags & TASKQ_PREPOPULATE) || \ + (tq->tq_nalloc > tq->tq_minalloc)) && \ + (taskq_random < (1771875 / taskq_smtbf))) { \ + mutex_exit(&tq->tq_lock); \ + return ((taskqid_t)0); \ + } +#else +#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) +#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) +#endif + +#define IS_EMPTY(l) (((l).tqent_prev == (l).tqent_next) && \ + ((l).tqent_prev == &(l))) + +/* + * Append `tqe' in the end of the doubly-linked list denoted by l. + */ +#define TQ_APPEND(l, tqe) { \ + tqe->tqent_next = &l; \ + tqe->tqent_prev = l.tqent_prev; \ + tqe->tqent_next->tqent_prev = tqe; \ + tqe->tqent_prev->tqent_next = tqe; \ +} + +/* + * Schedule a task specified by func and arg into the task queue entry tqe. + */ +#define TQ_ENQUEUE(tq, tqe, func, arg) { \ + ASSERT(MUTEX_HELD(&tq->tq_lock)); \ + TQ_APPEND(tq->tq_task, tqe); \ + tqe->tqent_func = (func); \ + tqe->tqent_arg = (arg); \ + tq->tq_tasks++; \ + if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks) \ + tq->tq_maxtasks = tq->tq_tasks - tq->tq_executed; \ + cv_signal(&tq->tq_dispatch_cv); \ + DTRACE_PROBE2(taskq__enqueue, taskq_t *, tq, taskq_ent_t *, tqe); \ +} + +/* + * Do-nothing task which may be used to prepopulate thread caches. + */ +/*ARGSUSED*/ +void +nulltask(void *unused) +{ +} + + +/*ARGSUSED*/ +static int +taskq_constructor(void *buf, void *cdrarg, int kmflags) +{ + taskq_t *tq = buf; + + bzero(tq, sizeof (taskq_t)); + + mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); + cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); + + tq->tq_task.tqent_next = &tq->tq_task; + tq->tq_task.tqent_prev = &tq->tq_task; + + return (0); +} + +/*ARGSUSED*/ +static void +taskq_destructor(void *buf, void *cdrarg) +{ + taskq_t *tq = buf; + + mutex_destroy(&tq->tq_lock); + rw_destroy(&tq->tq_threadlock); + cv_destroy(&tq->tq_dispatch_cv); + cv_destroy(&tq->tq_wait_cv); +} + +/*ARGSUSED*/ +static int +taskq_ent_constructor(void *buf, void *cdrarg, int kmflags) +{ + taskq_ent_t *tqe = buf; + + tqe->tqent_thread = NULL; + cv_init(&tqe->tqent_cv, NULL, CV_DEFAULT, NULL); + + return (0); +} + +/*ARGSUSED*/ +static void +taskq_ent_destructor(void *buf, void *cdrarg) +{ + taskq_ent_t *tqe = buf; + + ASSERT(tqe->tqent_thread == NULL); + cv_destroy(&tqe->tqent_cv); +} + +/* + * Create global system dynamic task queue. + */ +void +system_taskq_init(void) +{ + system_taskq = taskq_create_common("system_taskq", 0, + system_taskq_size * max_ncpus, minclsyspri, 4, 512, + TASKQ_PREPOPULATE); +} + +void +system_taskq_fini(void) +{ + taskq_destroy(system_taskq); +} + +static void +taskq_init(void *dummy __unused) +{ + taskq_ent_cache = kmem_cache_create("taskq_ent_cache", + sizeof (taskq_ent_t), 0, taskq_ent_constructor, + taskq_ent_destructor, NULL, NULL, NULL, 0); + taskq_cache = kmem_cache_create("taskq_cache", sizeof (taskq_t), + 0, taskq_constructor, taskq_destructor, NULL, NULL, NULL, 0); + system_taskq_init(); +} + +static void +taskq_fini(void *dummy __unused) +{ + system_taskq_fini(); + kmem_cache_destroy(taskq_cache); + kmem_cache_destroy(taskq_ent_cache); +} + +/* + * taskq_ent_alloc() + * + * Allocates a new taskq_ent_t structure either from the free list or from the + * cache. Returns NULL if it can't be allocated. + * + * Assumes: tq->tq_lock is held. + */ +static taskq_ent_t * +taskq_ent_alloc(taskq_t *tq, int flags) +{ + int kmflags = (flags & TQ_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + + taskq_ent_t *tqe; + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + /* + * TQ_NOALLOC allocations are allowed to use the freelist, even if + * we are below tq_minalloc. + */ + if ((tqe = tq->tq_freelist) != NULL && + ((flags & TQ_NOALLOC) || tq->tq_nalloc >= tq->tq_minalloc)) { + tq->tq_freelist = tqe->tqent_next; + } else { + if (flags & TQ_NOALLOC) + return (NULL); + + mutex_exit(&tq->tq_lock); + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (kmflags & KM_NOSLEEP) { + mutex_enter(&tq->tq_lock); + return (NULL); + } + /* + * We don't want to exceed tq_maxalloc, but we can't + * wait for other tasks to complete (and thus free up + * task structures) without risking deadlock with + * the caller. So, we just delay for one second + * to throttle the allocation rate. + */ + delay(hz); + } + tqe = kmem_cache_alloc(taskq_ent_cache, kmflags); + mutex_enter(&tq->tq_lock); + if (tqe != NULL) + tq->tq_nalloc++; + } + return (tqe); +} + +/* + * taskq_ent_free() + * + * Free taskq_ent_t structure by either putting it on the free list or freeing + * it to the cache. + * + * Assumes: tq->tq_lock is held. + */ +static void +taskq_ent_free(taskq_t *tq, taskq_ent_t *tqe) +{ + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + if (tq->tq_nalloc <= tq->tq_minalloc) { + tqe->tqent_next = tq->tq_freelist; + tq->tq_freelist = tqe; + } else { + tq->tq_nalloc--; + mutex_exit(&tq->tq_lock); + kmem_cache_free(taskq_ent_cache, tqe); + mutex_enter(&tq->tq_lock); + } +} + +/* + * Dispatch a task. + * + * Assumes: func != NULL + * + * Returns: NULL if dispatch failed. + * non-NULL if task dispatched successfully. + * Actual return value is the pointer to taskq entry that was used to + * dispatch a task. This is useful for debugging. + */ +/* ARGSUSED */ +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *tqe = NULL; + + ASSERT(tq != NULL); + ASSERT(func != NULL); + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + + /* + * TQ_NOQUEUE flag can't be used with non-dynamic task queues. + */ + ASSERT(! (flags & TQ_NOQUEUE)); + + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags); + + if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) { + mutex_exit(&tq->tq_lock); + return ((taskqid_t)NULL); + } + TQ_ENQUEUE(tq, tqe, func, arg); + mutex_exit(&tq->tq_lock); + return ((taskqid_t)tqe); +} + +/* + * Wait for all pending tasks to complete. + * Calling taskq_wait from a task will cause deadlock. + */ +void +taskq_wait(taskq_t *tq) +{ + + mutex_enter(&tq->tq_lock); + while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + mutex_exit(&tq->tq_lock); +} + +/* + * Suspend execution of tasks. + * + * Tasks in the queue part will be suspended immediately upon return from this + * function. Pending tasks in the dynamic part will continue to execute, but all + * new tasks will be suspended. + */ +void +taskq_suspend(taskq_t *tq) +{ + rw_enter(&tq->tq_threadlock, RW_WRITER); + + /* + * Mark task queue as being suspended. Needed for taskq_suspended(). + */ + mutex_enter(&tq->tq_lock); + ASSERT(!(tq->tq_flags & TASKQ_SUSPENDED)); + tq->tq_flags |= TASKQ_SUSPENDED; + mutex_exit(&tq->tq_lock); +} + +/* + * returns: 1 if tq is suspended, 0 otherwise. + */ +int +taskq_suspended(taskq_t *tq) +{ + return ((tq->tq_flags & TASKQ_SUSPENDED) != 0); +} + +/* + * Resume taskq execution. + */ +void +taskq_resume(taskq_t *tq) +{ + ASSERT(RW_WRITE_HELD(&tq->tq_threadlock)); + + mutex_enter(&tq->tq_lock); + ASSERT(tq->tq_flags & TASKQ_SUSPENDED); + tq->tq_flags &= ~TASKQ_SUSPENDED; + mutex_exit(&tq->tq_lock); + + rw_exit(&tq->tq_threadlock); +} + +/* + * Worker thread for processing task queue. + */ +static void +taskq_thread(void *arg) +{ + taskq_t *tq = arg; + taskq_ent_t *tqe; + callb_cpr_t cprinfo; + hrtime_t start, end; + + CALLB_CPR_INIT(&cprinfo, &tq->tq_lock, callb_generic_cpr, tq->tq_name); + + mutex_enter(&tq->tq_lock); + while (tq->tq_flags & TASKQ_ACTIVE) { + if ((tqe = tq->tq_task.tqent_next) == &tq->tq_task) { + if (--tq->tq_active == 0) + cv_broadcast(&tq->tq_wait_cv); + if (tq->tq_flags & TASKQ_CPR_SAFE) { + cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); + } else { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); + CALLB_CPR_SAFE_END(&cprinfo, &tq->tq_lock); + } + tq->tq_active++; + continue; + } + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + mutex_exit(&tq->tq_lock); + + rw_enter(&tq->tq_threadlock, RW_READER); + start = gethrtime(); + DTRACE_PROBE2(taskq__exec__start, taskq_t *, tq, + taskq_ent_t *, tqe); + tqe->tqent_func(tqe->tqent_arg); + DTRACE_PROBE2(taskq__exec__end, taskq_t *, tq, + taskq_ent_t *, tqe); + end = gethrtime(); + rw_exit(&tq->tq_threadlock); + + mutex_enter(&tq->tq_lock); + tq->tq_totaltime += end - start; + tq->tq_executed++; + + taskq_ent_free(tq, tqe); + } + tq->tq_nthreads--; + cv_broadcast(&tq->tq_wait_cv); + ASSERT(!(tq->tq_flags & TASKQ_CPR_SAFE)); + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); +} + +/* + * Taskq creation. May sleep for memory. + * Always use automatically generated instances to avoid kstat name space + * collisions. + */ + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, uint_t flags) +{ + return taskq_create_common(name, 0, nthreads, pri, minalloc, + maxalloc, flags | TASKQ_NOINSTANCE); +} + +static taskq_t * +taskq_create_common(const char *name, int instance, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP); + uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus); + uint_t bsize; /* # of buckets - always power of 2 */ + + ASSERT(instance == 0); + ASSERT(flags == TASKQ_PREPOPULATE | TASKQ_NOINSTANCE); + + /* + * TASKQ_CPR_SAFE and TASKQ_DYNAMIC flags are mutually exclusive. + */ + ASSERT((flags & (TASKQ_DYNAMIC | TASKQ_CPR_SAFE)) != + ((TASKQ_DYNAMIC | TASKQ_CPR_SAFE))); + + ASSERT(tq->tq_buckets == NULL); + + bsize = 1 << (highbit(ncpus) - 1); + ASSERT(bsize >= 1); + bsize = MIN(bsize, taskq_maxbuckets); + + tq->tq_maxsize = nthreads; + + (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1); + tq->tq_name[TASKQ_NAMELEN] = '\0'; + /* Make sure the name conforms to the rules for C indentifiers */ + strident_canon(tq->tq_name, TASKQ_NAMELEN); + + tq->tq_flags = flags | TASKQ_ACTIVE; + tq->tq_active = nthreads; + tq->tq_nthreads = nthreads; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nbuckets = bsize; + tq->tq_pri = pri; + + if (flags & TASKQ_PREPOPULATE) { + mutex_enter(&tq->tq_lock); + while (minalloc-- > 0) + taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP)); + mutex_exit(&tq->tq_lock); + } + + if (nthreads == 1) { + tq->tq_thread = thread_create(NULL, 0, taskq_thread, tq, + 0, NULL, TS_RUN, pri); + } else { + kthread_t **tpp = kmem_alloc(sizeof (kthread_t *) * nthreads, + KM_SLEEP); + + tq->tq_threadlist = tpp; + + mutex_enter(&tq->tq_lock); + while (nthreads-- > 0) { + *tpp = thread_create(NULL, 0, taskq_thread, tq, + 0, NULL, TS_RUN, pri); + tpp++; + } + mutex_exit(&tq->tq_lock); + } + + return (tq); +} + +/* + * taskq_destroy(). + * + * Assumes: by the time taskq_destroy is called no one will use this task queue + * in any way and no one will try to dispatch entries in it. + */ +void +taskq_destroy(taskq_t *tq) +{ + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + + ASSERT(! (tq->tq_flags & TASKQ_CPR_SAFE)); + + /* + * Wait for any pending entries to complete. + */ + taskq_wait(tq); + + mutex_enter(&tq->tq_lock); + ASSERT((tq->tq_task.tqent_next == &tq->tq_task) && + (tq->tq_active == 0)); + + if ((tq->tq_nthreads > 1) && (tq->tq_threadlist != NULL)) + kmem_free(tq->tq_threadlist, sizeof (kthread_t *) * + tq->tq_nthreads); + + tq->tq_flags &= ~TASKQ_ACTIVE; + cv_broadcast(&tq->tq_dispatch_cv); + while (tq->tq_nthreads != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + + tq->tq_minalloc = 0; + while (tq->tq_nalloc != 0) + taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP)); + + mutex_exit(&tq->tq_lock); + + /* + * Mark each bucket as closing and wakeup all sleeping threads. + */ + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + taskq_ent_t *tqe; + + mutex_enter(&b->tqbucket_lock); + + b->tqbucket_flags |= TQBUCKET_CLOSE; + /* Wakeup all sleeping threads */ + + for (tqe = b->tqbucket_freelist.tqent_next; + tqe != &b->tqbucket_freelist; tqe = tqe->tqent_next) + cv_signal(&tqe->tqent_cv); + + ASSERT(b->tqbucket_nalloc == 0); + + /* + * At this point we waited for all pending jobs to complete (in + * both the task queue and the bucket and no new jobs should + * arrive. Wait for all threads to die. + */ + while (b->tqbucket_nfree > 0) + cv_wait(&b->tqbucket_cv, &b->tqbucket_lock); + mutex_exit(&b->tqbucket_lock); + mutex_destroy(&b->tqbucket_lock); + cv_destroy(&b->tqbucket_cv); + } + + if (tq->tq_buckets != NULL) { + ASSERT(tq->tq_flags & TASKQ_DYNAMIC); + kmem_free(tq->tq_buckets, + sizeof (taskq_bucket_t) * tq->tq_nbuckets); + + /* Cleanup fields before returning tq to the cache */ + tq->tq_buckets = NULL; + tq->tq_tcreates = 0; + tq->tq_tdeaths = 0; + } else { + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + } + + tq->tq_totaltime = 0; + tq->tq_tasks = 0; + tq->tq_maxtasks = 0; + tq->tq_executed = 0; + kmem_cache_free(taskq_cache, tq); +} + +SYSINIT(sol_taskq, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, taskq_init, NULL) +SYSUNINIT(sol_taskq, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, taskq_fini, NULL); diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr.c b/sys/contrib/opensolaris/uts/common/rpc/xdr.c new file mode 100644 index 0000000..65b73b6 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/rpc/xdr.c @@ -0,0 +1,671 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * xdr.c, generic XDR routines implementation. + * These are the "generic" xdr routines used to serialize and de-serialize + * most common data items. See xdr.h for more info on the interface to + * xdr. + */ + +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/types.h> +#include <sys/systm.h> + +#include <rpc/types.h> +#include <rpc/xdr.h> + +#pragma weak xdr_int32_t = xdr_int +#pragma weak xdr_uint32_t = xdr_u_int +#pragma weak xdr_int64_t = xdr_longlong_t +#pragma weak xdr_uint64_t = xdr_u_longlong_t + +#if !defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN) +#error "Exactly one of _BIG_ENDIAN or _LITTLE_ENDIAN must be defined" +#elif defined(_BIG_ENDIAN) && defined(_LITTLE_ENDIAN) +#error "Only one of _BIG_ENDIAN or _LITTLE_ENDIAN may be defined" +#endif + +/* + * constants specific to the xdr "protocol" + */ +#define XDR_FALSE ((int32_t)0) +#define XDR_TRUE ((int32_t)1) +#define LASTUNSIGNED ((uint_t)0-1) + +/* + * for unit alignment + */ +static char xdr_zero[BYTES_PER_XDR_UNIT] = { 0, 0, 0, 0 }; + +/* + * Free a data structure using XDR + * Not a filter, but a convenient utility nonetheless + */ +void +xdr_free(xdrproc_t proc, char *objp) +{ + XDR x; + + x.x_op = XDR_FREE; + (*proc)(&x, objp); +} + +/* + * XDR nothing + */ +bool_t +xdr_void(void) +{ + return (TRUE); +} + +/* + * XDR integers + * + * PSARC 2003/523 Contract Private Interface + * xdr_int + * Changes must be reviewed by Solaris File Sharing + * Changes must be communicated to contract-2003-523@sun.com + */ +bool_t +xdr_int(XDR *xdrs, int *ip) +{ + if (xdrs->x_op == XDR_ENCODE) + return (XDR_PUTINT32(xdrs, ip)); + + if (xdrs->x_op == XDR_DECODE) + return (XDR_GETINT32(xdrs, ip)); + + if (xdrs->x_op == XDR_FREE) + return (TRUE); + +#ifdef DEBUG + printf("xdr_int: FAILED\n"); +#endif + return (FALSE); +} + +/* + * XDR unsigned integers + * + * PSARC 2003/523 Contract Private Interface + * xdr_u_int + * Changes must be reviewed by Solaris File Sharing + * Changes must be communicated to contract-2003-523@sun.com + */ +bool_t +xdr_u_int(XDR *xdrs, uint_t *up) +{ + if (xdrs->x_op == XDR_ENCODE) + return (XDR_PUTINT32(xdrs, (int32_t *)up)); + + if (xdrs->x_op == XDR_DECODE) + return (XDR_GETINT32(xdrs, (int32_t *)up)); + + if (xdrs->x_op == XDR_FREE) + return (TRUE); + +#ifdef DEBUG + printf("xdr_int: FAILED\n"); +#endif + return (FALSE); +} + + +#if defined(_ILP32) +/* + * xdr_long and xdr_u_long for binary compatability on ILP32 kernels. + * + * No prototypes since new code should not be using these interfaces. + */ +bool_t +xdr_long(XDR *xdrs, long *ip) +{ + return (xdr_int(xdrs, (int *)ip)); +} + +bool_t +xdr_u_long(XDR *xdrs, unsigned long *up) +{ + return (xdr_u_int(xdrs, (uint_t *)up)); +} +#endif /* _ILP32 */ + + +/* + * XDR long long integers + */ +bool_t +xdr_longlong_t(XDR *xdrs, longlong_t *hp) +{ + if (xdrs->x_op == XDR_ENCODE) { +#if defined(_LITTLE_ENDIAN) + if (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp + + BYTES_PER_XDR_UNIT)) == TRUE) { + return (XDR_PUTINT32(xdrs, (int32_t *)hp)); + } +#elif defined(_BIG_ENDIAN) + if (XDR_PUTINT32(xdrs, (int32_t *)hp) == TRUE) { + return (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp + + BYTES_PER_XDR_UNIT))); + } +#endif + return (FALSE); + + } + if (xdrs->x_op == XDR_DECODE) { +#if defined(_LITTLE_ENDIAN) + if (XDR_GETINT32(xdrs, (int32_t *)((char *)hp + + BYTES_PER_XDR_UNIT)) == TRUE) { + return (XDR_GETINT32(xdrs, (int32_t *)hp)); + } +#elif defined(_BIG_ENDIAN) + if (XDR_GETINT32(xdrs, (int32_t *)hp) == TRUE) { + return (XDR_GETINT32(xdrs, (int32_t *)((char *)hp + + BYTES_PER_XDR_UNIT))); + } +#endif + return (FALSE); + } + return (TRUE); +} + +/* + * XDR unsigned long long integers + */ +bool_t +xdr_u_longlong_t(XDR *xdrs, u_longlong_t *hp) +{ + + if (xdrs->x_op == XDR_ENCODE) { +#if defined(_LITTLE_ENDIAN) + if (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp + + BYTES_PER_XDR_UNIT)) == TRUE) { + return (XDR_PUTINT32(xdrs, (int32_t *)hp)); + } +#elif defined(_BIG_ENDIAN) + if (XDR_PUTINT32(xdrs, (int32_t *)hp) == TRUE) { + return (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp + + BYTES_PER_XDR_UNIT))); + } +#endif + return (FALSE); + + } + if (xdrs->x_op == XDR_DECODE) { +#if defined(_LITTLE_ENDIAN) + if (XDR_GETINT32(xdrs, (int32_t *)((char *)hp + + BYTES_PER_XDR_UNIT)) == TRUE) { + return (XDR_GETINT32(xdrs, (int32_t *)hp)); + } +#elif defined(_BIG_ENDIAN) + if (XDR_GETINT32(xdrs, (int32_t *)hp) == TRUE) { + return (XDR_GETINT32(xdrs, (int32_t *)((char *)hp + + BYTES_PER_XDR_UNIT))); + } +#endif + return (FALSE); + } + return (TRUE); +} + +/* + * XDR short integers + */ +bool_t +xdr_short(XDR *xdrs, short *sp) +{ + int32_t l; + + switch (xdrs->x_op) { + + case XDR_ENCODE: + l = (int32_t)*sp; + return (XDR_PUTINT32(xdrs, &l)); + + case XDR_DECODE: + if (!XDR_GETINT32(xdrs, &l)) + return (FALSE); + *sp = (short)l; + return (TRUE); + + case XDR_FREE: + return (TRUE); + } + return (FALSE); +} + +/* + * XDR unsigned short integers + */ +bool_t +xdr_u_short(XDR *xdrs, ushort_t *usp) +{ + uint32_t l; + + switch (xdrs->x_op) { + + case XDR_ENCODE: + l = (uint32_t)*usp; + return (XDR_PUTINT32(xdrs, (int32_t *)&l)); + + case XDR_DECODE: + if (!XDR_GETINT32(xdrs, (int32_t *)&l)) { +#ifdef DEBUG + printf("xdr_u_short: decode FAILED\n"); +#endif + return (FALSE); + } + *usp = (ushort_t)l; + return (TRUE); + + case XDR_FREE: + return (TRUE); + } +#ifdef DEBUG + printf("xdr_u_short: bad op FAILED\n"); +#endif + return (FALSE); +} + + +/* + * XDR a char + */ +bool_t +xdr_char(XDR *xdrs, char *cp) +{ + int i; + + i = (*cp); + if (!xdr_int(xdrs, &i)) { + return (FALSE); + } + *cp = (char)i; + return (TRUE); +} + +/* + * XDR booleans + * + * PSARC 2003/523 Contract Private Interface + * xdr_bool + * Changes must be reviewed by Solaris File Sharing + * Changes must be communicated to contract-2003-523@sun.com + */ +bool_t +xdr_bool(XDR *xdrs, bool_t *bp) +{ + int32_t i32b; + + switch (xdrs->x_op) { + + case XDR_ENCODE: + i32b = *bp ? XDR_TRUE : XDR_FALSE; + return (XDR_PUTINT32(xdrs, &i32b)); + + case XDR_DECODE: + if (!XDR_GETINT32(xdrs, &i32b)) { +#ifdef DEBUG + printf("xdr_bool: decode FAILED\n"); +#endif + return (FALSE); + } + *bp = (i32b == XDR_FALSE) ? FALSE : TRUE; + return (TRUE); + + case XDR_FREE: + return (TRUE); + } +#ifdef DEBUG + printf("xdr_bool: bad op FAILED\n"); +#endif + return (FALSE); +} + +/* + * XDR enumerations + * + * PSARC 2003/523 Contract Private Interface + * xdr_enum + * Changes must be reviewed by Solaris File Sharing + * Changes must be communicated to contract-2003-523@sun.com + */ +#ifndef lint +enum sizecheck { SIZEVAL } sizecheckvar; /* used to find the size of */ + /* an enum */ +#endif +bool_t +xdr_enum(XDR *xdrs, enum_t *ep) +{ +#ifndef lint + /* + * enums are treated as ints + */ + if (sizeof (sizecheckvar) == sizeof (int32_t)) { + return (xdr_int(xdrs, (int32_t *)ep)); + } else if (sizeof (sizecheckvar) == sizeof (short)) { + return (xdr_short(xdrs, (short *)ep)); + } else { + return (FALSE); + } +#else + (void) (xdr_short(xdrs, (short *)ep)); + return (xdr_int(xdrs, (int32_t *)ep)); +#endif +} + +/* + * XDR opaque data + * Allows the specification of a fixed size sequence of opaque bytes. + * cp points to the opaque object and cnt gives the byte length. + * + * PSARC 2003/523 Contract Private Interface + * xdr_opaque + * Changes must be reviewed by Solaris File Sharing + * Changes must be communicated to contract-2003-523@sun.com + */ +bool_t +xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + uint_t rndup; + static char crud[BYTES_PER_XDR_UNIT]; + + /* + * if no data we are done + */ + if (cnt == 0) + return (TRUE); + + /* + * round byte count to full xdr units + */ + rndup = cnt % BYTES_PER_XDR_UNIT; + if (rndup != 0) + rndup = BYTES_PER_XDR_UNIT - rndup; + + if (xdrs->x_op == XDR_DECODE) { + if (!XDR_GETBYTES(xdrs, cp, cnt)) { +#ifdef DEBUG + printf("xdr_opaque: decode FAILED\n"); +#endif + return (FALSE); + } + if (rndup == 0) + return (TRUE); + return (XDR_GETBYTES(xdrs, (caddr_t)crud, rndup)); + } + + if (xdrs->x_op == XDR_ENCODE) { + if (!XDR_PUTBYTES(xdrs, cp, cnt)) { +#ifdef DEBUG + printf("xdr_opaque: encode FAILED\n"); +#endif + return (FALSE); + } + if (rndup == 0) + return (TRUE); + return (XDR_PUTBYTES(xdrs, xdr_zero, rndup)); + } + + if (xdrs->x_op == XDR_FREE) + return (TRUE); + +#ifdef DEBUG + printf("xdr_opaque: bad op FAILED\n"); +#endif + return (FALSE); +} + +/* + * XDR counted bytes + * *cpp is a pointer to the bytes, *sizep is the count. + * If *cpp is NULL maxsize bytes are allocated + * + * PSARC 2003/523 Contract Private Interface + * xdr_bytes + * Changes must be reviewed by Solaris File Sharing + * Changes must be communicated to contract-2003-523@sun.com + */ +bool_t +xdr_bytes(XDR *xdrs, char **cpp, uint_t *sizep, const uint_t maxsize) +{ + char *sp = *cpp; /* sp is the actual string pointer */ + uint_t nodesize; + + /* + * first deal with the length since xdr bytes are counted + */ + if (!xdr_u_int(xdrs, sizep)) { +#ifdef DEBUG + printf("xdr_bytes: size FAILED\n"); +#endif + return (FALSE); + } + nodesize = *sizep; + if ((nodesize > maxsize) && (xdrs->x_op != XDR_FREE)) { +#ifdef DEBUG + printf("xdr_bytes: bad size (%d) FAILED (%d max)\n", + nodesize, maxsize); +#endif + return (FALSE); + } + + /* + * now deal with the actual bytes + */ + switch (xdrs->x_op) { + case XDR_DECODE: + if (nodesize == 0) + return (TRUE); + if (sp == NULL) + *cpp = sp = (char *)mem_alloc(nodesize); + /* FALLTHROUGH */ + + case XDR_ENCODE: + return (xdr_opaque(xdrs, sp, nodesize)); + + case XDR_FREE: + if (sp != NULL) { + mem_free(sp, nodesize); + *cpp = NULL; + } + return (TRUE); + } +#ifdef DEBUG + printf("xdr_bytes: bad op FAILED\n"); +#endif + return (FALSE); +} + +/* + * Implemented here due to commonality of the object. + */ +bool_t +xdr_netobj(XDR *xdrs, struct netobj *np) +{ + return (xdr_bytes(xdrs, &np->n_bytes, &np->n_len, MAX_NETOBJ_SZ)); +} + +/* + * XDR a descriminated union + * Support routine for discriminated unions. + * You create an array of xdrdiscrim structures, terminated with + * an entry with a null procedure pointer. The routine gets + * the discriminant value and then searches the array of xdrdiscrims + * looking for that value. It calls the procedure given in the xdrdiscrim + * to handle the discriminant. If there is no specific routine a default + * routine may be called. + * If there is no specific or default routine an error is returned. + */ +bool_t +xdr_union(XDR *xdrs, enum_t *dscmp, char *unp, + const struct xdr_discrim *choices, const xdrproc_t dfault) +{ + enum_t dscm; + + /* + * we deal with the discriminator; it's an enum + */ + if (!xdr_enum(xdrs, dscmp)) { +#ifdef DEBUG + printf("xdr_enum: dscmp FAILED\n"); +#endif + return (FALSE); + } + dscm = *dscmp; + + /* + * search choices for a value that matches the discriminator. + * if we find one, execute the xdr routine for that value. + */ + for (; choices->proc != NULL_xdrproc_t; choices++) { + if (choices->value == dscm) + return ((*(choices->proc))(xdrs, unp, LASTUNSIGNED)); + } + + /* + * no match - execute the default xdr routine if there is one + */ + return ((dfault == NULL_xdrproc_t) ? FALSE : + (*dfault)(xdrs, unp, LASTUNSIGNED)); +} + + +/* + * Non-portable xdr primitives. + * Care should be taken when moving these routines to new architectures. + */ + + +/* + * XDR null terminated ASCII strings + * xdr_string deals with "C strings" - arrays of bytes that are + * terminated by a NULL character. The parameter cpp references a + * pointer to storage; If the pointer is null, then the necessary + * storage is allocated. The last parameter is the max allowed length + * of the string as specified by a protocol. + */ +bool_t +xdr_string(XDR *xdrs, char **cpp, const uint_t maxsize) +{ + char *sp = *cpp; /* sp is the actual string pointer */ + uint_t size; + uint_t nodesize; + + /* + * first deal with the length since xdr strings are counted-strings + */ + switch (xdrs->x_op) { + case XDR_FREE: + if (sp == NULL) + return (TRUE); /* already free */ + /* FALLTHROUGH */ + case XDR_ENCODE: + size = (sp != NULL) ? (uint_t)strlen(sp) : 0; + break; + case XDR_DECODE: + break; + } + if (!xdr_u_int(xdrs, &size)) { +#ifdef DEBUG + printf("xdr_string: size FAILED\n"); +#endif + return (FALSE); + } + if (size > maxsize) { +#ifdef DEBUG + printf("xdr_string: bad size FAILED\n"); +#endif + return (FALSE); + } + nodesize = size + 1; + + /* + * now deal with the actual bytes + */ + switch (xdrs->x_op) { + case XDR_DECODE: + if (nodesize == 0) + return (TRUE); + if (sp == NULL) + sp = (char *)mem_alloc(nodesize); + sp[size] = 0; + if (!xdr_opaque(xdrs, sp, size)) { + /* + * free up memory if allocated here + */ + if (*cpp == NULL) { + mem_free(sp, nodesize); + } + return (FALSE); + } + if (strlen(sp) != size) { + if (*cpp == NULL) { + mem_free(sp, nodesize); + } + return (FALSE); + } + *cpp = sp; + return (TRUE); + + case XDR_ENCODE: + return (xdr_opaque(xdrs, sp, size)); + + case XDR_FREE: + mem_free(sp, nodesize); + *cpp = NULL; + return (TRUE); + } +#ifdef DEBUG + printf("xdr_string: bad op FAILED\n"); +#endif + return (FALSE); +} + +/* + * Wrapper for xdr_string that can be called directly from + * routines like clnt_call + */ +bool_t +xdr_wrapstring(XDR *xdrs, char **cpp) +{ + if (xdr_string(xdrs, cpp, LASTUNSIGNED)) + return (TRUE); + return (FALSE); +} diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr.h b/sys/contrib/opensolaris/uts/common/rpc/xdr.h new file mode 100644 index 0000000..c8eb419 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/rpc/xdr.h @@ -0,0 +1,605 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Portions of this source code were derived from Berkeley + * 4.3 BSD under license from the Regents of the University of + * California. + */ + +/* + * xdr.h, External Data Representation Serialization Routines. + * + */ + +#ifndef _RPC_XDR_H +#define _RPC_XDR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/byteorder.h> /* For all ntoh* and hton*() kind of macros */ +#include <rpc/types.h> /* For all ntoh* and hton*() kind of macros */ +#ifndef _KERNEL +#include <stdio.h> /* defines FILE *, used in ANSI C function prototypes */ +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * XDR provides a conventional way for converting between C data + * types and an external bit-string representation. Library supplied + * routines provide for the conversion on built-in C data types. These + * routines and utility routines defined here are used to help implement + * a type encode/decode routine for each user-defined type. + * + * Each data type provides a single procedure which takes two arguments: + * + * bool_t + * xdrproc(xdrs, argresp) + * XDR *xdrs; + * <type> *argresp; + * + * xdrs is an instance of a XDR handle, to which or from which the data + * type is to be converted. argresp is a pointer to the structure to be + * converted. The XDR handle contains an operation field which indicates + * which of the operations (ENCODE, DECODE * or FREE) is to be performed. + * + * XDR_DECODE may allocate space if the pointer argresp is null. This + * data can be freed with the XDR_FREE operation. + * + * We write only one procedure per data type to make it easy + * to keep the encode and decode procedures for a data type consistent. + * In many cases the same code performs all operations on a user defined type, + * because all the hard work is done in the component type routines. + * decode as a series of calls on the nested data types. + */ + +/* + * Xdr operations. XDR_ENCODE causes the type to be encoded into the + * stream. XDR_DECODE causes the type to be extracted from the stream. + * XDR_FREE can be used to release the space allocated by an XDR_DECODE + * request. + */ +enum xdr_op { + XDR_ENCODE = 0, + XDR_DECODE = 1, + XDR_FREE = 2 +}; + +/* + * This is the number of bytes per unit of external data. + */ +#define BYTES_PER_XDR_UNIT (4) +#define RNDUP(x) ((((x) + BYTES_PER_XDR_UNIT - 1) / BYTES_PER_XDR_UNIT) \ + * BYTES_PER_XDR_UNIT) + +/* + * The XDR handle. + * Contains operation which is being applied to the stream, + * an operations vector for the paticular implementation (e.g. see xdr_mem.c), + * and two private fields for the use of the particular impelementation. + * + * PSARC 2003/523 Contract Private Interface + * XDR + * Changes must be reviewed by Solaris File Sharing + * Changes must be communicated to contract-2003-523@sun.com + */ +typedef struct XDR { + enum xdr_op x_op; /* operation; fast additional param */ + struct xdr_ops *x_ops; + caddr_t x_public; /* users' data */ + caddr_t x_private; /* pointer to private data */ + caddr_t x_base; /* private used for position info */ + int x_handy; /* extra private word */ +} XDR; + +/* + * PSARC 2003/523 Contract Private Interface + * xdr_ops + * Changes must be reviewed by Solaris File Sharing + * Changes must be communicated to contract-2003-523@sun.com + */ +struct xdr_ops { +#ifdef __STDC__ +#if !defined(_KERNEL) + bool_t (*x_getlong)(struct XDR *, long *); + /* get a long from underlying stream */ + bool_t (*x_putlong)(struct XDR *, long *); + /* put a long to " */ +#endif /* KERNEL */ + bool_t (*x_getbytes)(struct XDR *, caddr_t, int); + /* get some bytes from " */ + bool_t (*x_putbytes)(struct XDR *, caddr_t, int); + /* put some bytes to " */ + uint_t (*x_getpostn)(struct XDR *); + /* returns bytes off from beginning */ + bool_t (*x_setpostn)(struct XDR *, uint_t); + /* lets you reposition the stream */ + rpc_inline_t *(*x_inline)(struct XDR *, int); + /* buf quick ptr to buffered data */ + void (*x_destroy)(struct XDR *); + /* free privates of this xdr_stream */ + bool_t (*x_control)(struct XDR *, int, void *); +#if defined(_LP64) || defined(_KERNEL) + bool_t (*x_getint32)(struct XDR *, int32_t *); + /* get a int from underlying stream */ + bool_t (*x_putint32)(struct XDR *, int32_t *); + /* put an int to " */ +#endif /* _LP64 || _KERNEL */ +#else +#if !defined(_KERNEL) + bool_t (*x_getlong)(); /* get a long from underlying stream */ + bool_t (*x_putlong)(); /* put a long to " */ +#endif /* KERNEL */ + bool_t (*x_getbytes)(); /* get some bytes from " */ + bool_t (*x_putbytes)(); /* put some bytes to " */ + uint_t (*x_getpostn)(); /* returns bytes off from beginning */ + bool_t (*x_setpostn)(); /* lets you reposition the stream */ + rpc_inline_t *(*x_inline)(); + /* buf quick ptr to buffered data */ + void (*x_destroy)(); /* free privates of this xdr_stream */ + bool_t (*x_control)(); +#if defined(_LP64) || defined(_KERNEL) + bool_t (*x_getint32)(); + bool_t (*x_putint32)(); +#endif /* _LP64 || defined(_KERNEL) */ +#endif +}; + +/* + * Operations defined on a XDR handle + * + * XDR *xdrs; + * long *longp; + * caddr_t addr; + * uint_t len; + * uint_t pos; + */ +#if !defined(_KERNEL) +#define XDR_GETLONG(xdrs, longp) \ + (*(xdrs)->x_ops->x_getlong)(xdrs, longp) +#define xdr_getlong(xdrs, longp) \ + (*(xdrs)->x_ops->x_getlong)(xdrs, longp) + +#define XDR_PUTLONG(xdrs, longp) \ + (*(xdrs)->x_ops->x_putlong)(xdrs, longp) +#define xdr_putlong(xdrs, longp) \ + (*(xdrs)->x_ops->x_putlong)(xdrs, longp) +#endif /* KERNEL */ + + +#if !defined(_LP64) && !defined(_KERNEL) + +/* + * For binary compatability on ILP32 we do not change the shape + * of the XDR structure and the GET/PUTINT32 functions just use + * the get/putlong vectors which operate on identically-sized + * units of data. + */ + +#define XDR_GETINT32(xdrs, int32p) \ + (*(xdrs)->x_ops->x_getlong)(xdrs, (long *)int32p) +#define xdr_getint32(xdrs, int32p) \ + (*(xdrs)->x_ops->x_getlong)(xdrs, (long *)int32p) + +#define XDR_PUTINT32(xdrs, int32p) \ + (*(xdrs)->x_ops->x_putlong)(xdrs, (long *)int32p) +#define xdr_putint32(xdrs, int32p) \ + (*(xdrs)->x_ops->x_putlong)(xdrs, (long *)int32p) + +#else /* !_LP64 && !_KERNEL */ + +#define XDR_GETINT32(xdrs, int32p) \ + (*(xdrs)->x_ops->x_getint32)(xdrs, int32p) +#define xdr_getint32(xdrs, int32p) \ + (*(xdrs)->x_ops->x_getint32)(xdrs, int32p) + +#define XDR_PUTINT32(xdrs, int32p) \ + (*(xdrs)->x_ops->x_putint32)(xdrs, int32p) +#define xdr_putint32(xdrs, int32p) \ + (*(xdrs)->x_ops->x_putint32)(xdrs, int32p) + +#endif /* !_LP64 && !_KERNEL */ + +#define XDR_GETBYTES(xdrs, addr, len) \ + (*(xdrs)->x_ops->x_getbytes)(xdrs, addr, len) +#define xdr_getbytes(xdrs, addr, len) \ + (*(xdrs)->x_ops->x_getbytes)(xdrs, addr, len) + +#define XDR_PUTBYTES(xdrs, addr, len) \ + (*(xdrs)->x_ops->x_putbytes)(xdrs, addr, len) +#define xdr_putbytes(xdrs, addr, len) \ + (*(xdrs)->x_ops->x_putbytes)(xdrs, addr, len) + +#define XDR_GETPOS(xdrs) \ + (*(xdrs)->x_ops->x_getpostn)(xdrs) +#define xdr_getpos(xdrs) \ + (*(xdrs)->x_ops->x_getpostn)(xdrs) + +#define XDR_SETPOS(xdrs, pos) \ + (*(xdrs)->x_ops->x_setpostn)(xdrs, pos) +#define xdr_setpos(xdrs, pos) \ + (*(xdrs)->x_ops->x_setpostn)(xdrs, pos) + +#define XDR_INLINE(xdrs, len) \ + (*(xdrs)->x_ops->x_inline)(xdrs, len) +#define xdr_inline(xdrs, len) \ + (*(xdrs)->x_ops->x_inline)(xdrs, len) + +#define XDR_DESTROY(xdrs) \ + (*(xdrs)->x_ops->x_destroy)(xdrs) +#define xdr_destroy(xdrs) \ + (*(xdrs)->x_ops->x_destroy)(xdrs) + +#define XDR_CONTROL(xdrs, req, op) \ + (*(xdrs)->x_ops->x_control)(xdrs, req, op) +#define xdr_control(xdrs, req, op) \ + (*(xdrs)->x_ops->x_control)(xdrs, req, op) + +/* + * Support struct for discriminated unions. + * You create an array of xdrdiscrim structures, terminated with + * a entry with a null procedure pointer. The xdr_union routine gets + * the discriminant value and then searches the array of structures + * for a matching value. If a match is found the associated xdr routine + * is called to handle that part of the union. If there is + * no match, then a default routine may be called. + * If there is no match and no default routine it is an error. + */ + + +/* + * A xdrproc_t exists for each data type which is to be encoded or decoded. + * + * The second argument to the xdrproc_t is a pointer to an opaque pointer. + * The opaque pointer generally points to a structure of the data type + * to be decoded. If this pointer is 0, then the type routines should + * allocate dynamic storage of the appropriate size and return it. + * bool_t (*xdrproc_t)(XDR *, void *); + */ +#ifdef __cplusplus +typedef bool_t (*xdrproc_t)(XDR *, void *); +#else +#ifdef __STDC__ +typedef bool_t (*xdrproc_t)(); /* For Backward compatibility */ +#else +typedef bool_t (*xdrproc_t)(); +#endif +#endif + +#define NULL_xdrproc_t ((xdrproc_t)0) + +#if defined(_LP64) || defined(_I32LPx) +#define xdr_rpcvers(xdrs, versp) xdr_u_int(xdrs, versp) +#define xdr_rpcprog(xdrs, progp) xdr_u_int(xdrs, progp) +#define xdr_rpcproc(xdrs, procp) xdr_u_int(xdrs, procp) +#define xdr_rpcprot(xdrs, protp) xdr_u_int(xdrs, protp) +#define xdr_rpcport(xdrs, portp) xdr_u_int(xdrs, portp) +#else +#define xdr_rpcvers(xdrs, versp) xdr_u_long(xdrs, versp) +#define xdr_rpcprog(xdrs, progp) xdr_u_long(xdrs, progp) +#define xdr_rpcproc(xdrs, procp) xdr_u_long(xdrs, procp) +#define xdr_rpcprot(xdrs, protp) xdr_u_long(xdrs, protp) +#define xdr_rpcport(xdrs, portp) xdr_u_long(xdrs, portp) +#endif + +struct xdr_discrim { + int value; + xdrproc_t proc; +}; + +/* + * In-line routines for fast encode/decode of primitve data types. + * Caveat emptor: these use single memory cycles to get the + * data from the underlying buffer, and will fail to operate + * properly if the data is not aligned. The standard way to use these + * is to say: + * if ((buf = XDR_INLINE(xdrs, count)) == NULL) + * return (FALSE); + * <<< macro calls >>> + * where ``count'' is the number of bytes of data occupied + * by the primitive data types. + * + * N.B. and frozen for all time: each data type here uses 4 bytes + * of external representation. + */ + +#define IXDR_GET_INT32(buf) ((int32_t)ntohl((uint32_t)*(buf)++)) +#define IXDR_PUT_INT32(buf, v) (*(buf)++ = (int32_t)htonl((uint32_t)v)) +#define IXDR_GET_U_INT32(buf) ((uint32_t)IXDR_GET_INT32(buf)) +#define IXDR_PUT_U_INT32(buf, v) IXDR_PUT_INT32((buf), ((int32_t)(v))) + +#if !defined(_KERNEL) && !defined(_LP64) + +#define IXDR_GET_LONG(buf) ((long)ntohl((ulong_t)*(buf)++)) +#define IXDR_PUT_LONG(buf, v) (*(buf)++ = (long)htonl((ulong_t)v)) +#define IXDR_GET_U_LONG(buf) ((ulong_t)IXDR_GET_LONG(buf)) +#define IXDR_PUT_U_LONG(buf, v) IXDR_PUT_LONG((buf), ((long)(v))) + +#define IXDR_GET_BOOL(buf) ((bool_t)IXDR_GET_LONG(buf)) +#define IXDR_GET_ENUM(buf, t) ((t)IXDR_GET_LONG(buf)) +#define IXDR_GET_SHORT(buf) ((short)IXDR_GET_LONG(buf)) +#define IXDR_GET_U_SHORT(buf) ((ushort_t)IXDR_GET_LONG(buf)) + +#define IXDR_PUT_BOOL(buf, v) IXDR_PUT_LONG((buf), ((long)(v))) +#define IXDR_PUT_ENUM(buf, v) IXDR_PUT_LONG((buf), ((long)(v))) +#define IXDR_PUT_SHORT(buf, v) IXDR_PUT_LONG((buf), ((long)(v))) +#define IXDR_PUT_U_SHORT(buf, v) IXDR_PUT_LONG((buf), ((long)(v))) + +#else + +#define IXDR_GET_BOOL(buf) ((bool_t)IXDR_GET_INT32(buf)) +#define IXDR_GET_ENUM(buf, t) ((t)IXDR_GET_INT32(buf)) +#define IXDR_GET_SHORT(buf) ((short)IXDR_GET_INT32(buf)) +#define IXDR_GET_U_SHORT(buf) ((ushort_t)IXDR_GET_INT32(buf)) + +#define IXDR_PUT_BOOL(buf, v) IXDR_PUT_INT32((buf), ((int)(v))) +#define IXDR_PUT_ENUM(buf, v) IXDR_PUT_INT32((buf), ((int)(v))) +#define IXDR_PUT_SHORT(buf, v) IXDR_PUT_INT32((buf), ((int)(v))) +#define IXDR_PUT_U_SHORT(buf, v) IXDR_PUT_INT32((buf), ((int)(v))) + +#endif + +#ifndef _LITTLE_ENDIAN +#define IXDR_GET_HYPER(buf, v) { \ + *((int32_t *)(&v)) = ntohl(*(uint32_t *)buf++); \ + *((int32_t *)(((char *)&v) + BYTES_PER_XDR_UNIT)) \ + = ntohl(*(uint32_t *)buf++); \ + } +#define IXDR_PUT_HYPER(buf, v) { \ + *(buf)++ = (int32_t)htonl(*(uint32_t *) \ + ((char *)&v)); \ + *(buf)++ = \ + (int32_t)htonl(*(uint32_t *)(((char *)&v) \ + + BYTES_PER_XDR_UNIT)); \ + } +#else + +#define IXDR_GET_HYPER(buf, v) { \ + *((int32_t *)(((char *)&v) + \ + BYTES_PER_XDR_UNIT)) \ + = ntohl(*(uint32_t *)buf++); \ + *((int32_t *)(&v)) = \ + ntohl(*(uint32_t *)buf++); \ + } + +#define IXDR_PUT_HYPER(buf, v) { \ + *(buf)++ = \ + (int32_t)htonl(*(uint32_t *)(((char *)&v) + \ + BYTES_PER_XDR_UNIT)); \ + *(buf)++ = \ + (int32_t)htonl(*(uint32_t *)((char *)&v)); \ + } +#endif +#define IXDR_GET_U_HYPER(buf, v) IXDR_GET_HYPER(buf, v) +#define IXDR_PUT_U_HYPER(buf, v) IXDR_PUT_HYPER(buf, v) + + +/* + * These are the "generic" xdr routines. + */ +#ifdef __STDC__ +extern bool_t xdr_void(void); +extern bool_t xdr_int(XDR *, int *); +extern bool_t xdr_u_int(XDR *, uint_t *); +extern bool_t xdr_long(XDR *, long *); +extern bool_t xdr_u_long(XDR *, ulong_t *); +extern bool_t xdr_short(XDR *, short *); +extern bool_t xdr_u_short(XDR *, ushort_t *); +extern bool_t xdr_bool(XDR *, bool_t *); +extern bool_t xdr_enum(XDR *, enum_t *); +extern bool_t xdr_array(XDR *, caddr_t *, uint_t *, const uint_t, + const uint_t, const xdrproc_t); +extern bool_t xdr_bytes(XDR *, char **, uint_t *, const uint_t); +extern bool_t xdr_opaque(XDR *, caddr_t, const uint_t); +extern bool_t xdr_string(XDR *, char **, const uint_t); +extern bool_t xdr_union(XDR *, enum_t *, char *, + const struct xdr_discrim *, const xdrproc_t); +extern unsigned int xdr_sizeof(xdrproc_t, void *); + +extern bool_t xdr_hyper(XDR *, longlong_t *); +extern bool_t xdr_longlong_t(XDR *, longlong_t *); +extern bool_t xdr_u_hyper(XDR *, u_longlong_t *); +extern bool_t xdr_u_longlong_t(XDR *, u_longlong_t *); + +extern bool_t xdr_char(XDR *, char *); +extern bool_t xdr_wrapstring(XDR *, char **); +extern bool_t xdr_reference(XDR *, caddr_t *, uint_t, const xdrproc_t); +extern bool_t xdr_pointer(XDR *, char **, uint_t, const xdrproc_t); +extern void xdr_free(xdrproc_t, char *); +extern bool_t xdr_time_t(XDR *, time_t *); + +extern bool_t xdr_int8_t(XDR *, int8_t *); +extern bool_t xdr_uint8_t(XDR *, uint8_t *); +extern bool_t xdr_int16_t(XDR *, int16_t *); +extern bool_t xdr_uint16_t(XDR *, uint16_t *); +extern bool_t xdr_int32_t(XDR *, int32_t *); +extern bool_t xdr_uint32_t(XDR *, uint32_t *); +#if defined(_INT64_TYPE) +extern bool_t xdr_int64_t(XDR *, int64_t *); +extern bool_t xdr_uint64_t(XDR *, uint64_t *); +#endif + +#ifndef _KERNEL +extern bool_t xdr_u_char(XDR *, uchar_t *); +extern bool_t xdr_vector(XDR *, char *, const uint_t, const uint_t, const +xdrproc_t); +extern bool_t xdr_float(XDR *, float *); +extern bool_t xdr_double(XDR *, double *); +extern bool_t xdr_quadruple(XDR *, long double *); +#endif /* !_KERNEL */ +#else +extern bool_t xdr_void(); +extern bool_t xdr_int(); +extern bool_t xdr_u_int(); +extern bool_t xdr_long(); +extern bool_t xdr_u_long(); +extern bool_t xdr_short(); +extern bool_t xdr_u_short(); +extern bool_t xdr_bool(); +extern bool_t xdr_enum(); +extern bool_t xdr_array(); +extern bool_t xdr_bytes(); +extern bool_t xdr_opaque(); +extern bool_t xdr_string(); +extern bool_t xdr_union(); + +extern bool_t xdr_hyper(); +extern bool_t xdr_longlong_t(); +extern bool_t xdr_u_hyper(); +extern bool_t xdr_u_longlong_t(); +extern bool_t xdr_char(); +extern bool_t xdr_reference(); +extern bool_t xdr_pointer(); +extern void xdr_free(); +extern bool_t xdr_wrapstring(); +extern bool_t xdr_time_t(); + +extern bool_t xdr_int8_t(); +extern bool_t xdr_uint8_t(); +extern bool_t xdr_int16_t(); +extern bool_t xdr_uint16_t(); +extern bool_t xdr_int32_t(); +extern bool_t xdr_uint32_t(); +#if defined(_INT64_TYPE) +extern bool_t xdr_int64_t(); +extern bool_t xdr_uint64_t(); +#endif + +#ifndef _KERNEL +extern bool_t xdr_u_char(); +extern bool_t xdr_vector(); +extern bool_t xdr_float(); +extern bool_t xdr_double(); +extern bool_t xdr_quadruple(); +#endif /* !_KERNEL */ +#endif + +/* + * Common opaque bytes objects used by many rpc protocols; + * declared here due to commonality. + */ +#define MAX_NETOBJ_SZ 1024 +struct netobj { + uint_t n_len; + char *n_bytes; +}; +typedef struct netobj netobj; + +#ifdef __STDC__ +extern bool_t xdr_netobj(XDR *, netobj *); +#else +extern bool_t xdr_netobj(); +#endif + +/* + * These are XDR control operators + */ + +#define XDR_GET_BYTES_AVAIL 1 + +struct xdr_bytesrec { + bool_t xc_is_last_record; + size_t xc_num_avail; +}; + +typedef struct xdr_bytesrec xdr_bytesrec; + +/* + * These are the request arguments to XDR_CONTROL. + * + * XDR_PEEK - returns the contents of the next XDR unit on the XDR stream. + * XDR_SKIPBYTES - skips the next N bytes in the XDR stream. + * XDR_RDMAGET - for xdr implementation over RDMA, gets private flags from + * the XDR stream being moved over RDMA + * XDR_RDMANOCHUNK - for xdr implementaion over RDMA, sets private flags in + * the XDR stream moving over RDMA. + */ +#ifdef _KERNEL +#define XDR_PEEK 2 +#define XDR_SKIPBYTES 3 +#define XDR_RDMAGET 4 +#define XDR_RDMASET 5 +#endif + +/* + * These are the public routines for the various implementations of + * xdr streams. + */ +#ifndef _KERNEL +#ifdef __STDC__ +extern void xdrmem_create(XDR *, const caddr_t, const uint_t, const enum +xdr_op); + /* XDR using memory buffers */ +extern void xdrrec_create(XDR *, const uint_t, const uint_t, const caddr_t, +int (*) (void *, caddr_t, int), int (*) (void *, caddr_t, int)); +/* XDR pseudo records for tcp */ +extern bool_t xdrrec_endofrecord(XDR *, bool_t); +/* make end of xdr record */ +extern bool_t xdrrec_skiprecord(XDR *); +/* move to beginning of next record */ +extern bool_t xdrrec_eof(XDR *); +extern uint_t xdrrec_readbytes(XDR *, caddr_t, uint_t); +/* true if no more input */ +#else +extern void xdrmem_create(); +extern void xdrstdio_create(); +extern void xdrrec_create(); +extern bool_t xdrrec_endofrecord(); +extern bool_t xdrrec_skiprecord(); +extern bool_t xdrrec_eof(); +extern uint_t xdrrec_readbytes(); +#endif +#else + +extern void xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op); + +extern struct xdr_ops xdrmblk_ops; + +struct rpc_msg; +extern bool_t xdr_callmsg(XDR *, struct rpc_msg *); +extern bool_t xdr_replymsg_body(XDR *, struct rpc_msg *); +extern bool_t xdr_replymsg_hdr(XDR *, struct rpc_msg *); + +#include <sys/malloc.h> +#ifdef mem_alloc +#undef mem_alloc +#define mem_alloc(size) malloc((size), M_TEMP, M_WAITOK | M_ZERO) +#endif +#ifdef mem_free +#undef mem_free +#define mem_free(ptr, size) free((ptr), M_TEMP) +#endif + +#endif /* !_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* !_RPC_XDR_H */ diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr_array.c b/sys/contrib/opensolaris/uts/common/rpc/xdr_array.c new file mode 100644 index 0000000..3711e53 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/rpc/xdr_array.c @@ -0,0 +1,123 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * xdr_array.c, Generic XDR routines impelmentation. + * These are the "non-trivial" xdr primitives used to serialize and de-serialize + * arrays. See xdr.h for more info on the interface to xdr. + */ + +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/types.h> +#include <sys/systm.h> + +#include <rpc/types.h> +#include <rpc/xdr.h> + +#define LASTUNSIGNED ((uint_t)0-1) + +/* + * XDR an array of arbitrary elements + * *addrp is a pointer to the array, *sizep is the number of elements. + * If addrp is NULL (*sizep * elsize) bytes are allocated. + * elsize is the size (in bytes) of each element, and elproc is the + * xdr procedure to call to handle each element of the array. + */ +bool_t +xdr_array(XDR *xdrs, caddr_t *addrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i; + caddr_t target = *addrp; + uint_t c; /* the actual element count */ + bool_t stat = TRUE; + uint_t nodesize; + + /* like strings, arrays are really counted arrays */ + if (!xdr_u_int(xdrs, sizep)) { +#ifdef DEBUG + printf("xdr_array: size FAILED\n"); +#endif + return (FALSE); + } + c = *sizep; + if ((c > maxsize || LASTUNSIGNED / elsize < c) && + xdrs->x_op != XDR_FREE) { +#ifdef DEBUG + printf("xdr_array: bad size FAILED\n"); +#endif + return (FALSE); + } + nodesize = c * elsize; + + /* + * if we are deserializing, we may need to allocate an array. + * We also save time by checking for a null array if we are freeing. + */ + if (target == NULL) + switch (xdrs->x_op) { + case XDR_DECODE: + if (c == 0) + return (TRUE); + *addrp = target = (char *)mem_alloc(nodesize); + bzero(target, nodesize); + break; + + case XDR_FREE: + return (TRUE); + + case XDR_ENCODE: + break; + } + + /* + * now we xdr each element of array + */ + for (i = 0; (i < c) && stat; i++) { + stat = (*elproc)(xdrs, target, LASTUNSIGNED); + target += elsize; + } + + /* + * the array may need freeing + */ + if (xdrs->x_op == XDR_FREE) { + mem_free(*addrp, nodesize); + *addrp = NULL; + } + return (stat); +} diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c b/sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c new file mode 100644 index 0000000..32ff32d --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c @@ -0,0 +1,209 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * xdr_mem.c, XDR implementation using memory buffers. + * + * If you have some data to be interpreted as external data representation + * or to be converted to external data representation in a memory buffer, + * then this is the package for you. + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/systm.h> + +#include <rpc/types.h> +#include <rpc/xdr.h> + +static struct xdr_ops *xdrmem_ops(void); + +/* + * The procedure xdrmem_create initializes a stream descriptor for a + * memory buffer. + */ +void +xdrmem_create(XDR *xdrs, caddr_t addr, uint_t size, enum xdr_op op) +{ + xdrs->x_op = op; + xdrs->x_ops = xdrmem_ops(); + xdrs->x_private = xdrs->x_base = addr; + xdrs->x_handy = size; + xdrs->x_public = NULL; +} + +/* ARGSUSED */ +static void +xdrmem_destroy(XDR *xdrs) +{ +} + +static bool_t +xdrmem_getint32(XDR *xdrs, int32_t *int32p) +{ + if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0) + return (FALSE); + /* LINTED pointer alignment */ + *int32p = (int32_t)ntohl((uint32_t)(*((int32_t *)(xdrs->x_private)))); + xdrs->x_private += sizeof (int32_t); + return (TRUE); +} + +static bool_t +xdrmem_putint32(XDR *xdrs, int32_t *int32p) +{ + if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0) + return (FALSE); + /* LINTED pointer alignment */ + *(int32_t *)xdrs->x_private = (int32_t)htonl((uint32_t)(*int32p)); + xdrs->x_private += sizeof (int32_t); + return (TRUE); +} + +static bool_t +xdrmem_getbytes(XDR *xdrs, caddr_t addr, int len) +{ + if ((xdrs->x_handy -= len) < 0) + return (FALSE); + bcopy(xdrs->x_private, addr, len); + xdrs->x_private += len; + return (TRUE); +} + +static bool_t +xdrmem_putbytes(XDR *xdrs, caddr_t addr, int len) +{ + if ((xdrs->x_handy -= len) < 0) + return (FALSE); + bcopy(addr, xdrs->x_private, len); + xdrs->x_private += len; + return (TRUE); +} + +static uint_t +xdrmem_getpos(XDR *xdrs) +{ + return ((uint_t)((uintptr_t)xdrs->x_private - (uintptr_t)xdrs->x_base)); +} + +static bool_t +xdrmem_setpos(XDR *xdrs, uint_t pos) +{ + caddr_t newaddr = xdrs->x_base + pos; + caddr_t lastaddr = xdrs->x_private + xdrs->x_handy; + ptrdiff_t diff; + + if (newaddr > lastaddr) + return (FALSE); + xdrs->x_private = newaddr; + diff = lastaddr - newaddr; + xdrs->x_handy = (int)diff; + return (TRUE); +} + +static rpc_inline_t * +xdrmem_inline(XDR *xdrs, int len) +{ + rpc_inline_t *buf = NULL; + + if (xdrs->x_handy >= len) { + xdrs->x_handy -= len; + /* LINTED pointer alignment */ + buf = (rpc_inline_t *)xdrs->x_private; + xdrs->x_private += len; + } + return (buf); +} + +static bool_t +xdrmem_control(XDR *xdrs, int request, void *info) +{ + xdr_bytesrec *xptr; + int32_t *int32p; + int len; + + switch (request) { + + case XDR_GET_BYTES_AVAIL: + xptr = (xdr_bytesrec *)info; + xptr->xc_is_last_record = TRUE; + xptr->xc_num_avail = xdrs->x_handy; + return (TRUE); + + case XDR_PEEK: + /* + * Return the next 4 byte unit in the XDR stream. + */ + if (xdrs->x_handy < sizeof (int32_t)) + return (FALSE); + int32p = (int32_t *)info; + *int32p = (int32_t)ntohl((uint32_t) + (*((int32_t *)(xdrs->x_private)))); + return (TRUE); + + case XDR_SKIPBYTES: + /* + * Skip the next N bytes in the XDR stream. + */ + int32p = (int32_t *)info; + len = RNDUP((int)(*int32p)); + if ((xdrs->x_handy -= len) < 0) + return (FALSE); + xdrs->x_private += len; + return (TRUE); + + } + return (FALSE); +} + +static struct xdr_ops * +xdrmem_ops(void) +{ + static struct xdr_ops ops; + + if (ops.x_getint32 == NULL) { + ops.x_getbytes = xdrmem_getbytes; + ops.x_putbytes = xdrmem_putbytes; + ops.x_getpostn = xdrmem_getpos; + ops.x_setpostn = xdrmem_setpos; + ops.x_inline = xdrmem_inline; + ops.x_destroy = xdrmem_destroy; + ops.x_control = xdrmem_control; + ops.x_getint32 = xdrmem_getint32; + ops.x_putint32 = xdrmem_putint32; + } + return (&ops); +} diff --git a/sys/contrib/opensolaris/uts/common/sys/asm_linkage.h b/sys/contrib/opensolaris/uts/common/sys/asm_linkage.h new file mode 100644 index 0000000..66481a6 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/asm_linkage.h @@ -0,0 +1,304 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IA32_SYS_ASM_LINKAGE_H +#define _IA32_SYS_ASM_LINKAGE_H + + + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _ASM /* The remainder of this file is only for assembly files */ + +/* + * make annoying differences in assembler syntax go away + */ + +/* + * D16 and A16 are used to insert instructions prefixes; the + * macros help the assembler code be slightly more portable. + */ +#if !defined(__GNUC_AS__) +/* + * /usr/ccs/bin/as prefixes are parsed as separate instructions + */ +#define D16 data16; +#define A16 addr16; + +/* + * (There are some weird constructs in constant expressions) + */ +#define _CONST(const) [const] +#define _BITNOT(const) -1!_CONST(const) +#define _MUL(a, b) _CONST(a \* b) + +#else +/* + * Why not use the 'data16' and 'addr16' prefixes .. well, the + * assembler doesn't quite believe in real mode, and thus argues with + * us about what we're trying to do. + */ +#define D16 .byte 0x66; +#define A16 .byte 0x67; + +#define _CONST(const) (const) +#define _BITNOT(const) ~_CONST(const) +#define _MUL(a, b) _CONST(a * b) + +#endif + +/* + * C pointers are different sizes between i386 and amd64. + * These constants can be used to compute offsets into pointer arrays. + */ +#if defined(__amd64) +#define CLONGSHIFT 3 +#define CLONGSIZE 8 +#define CLONGMASK 7 +#elif defined(__i386) +#define CLONGSHIFT 2 +#define CLONGSIZE 4 +#define CLONGMASK 3 +#endif + +/* + * Since we know we're either ILP32 or LP64 .. + */ +#define CPTRSHIFT CLONGSHIFT +#define CPTRSIZE CLONGSIZE +#define CPTRMASK CLONGMASK + +#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) +#error "inconsistent shift constants" +#endif + +#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) +#error "inconsistent mask constants" +#endif + +#define ASM_ENTRY_ALIGN 16 + +/* + * SSE register alignment and save areas + */ + +#define XMM_SIZE 16 +#define XMM_ALIGN 16 + +#if defined(__amd64) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \ + movq %rsp, sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp + +#elif defined(__i386) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \ + movl %esp, sreg; \ + addl $XMM_ALIGN, sreg; \ + andl $_BITNOT(XMM_ALIGN-1), sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; + +#endif /* __i386 */ + +/* + * profiling causes definitions of the MCOUNT and RTMCOUNT + * particular to the type + */ +#ifdef GPROF + +#define MCOUNT(x) \ + pushl %ebp; \ + movl %esp, %ebp; \ + call _mcount; \ + popl %ebp + +#endif /* GPROF */ + +#ifdef PROF + +#define MCOUNT(x) \ +/* CSTYLED */ \ + .lcomm .L_/**/x/**/1, 4, 4; \ + pushl %ebp; \ + movl %esp, %ebp; \ +/* CSTYLED */ \ + movl $.L_/**/x/**/1, %edx; \ + call _mcount; \ + popl %ebp + +#endif /* PROF */ + +/* + * if we are not profiling, MCOUNT should be defined to nothing + */ +#if !defined(PROF) && !defined(GPROF) +#define MCOUNT(x) +#endif /* !defined(PROF) && !defined(GPROF) */ + +#define RTMCOUNT(x) MCOUNT(x) + +/* + * Macro to define weak symbol aliases. These are similar to the ANSI-C + * #pragma weak name = _name + * except a compiler can determine type. The assembler must be told. Hence, + * the second parameter must be the type of the symbol (i.e.: function,...) + */ +#define ANSI_PRAGMA_WEAK(sym, stype) \ + .weak sym; \ + .type sym, @stype; \ +/* CSTYLED */ \ +sym = _/**/sym + +/* + * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in: + * #pragma weak sym1 = sym2 + */ +#define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \ + .weak sym1; \ + .type sym1, @stype; \ +sym1 = sym2 + +/* + * ENTRY provides the standard procedure entry code and an easy way to + * insert the calls to mcount for profiling. ENTRY_NP is identical, but + * never calls mcount. + */ +#define ENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: MCOUNT(x) + +#define ENTRY_NP(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: + +#define RTENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: RTMCOUNT(x) + +/* + * ENTRY2 is identical to ENTRY but provides two labels for the entry point. + */ +#define ENTRY2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ + .type x, @function; \ + .type y, @function; \ +/* CSTYLED */ \ +x: ; \ +y: MCOUNT(x) + +#define ENTRY_NP2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ + .type x, @function; \ + .type y, @function; \ +/* CSTYLED */ \ +x: ; \ +y: + + +/* + * ALTENTRY provides for additional entry points. + */ +#define ALTENTRY(x) \ + .globl x; \ + .type x, @function; \ +x: + +/* + * DGDEF and DGDEF2 provide global data declarations. + * + * DGDEF provides a word aligned word of storage. + * + * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This + * implies this macro is best used for byte arrays. + * + * DGDEF3 allocates "sz" bytes of storage with "algn" alignment. + */ +#define DGDEF2(name, sz) \ + .data; \ + .globl name; \ + .type name, @object; \ + .size name, sz; \ +name: + +#define DGDEF3(name, sz, algn) \ + .data; \ + .align algn; \ + .globl name; \ + .type name, @object; \ + .size name, sz; \ +name: + +#define DGDEF(name) DGDEF3(name, 4, 4) + +/* + * SET_SIZE trails a function and set the size for the ELF symbol table. + */ +#define SET_SIZE(x) \ + .size x, [.-x] + +/* + * NWORD provides native word value. + */ +#if defined(__amd64) + +/*CSTYLED*/ +#define NWORD quad + +#elif defined(__i386) + +#define NWORD long + +#endif /* __i386 */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/atomic.h b/sys/contrib/opensolaris/uts/common/sys/atomic.h new file mode 100644 index 0000000..de968c5 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/atomic.h @@ -0,0 +1,431 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ATOMIC_H +#define _SYS_ATOMIC_H + + + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_KERNEL) && defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include <asm/atomic.h> +#endif + +#if defined(_KERNEL) || defined(__STDC__) +/* + * Increment target. + */ +extern void atomic_inc_8(volatile uint8_t *); +extern void atomic_inc_uchar(volatile uchar_t *); +extern void atomic_inc_16(volatile uint16_t *); +extern void atomic_inc_ushort(volatile ushort_t *); +extern void atomic_inc_32(volatile uint32_t *); +extern void atomic_inc_uint(volatile uint_t *); +extern void atomic_inc_ulong(volatile ulong_t *); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern void atomic_inc_64(volatile uint64_t *); +#endif + +/* + * Decrement target + */ +extern void atomic_dec_8(volatile uint8_t *); +extern void atomic_dec_uchar(volatile uchar_t *); +extern void atomic_dec_16(volatile uint16_t *); +extern void atomic_dec_ushort(volatile ushort_t *); +extern void atomic_dec_32(volatile uint32_t *); +extern void atomic_dec_uint(volatile uint_t *); +extern void atomic_dec_ulong(volatile ulong_t *); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern void atomic_dec_64(volatile uint64_t *); +#endif + +/* + * Add delta to target + */ +#ifdef __i386__ +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern void atomic_add_64(volatile uint64_t *, int64_t); +#endif +#endif + +/* + * logical OR bits with target + */ +extern void atomic_or_8(volatile uint8_t *, uint8_t); +extern void atomic_or_uchar(volatile uchar_t *, uchar_t); +extern void atomic_or_16(volatile uint16_t *, uint16_t); +extern void atomic_or_ushort(volatile ushort_t *, ushort_t); +extern void atomic_or_32(volatile uint32_t *, uint32_t); +extern void atomic_or_uint(volatile uint_t *, uint_t); +extern void atomic_or_ulong(volatile ulong_t *, ulong_t); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern void atomic_or_64(volatile uint64_t *, uint64_t); +#endif + +/* + * logical AND bits with target + */ +extern void atomic_and_8(volatile uint8_t *, uint8_t); +extern void atomic_and_uchar(volatile uchar_t *, uchar_t); +extern void atomic_and_16(volatile uint16_t *, uint16_t); +extern void atomic_and_ushort(volatile ushort_t *, ushort_t); +extern void atomic_and_32(volatile uint32_t *, uint32_t); +extern void atomic_and_uint(volatile uint_t *, uint_t); +extern void atomic_and_ulong(volatile ulong_t *, ulong_t); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern void atomic_and_64(volatile uint64_t *, uint64_t); +#endif + +/* + * As above, but return the new value. Note that these _nv() variants are + * substantially more expensive on some platforms than the no-return-value + * versions above, so don't use them unless you really need to know the + * new value *atomically* (e.g. when decrementing a reference count and + * checking whether it went to zero). + */ + +/* + * Increment target and return new value. + */ +extern uint8_t atomic_inc_8_nv(volatile uint8_t *); +extern uchar_t atomic_inc_uchar_nv(volatile uchar_t *); +extern uint16_t atomic_inc_16_nv(volatile uint16_t *); +extern ushort_t atomic_inc_ushort_nv(volatile ushort_t *); +extern uint32_t atomic_inc_32_nv(volatile uint32_t *); +extern uint_t atomic_inc_uint_nv(volatile uint_t *); +extern ulong_t atomic_inc_ulong_nv(volatile ulong_t *); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern uint64_t atomic_inc_64_nv(volatile uint64_t *); +#endif + +/* + * Decrement target and return new value. + */ +extern uint8_t atomic_dec_8_nv(volatile uint8_t *); +extern uchar_t atomic_dec_uchar_nv(volatile uchar_t *); +extern uint16_t atomic_dec_16_nv(volatile uint16_t *); +extern ushort_t atomic_dec_ushort_nv(volatile ushort_t *); +extern uint32_t atomic_dec_32_nv(volatile uint32_t *); +extern uint_t atomic_dec_uint_nv(volatile uint_t *); +extern ulong_t atomic_dec_ulong_nv(volatile ulong_t *); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern uint64_t atomic_dec_64_nv(volatile uint64_t *); +#endif + +/* + * Add delta to target + */ +extern uint8_t atomic_add_8_nv(volatile uint8_t *, int8_t); +extern uchar_t atomic_add_char_nv(volatile uchar_t *, signed char); +extern uint16_t atomic_add_16_nv(volatile uint16_t *, int16_t); +extern ushort_t atomic_add_short_nv(volatile ushort_t *, short); +extern uint32_t atomic_add_32_nv(volatile uint32_t *, int32_t); +extern uint_t atomic_add_int_nv(volatile uint_t *, int); +extern void *atomic_add_ptr_nv(volatile void *, ssize_t); +extern ulong_t atomic_add_long_nv(volatile ulong_t *, long); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern uint64_t atomic_add_64_nv(volatile uint64_t *, int64_t); +#endif + +/* + * logical OR bits with target and return new value. + */ +extern uint8_t atomic_or_8_nv(volatile uint8_t *, uint8_t); +extern uchar_t atomic_or_uchar_nv(volatile uchar_t *, uchar_t); +extern uint16_t atomic_or_16_nv(volatile uint16_t *, uint16_t); +extern ushort_t atomic_or_ushort_nv(volatile ushort_t *, ushort_t); +extern uint32_t atomic_or_32_nv(volatile uint32_t *, uint32_t); +extern uint_t atomic_or_uint_nv(volatile uint_t *, uint_t); +extern ulong_t atomic_or_ulong_nv(volatile ulong_t *, ulong_t); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern uint64_t atomic_or_64_nv(volatile uint64_t *, uint64_t); +#endif + +/* + * logical AND bits with target and return new value. + */ +extern uint8_t atomic_and_8_nv(volatile uint8_t *, uint8_t); +extern uchar_t atomic_and_uchar_nv(volatile uchar_t *, uchar_t); +extern uint16_t atomic_and_16_nv(volatile uint16_t *, uint16_t); +extern ushort_t atomic_and_ushort_nv(volatile ushort_t *, ushort_t); +extern uint32_t atomic_and_32_nv(volatile uint32_t *, uint32_t); +extern uint_t atomic_and_uint_nv(volatile uint_t *, uint_t); +extern ulong_t atomic_and_ulong_nv(volatile ulong_t *, ulong_t); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern uint64_t atomic_and_64_nv(volatile uint64_t *, uint64_t); +#endif + +/* + * If *arg1 == arg2, set *arg1 = arg3; return old value + */ +extern uint8_t atomic_cas_8(volatile uint8_t *, uint8_t, uint8_t); +extern uchar_t atomic_cas_uchar(volatile uchar_t *, uchar_t, uchar_t); +extern uint16_t atomic_cas_16(volatile uint16_t *, uint16_t, uint16_t); +extern ushort_t atomic_cas_ushort(volatile ushort_t *, ushort_t, ushort_t); +extern uint32_t atomic_cas_32(volatile uint32_t *, uint32_t, uint32_t); +extern uint_t atomic_cas_uint(volatile uint_t *, uint_t, uint_t); +extern void *atomic_cas_ptr(volatile void *, void *, void *); +extern ulong_t atomic_cas_ulong(volatile ulong_t *, ulong_t, ulong_t); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern uint64_t atomic_cas_64(volatile uint64_t *, uint64_t, uint64_t); +#endif + +/* + * Swap target and return old value + */ +extern uint8_t atomic_swap_8(volatile uint8_t *, uint8_t); +extern uchar_t atomic_swap_uchar(volatile uchar_t *, uchar_t); +extern uint16_t atomic_swap_16(volatile uint16_t *, uint16_t); +extern ushort_t atomic_swap_ushort(volatile ushort_t *, ushort_t); +extern uint32_t atomic_swap_32(volatile uint32_t *, uint32_t); +extern uint_t atomic_swap_uint(volatile uint_t *, uint_t); +extern void *atomic_swap_ptr(volatile void *, void *); +extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t); +#if defined(_KERNEL) || defined(_INT64_TYPE) +extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t); +#endif + +/* + * Perform an exclusive atomic bit set/clear on a target. + * Returns 0 if bit was sucessfully set/cleared, or -1 + * if the bit was already set/cleared. + */ +extern int atomic_set_long_excl(volatile ulong_t *, uint_t); +extern int atomic_clear_long_excl(volatile ulong_t *, uint_t); + +/* + * Generic memory barrier used during lock entry, placed after the + * memory operation that acquires the lock to guarantee that the lock + * protects its data. No stores from after the memory barrier will + * reach visibility, and no loads from after the barrier will be + * resolved, before the lock acquisition reaches global visibility. + */ +extern void membar_enter(void); + +/* + * Generic memory barrier used during lock exit, placed before the + * memory operation that releases the lock to guarantee that the lock + * protects its data. All loads and stores issued before the barrier + * will be resolved before the subsequent lock update reaches visibility. + */ +extern void membar_exit(void); + +/* + * Arrange that all stores issued before this point in the code reach + * global visibility before any stores that follow; useful in producer + * modules that update a data item, then set a flag that it is available. + * The memory barrier guarantees that the available flag is not visible + * earlier than the updated data, i.e. it imposes store ordering. + */ +extern void membar_producer(void); + +/* + * Arrange that all loads issued before this point in the code are + * completed before any subsequent loads; useful in consumer modules + * that check to see if data is available and read the data. + * The memory barrier guarantees that the data is not sampled until + * after the available flag has been seen, i.e. it imposes load ordering. + */ +extern void membar_consumer(void); +#endif + +#if !defined(_KERNEL) && !defined(__STDC__) +extern void atomic_inc_8(); +extern void atomic_inc_uchar(); +extern void atomic_inc_16(); +extern void atomic_inc_ushort(); +extern void atomic_inc_32(); +extern void atomic_inc_uint(); +extern void atomic_inc_ulong(); +#if defined(_INT64_TYPE) +extern void atomic_inc_64(); +#endif /* defined(_INT64_TYPE) */ +extern void atomic_dec_8(); +extern void atomic_dec_uchar(); +extern void atomic_dec_16(); +extern void atomic_dec_ushort(); +extern void atomic_dec_32(); +extern void atomic_dec_uint(); +extern void atomic_dec_ulong(); +#if defined(_INT64_TYPE) +extern void atomic_dec_64(); +#endif /* defined(_INT64_TYPE) */ +extern void atomic_add_8(); +extern void atomic_add_char(); +extern void atomic_add_16(); +extern void atomic_add_short(); +extern void atomic_add_32(); +extern void atomic_add_int(); +extern void atomic_add_ptr(); +extern void atomic_add_long(); +#if defined(_INT64_TYPE) +extern void atomic_add_64(); +#endif /* defined(_INT64_TYPE) */ +extern void atomic_or_8(); +extern void atomic_or_uchar(); +extern void atomic_or_16(); +extern void atomic_or_ushort(); +extern void atomic_or_32(); +extern void atomic_or_uint(); +extern void atomic_or_ulong(); +#if defined(_INT64_TYPE) +extern void atomic_or_64(); +#endif /* defined(_INT64_TYPE) */ +extern void atomic_and_8(); +extern void atomic_and_uchar(); +extern void atomic_and_16(); +extern void atomic_and_ushort(); +extern void atomic_and_32(); +extern void atomic_and_uint(); +extern void atomic_and_ulong(); +#if defined(_INT64_TYPE) +extern void atomic_and_64(); +#endif /* defined(_INT64_TYPE) */ +extern uint8_t atomic_inc_8_nv(); +extern uchar_t atomic_inc_uchar_nv(); +extern uint16_t atomic_inc_16_nv(); +extern ushort_t atomic_inc_ushort_nv(); +extern uint32_t atomic_inc_32_nv(); +extern uint_t atomic_inc_uint_nv(); +extern ulong_t atomic_inc_ulong_nv(); +#if defined(_INT64_TYPE) +extern uint64_t atomic_inc_64_nv(); +#endif /* defined(_INT64_TYPE) */ +extern uint8_t atomic_dec_8_nv(); +extern uchar_t atomic_dec_uchar_nv(); +extern uint16_t atomic_dec_16_nv(); +extern ushort_t atomic_dec_ushort_nv(); +extern uint32_t atomic_dec_32_nv(); +extern uint_t atomic_dec_uint_nv(); +extern ulong_t atomic_dec_ulong_nv(); +#if defined(_INT64_TYPE) +extern uint64_t atomic_dec_64_nv(); +#endif /* defined(_INT64_TYPE) */ +extern uint8_t atomic_add_8_nv(); +extern uchar_t atomic_add_char_nv(); +extern uint16_t atomic_add_16_nv(); +extern ushort_t atomic_add_short_nv(); +extern uint32_t atomic_add_32_nv(); +extern uint_t atomic_add_int_nv(); +extern void *atomic_add_ptr_nv(); +extern ulong_t atomic_add_long_nv(); +#if defined(_INT64_TYPE) +extern uint64_t atomic_add_64_nv(); +#endif /* defined(_INT64_TYPE) */ +extern uint8_t atomic_or_8_nv(); +extern uchar_t atomic_or_uchar_nv(); +extern uint16_t atomic_or_16_nv(); +extern ushort_t atomic_or_ushort_nv(); +extern uint32_t atomic_or_32_nv(); +extern uint_t atomic_or_uint_nv(); +extern ulong_t atomic_or_ulong_nv(); +#if defined(_INT64_TYPE) +extern uint64_t atomic_or_64_nv(); +#endif /* defined(_INT64_TYPE) */ +extern uint8_t atomic_and_8_nv(); +extern uchar_t atomic_and_uchar_nv(); +extern uint16_t atomic_and_16_nv(); +extern ushort_t atomic_and_ushort_nv(); +extern uint32_t atomic_and_32_nv(); +extern uint_t atomic_and_uint_nv(); +extern ulong_t atomic_and_ulong_nv(); +#if defined(_INT64_TYPE) +extern uint64_t atomic_and_64_nv(); +#endif /* defined(_INT64_TYPE) */ +extern uint8_t atomic_cas_8(); +extern uchar_t atomic_cas_uchar(); +extern uint16_t atomic_cas_16(); +extern ushort_t atomic_cas_ushort(); +extern uint32_t atomic_cas_32(); +extern uint_t atomic_cas_uint(); +extern void *atomic_cas_ptr(); +extern ulong_t atomic_cas_ulong(); +#if defined(_INT64_TYPE) +extern uint64_t atomic_cas_64(); +#endif /* defined(_INT64_TYPE) */ +extern uint8_t atomic_swap_8(); +extern uchar_t atomic_swap_uchar(); +extern uint16_t atomic_swap_16(); +extern ushort_t atomic_swap_ushort(); +extern uint32_t atomic_swap_32(); +extern uint_t atomic_swap_uint(); +extern void *atomic_swap_ptr(); +extern ulong_t atomic_swap_ulong(); +#if defined(_INT64_TYPE) +extern uint64_t atomic_swap_64(); +#endif /* defined(_INT64_TYPE) */ + + +extern int atomic_set_long_excl(); +extern int atomic_clear_long_excl(); + +extern void membar_enter(); +extern void membar_exit(); +extern void membar_producer(); +extern void membar_consumer(); + +#endif + +#if defined(_KERNEL) + +#if defined(_LP64) || defined(_ILP32) +#define atomic_add_ip atomic_add_long +#define atomic_add_ip_nv atomic_add_long_nv +#define casip atomic_cas_ulong +#endif + +#if defined(__sparc) +extern uint8_t ldstub(uint8_t *); +#endif + +/* + * Legacy kernel interfaces; they will go away (eventually). + */ +extern uint8_t cas8(uint8_t *, uint8_t, uint8_t); +extern uint32_t cas32(uint32_t *, uint32_t, uint32_t); +extern uint64_t cas64(uint64_t *, uint64_t, uint64_t); +extern ulong_t caslong(ulong_t *, ulong_t, ulong_t); +extern void *casptr(void *, void *, void *); +extern void atomic_and_long(ulong_t *, ulong_t); +extern void atomic_or_long(ulong_t *, ulong_t); +#if defined(__sparc) +extern uint32_t swapl(uint32_t *, uint32_t); +#endif + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ATOMIC_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/avl.h b/sys/contrib/opensolaris/uts/common/sys/avl.h new file mode 100644 index 0000000..bf9af89 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/avl.h @@ -0,0 +1,298 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AVL_H +#define _AVL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/avl_impl.h> + +/* + * This is a generic implemenatation of AVL trees for use in the Solaris kernel. + * The interfaces provide an efficient way of implementing an ordered set of + * data structures. + * + * AVL trees provide an alternative to using an ordered linked list. Using AVL + * trees will usually be faster, however they requires more storage. An ordered + * linked list in general requires 2 pointers in each data structure. The + * AVL tree implementation uses 3 pointers. The following chart gives the + * approximate performance of operations with the different approaches: + * + * Operation Link List AVL tree + * --------- -------- -------- + * lookup O(n) O(log(n)) + * + * insert 1 node constant constant + * + * delete 1 node constant between constant and O(log(n)) + * + * delete all nodes O(n) O(n) + * + * visit the next + * or prev node constant between constant and O(log(n)) + * + * + * The data structure nodes are anchored at an "avl_tree_t" (the equivalent + * of a list header) and the individual nodes will have a field of + * type "avl_node_t" (corresponding to list pointers). + * + * The type "avl_index_t" is used to indicate a position in the list for + * certain calls. + * + * The usage scenario is generally: + * + * 1. Create the list/tree with: avl_create() + * + * followed by any mixture of: + * + * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert() + * + * 2b. Visited elements with: + * avl_first() - returns the lowest valued node + * avl_last() - returns the highest valued node + * AVL_NEXT() - given a node go to next higher one + * AVL_PREV() - given a node go to previous lower one + * + * 2c. Find the node with the closest value either less than or greater + * than a given value with avl_nearest(). + * + * 2d. Remove individual nodes from the list/tree with avl_remove(). + * + * and finally when the list is being destroyed + * + * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes. + * Note that once you use avl_destroy_nodes(), you can no longer + * use any routine except avl_destroy_nodes() and avl_destoy(). + * + * 4. Use avl_destroy() to destroy the AVL tree itself. + * + * Any locking for multiple thread access is up to the user to provide, just + * as is needed for any linked list implementation. + */ + + +/* + * Type used for the root of the AVL tree. + */ +typedef struct avl_tree avl_tree_t; + +/* + * The data nodes in the AVL tree must have a field of this type. + */ +typedef struct avl_node avl_node_t; + +/* + * An opaque type used to locate a position in the tree where a node + * would be inserted. + */ +typedef uintptr_t avl_index_t; + + +/* + * Direction constants used for avl_nearest(). + */ +#define AVL_BEFORE (0) +#define AVL_AFTER (1) + + + +/* + * Prototypes + * + * Where not otherwise mentioned, "void *" arguments are a pointer to the + * user data structure which must contain a field of type avl_node_t. + * + * Also assume the user data structures looks like: + * stuct my_type { + * ... + * avl_node_t my_link; + * ... + * }; + */ + +/* + * Initialize an AVL tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + * offset - the value of OFFSETOF(struct my_type, my_link) + */ +extern void avl_create(avl_tree_t *tree, + int (*compar) (const void *, const void *), size_t size, size_t offset); + + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with avl_insert() or avl_nearest(). + * + * node - node that has the value being looked for + * where - position for use with avl_nearest() or avl_insert(), may be NULL + */ +extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from avl_find() + */ +extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); + +/* + * Insert "new_data" in "tree" in the given "direction" either after + * or before the data "here". + * + * This might be usefull for avl clients caching recently accessed + * data to avoid doing avl_find() again for insertion. + * + * new_data - new data to insert + * here - existing node in "tree" + * direction - either AVL_AFTER or AVL_BEFORE the data "here". + */ +extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, + int direction); + + +/* + * Return the first or last valued node in the tree. Will return NULL + * if the tree is empty. + * + */ +extern void *avl_first(avl_tree_t *tree); +extern void *avl_last(avl_tree_t *tree); + + +/* + * Return the next or previous valued node in the tree. + * AVL_NEXT() will return NULL if at the last node. + * AVL_PREV() will return NULL if at the first node. + * + * node - the node from which the next or previous node is found + */ +#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER) +#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE) + + +/* + * Find the node with the nearest value either greater or less than + * the value from a previous avl_find(). Returns the node or NULL if + * there isn't a matching one. + * + * where - position as returned from avl_find() + * direction - either AVL_BEFORE or AVL_AFTER + * + * EXAMPLE get the greatest node that is less than a given value: + * + * avl_tree_t *tree; + * struct my_data look_for_value = {....}; + * struct my_data *node; + * struct my_data *less; + * avl_index_t where; + * + * node = avl_find(tree, &look_for_value, &where); + * if (node != NULL) + * less = AVL_PREV(tree, node); + * else + * less = avl_nearest(tree, where, AVL_BEFORE); + */ +extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction); + + +/* + * Add a single node to the tree. + * The node must not be in the tree, and it must not + * compare equal to any other node already in the tree. + * + * node - the node to add + */ +extern void avl_add(avl_tree_t *tree, void *node); + + +/* + * Remove a single node from the tree. The node must be in the tree. + * + * node - the node to remove + */ +extern void avl_remove(avl_tree_t *tree, void *node); + + +/* + * Return the number of nodes in the tree + */ +extern ulong_t avl_numnodes(avl_tree_t *tree); + + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call avl_destroy_nodes(), you can only continuing calling it and + * finally avl_destroy(). No other AVL routines will be valid. + * + * cookie - a "void *" used to save state between calls to avl_destroy_nodes() + * + * EXAMPLE: + * avl_tree_t *tree; + * struct my_data *node; + * void *cookie; + * + * cookie = NULL; + * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) + * free(node); + * avl_destroy(tree); + */ +extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie); + + +/* + * Final destroy of an AVL tree. Arguments are: + * + * tree - the empty tree to destroy + */ +extern void avl_destroy(avl_tree_t *tree); + + + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/avl_impl.h b/sys/contrib/opensolaris/uts/common/sys/avl_impl.h new file mode 100644 index 0000000..620685f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/avl_impl.h @@ -0,0 +1,164 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AVL_IMPL_H +#define _AVL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * generic AVL tree implementation for kernel use + * + * There are 5 pieces of information stored for each node in an AVL tree + * + * pointer to less than child + * pointer to greater than child + * a pointer to the parent of this node + * an indication [0/1] of which child I am of my parent + * a "balance" (-1, 0, +1) indicating which child tree is taller + * + * Since they only need 3 bits, the last two fields are packed into the + * bottom bits of the parent pointer on 64 bit machines to save on space. + */ + +#ifndef _LP64 + +struct avl_node { + struct avl_node *avl_child[2]; /* left/right children */ + struct avl_node *avl_parent; /* this node's parent */ + unsigned short avl_child_index; /* my index in parent's avl_child[] */ + short avl_balance; /* balance value: -1, 0, +1 */ +}; + +#define AVL_XPARENT(n) ((n)->avl_parent) +#define AVL_SETPARENT(n, p) ((n)->avl_parent = (p)) + +#define AVL_XCHILD(n) ((n)->avl_child_index) +#define AVL_SETCHILD(n, c) ((n)->avl_child_index = (unsigned short)(c)) + +#define AVL_XBALANCE(n) ((n)->avl_balance) +#define AVL_SETBALANCE(n, b) ((n)->avl_balance = (short)(b)) + +#else /* _LP64 */ + +/* + * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index + * values packed in the following manner: + * + * |63 3| 2 |1 0 | + * |-------------------------------------|-----------------|-------------| + * | avl_parent hi order bits | avl_child_index | avl_balance | + * | | | + 1 | + * |-------------------------------------|-----------------|-------------| + * + */ +struct avl_node { + struct avl_node *avl_child[2]; /* left/right children nodes */ + uintptr_t avl_pcb; /* parent, child_index, balance */ +}; + +/* + * macros to extract/set fields in avl_pcb + * + * pointer to the parent of the current node is the high order bits + */ +#define AVL_XPARENT(n) ((struct avl_node *)((n)->avl_pcb & ~7)) +#define AVL_SETPARENT(n, p) \ + ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p))) + +/* + * index of this node in its parent's avl_child[]: bit #2 + */ +#define AVL_XCHILD(n) (((n)->avl_pcb >> 2) & 1) +#define AVL_SETCHILD(n, c) \ + ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2))) + +/* + * balance indication for a node, lowest 2 bits. A valid balance is + * -1, 0, or +1, and is encoded by adding 1 to the value to get the + * unsigned values of 0, 1, 2. + */ +#define AVL_XBALANCE(n) ((int)(((n)->avl_pcb & 3) - 1)) +#define AVL_SETBALANCE(n, b) \ + ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1)))) + +#endif /* _LP64 */ + + + +/* + * switch between a node and data pointer for a given tree + * the value of "o" is tree->avl_offset + */ +#define AVL_NODE2DATA(n, o) ((void *)((uintptr_t)(n) - (o))) +#define AVL_DATA2NODE(d, o) ((struct avl_node *)((uintptr_t)(d) + (o))) + + + +/* + * macros used to create/access an avl_index_t + */ +#define AVL_INDEX2NODE(x) ((avl_node_t *)((x) & ~1)) +#define AVL_INDEX2CHILD(x) ((x) & 1) +#define AVL_MKINDEX(n, c) ((avl_index_t)(n) | (c)) + + +/* + * The tree structure. The fields avl_root, avl_compar, and avl_offset come + * first since they are needed for avl_find(). We want them to fit into + * a single 64 byte cache line to make avl_find() as fast as possible. + */ +struct avl_tree { + struct avl_node *avl_root; /* root node in tree */ + int (*avl_compar)(const void *, const void *); + size_t avl_offset; /* offsetof(type, avl_link_t field) */ + ulong_t avl_numnodes; /* number of nodes in the tree */ + size_t avl_size; /* sizeof user type struct */ +}; + + +/* + * This will only by used via AVL_NEXT() or AVL_PREV() + */ +extern void *avl_walk(struct avl_tree *, void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/bitmap.h b/sys/contrib/opensolaris/uts/common/sys/bitmap.h new file mode 100644 index 0000000..d0dd12b --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/bitmap.h @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_BITMAP_H +#define _SYS_BITMAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/feature_tests.h> +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include <asm/bitmap.h> +#endif + +/* + * Operations on bitmaps of arbitrary size + * A bitmap is a vector of 1 or more ulong_t's. + * The user of the package is responsible for range checks and keeping + * track of sizes. + */ + +#ifdef _LP64 +#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */ +#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#else +#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#endif + +#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */ +#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */ + +#ifdef _LP64 +#define BT_NBIPUL32 (1 << BT_ULSHIFT32) /* n bits per ulong_t */ +#define BT_ULMASK32 (BT_NBIPUL32 - 1) /* to extract bit index */ +#define BT_ULMAXMASK 0xffffffffffffffff /* used by bt_getlowbit */ +#else +#define BT_ULMAXMASK 0xffffffff +#endif + +/* + * bitmap is a ulong_t *, bitindex an index_t + * + * The macros BT_WIM and BT_BIW internal; there is no need + * for users of this package to use them. + */ + +/* + * word in map + */ +#define BT_WIM(bitmap, bitindex) \ + ((bitmap)[(bitindex) >> BT_ULSHIFT]) +/* + * bit in word + */ +#define BT_BIW(bitindex) \ + (1UL << ((bitindex) & BT_ULMASK)) + +#ifdef _LP64 +#define BT_WIM32(bitmap, bitindex) \ + ((bitmap)[(bitindex) >> BT_ULSHIFT32]) + +#define BT_BIW32(bitindex) \ + (1UL << ((bitindex) & BT_ULMASK32)) +#endif + +/* + * These are public macros + * + * BT_BITOUL == n bits to n ulong_t's + */ +#define BT_BITOUL(nbits) \ + (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL) +#define BT_SIZEOFMAP(nbits) \ + (BT_BITOUL(nbits) * sizeof (ulong_t)) +#define BT_TEST(bitmap, bitindex) \ + ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0) +#define BT_SET(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); } +#define BT_CLEAR(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); } + +#ifdef _LP64 +#define BT_BITOUL32(nbits) \ + (((nbits) + BT_NBIPUL32 - 1l) / BT_NBIPUL32) +#define BT_SIZEOFMAP32(nbits) \ + (BT_BITOUL32(nbits) * sizeof (uint_t)) +#define BT_TEST32(bitmap, bitindex) \ + ((BT_WIM32((bitmap), (bitindex)) & BT_BIW32(bitindex)) ? 1 : 0) +#define BT_SET32(bitmap, bitindex) \ + { BT_WIM32((bitmap), (bitindex)) |= BT_BIW32(bitindex); } +#define BT_CLEAR32(bitmap, bitindex) \ + { BT_WIM32((bitmap), (bitindex)) &= ~BT_BIW32(bitindex); } +#endif /* _LP64 */ + + +/* + * BIT_ONLYONESET is a private macro not designed for bitmaps of + * arbitrary size. u must be an unsigned integer/long. It returns + * true if one and only one bit is set in u. + */ +#define BIT_ONLYONESET(u) \ + ((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0)) + +#if defined(_KERNEL) && !defined(_ASM) +#include <sys/atomic.h> + +/* + * return next available bit index from map with specified number of bits + */ +extern index_t bt_availbit(ulong_t *bitmap, size_t nbits); +/* + * find the highest order bit that is on, and is within or below + * the word specified by wx + */ +extern int bt_gethighbit(ulong_t *mapp, int wx); +extern int bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2, + size_t end_pos); +/* + * Find highest and lowest one bit set. + * Returns bit number + 1 of bit that is set, otherwise returns 0. + * Low order bit is 0, high order bit is 31. + */ +extern int highbit(ulong_t); +extern int lowbit(ulong_t); +extern int bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop); +extern void bt_copy(ulong_t *, ulong_t *, ulong_t); + +/* + * find the parity + */ +extern int odd_parity(ulong_t); + +/* + * Atomically set/clear bits + * Atomic exclusive operations will set "result" to "-1" + * if the bit is already set/cleared. "result" will be set + * to 0 otherwise. + */ +#define BT_ATOMIC_SET(bitmap, bitindex) \ + { atomic_or_long(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); } +#define BT_ATOMIC_CLEAR(bitmap, bitindex) \ + { atomic_and_long(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); } + +#define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \ + { result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \ + (bitindex) % BT_NBIPUL); } +#define BT_ATOMIC_CLEAR_EXCL(bitmap, bitindex, result) \ + { result = atomic_clear_long_excl(&(BT_WIM(bitmap, bitindex)), \ + (bitindex) % BT_NBIPUL); } + +/* + * Extracts bits between index h (high, inclusive) and l (low, exclusive) from + * u, which must be an unsigned integer. + */ +#define BITX(u, h, l) (((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU)) + +#endif /* _KERNEL && !_ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITMAP_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/byteorder.h b/sys/contrib/opensolaris/uts/common/sys/byteorder.h new file mode 100644 index 0000000..b80a0f0 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/byteorder.h @@ -0,0 +1,137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_BYTEORDER_H +#define _SYS_BYTEORDER_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/isa_defs.h> +#include <sys/int_types.h> + +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include <asm/byteorder.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * macros for conversion between host and (internet) network byte order + */ + +#if defined(_BIG_ENDIAN) && !defined(ntohl) && !defined(__lint) +/* big-endian */ +#define ntohl(x) (x) +#define ntohs(x) (x) +#define htonl(x) (x) +#define htons(x) (x) + +#elif !defined(ntohl) /* little-endian */ + +#ifndef _IN_PORT_T +#define _IN_PORT_T +typedef uint16_t in_port_t; +#endif + +#ifndef _IN_ADDR_T +#define _IN_ADDR_T +typedef uint32_t in_addr_t; +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) +extern uint32_t htonl(uint32_t); +extern uint16_t htons(uint16_t); +extern uint32_t ntohl(uint32_t); +extern uint16_t ntohs(uint16_t); +#else +extern in_addr_t htonl(in_addr_t); +extern in_port_t htons(in_port_t); +extern in_addr_t ntohl(in_addr_t); +extern in_port_t ntohs(in_port_t); +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) */ +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#ifdef _BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BYTEORDER_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/callb.h b/sys/contrib/opensolaris/uts/common/sys/callb.h new file mode 100644 index 0000000..b12b2e2 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/callb.h @@ -0,0 +1,214 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CALLB_H +#define _SYS_CALLB_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/t_lock.h> +#include <sys/thread.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * definitions of callback classes (c_class) + * + * Callbacks belong in the same class if (1) their callback routines + * do the same kind of processing (ideally, using the same callback function) + * and (2) they can/should be executed at the same time in a cpr + * suspend/resume operation. + * + * Note: The DAEMON class, in particular, is for stopping kernel threads + * and nothing else. The CALLB_* macros below should be used to deal + * with kernel threads, and the callback function should be callb_generic_cpr. + * Another idiosyncrasy of the DAEMON class is that if a suspend operation + * fails, some of the callback functions may be called with the RESUME + * code which were never called with SUSPEND. Not a problem currently, + * but see bug 4201851. + */ +#define CB_CL_CPR_DAEMON 0 +#define CB_CL_CPR_VM 1 +#define CB_CL_CPR_CALLOUT 2 +#define CB_CL_CPR_OBP 3 +#define CB_CL_CPR_FB 4 +#define CB_CL_PANIC 5 +#define CB_CL_CPR_RPC 6 +#define CB_CL_CPR_PROMPRINTF 7 +#define CB_CL_UADMIN 8 +#define CB_CL_CPR_PM 9 +#define CB_CL_HALT 10 +#define CB_CL_CPR_DMA 11 +#define CB_CL_CPR_POST_USER 12 +#define CB_CL_UADMIN_PRE_VFS 13 +#define CB_CL_MDBOOT CB_CL_UADMIN +#define CB_CL_ENTER_DEBUGGER 14 +#define CB_CL_CPR_POST_KERNEL 15 +#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */ + +/* + * CB_CL_CPR_DAEMON class specific definitions are given below: + */ + +/* + * code for CPR callb_execute_class + */ +#define CB_CODE_CPR_CHKPT 0 +#define CB_CODE_CPR_RESUME 1 + +typedef void * callb_id_t; +/* + * Per kernel thread structure for CPR daemon callbacks. + * Must be protected by either a existing lock in the daemon or + * a new lock created for such a purpose. + */ +typedef struct callb_cpr { + kmutex_t *cc_lockp; /* lock to protect this struct */ + char cc_events; /* various events for CPR */ + callb_id_t cc_id; /* callb id address */ + kcondvar_t cc_callb_cv; /* cv for callback waiting */ + kcondvar_t cc_stop_cv; /* cv to checkpoint block */ +} callb_cpr_t; + +/* + * cc_events definitions + */ +#define CALLB_CPR_START 1 /* a checkpoint request's started */ +#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */ +#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */ + +/* + * Used when checking that all kernel threads are stopped. + */ +#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */ +#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */ +#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */ + /* due to pwr mgmt of disks, make -- */ + /* big enough for worst spinup time */ + +#ifdef _KERNEL +/* + * + * CALLB_CPR_INIT macro is used by kernel threads to add their entry to + * the callback table and perform other initialization. It automatically + * adds the thread as being in the callback class CB_CL_CPR_DAEMON. + * + * cp - ptr to the callb_cpr_t structure for this kernel thread + * + * lockp - pointer to mutex protecting the callb_cpr_t stuct + * + * func - pointer to the callback function for this kernel thread. + * It has the prototype boolean_t <func>(void *arg, int code) + * where: arg - ptr to the callb_cpr_t structure + * code - not used for this type of callback + * returns: B_TRUE if successful; B_FALSE if unsuccessful. + * + * name - a string giving the name of the kernel thread + * + * Note: lockp is the lock to protect the callb_cpr_t (cp) structure + * later on. No lock held is needed for this initialization. + */ +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \ + (cp)->cc_lockp = lockp; \ + (cp)->cc_id = callb_add(func, (void *)(cp), \ + CB_CL_CPR_DAEMON, name); \ + } + +#ifndef __lock_lint +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); +#else +#define CALLB_CPR_ASSERT(cp) +#endif +/* + * Some threads (like the idle threads) do not adhere to the callback + * protocol and are always considered safe. Such threads must never exit. + * They register their presence by calling this macro during their + * initialization. + * + * Args: + * t - thread pointer of the client kernel thread + * name - a string giving the name of the kernel thread + */ +#define CALLB_CPR_INIT_SAFE(t, name) { \ + (void) callb_add_thread(callb_generic_cpr_safe, \ + (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \ + name, t); \ + } +/* + * The lock to protect cp's content must be held before + * calling the following two macros. + * + * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END + * is safe for checkpoint/resume. + */ +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + } +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp) \ + while ((cp)->cc_events & CALLB_CPR_START) \ + cv_wait(&(cp)->cc_stop_cv, lockp); \ + (cp)->cc_events &= ~CALLB_CPR_SAFE; \ + } +/* + * cv_destroy is nop right now but may be needed in the future. + */ +#define CALLB_CPR_EXIT(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + mutex_exit((cp)->cc_lockp); \ + (void) callb_delete((cp)->cc_id); \ + cv_destroy(&(cp)->cc_callb_cv); \ + cv_destroy(&(cp)->cc_stop_cv); \ + } + +extern callb_cpr_t callb_cprinfo_safe; +extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *); +extern callb_id_t callb_add_thread(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); +extern int callb_delete(callb_id_t); +extern void callb_execute(callb_id_t, int); +extern void *callb_execute_class(int, int); +extern boolean_t callb_generic_cpr(void *, int); +extern boolean_t callb_generic_cpr_safe(void *, int); +extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *); +extern void callb_lock_table(void); +extern void callb_unlock_table(void); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CALLB_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/ccompile.h b/sys/contrib/opensolaris/uts/common/sys/ccompile.h new file mode 100644 index 0000000..c9857b08 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/ccompile.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CCOMPILE_H +#define _SYS_CCOMPILE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file contains definitions designed to enable different compilers + * to be used harmoniously on Solaris systems. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Allow for version tests for compiler bugs and features. + */ +#if defined(__GNUC__) +#define __GNUC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#else +#define __GNUC_VERSION 0 +#endif + +#if defined(__ATTRIBUTE_IMPLEMENTED) || defined(__GNUC__) + +/* + * analogous to lint's PRINTFLIKEn + */ +#define __sun_attr___PRINTFLIKE__(__n) \ + __attribute__((__format__(printf, __n, (__n)+1))) +#define __sun_attr___VPRINTFLIKE__(__n) \ + __attribute__((__format__(printf, __n, 0))) + +/* + * Handle the kernel printf routines that can take '%b' too + */ +#if __GNUC_VERSION < 30402 +/* + * XX64 at least this doesn't work correctly yet with 3.4.1 anyway! + */ +#define __sun_attr___KPRINTFLIKE__ __sun_attr___PRINTFLIKE__ +#define __sun_attr___KVPRINTFLIKE__ __sun_attr___VPRINTFLIKE__ +#else +#define __sun_attr___KPRINTFLIKE__(__n) \ + __attribute__((__format__(cmn_err, __n, (__n)+1))) +#define __sun_attr___KVPRINTFLIKE__(__n) \ + __attribute__((__format__(cmn_err, __n, 0))) +#endif + +/* + * This one's pretty obvious -- the function never returns + */ +#define __sun_attr___noreturn__ __attribute__((__noreturn__)) + + +/* + * This is an appropriate label for functions that do not + * modify their arguments, e.g. strlen() + */ +#define __sun_attr___pure__ __attribute__((__pure__)) + +/* + * This is a stronger form of __pure__. Can be used for functions + * that do not modify their arguments and don't depend on global + * memory. + */ +#define __sun_attr___const__ __attribute__((__const__)) + +/* + * structure packing like #pragma pack(1) + */ +#define __sun_attr___packed__ __attribute__((__packed__)) + +#define ___sun_attr_inner(__a) __sun_attr_##__a +#define __sun_attr__(__a) ___sun_attr_inner __a + +#else /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ + +#define __sun_attr__(__a) + +#endif /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ + +/* + * Shorthand versions for readability + */ + +#define __PRINTFLIKE(__n) __sun_attr__((__PRINTFLIKE__(__n))) +#define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n))) +#define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n))) +#define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n))) +#define __NORETURN __sun_attr__((__noreturn__)) +#define __CONST __sun_attr__((__const__)) +#define __PURE __sun_attr__((__pure__)) + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CCOMPILE_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/compress.h b/sys/contrib/opensolaris/uts/common/sys/compress.h new file mode 100644 index 0000000..3d79d95 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/compress.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1998 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#ifndef _SYS_COMPRESS_H +#define _SYS_COMPRESS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern size_t compress(void *, void *, size_t); +extern size_t decompress(void *, void *, size_t, size_t); +extern uint32_t checksum32(void *, size_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_COMPRESS_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/cred.h b/sys/contrib/opensolaris/uts/common/sys/cred.h new file mode 100644 index 0000000..c1400b8 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/cred.h @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SYS_CRED_H +#define _SYS_CRED_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The credential is an opaque kernel private data structure defined in + * <sys/cred_impl.h>. + */ + +typedef struct cred cred_t; + +#ifdef _KERNEL + +#define CRED() curthread->t_cred + +struct proc; /* cred.h is included in proc.h */ +struct prcred; + +struct auditinfo_addr; /* cred.h is included in audit.h */ + +extern int ngroups_max; +/* + * kcred is used when you need all privileges. + */ +extern struct cred *kcred; + +extern void cred_init(void); +extern void crhold(cred_t *); +extern void crfree(cred_t *); +extern cred_t *cralloc(void); /* all but ref uninitialized */ +extern cred_t *crget(void); /* initialized */ +extern cred_t *crcopy(cred_t *); +extern void crcopy_to(cred_t *, cred_t *); +extern cred_t *crdup(cred_t *); +extern void crdup_to(cred_t *, cred_t *); +extern cred_t *crgetcred(void); +extern void crset(struct proc *, cred_t *); +extern int groupmember(gid_t, const cred_t *); +extern int supgroupmember(gid_t, const cred_t *); +extern int hasprocperm(const cred_t *, const cred_t *); +extern int prochasprocperm(struct proc *, struct proc *, const cred_t *); +extern int crcmp(const cred_t *, const cred_t *); +extern cred_t *zone_kcred(void); + +extern uid_t crgetuid(const cred_t *); +extern uid_t crgetruid(const cred_t *); +extern uid_t crgetsuid(const cred_t *); +extern gid_t crgetgid(const cred_t *); +extern gid_t crgetrgid(const cred_t *); +extern gid_t crgetsgid(const cred_t *); +extern zoneid_t crgetzoneid(const cred_t *); +extern projid_t crgetprojid(const cred_t *); + + +extern const struct auditinfo_addr *crgetauinfo(const cred_t *); +extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *); + +extern uint_t crgetref(const cred_t *); + +extern const gid_t *crgetgroups(const cred_t *); + +extern int crgetngroups(const cred_t *); + +/* + * Sets real, effective and/or saved uid/gid; + * -1 argument accepted as "no change". + */ +extern int crsetresuid(cred_t *, uid_t, uid_t, uid_t); +extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t); + +/* + * Sets real, effective and saved uids/gids all to the same + * values. Both values must be non-negative and <= MAXUID + */ +extern int crsetugid(cred_t *, uid_t, gid_t); + +extern int crsetgroups(cred_t *, int, gid_t *); + +/* + * Private interface for setting zone association of credential. + */ +struct zone; +extern void crsetzone(cred_t *, struct zone *); +extern struct zone *crgetzone(const cred_t *); + +/* + * Private interface for setting project id in credential. + */ +extern void crsetprojid(cred_t *, projid_t); + +/* + * Private interface for nfs. + */ +extern cred_t *crnetadjust(cred_t *); + +/* + * Private interface for procfs. + */ +extern void cred2prcred(const cred_t *, struct prcred *); + +/* + * Private interfaces for Rampart Trusted Solaris. + */ +struct ts_label_s; +extern struct ts_label_s *crgetlabel(const cred_t *); +extern boolean_t crisremote(const cred_t *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRED_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/debug.h b/sys/contrib/opensolaris/uts/common/sys/debug.h new file mode 100644 index 0000000..c87c884 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/debug.h @@ -0,0 +1,129 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_DEBUG_H +#define _SYS_DEBUG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * ASSERT(ex) causes a panic or debugger entry if expression ex is not + * true. ASSERT() is included only for debugging, and is a no-op in + * production kernels. VERIFY(ex), on the other hand, behaves like + * ASSERT and is evaluated on both debug and non-debug kernels. + */ + +#if defined(__STDC__) +extern int assfail(const char *, const char *, int); +#define VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__))) +#if DEBUG +#define ASSERT(EX) VERIFY(EX) +#else +#define ASSERT(x) ((void)0) +#endif +#else /* defined(__STDC__) */ +extern int assfail(); +#define VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__))) +#if DEBUG +#define ASSERT(EX) VERIFY(EX) +#else +#define ASSERT(x) ((void)0) +#endif +#endif /* defined(__STDC__) */ + +/* + * Assertion variants sensitive to the compilation data model + */ +#if defined(_LP64) +#define ASSERT64(x) ASSERT(x) +#define ASSERT32(x) +#else +#define ASSERT64(x) +#define ASSERT32(x) ASSERT(x) +#endif + +/* + * ASSERT3() behaves like ASSERT() except that it is an explicit conditional, + * and prints out the values of the left and right hand expressions as part of + * the panic message to ease debugging. The three variants imply the type + * of their arguments. ASSERT3S() is for signed data types, ASSERT3U() is + * for unsigned, and ASSERT3P() is for pointers. The VERIFY3*() macros + * have the same relationship as above. + */ +extern void assfail3(const char *, uintmax_t, const char *, uintmax_t, + const char *, int); +#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \ + const TYPE __left = (TYPE)(LEFT); \ + const TYPE __right = (TYPE)(RIGHT); \ + if (!(__left OP __right)) \ + assfail3(#LEFT " " #OP " " #RIGHT, \ + (uintmax_t)__left, #OP, (uintmax_t)__right, \ + __FILE__, __LINE__); \ +_NOTE(CONSTCOND) } while (0) + +#define VERIFY3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t) +#define VERIFY3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t) +#define VERIFY3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t) +#if DEBUG +#define ASSERT3S(x, y, z) VERIFY3S(x, y, z) +#define ASSERT3U(x, y, z) VERIFY3U(x, y, z) +#define ASSERT3P(x, y, z) VERIFY3P(x, y, z) +#else +#define ASSERT3S(x, y, z) ((void)0) +#define ASSERT3U(x, y, z) ((void)0) +#define ASSERT3P(x, y, z) ((void)0) +#endif + +#ifdef _KERNEL + +extern void abort_sequence_enter(char *); +extern void debug_enter(char *); + +#endif /* _KERNEL */ + +#if defined(DEBUG) && !defined(__sun) +/* CSTYLED */ +#define STATIC +#else +/* CSTYLED */ +#define STATIC static +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DEBUG_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/dkio.h b/sys/contrib/opensolaris/uts/common/sys/dkio.h new file mode 100644 index 0000000..b0ddd07 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/dkio.h @@ -0,0 +1,477 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DKIO_H +#define _SYS_DKIO_H + +#pragma ident "%Z%%M% %I% %E% SMI" /* SunOS-4.0 5.19 */ + +#include <sys/dklabel.h> /* Needed for NDKMAP define */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Structures and definitions for disk io control commands + */ + +/* + * Structures used as data by ioctl calls. + */ + +#define DK_DEVLEN 16 /* device name max length, including */ + /* unit # & NULL (ie - "xyc1") */ + +/* + * Used for controller info + */ +struct dk_cinfo { + char dki_cname[DK_DEVLEN]; /* controller name (no unit #) */ + ushort_t dki_ctype; /* controller type */ + ushort_t dki_flags; /* flags */ + ushort_t dki_cnum; /* controller number */ + uint_t dki_addr; /* controller address */ + uint_t dki_space; /* controller bus type */ + uint_t dki_prio; /* interrupt priority */ + uint_t dki_vec; /* interrupt vector */ + char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */ + uint_t dki_unit; /* unit number */ + uint_t dki_slave; /* slave number */ + ushort_t dki_partition; /* partition number */ + ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */ +}; + +/* + * Controller types + */ +#define DKC_UNKNOWN 0 +#define DKC_CDROM 1 /* CD-ROM, SCSI or otherwise */ +#define DKC_WDC2880 2 +#define DKC_XXX_0 3 /* unassigned */ +#define DKC_XXX_1 4 /* unassigned */ +#define DKC_DSD5215 5 +#define DKC_ACB4000 7 +#define DKC_MD21 8 +#define DKC_XXX_2 9 /* unassigned */ +#define DKC_NCRFLOPPY 10 +#define DKC_SMSFLOPPY 12 +#define DKC_SCSI_CCS 13 /* SCSI CCS compatible */ +#define DKC_INTEL82072 14 /* native floppy chip */ +#define DKC_MD 16 /* meta-disk (virtual-disk) driver */ +#define DKC_INTEL82077 19 /* 82077 floppy disk controller */ +#define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */ +#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */ +#define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */ + +/* + * Sun reserves up through 1023 + */ + +#define DKC_CUSTOMER_BASE 1024 + +/* + * Flags + */ +#define DKI_BAD144 0x01 /* use DEC std 144 bad sector fwding */ +#define DKI_MAPTRK 0x02 /* controller does track mapping */ +#define DKI_FMTTRK 0x04 /* formats only full track at a time */ +#define DKI_FMTVOL 0x08 /* formats only full volume at a time */ +#define DKI_FMTCYL 0x10 /* formats only full cylinders at a time */ +#define DKI_HEXUNIT 0x20 /* unit number is printed as 3 hex digits */ +#define DKI_PCMCIA_PFD 0x40 /* PCMCIA pseudo-floppy memory card */ + +/* + * Used for all partitions + */ +struct dk_allmap { + struct dk_map dka_map[NDKMAP]; +}; + +#if defined(_SYSCALL32) +struct dk_allmap32 { + struct dk_map32 dka_map[NDKMAP]; +}; +#endif /* _SYSCALL32 */ + +/* + * Definition of a disk's geometry + */ +struct dk_geom { + unsigned short dkg_ncyl; /* # of data cylinders */ + unsigned short dkg_acyl; /* # of alternate cylinders */ + unsigned short dkg_bcyl; /* cyl offset (for fixed head area) */ + unsigned short dkg_nhead; /* # of heads */ + unsigned short dkg_obs1; /* obsolete */ + unsigned short dkg_nsect; /* # of data sectors per track */ + unsigned short dkg_intrlv; /* interleave factor */ + unsigned short dkg_obs2; /* obsolete */ + unsigned short dkg_obs3; /* obsolete */ + unsigned short dkg_apc; /* alternates per cyl (SCSI only) */ + unsigned short dkg_rpm; /* revolutions per minute */ + unsigned short dkg_pcyl; /* # of physical cylinders */ + unsigned short dkg_write_reinstruct; /* # sectors to skip, writes */ + unsigned short dkg_read_reinstruct; /* # sectors to skip, reads */ + unsigned short dkg_extra[7]; /* for compatible expansion */ +}; + +/* + * These defines are for historic compatibility with old drivers. + */ +#define dkg_bhead dkg_obs1 /* used to be head offset */ +#define dkg_gap1 dkg_obs2 /* used to be gap1 */ +#define dkg_gap2 dkg_obs3 /* used to be gap2 */ + +/* + * Disk io control commands + * Warning: some other ioctls with the DIOC prefix exist elsewhere. + * The Generic DKIOC numbers are from 0 - 50. + * The Floppy Driver uses 51 - 100. + * The Hard Disk (except SCSI) 101 - 106. (these are obsolete) + * The CDROM Driver 151 - 200. + * The USCSI ioctl 201 - 250. + */ +#define DKIOC (0x04 << 8) + +/* + * The following ioctls are generic in nature and need to be + * suported as appropriate by all disk drivers + */ +#define DKIOCGGEOM (DKIOC|1) /* Get geometry */ +#define DKIOCINFO (DKIOC|3) /* Get info */ +#define DKIOCEJECT (DKIOC|6) /* Generic 'eject' */ +#define DKIOCGVTOC (DKIOC|11) /* Get VTOC */ +#define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */ + +/* + * Disk Cache Controls. These ioctls should be supported by + * all disk drivers. + * + * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl + * argument, but it should be passed as NULL to allow for future + * reinterpretation. From user-mode, this ioctl request is synchronous. + * + * When invoked from within the kernel, the arg can be NULL to indicate + * a synchronous request or can be the address of a struct dk_callback + * to request an asynchronous callback when the flush request is complete. + * In this case, the flag to the ioctl must include FKIOCTL and the + * dkc_callback field of the pointed to struct must be non-null or the + * request is made synchronously. + * + * In the callback case: if the ioctl returns 0, a callback WILL be performed. + * If the ioctl returns non-zero, a callback will NOT be performed. + * NOTE: In some cases, the callback may be done BEFORE the ioctl call + * returns. The caller's locking strategy should be prepared for this case. + */ +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; +}; + +#define DKIOCGETWCE (DKIOC|36) /* Get current write cache */ + /* enablement status */ +#define DKIOCSETWCE (DKIOC|37) /* Enable/Disable write cache */ + +/* + * The following ioctls are used by Sun drivers to communicate + * with their associated format routines. Support of these ioctls + * is not required of foreign drivers + */ +#define DKIOCSGEOM (DKIOC|2) /* Set geometry */ +#define DKIOCSAPART (DKIOC|4) /* Set all partitions */ +#define DKIOCGAPART (DKIOC|5) /* Get all partitions */ +#define DKIOCG_PHYGEOM (DKIOC|32) /* get physical geometry */ +#define DKIOCG_VIRTGEOM (DKIOC|33) /* get virtual geometry */ + +/* + * The following ioctl's are removable media support + */ +#define DKIOCLOCK (DKIOC|7) /* Generic 'lock' */ +#define DKIOCUNLOCK (DKIOC|8) /* Generic 'unlock' */ +#define DKIOCSTATE (DKIOC|13) /* Inquire insert/eject state */ +#define DKIOCREMOVABLE (DKIOC|16) /* is media removable */ + + +/* + * ioctl for hotpluggable devices + */ +#define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */ + +/* + * Ioctl to force driver to re-read the alternate partition and rebuild + * the internal defect map. + */ +#define DKIOCADDBAD (DKIOC|20) /* Re-read the alternate map (IDE) */ +#define DKIOCGETDEF (DKIOC|21) /* read defect list (IDE) */ + +/* + * Used by applications to get disk defect information from IDE + * drives. + */ +#ifdef _SYSCALL32 +struct defect_header32 { + int head; + caddr32_t buffer; +}; +#endif /* _SYSCALL32 */ + +struct defect_header { + int head; + caddr_t buffer; +}; + +#define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */ + +/* + * Used by applications to get partition or slice information + */ +#ifdef _SYSCALL32 +struct part_info32 { + daddr32_t p_start; + int p_length; +}; +#endif /* _SYSCALL32 */ + +struct part_info { + daddr_t p_start; + int p_length; +}; + +/* The following ioctls are for Optical Memory Device */ +#define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */ +#define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */ + +/* + * This state enum is the argument passed to the DKIOCSTATE ioctl. + */ +enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE }; + +#define DKIOCGMEDIAINFO (DKIOC|42) /* get information about the media */ + +/* + * ioctls to read/write mboot info. + */ +#define DKIOCGMBOOT (DKIOC|43) /* get mboot info */ +#define DKIOCSMBOOT (DKIOC|44) /* set mboot info */ + +/* + * ioctl to get the device temperature. + */ +#define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */ + +/* + * Used for providing the temperature. + */ + +struct dk_temperature { + uint_t dkt_flags; /* Flags */ + short dkt_cur_temp; /* Current disk temperature */ + short dkt_ref_temp; /* reference disk temperature */ +}; + +#define DKT_BYPASS_PM 0x1 +#define DKT_INVALID_TEMP 0xFFFF + + +/* + * Used for Media info or the current profile info + */ +struct dk_minfo { + uint_t dki_media_type; /* Media type or profile info */ + uint_t dki_lbsize; /* Logical blocksize of media */ + diskaddr_t dki_capacity; /* Capacity as # of dki_lbsize blks */ +}; + +/* + * Media types or profiles known + */ +#define DK_UNKNOWN 0x00 /* Media inserted - type unknown */ + + +/* + * SFF 8090 Specification Version 3, media types 0x01 - 0xfffe are retained to + * maintain compatibility with SFF8090. The following define the + * optical media type. + */ +#define DK_REMOVABLE_DISK 0x02 /* Removable Disk */ +#define DK_MO_ERASABLE 0x03 /* MO Erasable */ +#define DK_MO_WRITEONCE 0x04 /* MO Write once */ +#define DK_AS_MO 0x05 /* AS MO */ +#define DK_CDROM 0x08 /* CDROM */ +#define DK_CDR 0x09 /* CD-R */ +#define DK_CDRW 0x0A /* CD-RW */ +#define DK_DVDROM 0x10 /* DVD-ROM */ +#define DK_DVDR 0x11 /* DVD-R */ +#define DK_DVDRAM 0x12 /* DVD_RAM or DVD-RW */ + +/* + * Media types for other rewritable magnetic media + */ +#define DK_FIXED_DISK 0x10001 /* Fixed disk SCSI or otherwise */ +#define DK_FLOPPY 0x10002 /* Floppy media */ +#define DK_ZIP 0x10003 /* IOMEGA ZIP media */ +#define DK_JAZ 0x10004 /* IOMEGA JAZ media */ + +#define DKIOCSETEFI (DKIOC|17) /* Set EFI info */ +#define DKIOCGETEFI (DKIOC|18) /* Get EFI info */ + +#define DKIOCPARTITION (DKIOC|9) /* Get partition info */ + +/* + * Ioctls to get/set volume capabilities related to Logical Volume Managers. + * They include the ability to get/set capabilities and to issue a read to a + * specific underlying device of a replicated device. + */ + +#define DKIOCGETVOLCAP (DKIOC | 25) /* Get volume capabilities */ +#define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */ +#define DKIOCDMR (DKIOC | 27) /* Issue a directed read */ + +typedef uint_t volcapinfo_t; + +typedef uint_t volcapset_t; + +#define DKV_ABR_CAP 0x00000001 /* Support Appl.Based Recovery */ +#define DKV_DMR_CAP 0x00000002 /* Support Directed Mirror Read */ + +typedef struct volcap { + volcapinfo_t vc_info; /* Capabilities available */ + volcapset_t vc_set; /* Capabilities set */ +} volcap_t; + +#define VOL_SIDENAME 256 + +typedef struct vol_directed_rd { + int vdr_flags; + offset_t vdr_offset; + size_t vdr_nbytes; + size_t vdr_bytesread; + void *vdr_data; + int vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd_t; + +#define DKV_SIDE_INIT (-1) +#define DKV_DMR_NEXT_SIDE 0x00000001 +#define DKV_DMR_DONE 0x00000002 +#define DKV_DMR_ERROR 0x00000004 +#define DKV_DMR_SUCCESS 0x00000008 +#define DKV_DMR_SHORT 0x00000010 + +#ifdef _MULTI_DATAMODEL +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif +typedef struct vol_directed_rd32 { + int32_t vdr_flags; + offset_t vdr_offset; /* 64-bit element on 32-bit alignment */ + size32_t vdr_nbytes; + size32_t vdr_bytesread; + caddr32_t vdr_data; + int32_t vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd32_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif +#endif /* _MULTI_DATAMODEL */ + +/* + * The ioctl is used to fetch disk's device type, vendor ID, + * model number/product ID, firmware revision and serial number together. + * + * Currently there are two device types - DKD_ATA_TYPE which means the + * disk is driven by cmdk/ata or dad/uata driver, and DKD_SCSI_TYPE + * which means the disk is driven by sd/scsi hba driver. + */ +#define DKIOC_GETDISKID (DKIOC|46) + +/* These two labels are for dkd_dtype of dk_disk_id_t */ +#define DKD_ATA_TYPE 0x01 /* ATA disk or legacy mode SATA disk */ +#define DKD_SCSI_TYPE 0x02 /* SCSI disk or native mode SATA disk */ + +#define DKD_ATA_MODEL 40 /* model number length */ +#define DKD_ATA_FWVER 8 /* firmware revision length */ +#define DKD_ATA_SERIAL 20 /* serial number length */ + +#define DKD_SCSI_VENDOR 8 /* vendor ID length */ +#define DKD_SCSI_PRODUCT 16 /* product ID length */ +#define DKD_SCSI_REVLEVEL 4 /* revision level length */ +#define DKD_SCSI_SERIAL 12 /* serial number length */ + +/* + * The argument type for DKIOC_GETDISKID ioctl. + */ +typedef struct dk_disk_id { + uint_t dkd_dtype; + union { + struct { + char dkd_amodel[DKD_ATA_MODEL]; /* 40 bytes */ + char dkd_afwver[DKD_ATA_FWVER]; /* 8 bytes */ + char dkd_aserial[DKD_ATA_SERIAL]; /* 20 bytes */ + } ata_disk_id; + struct { + char dkd_svendor[DKD_SCSI_VENDOR]; /* 8 bytes */ + char dkd_sproduct[DKD_SCSI_PRODUCT]; /* 16 bytes */ + char dkd_sfwver[DKD_SCSI_REVLEVEL]; /* 4 bytes */ + char dkd_sserial[DKD_SCSI_SERIAL]; /* 12 bytes */ + } scsi_disk_id; + } disk_id; +} dk_disk_id_t; + +/* + * The ioctl is used to update the firmware of device. + */ +#define DKIOC_UPDATEFW (DKIOC|47) + +/* The argument type for DKIOC_UPDATEFW ioctl */ +typedef struct dk_updatefw { + caddr_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_t; + +#ifdef _SYSCALL32 +typedef struct dk_updatefw_32 { + caddr32_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_32_t; +#endif /* _SYSCALL32 */ + +/* + * firmware update type - temporary or permanent use + */ +#define FW_TYPE_TEMP 0x0 /* temporary use */ +#define FW_TYPE_PERM 0x1 /* permanent use */ + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DKIO_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/dklabel.h b/sys/contrib/opensolaris/uts/common/sys/dklabel.h new file mode 100644 index 0000000..92cb47a --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/dklabel.h @@ -0,0 +1,268 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1990-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DKLABEL_H +#define _SYS_DKLABEL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/isa_defs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Miscellaneous defines + */ +#define DKL_MAGIC 0xDABE /* magic number */ +#define FKL_MAGIC 0xff /* magic number for DOS floppies */ + +#if defined(_SUNOS_VTOC_16) +#define NDKMAP 16 /* # of logical partitions */ +#define DK_LABEL_LOC 1 /* location of disk label */ +#elif defined(_SUNOS_VTOC_8) +#define NDKMAP 8 /* # of logical partitions */ +#define DK_LABEL_LOC 0 /* location of disk label */ +#else +#error "No VTOC format defined." +#endif + +#define LEN_DKL_ASCII 128 /* length of dkl_asciilabel */ +#define LEN_DKL_VVOL 8 /* length of v_volume */ +#define DK_LABEL_SIZE 512 /* size of disk label */ +#define DK_MAX_BLOCKS 0x7fffffff /* max # of blocks handled */ + +/* + * Reserve two cylinders on SCSI disks. + * One is for the backup disk label and the other is for the deviceid. + * + * IPI disks only reserve one cylinder, but they will go away soon. + * CDROMs do not reserve any cylinders. + */ +#define DK_ACYL 2 + +/* + * Format of a Sun disk label. + * Resides in cylinder 0, head 0, sector 0. + * + * sizeof (struct dk_label) should be 512 (the current sector size), + * but should the sector size increase, this structure should remain + * at the beginning of the sector. + */ + +/* + * partition headers: section 1 + * Returned in struct dk_allmap by ioctl DKIOC[SG]APART (dkio(7I)) + */ +struct dk_map { + daddr_t dkl_cylno; /* starting cylinder */ + daddr_t dkl_nblk; /* number of blocks; if == 0, */ + /* partition is undefined */ +}; + +/* + * partition headers: section 1 + * Fixed size for on-disk dk_label + */ +struct dk_map32 { + daddr32_t dkl_cylno; /* starting cylinder */ + daddr32_t dkl_nblk; /* number of blocks; if == 0, */ + /* partition is undefined */ +}; + +/* + * partition headers: section 2, + * brought over from AT&T SVr4 vtoc structure. + */ +struct dk_map2 { + uint16_t p_tag; /* ID tag of partition */ + uint16_t p_flag; /* permission flag */ +}; + +struct dkl_partition { + uint16_t p_tag; /* ID tag of partition */ + uint16_t p_flag; /* permision flags */ + daddr32_t p_start; /* start sector no of partition */ + int32_t p_size; /* # of blocks in partition */ +}; + + +/* + * VTOC inclusions from AT&T SVr4 + * Fixed sized types for on-disk VTOC + */ + +struct dk_vtoc { +#if defined(_SUNOS_VTOC_16) + uint32_t v_bootinfo[3]; /* info for mboot (unsupported) */ + uint32_t v_sanity; /* to verify vtoc sanity */ + uint32_t v_version; /* layout version */ + char v_volume[LEN_DKL_VVOL]; /* volume name */ + uint16_t v_sectorsz; /* sector size in bytes */ + uint16_t v_nparts; /* number of partitions */ + uint32_t v_reserved[10]; /* free space */ + struct dkl_partition v_part[NDKMAP]; /* partition headers */ + time32_t timestamp[NDKMAP]; /* partition timestamp (unsupported) */ + char v_asciilabel[LEN_DKL_ASCII]; /* for compatibility */ +#elif defined(_SUNOS_VTOC_8) + uint32_t v_version; /* layout version */ + char v_volume[LEN_DKL_VVOL]; /* volume name */ + uint16_t v_nparts; /* number of partitions */ + struct dk_map2 v_part[NDKMAP]; /* partition hdrs, sec 2 */ + uint32_t v_bootinfo[3]; /* info needed by mboot */ + uint32_t v_sanity; /* to verify vtoc sanity */ + uint32_t v_reserved[10]; /* free space */ + time32_t v_timestamp[NDKMAP]; /* partition timestamp */ +#else +#error "No VTOC format defined." +#endif +}; + +/* + * define the amount of disk label padding needed to make + * the entire structure occupy 512 bytes. + */ +#if defined(_SUNOS_VTOC_16) +#define LEN_DKL_PAD (DK_LABEL_SIZE - \ + ((sizeof (struct dk_vtoc) + \ + (4 * sizeof (uint32_t)) + \ + (12 * sizeof (uint16_t)) + \ + (2 * (sizeof (uint16_t)))))) +#elif defined(_SUNOS_VTOC_8) +#define LEN_DKL_PAD (DK_LABEL_SIZE \ + - ((LEN_DKL_ASCII) + \ + (sizeof (struct dk_vtoc)) + \ + (sizeof (struct dk_map32) * NDKMAP) + \ + (14 * (sizeof (uint16_t))) + \ + (2 * (sizeof (uint16_t))))) +#else +#error "No VTOC format defined." +#endif + + +struct dk_label { +#if defined(_SUNOS_VTOC_16) + struct dk_vtoc dkl_vtoc; /* vtoc inclusions from AT&T SVr4 */ + uint32_t dkl_pcyl; /* # of physical cylinders */ + uint32_t dkl_ncyl; /* # of data cylinders */ + uint16_t dkl_acyl; /* # of alternate cylinders */ + uint16_t dkl_bcyl; /* cyl offset (for fixed head area) */ + uint32_t dkl_nhead; /* # of heads */ + uint32_t dkl_nsect; /* # of data sectors per track */ + uint16_t dkl_intrlv; /* interleave factor */ + uint16_t dkl_skew; /* skew factor */ + uint16_t dkl_apc; /* alternates per cyl (SCSI only) */ + uint16_t dkl_rpm; /* revolutions per minute */ + uint16_t dkl_write_reinstruct; /* # sectors to skip, writes */ + uint16_t dkl_read_reinstruct; /* # sectors to skip, reads */ + uint16_t dkl_extra[4]; /* for compatible expansion */ + char dkl_pad[LEN_DKL_PAD]; /* unused part of 512 bytes */ +#elif defined(_SUNOS_VTOC_8) + char dkl_asciilabel[LEN_DKL_ASCII]; /* for compatibility */ + struct dk_vtoc dkl_vtoc; /* vtoc inclusions from AT&T SVr4 */ + uint16_t dkl_write_reinstruct; /* # sectors to skip, writes */ + uint16_t dkl_read_reinstruct; /* # sectors to skip, reads */ + char dkl_pad[LEN_DKL_PAD]; /* unused part of 512 bytes */ + uint16_t dkl_rpm; /* rotations per minute */ + uint16_t dkl_pcyl; /* # physical cylinders */ + uint16_t dkl_apc; /* alternates per cylinder */ + uint16_t dkl_obs1; /* obsolete */ + uint16_t dkl_obs2; /* obsolete */ + uint16_t dkl_intrlv; /* interleave factor */ + uint16_t dkl_ncyl; /* # of data cylinders */ + uint16_t dkl_acyl; /* # of alternate cylinders */ + uint16_t dkl_nhead; /* # of heads in this partition */ + uint16_t dkl_nsect; /* # of 512 byte sectors per track */ + uint16_t dkl_obs3; /* obsolete */ + uint16_t dkl_obs4; /* obsolete */ + struct dk_map32 dkl_map[NDKMAP]; /* logical partition headers */ +#else +#error "No VTOC format defined." +#endif + uint16_t dkl_magic; /* identifies this label format */ + uint16_t dkl_cksum; /* xor checksum of sector */ +}; + +#if defined(_SUNOS_VTOC_16) +#define dkl_asciilabel dkl_vtoc.v_asciilabel +#define v_timestamp timestamp + +#elif defined(_SUNOS_VTOC_8) + +/* + * These defines are for historic compatibility with old drivers. + */ +#define dkl_gap1 dkl_obs1 /* used to be gap1 */ +#define dkl_gap2 dkl_obs2 /* used to be gap2 */ +#define dkl_bhead dkl_obs3 /* used to be label head offset */ +#define dkl_ppart dkl_obs4 /* used to by physical partition */ +#else +#error "No VTOC format defined." +#endif + +struct fk_label { /* DOS floppy label */ + uchar_t fkl_type; + uchar_t fkl_magich; + uchar_t fkl_magicl; + uchar_t filler; +}; + +/* + * Layout of stored fabricated device id (on-disk) + */ +#define DK_DEVID_BLKSIZE (512) +#define DK_DEVID_SIZE (DK_DEVID_BLKSIZE - ((sizeof (uchar_t) * 7))) +#define DK_DEVID_REV_MSB (0) +#define DK_DEVID_REV_LSB (1) + +struct dk_devid { + uchar_t dkd_rev_hi; /* revision (MSB) */ + uchar_t dkd_rev_lo; /* revision (LSB) */ + uchar_t dkd_flags; /* flags (not used yet) */ + uchar_t dkd_devid[DK_DEVID_SIZE]; /* devid stored here */ + uchar_t dkd_checksum3; /* checksum (MSB) */ + uchar_t dkd_checksum2; + uchar_t dkd_checksum1; + uchar_t dkd_checksum0; /* checksum (LSB) */ +}; + +#define DKD_GETCHKSUM(dkd) ((dkd)->dkd_checksum3 << 24) + \ + ((dkd)->dkd_checksum2 << 16) + \ + ((dkd)->dkd_checksum1 << 8) + \ + ((dkd)->dkd_checksum0) + +#define DKD_FORMCHKSUM(c, dkd) (dkd)->dkd_checksum3 = hibyte(hiword((c))); \ + (dkd)->dkd_checksum2 = lobyte(hiword((c))); \ + (dkd)->dkd_checksum1 = hibyte(loword((c))); \ + (dkd)->dkd_checksum0 = lobyte(loword((c))); +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DKLABEL_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/dnlc.h b/sys/contrib/opensolaris/uts/common/sys/dnlc.h new file mode 100644 index 0000000..88b7509 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/dnlc.h @@ -0,0 +1,232 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_DNLC_H +#define _SYS_DNLC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/kstat.h> + +/* + * DNLC - Directory name lookup cache. + * There are now two sorts of name caching: + * + * Standard dnlc: This original cache holds recent mappings + * of <directory vnode, name> to vnode mappings. + * + * Directory caches: Entire large directories can be cached, subject to + * memory availability and tunables. A directory cache + * anchor point must be provided in the xxnode for + * a directory. + */ + + +/* + * Standard dnlc + * ============= + */ + +/* + * This structure describes the elements in the cache of recent + * names looked up. + * + * Note namlen is a uchar_t to conserve space + * and alignment padding. The max length of any + * pathname component is defined as MAXNAMELEN + * which is 256 (including the terminating null). + * So provided this doesn't change, we don't include the null, + * we always use bcmp to compare strings, and we don't start + * storing full names, then we are ok. The space savings are worth it. + */ +typedef struct ncache { + struct ncache *hash_next; /* hash chain, MUST BE FIRST */ + struct ncache *hash_prev; + struct vnode *vp; /* vnode the name refers to */ + struct vnode *dp; /* vnode of parent of name */ + int hash; /* hash signature */ + uchar_t namlen; /* length of name */ + char name[1]; /* segment name - null terminated */ +} ncache_t; + +/* + * Hash table bucket structure of name cache entries for fast lookup. + */ +typedef struct nc_hash { + ncache_t *hash_next; + ncache_t *hash_prev; + kmutex_t hash_lock; +} nc_hash_t; + +/* + * Statistics on name cache + * Not protected by locks + */ +/* + * ncstats has been deprecated, due to the integer size of the counters + * which can easily overflow in the dnlc. + * It is maintained (at some expense) for compatability. + * The preferred interface is the kstat accessible nc_stats below, ehich + * is actually shared with directory caching. + */ +struct ncstats { + int hits; /* hits that we can really use */ + int misses; /* cache misses */ + int enters; /* number of enters done */ + int dbl_enters; /* number of enters tried when already cached */ + int long_enter; /* deprecated, no longer accounted */ + int long_look; /* deprecated, no longer accounted */ + int move_to_front; /* entry moved to front of hash chain */ + int purges; /* number of purges of cache */ +}; + +struct nc_stats { + kstat_named_t ncs_hits; /* cache hits */ + kstat_named_t ncs_misses; /* cache misses */ + kstat_named_t ncs_neg_hits; /* negative cache hits */ + kstat_named_t ncs_enters; /* enters */ + kstat_named_t ncs_dbl_enters; /* enters when entry already cached */ + kstat_named_t ncs_purge_total; /* total entries prurged */ + kstat_named_t ncs_purge_all; /* dnlc_purge() calls */ + kstat_named_t ncs_purge_vp; /* dnlc_purge_vp() calls */ + kstat_named_t ncs_purge_vfs; /* dnlc_purge_vfs() calls */ + kstat_named_t ncs_purge_fs1; /* dnlc_purge_fs1() calls */ + kstat_named_t ncs_pick_free; /* found a free ncache */ + kstat_named_t ncs_pick_heur; /* found ncache w/ NULL vpages */ + kstat_named_t ncs_pick_last; /* found last ncache on chain */ +}; + +/* + * The dnlc hashing function. + * Although really a kernel macro we export it to allow validation + * of ncache_t entries by mdb. Note, mdb can handle the ASSERT. + * + * 'hash' and 'namlen' must be l-values. A check is made to ensure + * the name length fits into an unsigned char (see ncache_t). + */ +#define DNLCHASH(name, dvp, hash, namlen) \ + { \ + char Xc, *Xcp; \ + hash = (int)((uintptr_t)(dvp)) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + (hash) = ((hash) << 4) + (hash) + Xc; \ + ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1)); \ + (namlen) = Xcp - (name); \ + } + +#if defined(_KERNEL) + +#include <sys/vfs.h> +#include <sys/vnode.h> + +extern int ncsize; /* set in param_init() # of dnlc entries */ +extern vnode_t negative_cache_vnode; +#define DNLC_NO_VNODE &negative_cache_vnode + +void dnlc_update(vnode_t *, char *, vnode_t *); +vnode_t *dnlc_lookup(vnode_t *, char *); +int dnlc_purge_vfsp(vfs_t *, int); +void dnlc_remove(vnode_t *, char *); +void dnlc_reduce_cache(void *); + +#endif /* defined(_KERNEL) */ + + +/* + * Directory caching interfaces + * ============================ + */ + +/* + * Typically for large directories, the file names will be the same or + * at least similar lengths. So there's no point in anything more elaborate + * than a simple unordered linked list of free space entries. + * For small directories the name length distribution doesn't really matter. + */ +typedef struct dcfree { + uint64_t df_handle; /* fs supplied handle */ + struct dcfree *df_next; /* link to next free entry in bucket */ + uint_t df_len; /* length of free entry */ +} dcfree_t; + +typedef struct dcentry { + uint64_t de_handle; /* fs supplied and returned data */ + struct dcentry *de_next; /* link to next name entry in bucket */ + int de_hash; /* hash signature */ + uchar_t de_namelen; /* length of name excluding null */ + char de_name[1]; /* null terminated name */ +} dcentry_t; + +typedef struct dircache { + struct dircache *dc_next; /* chain - for purge purposes */ + struct dircache *dc_prev; /* chain - for purge purposes */ + int64_t dc_actime; /* dir access time, from lbolt64 */ + dcentry_t **dc_namehash; /* entry hash table pointer */ + dcfree_t **dc_freehash; /* free entry hash table pointer */ + uint_t dc_num_entries; /* no of named entries */ + uint_t dc_num_free; /* no of free space entries */ + uint_t dc_nhash_mask; /* name hash table size - 1 */ + uint_t dc_fhash_mask; /* free space hash table size - 1 */ + struct dcanchor *dc_anchor; /* back ptr to anchor */ + boolean_t dc_complete; /* cache complete boolean */ +} dircache_t; + +typedef struct dcanchor { + void *dca_dircache; /* pointer to directory cache */ + kmutex_t dca_lock; /* protects the directory cache */ +} dcanchor_t; + +/* + * Head struct for doubly linked chain of dircache_t + * The next and prev fields must match those of a dircache_t + */ +typedef struct { + dircache_t *dch_next; /* next in chain */ + dircache_t *dch_prev; /* prev in chain */ + kmutex_t dch_lock; /* lock for the chain */ +} dchead_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DNLC_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/errorq.h b/sys/contrib/opensolaris/uts/common/sys/errorq.h new file mode 100644 index 0000000..971b19e --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/errorq.h @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ERRORQ_H +#define _ERRORQ_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/nvpair.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct errorq errorq_t; +typedef struct errorq_elem errorq_elem_t; +typedef void (*errorq_func_t)(void *, const void *, const errorq_elem_t *); + +/* + * Public flags for errorq_create(): bit range 0-15 + */ +#define ERRORQ_VITAL 0x0001 /* drain queue automatically on system reset */ + +/* + * Public flags for errorq_dispatch(): + */ +#define ERRORQ_ASYNC 0 /* schedule async queue drain for caller */ +#define ERRORQ_SYNC 1 /* do not schedule drain; caller will drain */ + +#ifdef _KERNEL + +extern errorq_t *errorq_create(const char *, errorq_func_t, void *, + ulong_t, size_t, uint_t, uint_t); + +extern errorq_t *errorq_nvcreate(const char *, errorq_func_t, void *, + ulong_t, size_t, uint_t, uint_t); + +extern void errorq_destroy(errorq_t *); +extern void errorq_dispatch(errorq_t *, const void *, size_t, uint_t); +extern void errorq_drain(errorq_t *); +extern void errorq_init(void); +extern void errorq_panic(void); +extern errorq_elem_t *errorq_reserve(errorq_t *); +extern void errorq_commit(errorq_t *, errorq_elem_t *, uint_t); +extern void errorq_cancel(errorq_t *, errorq_elem_t *); +extern nvlist_t *errorq_elem_nvl(errorq_t *, const errorq_elem_t *); +extern nv_alloc_t *errorq_elem_nva(errorq_t *, const errorq_elem_t *); +extern void *errorq_elem_dup(errorq_t *, const errorq_elem_t *, + errorq_elem_t **); +extern void errorq_dump(); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ERRORQ_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/feature_tests.h b/sys/contrib/opensolaris/uts/common/sys/feature_tests.h new file mode 100644 index 0000000..bb79cb8 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/feature_tests.h @@ -0,0 +1,397 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FEATURE_TESTS_H +#define _SYS_FEATURE_TESTS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/ccompile.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Values of _POSIX_C_SOURCE + * + * undefined not a POSIX compilation + * 1 POSIX.1-1990 compilation + * 2 POSIX.2-1992 compilation + * 199309L POSIX.1b-1993 compilation (Real Time) + * 199506L POSIX.1c-1995 compilation (POSIX Threads) + * 200112L POSIX.1-2001 compilation (Austin Group Revision) + */ +#if defined(_POSIX_SOURCE) && !defined(_POSIX_C_SOURCE) +#define _POSIX_C_SOURCE 1 +#endif + +/* + * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, and _STDC_C99 + * are Sun implementation specific macros created in order to compress + * common standards specified feature test macros for easier reading. + * These macros should not be used by the application developer as + * unexpected results may occur. Instead, the user should reference + * standards(5) for correct usage of the standards feature test macros. + * + * __XOPEN_OR_POSIX Used in cases where a symbol is defined by both + * X/Open or POSIX or in the negative, when neither + * X/Open or POSIX defines a symbol. + * + * _STRICT_STDC __STDC__ is specified by the C Standards and defined + * by the compiler. For Sun compilers the value of + * __STDC__ is either 1, 0, or not defined based on the + * compilation mode (see cc(1)). When the value of + * __STDC__ is 1 and in the absence of any other feature + * test macros, the namespace available to the application + * is limited to only those symbols defined by the C + * Standard. _STRICT_STDC provides a more readable means + * of identifying symbols defined by the standard, or in + * the negative, symbols that are extensions to the C + * Standard. See additional comments for GNU C differences. + * + * _STDC_C99 __STDC_VERSION__ is specified by the C standards and + * defined by the compiler and indicates the version of + * the C standard. A value of 199901L indicates a + * compiler that complies with ISO/IEC 9899:1999, other- + * wise known as the C99 standard. + */ + +#if defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE) +#define __XOPEN_OR_POSIX +#endif + +/* + * ISO/IEC 9899:1990 and it's revision, ISO/IEC 9899:1999 specify the + * following predefined macro name: + * + * __STDC__ The integer constant 1, intended to indicate a conforming + * implementation. + * + * Furthermore, a strictly conforming program shall use only those features + * of the language and library specified in these standards. A conforming + * implementation shall accept any strictly conforming program. + * + * Based on these requirements, Sun's C compiler defines __STDC__ to 1 for + * strictly conforming environments and __STDC__ to 0 for environments that + * use ANSI C semantics but allow extensions to the C standard. For non-ANSI + * C semantics, Sun's C compiler does not define __STDC__. + * + * The GNU C project interpretation is that __STDC__ should always be defined + * to 1 for compilation modes that accept ANSI C syntax regardless of whether + * or not extensions to the C standard are used. Violations of conforming + * behavior are conditionally flagged as warnings via the use of the + * -pedantic option. In addition to defining __STDC__ to 1, the GNU C + * compiler also defines __STRICT_ANSI__ as a means of specifying strictly + * conforming environments using the -ansi or -std=<standard> options. + * + * In the absence of any other compiler options, Sun and GNU set the value + * of __STDC__ as follows when using the following options: + * + * Value of __STDC__ __STRICT_ANSI__ + * + * cc -Xa (default) 0 undefined + * cc -Xt (transitional) 0 undefined + * cc -Xc (strictly conforming) 1 undefined + * cc -Xs (K&R C) undefined undefined + * + * gcc (default) 1 undefined + * gcc -ansi, -std={c89, c99,...) 1 defined + * gcc -traditional (K&R) undefined undefined + * + * The default compilation modes for Sun C compilers versus GNU C compilers + * results in a differing value for __STDC__ which results in a more + * restricted namespace when using Sun compilers. To allow both GNU and Sun + * interpretations to peacefully co-exist, we use the following Sun + * implementation _STRICT_STDC_ macro: + */ + +#if (__STDC__ - 0 == 1 && !defined(__GNUC__)) || \ + (defined(__GNUC__) && defined(__STRICT_ANSI__)) +#define _STRICT_STDC +#else +#undef _STRICT_STDC +#endif + +/* + * Compiler complies with ISO/IEC 9899:1999 + */ + +#if __STDC_VERSION__ - 0 >= 199901L +#ifndef _STDC_C99 +#define _STDC_C99 +#endif +#endif + +/* + * Large file interfaces: + * + * _LARGEFILE_SOURCE + * 1 large file-related additions to POSIX + * interfaces requested (fseeko, etc.) + * _LARGEFILE64_SOURCE + * 1 transitional large-file-related interfaces + * requested (seek64, stat64, etc.) + * + * The corresponding announcement macros are respectively: + * _LFS_LARGEFILE + * _LFS64_LARGEFILE + * (These are set in <unistd.h>.) + * + * Requesting _LARGEFILE64_SOURCE implies requesting _LARGEFILE_SOURCE as + * well. + * + * The large file interfaces are made visible regardless of the initial values + * of the feature test macros under certain circumstances: + * - If no explicit standards-conforming environment is requested (neither + * of _POSIX_SOURCE nor _XOPEN_SOURCE is defined and the value of + * __STDC__ does not imply standards conformance). + * - Extended system interfaces are explicitly requested (__EXTENSIONS__ + * is defined). + * - Access to in-kernel interfaces is requested (_KERNEL or _KMEMUSER is + * defined). (Note that this dependency is an artifact of the current + * kernel implementation and may change in future releases.) + */ +#if (!defined(_STRICT_STDC) && !defined(__XOPEN_OR_POSIX)) || \ + defined(_KERNEL) || defined(_KMEMUSER) || \ + defined(__EXTENSIONS__) +#undef _LARGEFILE64_SOURCE +#define _LARGEFILE64_SOURCE 1 +#endif +#if _LARGEFILE64_SOURCE - 0 == 1 +#undef _LARGEFILE_SOURCE +#define _LARGEFILE_SOURCE 1 +#endif + +/* + * Large file compilation environment control: + * + * The setting of _FILE_OFFSET_BITS controls the size of various file-related + * types and governs the mapping between file-related source function symbol + * names and the corresponding binary entry points. + * + * In the 32-bit environment, the default value is 32; if not set, set it to + * the default here, to simplify tests in other headers. + * + * In the 64-bit compilation environment, the only value allowed is 64. + */ +#if defined(_LP64) +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif +#if _FILE_OFFSET_BITS - 0 != 64 +#error "invalid _FILE_OFFSET_BITS value specified" +#endif +#else /* _LP64 */ +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 32 +#endif +#if _FILE_OFFSET_BITS - 0 != 32 && _FILE_OFFSET_BITS - 0 != 64 +#error "invalid _FILE_OFFSET_BITS value specified" +#endif +#endif /* _LP64 */ + +/* + * Use of _XOPEN_SOURCE + * + * The following X/Open specifications are supported: + * + * X/Open Portability Guide, Issue 3 (XPG3) + * X/Open CAE Specification, Issue 4 (XPG4) + * X/Open CAE Specification, Issue 4, Version 2 (XPG4v2) + * X/Open CAE Specification, Issue 5 (XPG5) + * Open Group Technical Standard, Issue 6 (XPG6), also referred to as + * IEEE Std. 1003.1-2001 and ISO/IEC 9945:2002. + * + * XPG4v2 is also referred to as UNIX 95 (SUS or SUSv1). + * XPG5 is also referred to as UNIX 98 or the Single Unix Specification, + * Version 2 (SUSv2) + * XPG6 is the result of a merge of the X/Open and POSIX specifications + * and as such is also referred to as IEEE Std. 1003.1-2001 in + * addition to UNIX 03 and SUSv3. + * + * When writing a conforming X/Open application, as per the specification + * requirements, the appropriate feature test macros must be defined at + * compile time. These are as follows. For more info, see standards(5). + * + * Feature Test Macro Specification + * ------------------------------------------------ ------------- + * _XOPEN_SOURCE XPG3 + * _XOPEN_SOURCE && _XOPEN_VERSION = 4 XPG4 + * _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED = 1 XPG4v2 + * _XOPEN_SOURCE = 500 XPG5 + * _XOPEN_SOURCE = 600 (or POSIX_C_SOURCE=200112L) XPG6 + * + * In order to simplify the guards within the headers, the following + * implementation private test macros have been created. Applications + * must NOT use these private test macros as unexpected results will + * occur. + * + * Note that in general, the use of these private macros is cumulative. + * For example, the use of _XPG3 with no other restrictions on the X/Open + * namespace will make the symbols visible for XPG3 through XPG6 + * compilation environments. The use of _XPG4_2 with no other X/Open + * namespace restrictions indicates that the symbols were introduced in + * XPG4v2 and are therefore visible for XPG4v2 through XPG6 compilation + * environments, but not for XPG3 or XPG4 compilation environments. + * + * _XPG3 X/Open Portability Guide, Issue 3 (XPG3) + * _XPG4 X/Open CAE Specification, Issue 4 (XPG4) + * _XPG4_2 X/Open CAE Specification, Issue 4, Version 2 (XPG4v2/UNIX 95/SUS) + * _XPG5 X/Open CAE Specification, Issue 5 (XPG5/UNIX 98/SUSv2) + * _XPG6 Open Group Technical Standard, Issue 6 (XPG6/UNIX 03/SUSv3) + */ + +/* X/Open Portability Guide, Issue 3 */ +#if defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE - 0 < 500) && \ + (_XOPEN_VERSION - 0 < 4) && !defined(_XOPEN_SOURCE_EXTENDED) +#define _XPG3 +/* X/Open CAE Specification, Issue 4 */ +#elif (defined(_XOPEN_SOURCE) && _XOPEN_VERSION - 0 == 4) +#define _XPG4 +#define _XPG3 +/* X/Open CAE Specification, Issue 4, Version 2 */ +#elif (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE_EXTENDED - 0 == 1) +#define _XPG4_2 +#define _XPG4 +#define _XPG3 +/* X/Open CAE Specification, Issue 5 */ +#elif (_XOPEN_SOURCE - 0 == 500) +#define _XPG5 +#define _XPG4_2 +#define _XPG4 +#define _XPG3 +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 199506L +/* Open Group Technical Standard , Issue 6 */ +#elif (_XOPEN_SOURCE - 0 == 600) || (_POSIX_C_SOURCE - 0 == 200112L) +#define _XPG6 +#define _XPG5 +#define _XPG4_2 +#define _XPG4 +#define _XPG3 +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200112L +#undef _XOPEN_SOURCE +#define _XOPEN_SOURCE 600 +#endif + +/* + * _XOPEN_VERSION is defined by the X/Open specifications and is not + * normally defined by the application, except in the case of an XPG4 + * application. On the implementation side, _XOPEN_VERSION defined with + * the value of 3 indicates an XPG3 application. _XOPEN_VERSION defined + * with the value of 4 indicates an XPG4 or XPG4v2 (UNIX 95) application. + * _XOPEN_VERSION defined with a value of 500 indicates an XPG5 (UNIX 98) + * application and with a value of 600 indicates an XPG6 (UNIX 03) + * application. The appropriate version is determined by the use of the + * feature test macros described earlier. The value of _XOPEN_VERSION + * defaults to 3 otherwise indicating support for XPG3 applications. + */ +#ifndef _XOPEN_VERSION +#ifdef _XPG6 +#define _XOPEN_VERSION 600 +#elif defined(_XPG5) +#define _XOPEN_VERSION 500 +#elif defined(_XPG4_2) +#define _XOPEN_VERSION 4 +#else +#define _XOPEN_VERSION 3 +#endif +#endif + +/* + * ANSI C and ISO 9899:1990 say the type long long doesn't exist in strictly + * conforming environments. ISO 9899:1999 says it does. + * + * The presence of _LONGLONG_TYPE says "long long exists" which is therefore + * defined in all but strictly conforming environments that disallow it. + */ +#if !defined(_STDC_C99) && defined(_STRICT_STDC) && !defined(__GNUC__) +/* + * Resist attempts to force the definition of long long in this case. + */ +#if defined(_LONGLONG_TYPE) +#error "No long long in strictly conforming ANSI C & 1990 ISO C environments" +#endif +#else +#if !defined(_LONGLONG_TYPE) +#define _LONGLONG_TYPE +#endif +#endif + +/* + * It is invalid to compile an XPG3, XPG4, XPG4v2, or XPG5 application + * using c99. The same is true for POSIX.1-1990, POSIX.2-1992, POSIX.1b, + * and POSIX.1c applications. Likewise, it is invalid to compile an XPG6 + * or a POSIX.1-2001 application with anything other than a c99 or later + * compiler. Therefore, we force an error in both cases. + */ +#if defined(_STDC_C99) && (defined(__XOPEN_OR_POSIX) && !defined(_XPG6)) +#error "Compiler or options invalid for pre-UNIX 03 X/Open applications \ + and pre-2001 POSIX applications" +#elif !defined(_STDC_C99) && \ + (defined(__XOPEN_OR_POSIX) && defined(_XPG6)) +#error "Compiler or options invalid; UNIX 03 and POSIX.1-2001 applications \ + require the use of c99" +#endif + +/* + * The following macro defines a value for the ISO C99 restrict + * keyword so that _RESTRICT_KYWD resolves to "restrict" if + * an ISO C99 compiler is used and "" (null string) if any other + * compiler is used. This allows for the use of single prototype + * declarations regardless of compiler version. + */ +#if (defined(__STDC__) && defined(_STDC_C99)) +#define _RESTRICT_KYWD restrict +#else +#define _RESTRICT_KYWD +#endif + +/* + * The following macro indicates header support for the ANSI C++ + * standard. The ISO/IEC designation for this is ISO/IEC FDIS 14882. + */ +#define _ISO_CPP_14882_1998 + +/* + * The following macro indicates header support for the C99 standard, + * ISO/IEC 9899:1999, Programming Languages - C. + */ +#define _ISO_C_9899_1999 + +/* + * The following macro indicates header support for DTrace. The value is an + * integer that corresponds to the major version number for DTrace. + */ +#define _DTRACE_VERSION 1 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FEATURE_TESTS_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h new file mode 100644 index 0000000..aa5c7ee --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FM_FS_ZFS_H +#define _SYS_FM_FS_ZFS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_ERROR_CLASS "fs.zfs" + +#define FM_EREPORT_ZFS_CHECKSUM "checksum" +#define FM_EREPORT_ZFS_IO "io" +#define FM_EREPORT_ZFS_DATA "data" +#define FM_EREPORT_ZFS_POOL "zpool" +#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" +#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" +#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data" +#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas" +#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" +#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" +#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" + +#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" +#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" + +#define FM_RESOURCE_OK "ok" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_FS_ZFS_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/fm/protocol.h b/sys/contrib/opensolaris/uts/common/sys/fm/protocol.h new file mode 100644 index 0000000..a9980fe --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/fm/protocol.h @@ -0,0 +1,301 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FM_PROTOCOL_H +#define _SYS_FM_PROTOCOL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +#include <sys/varargs.h> +#include <sys/nvpair.h> +#else +#include <libnvpair.h> +#include <stdarg.h> +#endif + +/* FM common member names */ +#define FM_CLASS "class" +#define FM_VERSION "version" + +/* FM event class values */ +#define FM_EREPORT_CLASS "ereport" +#define FM_FAULT_CLASS "fault" +#define FM_RSRC_CLASS "resource" +#define FM_LIST_EVENT "list" + +/* FM list.* event class values */ +#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect" +#define FM_LIST_ISOLATED_CLASS FM_LIST_EVENT ".isolated" +#define FM_LIST_REPAIRED_CLASS FM_LIST_EVENT ".repaired" + +/* ereport class subcategory values */ +#define FM_ERROR_CPU "cpu" +#define FM_ERROR_IO "io" + +/* ereport version and payload member names */ +#define FM_EREPORT_VERS0 0 +#define FM_EREPORT_VERSION FM_EREPORT_VERS0 + +/* ereport payload member names */ +#define FM_EREPORT_DETECTOR "detector" +#define FM_EREPORT_ENA "ena" + +/* list.* event payload member names */ +#define FM_LIST_EVENT_SIZE "list-sz" + +/* list.suspect, isolated, and repaired versions and payload member names */ +#define FM_SUSPECT_UUID "uuid" +#define FM_SUSPECT_DIAG_CODE "code" +#define FM_SUSPECT_DIAG_TIME "diag-time" +#define FM_SUSPECT_DE "de" +#define FM_SUSPECT_FAULT_LIST "fault-list" +#define FM_SUSPECT_FAULT_SZ "fault-list-sz" +#define FM_SUSPECT_FAULT_STATUS "fault-status" +#define FM_SUSPECT_MESSAGE "message" + +#define FM_SUSPECT_VERS0 0 +#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0 + +/* fault event versions and payload member names */ +#define FM_FAULT_VERS0 0 +#define FM_FAULT_VERSION FM_FAULT_VERS0 + +#define FM_FAULT_ASRU "asru" +#define FM_FAULT_FRU "fru" +#define FM_FAULT_FRU_LABEL "fru-label" +#define FM_FAULT_CERTAINTY "certainty" +#define FM_FAULT_RESOURCE "resource" +#define FM_FAULT_LOCATION "location" + +/* resource event versions and payload member names */ +#define FM_RSRC_VERS0 0 +#define FM_RSRC_VERSION FM_RSRC_VERS0 +#define FM_RSRC_RESOURCE "resource" + +/* resource.fm.asru.* payload member names */ +#define FM_RSRC_ASRU_UUID "uuid" +#define FM_RSRC_ASRU_CODE "code" +#define FM_RSRC_ASRU_FAULTY "faulty" +#define FM_RSRC_ASRU_UNUSABLE "unusable" +#define FM_RSRC_ASRU_EVENT "event" + +/* resource.fm.xprt.* versions and payload member names */ +#define FM_RSRC_XPRT_VERS0 0 +#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0 +#define FM_RSRC_XPRT_UUID "uuid" +#define FM_RSRC_XPRT_SUBCLASS "subclass" + +/* + * FM ENA Format Macros + */ +#define ENA_FORMAT_MASK 0x3 +#define ENA_FORMAT(ena) ((ena) & ENA_FORMAT_MASK) + +/* ENA format types */ +#define FM_ENA_FMT0 0 +#define FM_ENA_FMT1 1 +#define FM_ENA_FMT2 2 + +/* Format 1 */ +#define ENA_FMT1_GEN_MASK 0x00000000000003FCull +#define ENA_FMT1_ID_MASK 0xFFFFFFFFFFFFFC00ull +#define ENA_FMT1_CPUID_MASK 0x00000000000FFC00ull +#define ENA_FMT1_TIME_MASK 0xFFFFFFFFFFF00000ull +#define ENA_FMT1_GEN_SHFT 2 +#define ENA_FMT1_ID_SHFT 10 +#define ENA_FMT1_CPUID_SHFT ENA_FMT1_ID_SHFT +#define ENA_FMT1_TIME_SHFT 20 + +/* Format 2 */ +#define ENA_FMT2_GEN_MASK 0x00000000000003FCull +#define ENA_FMT2_ID_MASK 0xFFFFFFFFFFFFFC00ull +#define ENA_FMT2_TIME_MASK ENA_FMT2_ID_MASK +#define ENA_FMT2_GEN_SHFT 2 +#define ENA_FMT2_ID_SHFT 10 +#define ENA_FMT2_TIME_SHFT ENA_FMT2_ID_SHFT + +/* Common FMRI type names */ +#define FM_FMRI_AUTHORITY "authority" +#define FM_FMRI_SCHEME "scheme" +#define FM_FMRI_SVC_AUTHORITY "svc-authority" + +/* FMRI authority-type member names */ +#define FM_FMRI_AUTH_CHASSIS "chassis-id" +#define FM_FMRI_AUTH_PRODUCT "product-id" +#define FM_FMRI_AUTH_DOMAIN "domain-id" +#define FM_FMRI_AUTH_SERVER "server-id" +#define FM_FMRI_AUTH_HOST "host-id" + +#define FM_AUTH_VERS0 0 +#define FM_FMRI_AUTH_VERSION FM_AUTH_VERS0 + +/* scheme name values */ +#define FM_FMRI_SCHEME_FMD "fmd" +#define FM_FMRI_SCHEME_DEV "dev" +#define FM_FMRI_SCHEME_HC "hc" +#define FM_FMRI_SCHEME_SVC "svc" +#define FM_FMRI_SCHEME_CPU "cpu" +#define FM_FMRI_SCHEME_MEM "mem" +#define FM_FMRI_SCHEME_MOD "mod" +#define FM_FMRI_SCHEME_PKG "pkg" +#define FM_FMRI_SCHEME_LEGACY "legacy-hc" +#define FM_FMRI_SCHEME_ZFS "zfs" + +/* Scheme versions */ +#define FMD_SCHEME_VERSION0 0 +#define FM_FMD_SCHEME_VERSION FMD_SCHEME_VERSION0 +#define DEV_SCHEME_VERSION0 0 +#define FM_DEV_SCHEME_VERSION DEV_SCHEME_VERSION0 +#define FM_HC_VERS0 0 +#define FM_HC_SCHEME_VERSION FM_HC_VERS0 +#define CPU_SCHEME_VERSION0 0 +#define CPU_SCHEME_VERSION1 1 +#define FM_CPU_SCHEME_VERSION CPU_SCHEME_VERSION1 +#define MEM_SCHEME_VERSION0 0 +#define FM_MEM_SCHEME_VERSION MEM_SCHEME_VERSION0 +#define MOD_SCHEME_VERSION0 0 +#define FM_MOD_SCHEME_VERSION MOD_SCHEME_VERSION0 +#define PKG_SCHEME_VERSION0 0 +#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0 +#define LEGACY_SCHEME_VERSION0 0 +#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0 +#define ZFS_SCHEME_VERSION0 0 +#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0 + +/* hc scheme member names */ +#define FM_FMRI_HC_SERIAL_ID "serial" +#define FM_FMRI_HC_PART "part" +#define FM_FMRI_HC_REVISION "revision" +#define FM_FMRI_HC_ROOT "hc-root" +#define FM_FMRI_HC_LIST_SZ "hc-list-sz" +#define FM_FMRI_HC_LIST "hc-list" +#define FM_FMRI_HC_SPECIFIC "hc-specific" + +/* hc-list version and member names */ +#define FM_FMRI_HC_NAME "hc-name" +#define FM_FMRI_HC_ID "hc-id" + +#define HC_LIST_VERSION0 0 +#define FM_HC_LIST_VERSION HC_LIST_VERSION0 + +/* hc-specific member names */ +#define FM_FMRI_HC_SPECIFIC_OFFSET "offset" + +/* fmd module scheme member names */ +#define FM_FMRI_FMD_NAME "mod-name" +#define FM_FMRI_FMD_VERSION "mod-version" + +/* dev scheme member names */ +#define FM_FMRI_DEV_ID "devid" +#define FM_FMRI_DEV_PATH "device-path" + +/* pkg scheme member names */ +#define FM_FMRI_PKG_BASEDIR "pkg-basedir" +#define FM_FMRI_PKG_INST "pkg-inst" +#define FM_FMRI_PKG_VERSION "pkg-version" + +/* svc scheme member names */ +#define FM_FMRI_SVC_NAME "service-name" +#define FM_FMRI_SVC_VERSION "service-version" +#define FM_FMRI_SVC_INSTANCE "instance" +#define FM_FMRI_SVC_CONTRACT_ID "contract-id" + +/* svc-authority member names */ +#define FM_FMRI_SVC_AUTH_SCOPE "scope" +#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-FQN" + +/* cpu scheme member names */ +#define FM_FMRI_CPU_ID "cpuid" +#define FM_FMRI_CPU_SERIAL_ID "serial" +#define FM_FMRI_CPU_MASK "cpumask" +#define FM_FMRI_CPU_VID "cpuvid" +#define FM_FMRI_CPU_CPUFRU "cpufru" + +/* legacy-hc scheme member names */ +#define FM_FMRI_LEGACY_HC "component" +#define FM_FMRI_LEGACY_HC_PREFIX FM_FMRI_SCHEME_HC":///" \ + FM_FMRI_LEGACY_HC"=" + +/* mem scheme member names */ +#define FM_FMRI_MEM_UNUM "unum" +#define FM_FMRI_MEM_SERIAL_ID "serial" +#define FM_FMRI_MEM_PHYSADDR "physaddr" +#define FM_FMRI_MEM_MEMCONFIG "memconfig" +#define FM_FMRI_MEM_OFFSET "offset" + +/* mod scheme member names */ +#define FM_FMRI_MOD_PKG "mod-pkg" +#define FM_FMRI_MOD_NAME "mod-name" +#define FM_FMRI_MOD_ID "mod-id" +#define FM_FMRI_MOD_DESC "mod-desc" + +/* zfs scheme member names */ +#define FM_FMRI_ZFS_POOL "pool" +#define FM_FMRI_ZFS_VDEV "vdev" + +extern nv_alloc_t *fm_nva_xcreate(char *, size_t); +extern void fm_nva_xdestroy(nv_alloc_t *); + +extern nvlist_t *fm_nvlist_create(nv_alloc_t *); +extern void fm_nvlist_destroy(nvlist_t *, int); + +#define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */ +#define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */ + +extern void fm_ereport_set(nvlist_t *, int, const char *, uint64_t, + const nvlist_t *, ...); +extern void fm_payload_set(nvlist_t *, ...); +extern int i_fm_payload_set(nvlist_t *, const char *, va_list); +extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *, + int, ...); +extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *, + const char *); +extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *); +extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t, + uint8_t *, const char *); +extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *, + const char *, uint64_t); +extern void fm_authority_set(nvlist_t *, int, const char *, const char *, + const char *, const char *); +extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t); + +extern uint64_t fm_ena_increment(uint64_t); +extern uint64_t fm_ena_generate(uint64_t, uchar_t); +extern uint64_t fm_ena_generation_get(uint64_t); +extern uchar_t fm_ena_format_get(uint64_t); +extern uint64_t fm_ena_id_get(uint64_t); +extern uint64_t fm_ena_time_get(uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_PROTOCOL_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/fm/util.h b/sys/contrib/opensolaris/uts/common/sys/fm/util.h new file mode 100644 index 0000000..f65e0ab4 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/fm/util.h @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FM_UTIL_H +#define _SYS_FM_UTIL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/nvpair.h> +#include <sys/errorq.h> + +/* + * Shared user/kernel definitions for class length, error channel name, + * and kernel event publisher string. + */ +#define FM_MAX_CLASS 100 +#define FM_ERROR_CHAN "com.sun:fm:error" +#define FM_PUB "fm" + +/* + * ereport dump device transport support + * + * Ereports are written out to the dump device at a proscribed offset from the + * end, similar to in-transit log messages. The ereports are represented as a + * erpt_dump_t header followed by ed_size bytes of packed native nvlist data. + * + * NOTE: All of these constants and the header must be defined so they have the + * same representation for *both* 32-bit and 64-bit producers and consumers. + */ +#define ERPT_MAGIC 0xf00d4eddU +#define ERPT_MAX_ERRS 16 +#define ERPT_DATA_SZ (6 * 1024) +#define ERPT_EVCH_MAX 256 +#define ERPT_HIWAT 64 + +typedef struct erpt_dump { + uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */ + uint32_t ed_chksum; /* checksum32() of packed nvlist data */ + uint32_t ed_size; /* ereport (nvl) fixed buf size */ + uint32_t ed_pad; /* reserved for future use */ + hrtime_t ed_hrt_nsec; /* hrtime of this ereport */ + hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */ + struct { + uint64_t sec; /* seconds since gettimeofday() Epoch */ + uint64_t nsec; /* nanoseconds past ed_tod_base.sec */ + } ed_tod_base; +} erpt_dump_t; + +#ifdef _KERNEL +#include <sys/systm.h> + +#define FM_STK_DEPTH 20 /* maximum stack depth */ +#define FM_SYM_SZ 64 /* maximum symbol size */ +#define FM_ERR_PIL 2 /* PIL for ereport_errorq drain processing */ + +#define FM_EREPORT_PAYLOAD_NAME_STACK "stack" + +extern errorq_t *ereport_errorq; +extern void *ereport_dumpbuf; +extern size_t ereport_dumplen; + +extern void fm_init(void); +extern void fm_nvprint(nvlist_t *); +extern void fm_panic(const char *, ...); +extern void fm_banner(void); + +extern void fm_ereport_dump(void); +extern void fm_ereport_post(nvlist_t *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_UTIL_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/contrib/opensolaris/uts/common/sys/fs/zfs.h new file mode 100644 index 0000000..715537d --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -0,0 +1,434 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_H +#define _SYS_FS_ZFS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/ioccom.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Types and constants shared between userland and the kernel. + */ + +/* + * Each dataset can be one of the following types. These constants can be + * combined into masks that can be passed to various functions. + */ +typedef enum { + ZFS_TYPE_FILESYSTEM = 0x1, + ZFS_TYPE_SNAPSHOT = 0x2, + ZFS_TYPE_VOLUME = 0x4, + ZFS_TYPE_POOL = 0x8 +} zfs_type_t; + +#define ZFS_TYPE_ANY \ + (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) + +/* + * Properties are identified by these constants and must be added to the + * end of this list to ensure that external conumsers are not affected + * by the change. The property list also determines how 'zfs get' will + * display them. If you make any changes to this list, be sure to update + * the property table in usr/src/common/zfs/zfs_prop.c. + */ +typedef enum { + ZFS_PROP_CONT = -2, + ZFS_PROP_INVAL = -1, + ZFS_PROP_TYPE, + ZFS_PROP_CREATION, + ZFS_PROP_USED, + ZFS_PROP_AVAILABLE, + ZFS_PROP_REFERENCED, + ZFS_PROP_COMPRESSRATIO, + ZFS_PROP_MOUNTED, + ZFS_PROP_ORIGIN, + ZFS_PROP_QUOTA, + ZFS_PROP_RESERVATION, + ZFS_PROP_VOLSIZE, + ZFS_PROP_VOLBLOCKSIZE, + ZFS_PROP_RECORDSIZE, + ZFS_PROP_MOUNTPOINT, + ZFS_PROP_SHARENFS, + ZFS_PROP_CHECKSUM, + ZFS_PROP_COMPRESSION, + ZFS_PROP_ATIME, + ZFS_PROP_DEVICES, + ZFS_PROP_EXEC, + ZFS_PROP_SETUID, + ZFS_PROP_READONLY, + ZFS_PROP_ZONED, + ZFS_PROP_SNAPDIR, + ZFS_PROP_ACLMODE, + ZFS_PROP_ACLINHERIT, + ZFS_PROP_CREATETXG, /* not exposed to the user */ + ZFS_PROP_NAME, /* not exposed to the user */ + ZFS_PROP_CANMOUNT, + ZFS_PROP_SHAREISCSI, + ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ + ZFS_PROP_XATTR, + ZFS_PROP_NUMCLONES, /* not exposed to the user */ + ZFS_PROP_COPIES, + ZFS_PROP_BOOTFS +} zfs_prop_t; + +typedef zfs_prop_t zpool_prop_t; + +#define ZFS_PROP_VALUE "value" +#define ZFS_PROP_SOURCE "source" + +typedef enum { + ZFS_SRC_NONE = 0x1, + ZFS_SRC_DEFAULT = 0x2, + ZFS_SRC_TEMPORARY = 0x4, + ZFS_SRC_LOCAL = 0x8, + ZFS_SRC_INHERITED = 0x10 +} zfs_source_t; + +#define ZFS_SRC_ALL 0x1f + +/* + * The following functions are shared between libzfs and the kernel. + */ +zfs_prop_t zfs_name_to_prop(const char *); +zpool_prop_t zpool_name_to_prop(const char *); +boolean_t zfs_prop_user(const char *); +int zfs_prop_readonly(zfs_prop_t); +const char *zfs_prop_default_string(zfs_prop_t); +const char *zfs_prop_to_name(zfs_prop_t); +const char *zpool_prop_to_name(zfs_prop_t); +uint64_t zfs_prop_default_numeric(zfs_prop_t); +int zfs_prop_inheritable(zfs_prop_t); +int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); +int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); + +/* + * Property Iterator + */ +typedef zfs_prop_t (*zfs_prop_f)(zfs_prop_t, void *); +typedef zfs_prop_f zpool_prop_f; +extern zfs_prop_t zfs_prop_iter(zfs_prop_f, void *, boolean_t); +extern zpool_prop_t zpool_prop_iter(zpool_prop_f, void *, boolean_t); + +/* + * On-disk version number. + */ +#define ZFS_VERSION_1 1ULL +#define ZFS_VERSION_2 2ULL +#define ZFS_VERSION_3 3ULL +#define ZFS_VERSION_4 4ULL +#define ZFS_VERSION_5 5ULL +#define ZFS_VERSION_6 6ULL +/* + * When bumping up ZFS_VERSION, make sure GRUB ZFS understand the on-disk + * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*}, + * and do the appropriate changes. + */ +#define ZFS_VERSION ZFS_VERSION_6 +#define ZFS_VERSION_STRING "6" + +/* + * Symbolic names for the changes that caused a ZFS_VERSION switch. + * Used in the code when checking for presence or absence of a feature. + * Feel free to define multiple symbolic names for each version if there + * were multiple changes to on-disk structures during that version. + * + * NOTE: When checking the current ZFS_VERSION in your code, be sure + * to use spa_version() since it reports the version of the + * last synced uberblock. Checking the in-flight version can + * be dangerous in some cases. + */ +#define ZFS_VERSION_INITIAL ZFS_VERSION_1 +#define ZFS_VERSION_DITTO_BLOCKS ZFS_VERSION_2 +#define ZFS_VERSION_SPARES ZFS_VERSION_3 +#define ZFS_VERSION_RAID6 ZFS_VERSION_3 +#define ZFS_VERSION_BPLIST_ACCOUNT ZFS_VERSION_3 +#define ZFS_VERSION_RAIDZ_DEFLATE ZFS_VERSION_3 +#define ZFS_VERSION_DNODE_BYTES ZFS_VERSION_3 +#define ZFS_VERSION_ZPOOL_HISTORY ZFS_VERSION_4 +#define ZFS_VERSION_GZIP_COMPRESSION ZFS_VERSION_5 +#define ZFS_VERSION_BOOTFS ZFS_VERSION_6 + +/* + * The following are configuration names used in the nvlist describing a pool's + * configuration. + */ +#define ZPOOL_CONFIG_VERSION "version" +#define ZPOOL_CONFIG_POOL_NAME "name" +#define ZPOOL_CONFIG_POOL_STATE "state" +#define ZPOOL_CONFIG_POOL_TXG "txg" +#define ZPOOL_CONFIG_POOL_GUID "pool_guid" +#define ZPOOL_CONFIG_CREATE_TXG "create_txg" +#define ZPOOL_CONFIG_TOP_GUID "top_guid" +#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" +#define ZPOOL_CONFIG_TYPE "type" +#define ZPOOL_CONFIG_CHILDREN "children" +#define ZPOOL_CONFIG_ID "id" +#define ZPOOL_CONFIG_GUID "guid" +#define ZPOOL_CONFIG_PATH "path" +#define ZPOOL_CONFIG_DEVID "devid" +#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" +#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" +#define ZPOOL_CONFIG_ASHIFT "ashift" +#define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_DTL "DTL" +#define ZPOOL_CONFIG_STATS "stats" +#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" +#define ZPOOL_CONFIG_OFFLINE "offline" +#define ZPOOL_CONFIG_ERRCOUNT "error_count" +#define ZPOOL_CONFIG_NOT_PRESENT "not_present" +#define ZPOOL_CONFIG_SPARES "spares" +#define ZPOOL_CONFIG_IS_SPARE "is_spare" +#define ZPOOL_CONFIG_NPARITY "nparity" + +#define VDEV_TYPE_ROOT "root" +#define VDEV_TYPE_MIRROR "mirror" +#define VDEV_TYPE_REPLACING "replacing" +#define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DISK "disk" +#define VDEV_TYPE_FILE "file" +#define VDEV_TYPE_MISSING "missing" +#define VDEV_TYPE_SPARE "spare" + +/* + * This is needed in userland to report the minimum necessary device size. + */ +#define SPA_MINDEVSIZE (64ULL << 20) + +/* + * The location of the pool configuration repository, shared between kernel and + * userland. + */ +#define ZPOOL_CACHE_DIR "/etc/zfs" +#define ZPOOL_CACHE_FILE "zpool.cache" +#define ZPOOL_CACHE_TMP ".zpool.cache" + +#define ZPOOL_CACHE ZPOOL_CACHE_DIR "/" ZPOOL_CACHE_FILE + +/* + * vdev states are ordered from least to most healthy. + * A vdev that's CANT_OPEN or below is considered unusable. + */ +typedef enum vdev_state { + VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ + VDEV_STATE_CLOSED, /* Not currently open */ + VDEV_STATE_OFFLINE, /* Not allowed to open */ + VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ + VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ + VDEV_STATE_HEALTHY /* Presumed good */ +} vdev_state_t; + +/* + * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field + * of the vdev stats structure uses these constants to distinguish why. + */ +typedef enum vdev_aux { + VDEV_AUX_NONE, /* no error */ + VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ + VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ + VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ + VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ + VDEV_AUX_TOO_SMALL, /* vdev size is too small */ + VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ + VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ + VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ + VDEV_AUX_SPARED /* hot spare used in another pool */ +} vdev_aux_t; + +/* + * pool state. The following states are written to disk as part of the normal + * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are + * software abstractions used at various levels to communicate pool state. + */ +typedef enum pool_state { + POOL_STATE_ACTIVE = 0, /* In active use */ + POOL_STATE_EXPORTED, /* Explicitly exported */ + POOL_STATE_DESTROYED, /* Explicitly destroyed */ + POOL_STATE_SPARE, /* Reserved for hot spare use */ + POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ + POOL_STATE_UNAVAIL, /* Internal libzfs state */ + POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ +} pool_state_t; + +/* + * Scrub types. + */ +typedef enum pool_scrub_type { + POOL_SCRUB_NONE, + POOL_SCRUB_RESILVER, + POOL_SCRUB_EVERYTHING, + POOL_SCRUB_TYPES +} pool_scrub_type_t; + +/* + * ZIO types. Needed to interpret vdev statistics below. + */ +typedef enum zio_type { + ZIO_TYPE_NULL = 0, + ZIO_TYPE_READ, + ZIO_TYPE_WRITE, + ZIO_TYPE_FREE, + ZIO_TYPE_CLAIM, + ZIO_TYPE_IOCTL, + ZIO_TYPES +} zio_type_t; + +/* + * Vdev statistics. Note: all fields should be 64-bit because this + * is passed between kernel and userland as an nvlist uint64 array. + */ +typedef struct vdev_stat { + hrtime_t vs_timestamp; /* time since vdev load */ + uint64_t vs_state; /* vdev state */ + uint64_t vs_aux; /* see vdev_aux_t */ + uint64_t vs_alloc; /* space allocated */ + uint64_t vs_space; /* total capacity */ + uint64_t vs_dspace; /* deflated capacity */ + uint64_t vs_rsize; /* replaceable dev size */ + uint64_t vs_ops[ZIO_TYPES]; /* operation count */ + uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ + uint64_t vs_read_errors; /* read errors */ + uint64_t vs_write_errors; /* write errors */ + uint64_t vs_checksum_errors; /* checksum errors */ + uint64_t vs_self_healed; /* self-healed bytes */ + uint64_t vs_scrub_type; /* pool_scrub_type_t */ + uint64_t vs_scrub_complete; /* completed? */ + uint64_t vs_scrub_examined; /* bytes examined; top */ + uint64_t vs_scrub_repaired; /* bytes repaired; leaf */ + uint64_t vs_scrub_errors; /* errors during scrub */ + uint64_t vs_scrub_start; /* UTC scrub start time */ + uint64_t vs_scrub_end; /* UTC scrub end time */ +} vdev_stat_t; + +#define ZFS_DRIVER "zfs" +#define ZFS_DEV_NAME "zfs" +#define ZFS_DEV "/dev/" ZFS_DEV_NAME + +/* + * zvol paths. Irritatingly, the devfsadm interfaces want all these + * paths without the /dev prefix, but for some things, we want the + * /dev prefix. Below are the names without /dev. + */ +#define ZVOL_DEV_DIR "zvol" + +/* + * And here are the things we need with /dev, etc. in front of them. + */ +#define ZVOL_PSEUDO_DEV "/devices/pseudo/zvol@0:" +#define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR + +#define ZVOL_PROP_NAME "name" + +/* + * /dev/zfs ioctl numbers. + */ +typedef unsigned long zfs_ioc_t; + +#define ZFS_IOC(ioreq) ((ioreq) & 0xff) + +#define ZFS_IOC_POOL_CREATE _IOWR('Z', 0, struct zfs_cmd) +#define ZFS_IOC_POOL_DESTROY _IOWR('Z', 1, struct zfs_cmd) +#define ZFS_IOC_POOL_IMPORT _IOWR('Z', 2, struct zfs_cmd) +#define ZFS_IOC_POOL_EXPORT _IOWR('Z', 3, struct zfs_cmd) +#define ZFS_IOC_POOL_CONFIGS _IOWR('Z', 4, struct zfs_cmd) +#define ZFS_IOC_POOL_STATS _IOWR('Z', 5, struct zfs_cmd) +#define ZFS_IOC_POOL_TRYIMPORT _IOWR('Z', 6, struct zfs_cmd) +#define ZFS_IOC_POOL_SCRUB _IOWR('Z', 7, struct zfs_cmd) +#define ZFS_IOC_POOL_FREEZE _IOWR('Z', 8, struct zfs_cmd) +#define ZFS_IOC_POOL_UPGRADE _IOWR('Z', 9, struct zfs_cmd) +#define ZFS_IOC_POOL_GET_HISTORY _IOWR('Z', 10, struct zfs_cmd) +#define ZFS_IOC_POOL_LOG_HISTORY _IOWR('Z', 11, struct zfs_cmd) +#define ZFS_IOC_VDEV_ADD _IOWR('Z', 12, struct zfs_cmd) +#define ZFS_IOC_VDEV_REMOVE _IOWR('Z', 13, struct zfs_cmd) +#define ZFS_IOC_VDEV_ONLINE _IOWR('Z', 14, struct zfs_cmd) +#define ZFS_IOC_VDEV_OFFLINE _IOWR('Z', 15, struct zfs_cmd) +#define ZFS_IOC_VDEV_ATTACH _IOWR('Z', 16, struct zfs_cmd) +#define ZFS_IOC_VDEV_DETACH _IOWR('Z', 17, struct zfs_cmd) +#define ZFS_IOC_VDEV_SETPATH _IOWR('Z', 18, struct zfs_cmd) +#define ZFS_IOC_OBJSET_STATS _IOWR('Z', 19, struct zfs_cmd) +#define ZFS_IOC_DATASET_LIST_NEXT _IOWR('Z', 20, struct zfs_cmd) +#define ZFS_IOC_SNAPSHOT_LIST_NEXT _IOWR('Z', 21, struct zfs_cmd) +#define ZFS_IOC_SET_PROP _IOWR('Z', 22, struct zfs_cmd) +#define ZFS_IOC_CREATE_MINOR _IOWR('Z', 23, struct zfs_cmd) +#define ZFS_IOC_REMOVE_MINOR _IOWR('Z', 24, struct zfs_cmd) +#define ZFS_IOC_CREATE _IOWR('Z', 25, struct zfs_cmd) +#define ZFS_IOC_DESTROY _IOWR('Z', 26, struct zfs_cmd) +#define ZFS_IOC_ROLLBACK _IOWR('Z', 27, struct zfs_cmd) +#define ZFS_IOC_RENAME _IOWR('Z', 28, struct zfs_cmd) +#define ZFS_IOC_RECVBACKUP _IOWR('Z', 29, struct zfs_cmd) +#define ZFS_IOC_SENDBACKUP _IOWR('Z', 30, struct zfs_cmd) +#define ZFS_IOC_INJECT_FAULT _IOWR('Z', 31, struct zfs_cmd) +#define ZFS_IOC_CLEAR_FAULT _IOWR('Z', 32, struct zfs_cmd) +#define ZFS_IOC_INJECT_LIST_NEXT _IOWR('Z', 33, struct zfs_cmd) +#define ZFS_IOC_ERROR_LOG _IOWR('Z', 34, struct zfs_cmd) +#define ZFS_IOC_CLEAR _IOWR('Z', 35, struct zfs_cmd) +#define ZFS_IOC_PROMOTE _IOWR('Z', 36, struct zfs_cmd) +#define ZFS_IOC_DESTROY_SNAPS _IOWR('Z', 37, struct zfs_cmd) +#define ZFS_IOC_SNAPSHOT _IOWR('Z', 38, struct zfs_cmd) +#define ZFS_IOC_DSOBJ_TO_DSNAME _IOWR('Z', 39, struct zfs_cmd) +#define ZFS_IOC_OBJ_TO_PATH _IOWR('Z', 40, struct zfs_cmd) +#define ZFS_IOC_POOL_SET_PROPS _IOWR('Z', 41, struct zfs_cmd) +#define ZFS_IOC_POOL_GET_PROPS _IOWR('Z', 42, struct zfs_cmd) +#define ZFS_IOC_JAIL _IOWR('Z', 43, struct zfs_cmd) +#define ZFS_IOC_UNJAIL _IOWR('Z', 44, struct zfs_cmd) + +/* + * Internal SPA load state. Used by FMA diagnosis engine. + */ +typedef enum { + SPA_LOAD_NONE, /* no load in progress */ + SPA_LOAD_OPEN, /* normal open */ + SPA_LOAD_IMPORT, /* import in progress */ + SPA_LOAD_TRYIMPORT /* tryimport in progress */ +} spa_load_state_t; + +/* + * Bookmark name values. + */ +#define ZPOOL_ERR_LIST "error list" +#define ZPOOL_ERR_DATASET "dataset" +#define ZPOOL_ERR_OBJECT "object" + +#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) + +/* + * The following are names used in the nvlist describing + * the pool's history log. + */ +#define ZPOOL_HIST_RECORD "history record" +#define ZPOOL_HIST_TIME "history time" +#define ZPOOL_HIST_CMD "history command" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/gfs.h b/sys/contrib/opensolaris/uts/common/sys/gfs.h new file mode 100644 index 0000000..8e70f29 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/gfs.h @@ -0,0 +1,139 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_GFS_H +#define _SYS_GFS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/vnode.h> +#include <sys/mutex.h> +#include <sys/dirent.h> +#include <sys/uio.h> +#include <sys/list.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define GFS_CACHE_VNODE 0x1 + +typedef struct gfs_dirent { + char *gfse_name; /* entry name */ + vnode_t *(*gfse_ctor)(vnode_t *); /* constructor */ + int gfse_flags; /* flags */ + list_node_t gfse_link; /* dynamic list */ + vnode_t *gfse_vnode; /* cached vnode */ +} gfs_dirent_t; + +typedef enum gfs_type { + GFS_DIR, + GFS_FILE +} gfs_type_t; + +typedef struct gfs_file { + vnode_t *gfs_vnode; /* current vnode */ + vnode_t *gfs_parent; /* parent vnode */ + size_t gfs_size; /* size of private data structure */ + gfs_type_t gfs_type; /* type of vnode */ + int gfs_index; /* index in parent dir */ + ino64_t gfs_ino; /* inode for this vnode */ +} gfs_file_t; + +typedef int (*gfs_readdir_cb)(vnode_t *, struct dirent64 *, int *, offset_t *, + offset_t *, void *); +typedef int (*gfs_lookup_cb)(vnode_t *, const char *, vnode_t **, ino64_t *); +typedef ino64_t (*gfs_inode_cb)(vnode_t *, int); + +typedef struct gfs_dir { + gfs_file_t gfsd_file; /* generic file attributes */ + gfs_dirent_t *gfsd_static; /* statically defined entries */ + int gfsd_nstatic; /* # static entries */ + kmutex_t gfsd_lock; /* protects entries */ + int gfsd_maxlen; /* maximum name length */ + gfs_readdir_cb gfsd_readdir; /* readdir() callback */ + gfs_lookup_cb gfsd_lookup; /* lookup() callback */ + gfs_inode_cb gfsd_inode; /* get an inode number */ +} gfs_dir_t; + +struct vfs; + +extern vnode_t *gfs_file_create(size_t, vnode_t *, vfs_t *, vnodeops_t *); +extern vnode_t *gfs_dir_create(size_t, vnode_t *, vfs_t *, vnodeops_t *, + gfs_dirent_t *, gfs_inode_cb, int, gfs_readdir_cb, gfs_lookup_cb); +extern vnode_t *gfs_root_create(size_t, vfs_t *, vnodeops_t *, ino64_t, + gfs_dirent_t *, gfs_inode_cb, int, gfs_readdir_cb, gfs_lookup_cb); +extern vnode_t *gfs_root_create_file(size_t, struct vfs *, vnodeops_t *, + ino64_t); + +extern void *gfs_file_inactive(vnode_t *); +extern void *gfs_dir_inactive(vnode_t *); + +extern int gfs_dir_lookup(vnode_t *, const char *, vnode_t **); +extern int gfs_dir_readdir(vnode_t *, uio_t *, int *, int *, u_long **, void *); + +#define gfs_dir_lock(gd) mutex_enter(&(gd)->gfsd_lock) +#define gfs_dir_unlock(gd) mutex_exit(&(gd)->gfsd_lock) + +#define gfs_file_parent(vp) (((gfs_file_t *)(vp)->v_data)->gfs_parent) + +#define gfs_file_index(vp) (((gfs_file_t *)(vp)->v_data)->gfs_index) +#define gfs_file_set_index(vp, idx) \ + (((gfs_file_t *)(vp)->v_data)->gfs_index = (idx)) + +#define gfs_file_inode(vp) (((gfs_file_t *)(vp)->v_data)->gfs_ino) +#define gfs_file_set_inode(vp, ino) \ + (((gfs_file_t *)(vp)->v_data)->gfs_ino = (ino)) + +typedef struct gfs_readdir_state { + struct dirent64 *grd_dirent; /* directory entry buffer */ + size_t grd_namlen; /* max file name length */ + size_t grd_ureclen; /* exported record size */ + ssize_t grd_oresid; /* original uio_resid */ + ino64_t grd_parent; /* inode of parent */ + ino64_t grd_self; /* inode of self */ +} gfs_readdir_state_t; + +extern int gfs_readdir_init(gfs_readdir_state_t *, int, int, uio_t *, ino64_t, + ino64_t); +extern int gfs_readdir_emit(gfs_readdir_state_t *, uio_t *, offset_t, ino64_t, + const char *, int *, u_long **); +extern int gfs_readdir_pred(gfs_readdir_state_t *, uio_t *, offset_t *, int *, + u_long **); +extern int gfs_readdir_fini(gfs_readdir_state_t *, int, int *, int); + +extern int gfs_lookup_dot(vnode_t **, vnode_t *, vnode_t *, const char *); + +extern int gfs_vop_readdir(struct vop_readdir_args *); +extern int gfs_vop_inactive(struct vop_inactive_args *); + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_GFS_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/isa_defs.h b/sys/contrib/opensolaris/uts/common/sys/isa_defs.h new file mode 100644 index 0000000..08fcbd0 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/isa_defs.h @@ -0,0 +1,479 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ISA_DEFS_H +#define _SYS_ISA_DEFS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This header file serves to group a set of well known defines and to + * set these for each instruction set architecture. These defines may + * be divided into two groups; characteristics of the processor and + * implementation choices for Solaris on a processor. + * + * Processor Characteristics: + * + * _LITTLE_ENDIAN / _BIG_ENDIAN: + * The natural byte order of the processor. A pointer to an int points + * to the least/most significant byte of that int. + * + * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD: + * The processor specific direction of stack growth. A push onto the + * stack increases/decreases the stack pointer, so it stores data at + * successively higher/lower addresses. (Stackless machines ignored + * without regrets). + * + * _LONG_LONG_HTOL / _LONG_LONG_LTOH: + * A pointer to a long long points to the most/least significant long + * within that long long. + * + * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH: + * The C compiler assigns bit fields from the high/low to the low/high end + * of an int (most to least significant vs. least to most significant). + * + * _IEEE_754: + * The processor (or supported implementations of the processor) + * supports the ieee-754 floating point standard. No other floating + * point standards are supported (or significant). Any other supported + * floating point formats are expected to be cased on the ISA processor + * symbol. + * + * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED: + * The C Compiler implements objects of type `char' as `unsigned' or + * `signed' respectively. This is really an implementation choice of + * the compiler writer, but it is specified in the ABI and tends to + * be uniform across compilers for an instruction set architecture. + * Hence, it has the properties of a processor characteristic. + * + * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT / + * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT / + * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT: + * The ABI defines alignment requirements of each of the primitive + * object types. Some, if not all, may be hardware requirements as + * well. The values are expressed in "byte-alignment" units. + * + * _MAX_ALIGNMENT: + * The most stringent alignment requirement as specified by the ABI. + * Equal to the maximum of all the above _XXX_ALIGNMENT values. + * + * _ALIGNMENT_REQUIRED: + * True or false (1 or 0) whether or not the hardware requires the ABI + * alignment. + * + * _LONG_LONG_ALIGNMENT_32 + * The 32-bit ABI supported by a 64-bit kernel may have different + * alignment requirements for primitive object types. The value of this + * identifier is expressed in "byte-alignment" units. + * + * _HAVE_CPUID_INSN + * This indicates that the architecture supports the 'cpuid' + * instruction as defined by Intel. (Intel allows other vendors + * to extend the instruction for their own purposes.) + * + * + * Implementation Choices: + * + * _ILP32 / _LP64: + * This specifies the compiler data type implementation as specified in + * the relevant ABI. The choice between these is strongly influenced + * by the underlying hardware, but is not absolutely tied to it. + * Currently only two data type models are supported: + * + * _ILP32: + * Int/Long/Pointer are 32 bits. This is the historical UNIX + * and Solaris implementation. Due to its historical standing, + * this is the default case. + * + * _LP64: + * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen + * implementation for 64-bit ABIs such as SPARC V9. + * + * _I32LPx: + * A compilation environment where 'int' is 32-bit, and + * longs and pointers are simply the same size. + * + * In all cases, Char is 8 bits and Short is 16 bits. + * + * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16: + * This specifies the form of the disk VTOC (or label): + * + * _SUNOS_VTOC_8: + * This is a VTOC form which is upwardly compatible with the + * SunOS 4.x disk label and allows 8 partitions per disk. + * + * _SUNOS_VTOC_16: + * In this format the incore vtoc image matches the ondisk + * version. It allows 16 slices per disk, and is not + * compatible with the SunOS 4.x disk label. + * + * Note that these are not the only two VTOC forms possible and + * additional forms may be added. One possible form would be the + * SVr4 VTOC form. The symbol for that is reserved now, although + * it is not implemented. + * + * _SVR4_VTOC_16: + * This VTOC form is compatible with the System V Release 4 + * VTOC (as implemented on the SVr4 Intel and 3b ports) with + * 16 partitions per disk. + * + * + * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR + * This describes the type of addresses used by system DMA: + * + * _DMA_USES_PHYSADDR: + * This type of DMA, used in the x86 implementation, + * requires physical addresses for DMA buffers. The 24-bit + * addresses used by some legacy boards is the source of the + * "low-memory" (<16MB) requirement for some devices using DMA. + * + * _DMA_USES_VIRTADDR: + * This method of DMA allows the use of virtual addresses for + * DMA transfers. + * + * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT + * This indicates the presence/absence of an fdisk table. + * + * _FIRMWARE_NEEDS_FDISK + * The fdisk table is required by system firmware. If present, + * it allows a disk to be subdivided into multiple fdisk + * partitions, each of which is equivalent to a separate, + * virtual disk. This enables the co-existence of multiple + * operating systems on a shared hard disk. + * + * _NO_FDISK_PRESENT + * If the fdisk table is absent, it is assumed that the entire + * media is allocated for a single operating system. + * + * _HAVE_TEM_FIRMWARE + * Defined if this architecture has the (fallback) option of + * using prom_* calls for doing I/O if a suitable kernel driver + * is not available to do it. + * + * _DONT_USE_1275_GENERIC_NAMES + * Controls whether or not device tree node names should + * comply with the IEEE 1275 "Generic Names" Recommended + * Practice. With _DONT_USE_GENERIC_NAMES, device-specific + * names identifying the particular device will be used. + * + * __i386_COMPAT + * This indicates whether the i386 ABI is supported as a *non-native* + * mode for the platform. When this symbol is defined: + * - 32-bit xstat-style system calls are enabled + * - 32-bit xmknod-style system calls are enabled + * - 32-bit system calls use i386 sizes -and- alignments + * + * Note that this is NOT defined for the i386 native environment! + * + * __x86 + * This is ONLY a synonym for defined(__i386) || defined(__amd64) + * which is useful only insofar as these two architectures share + * common attributes. Analogous to __sparc. + * + * _PSM_MODULES + * This indicates whether or not the implementation uses PSM + * modules for processor support, reading /etc/mach from inside + * the kernel to extract a list. + * + * _RTC_CONFIG + * This indicates whether or not the implementation uses /etc/rtc_config + * to configure the real-time clock in the kernel. + * + * _UNIX_KRTLD + * This indicates that the implementation uses a dynamically + * linked unix + krtld to form the core kernel image at boot + * time, or (in the absence of this symbol) a prelinked kernel image. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The following set of definitions characterize Solaris on AMD's + * 64-bit systems. + */ +#if defined(__x86_64) || defined(__amd64) + +#if !defined(__amd64) +#define __amd64 /* preferred guard */ +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _LITTLE_ENDIAN +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Different alignment constraints for the i386 ABI in compatibility mode + */ +#define _LONG_LONG_ALIGNMENT_32 4 + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#define _LP64 +#endif +#if !defined(_I32LPx) && defined(_KERNEL) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define __i386_COMPAT +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +/* + * The feature test macro __i386 is generic for all processors implementing + * the Intel 386 instruction set or a superset of it. Specifically, this + * includes all members of the 386, 486, and Pentium family of processors. + */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _LITTLE_ENDIAN +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#define _ILP32 +#if !defined(_I32LPx) && defined(_KERNEL) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +/* + * The following set of definitions characterize the Solaris on SPARC systems. + * + * The symbol __sparc indicates any of the SPARC family of processor + * architectures. This includes SPARC V7, SPARC V8 and SPARC V9. + * + * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined + * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough + * to SPARC V8 for the former to be subsumed into the latter definition.) + * + * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined + * by Version 9 of the SPARC Architecture Manual. + * + * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only + * relevant when the symbol __sparc is defined. + */ +/* + * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added + * to support backwards builds. This workaround should be removed in s10_71. + */ +#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__) +#if !defined(__sparc) +#define __sparc +#endif + +/* + * You can be 32-bit or 64-bit, but not both at the same time. + */ +#if defined(__sparcv8) && defined(__sparcv9) +#error "SPARC Versions 8 and 9 are mutually exclusive choices" +#endif + +/* + * Existing compilers do not set __sparcv8. Years will transpire before + * the compilers can be depended on to set the feature test macro. In + * the interim, we'll set it here on the basis of historical behaviour; + * if you haven't asked for SPARC V9, then you must've meant SPARC V8. + */ +#if !defined(__sparcv9) && !defined(__sparcv8) +#define __sparcv8 +#endif + +/* + * Define the appropriate "processor characteristics" shared between + * all Solaris on SPARC systems. + */ +#define _BIG_ENDIAN +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_HTOL +#define _BIT_FIELDS_HTOL +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Define the appropriate "implementation choices" shared between versions. + */ +#define _SUNOS_VTOC_8 +#define _DMA_USES_VIRTADDR +#define _NO_FDISK_PRESENT +#define _HAVE_TEM_FIRMWARE +#define _UNIX_KRTLD + +/* + * The following set of definitions characterize the implementation of + * 32-bit Solaris on SPARC V8 systems. + */ +#if defined(__sparcv8) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 8 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#define _ILP32 +#if !defined(_I32LPx) && defined(_KERNEL) +#define _I32LPx +#endif + +/* + * The following set of definitions characterize the implementation of + * 64-bit Solaris on SPARC V9 systems. + */ +#elif defined(__sparcv9) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#define _LP64 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL + +#else +#error "unknown SPARC version" +#endif + +/* + * #error is strictly ansi-C, but works as well as anything for K&R systems. + */ +#else +#error "ISA not supported" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ISA_DEFS_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/list.h b/sys/contrib/opensolaris/uts/common/sys/list.h new file mode 100644 index 0000000..7e9d9aa --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/list.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_H +#define _SYS_LIST_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/list_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct list_node list_node_t; +typedef struct list list_t; + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); + +int list_link_active(list_node_t *); +int list_is_empty(list_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/list_impl.h b/sys/contrib/opensolaris/uts/common/sys/list_impl.h new file mode 100644 index 0000000..9c42f88 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/list_impl.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_IMPL_H +#define _SYS_LIST_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct list_node { + struct list_node *list_next; + struct list_node *list_prev; +}; + +struct list { + size_t list_size; + size_t list_offset; + struct list_node list_head; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/note.h b/sys/contrib/opensolaris/uts/common/sys/note.h new file mode 100644 index 0000000..2cb7fd8 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/note.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994 by Sun Microsystems, Inc. + */ + +/* + * sys/note.h: interface for annotating source with info for tools + * + * This is the underlying interface; NOTE (/usr/include/note.h) is the + * preferred interface, but all exported header files should include this + * file directly and use _NOTE so as not to take "NOTE" from the user's + * namespace. For consistency, *all* kernel source should use _NOTE. + * + * By default, annotations expand to nothing. This file implements + * that. Tools using annotations will interpose a different version + * of this file that will expand annotations as needed. + */ + +#ifndef _SYS_NOTE_H +#define _SYS_NOTE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _NOTE +#define _NOTE(s) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_NOTE_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/contrib/opensolaris/uts/common/sys/nvpair.h new file mode 100644 index 0000000..306e30f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/nvpair.h @@ -0,0 +1,260 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_NVPAIR_H +#define _SYS_NVPAIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/errno.h> + +#if defined(_KERNEL) && !defined(_BOOT) +#include <sys/kmem.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + DATA_TYPE_UNKNOWN = 0, + DATA_TYPE_BOOLEAN, + DATA_TYPE_BYTE, + DATA_TYPE_INT16, + DATA_TYPE_UINT16, + DATA_TYPE_INT32, + DATA_TYPE_UINT32, + DATA_TYPE_INT64, + DATA_TYPE_UINT64, + DATA_TYPE_STRING, + DATA_TYPE_BYTE_ARRAY, + DATA_TYPE_INT16_ARRAY, + DATA_TYPE_UINT16_ARRAY, + DATA_TYPE_INT32_ARRAY, + DATA_TYPE_UINT32_ARRAY, + DATA_TYPE_INT64_ARRAY, + DATA_TYPE_UINT64_ARRAY, + DATA_TYPE_STRING_ARRAY, + DATA_TYPE_HRTIME, + DATA_TYPE_NVLIST, + DATA_TYPE_NVLIST_ARRAY, + DATA_TYPE_BOOLEAN_VALUE, + DATA_TYPE_INT8, + DATA_TYPE_UINT8, + DATA_TYPE_BOOLEAN_ARRAY, + DATA_TYPE_INT8_ARRAY, + DATA_TYPE_UINT8_ARRAY +} data_type_t; + +typedef struct nvpair { + int32_t nvp_size; /* size of this nvpair */ + int16_t nvp_name_sz; /* length of name string */ + int16_t nvp_reserve; /* not used */ + int32_t nvp_value_elem; /* number of elements for array types */ + data_type_t nvp_type; /* type of value */ + /* name string */ + /* aligned ptr array for string arrays */ + /* aligned array of data for value */ +} nvpair_t; + +/* nvlist header */ +typedef struct nvlist { + int32_t nvl_version; + uint32_t nvl_nvflag; /* persistent flags */ + uint64_t nvl_priv; /* ptr to private data if not packed */ + uint32_t nvl_flag; + int32_t nvl_pad; /* currently not used, for alignment */ +} nvlist_t; + +/* nvp implementation version */ +#define NV_VERSION 0 + +/* nvlist pack encoding */ +#define NV_ENCODE_NATIVE 0 +#define NV_ENCODE_XDR 1 + +/* nvlist persistent unique name flags, stored in nvl_nvflags */ +#define NV_UNIQUE_NAME 0x1 +#define NV_UNIQUE_NAME_TYPE 0x2 + +/* nvlist lookup pairs related flags */ +#define NV_FLAG_NOENTOK 0x1 + +/* convenience macros */ +#define NV_ALIGN(x) (((ulong_t)(x) + 7ul) & ~7ul) +#define NV_ALIGN4(x) (((x) + 3) & ~3) + +#define NVP_SIZE(nvp) ((nvp)->nvp_size) +#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t)) +#define NVP_TYPE(nvp) ((nvp)->nvp_type) +#define NVP_NELEM(nvp) ((nvp)->nvp_value_elem) +#define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \ + + (nvp)->nvp_name_sz)) + +#define NVL_VERSION(nvl) ((nvl)->nvl_version) +#define NVL_SIZE(nvl) ((nvl)->nvl_size) +#define NVL_FLAG(nvl) ((nvl)->nvl_flag) + +/* NV allocator framework */ +typedef struct nv_alloc_ops nv_alloc_ops_t; + +typedef struct nv_alloc { + const nv_alloc_ops_t *nva_ops; + void *nva_arg; +} nv_alloc_t; + +struct nv_alloc_ops { + int (*nv_ao_init)(nv_alloc_t *, __va_list); + void (*nv_ao_fini)(nv_alloc_t *); + void *(*nv_ao_alloc)(nv_alloc_t *, size_t); + void (*nv_ao_free)(nv_alloc_t *, void *, size_t); + void (*nv_ao_reset)(nv_alloc_t *); +}; + +extern const nv_alloc_ops_t *nv_fixed_ops; +extern nv_alloc_t *nv_alloc_nosleep; + +#if defined(_KERNEL) && !defined(_BOOT) +extern nv_alloc_t *nv_alloc_sleep; +#endif + +int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...); +void nv_alloc_reset(nv_alloc_t *); +void nv_alloc_fini(nv_alloc_t *); + +/* list management */ +int nvlist_alloc(nvlist_t **, uint_t, int); +void nvlist_free(nvlist_t *); +int nvlist_size(nvlist_t *, size_t *, int); +int nvlist_pack(nvlist_t *, char **, size_t *, int, int); +int nvlist_unpack(char *, size_t, nvlist_t **, int); +int nvlist_dup(nvlist_t *, nvlist_t **, int); +int nvlist_merge(nvlist_t *, nvlist_t *, int); + +int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *); +int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *); +int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *); +int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *); +nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *); + +int nvlist_add_nvpair(nvlist_t *, nvpair_t *); +int nvlist_add_boolean(nvlist_t *, const char *); +int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); +int nvlist_add_byte(nvlist_t *, const char *, uchar_t); +int nvlist_add_int8(nvlist_t *, const char *, int8_t); +int nvlist_add_uint8(nvlist_t *, const char *, uint8_t); +int nvlist_add_int16(nvlist_t *, const char *, int16_t); +int nvlist_add_uint16(nvlist_t *, const char *, uint16_t); +int nvlist_add_int32(nvlist_t *, const char *, int32_t); +int nvlist_add_uint32(nvlist_t *, const char *, uint32_t); +int nvlist_add_int64(nvlist_t *, const char *, int64_t); +int nvlist_add_uint64(nvlist_t *, const char *, uint64_t); +int nvlist_add_string(nvlist_t *, const char *, const char *); +int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); +int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); +int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); +int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); +int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); +int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); +int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); +int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); +int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); +int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); +int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t); +int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); +int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t); + +int nvlist_remove(nvlist_t *, const char *, data_type_t); +int nvlist_remove_all(nvlist_t *, const char *); + +int nvlist_lookup_boolean(nvlist_t *, const char *); +int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *); +int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *); +int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *); +int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *); +int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *); +int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *); +int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *); +int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *); +int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *); +int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *); +int nvlist_lookup_string(nvlist_t *, const char *, char **); +int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **); +int nvlist_lookup_boolean_array(nvlist_t *, const char *, + boolean_t **, uint_t *); +int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *); +int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *); +int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *); +int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *); +int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *); +int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *); +int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *); +int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *); +int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *); +int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *); +int nvlist_lookup_nvlist_array(nvlist_t *, const char *, + nvlist_t ***, uint_t *); +int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *); +int nvlist_lookup_pairs(nvlist_t *nvl, int, ...); + +/* processing nvpair */ +nvpair_t *nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *); +char *nvpair_name(nvpair_t *); +data_type_t nvpair_type(nvpair_t *); +int nvpair_value_boolean_value(nvpair_t *, boolean_t *); +int nvpair_value_byte(nvpair_t *, uchar_t *); +int nvpair_value_int8(nvpair_t *, int8_t *); +int nvpair_value_uint8(nvpair_t *, uint8_t *); +int nvpair_value_int16(nvpair_t *, int16_t *); +int nvpair_value_uint16(nvpair_t *, uint16_t *); +int nvpair_value_int32(nvpair_t *, int32_t *); +int nvpair_value_uint32(nvpair_t *, uint32_t *); +int nvpair_value_int64(nvpair_t *, int64_t *); +int nvpair_value_uint64(nvpair_t *, uint64_t *); +int nvpair_value_string(nvpair_t *, char **); +int nvpair_value_nvlist(nvpair_t *, nvlist_t **); +int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *); +int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *); +int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *); +int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *); +int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *); +int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *); +int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *); +int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *); +int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *); +int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *); +int nvpair_value_string_array(nvpair_t *, char ***, uint_t *); +int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *); +int nvpair_value_hrtime(nvpair_t *, hrtime_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_NVPAIR_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h b/sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h new file mode 100644 index 0000000..f12dbbf --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _NVPAIR_IMPL_H +#define _NVPAIR_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/nvpair.h> + +/* + * The structures here provided for information and debugging purposes only + * may be changed in the future. + */ + +/* + * implementation linked list for pre-packed data + */ +typedef struct i_nvp i_nvp_t; + +struct i_nvp { + union { + uint64_t _nvi_align; /* ensure alignment */ + struct { + i_nvp_t *_nvi_next; /* pointer to next nvpair */ + i_nvp_t *_nvi_prev; /* pointer to prev nvpair */ + } _nvi; + } _nvi_un; + nvpair_t nvi_nvp; /* nvpair */ +}; +#define nvi_next _nvi_un._nvi._nvi_next +#define nvi_prev _nvi_un._nvi._nvi_prev + +typedef struct { + i_nvp_t *nvp_list; /* linked list of nvpairs */ + i_nvp_t *nvp_last; /* last nvpair */ + i_nvp_t *nvp_curr; /* current walker nvpair */ + nv_alloc_t *nvp_nva; /* pluggable allocator */ + uint32_t nvp_stat; /* internal state */ +} nvpriv_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _NVPAIR_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/processor.h b/sys/contrib/opensolaris/uts/common/sys/processor.h new file mode 100644 index 0000000..063f7dacb --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/processor.h @@ -0,0 +1,146 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + */ + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PROCESSOR_H +#define _SYS_PROCESSOR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/procset.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Definitions for p_online, processor_info & lgrp system calls. + */ + +/* + * Type for an lgrpid + */ +typedef uint16_t lgrpid_t; + +/* + * Type for processor name (CPU number). + */ +typedef int processorid_t; +typedef int chipid_t; + +/* + * Flags and return values for p_online(2), and pi_state for processor_info(2). + * These flags are *not* for in-kernel examination of CPU states. + * See <sys/cpuvar.h> for appropriate informational functions. + */ +#define P_OFFLINE 0x0001 /* processor is offline, as quiet as possible */ +#define P_ONLINE 0x0002 /* processor is online */ +#define P_STATUS 0x0003 /* value passed to p_online to request status */ +#define P_FAULTED 0x0004 /* processor is offline, in faulted state */ +#define P_POWEROFF 0x0005 /* processor is powered off */ +#define P_NOINTR 0x0006 /* processor is online, but no I/O interrupts */ +#define P_SPARE 0x0007 /* processor is offline, can be reactivated */ +#define P_BAD P_FAULTED /* unused but defined by USL */ +#define P_FORCED 0x10000000 /* force processor offline */ + +/* + * String names for processor states defined above. + */ +#define PS_OFFLINE "off-line" +#define PS_ONLINE "on-line" +#define PS_FAULTED "faulted" +#define PS_POWEROFF "powered-off" +#define PS_NOINTR "no-intr" +#define PS_SPARE "spare" + +/* + * Structure filled in by processor_info(2). + * + * The string fields are guaranteed to contain a NULL. + * + * The pi_fputypes field contains a (possibly empty) comma-separated + * list of floating point identifier strings. + */ +#define PI_TYPELEN 16 /* max size of CPU type string */ +#define PI_FPUTYPE 32 /* max size of FPU types string */ + +typedef struct { + int pi_state; /* processor state, see above */ + char pi_processor_type[PI_TYPELEN]; /* ASCII CPU type */ + char pi_fputypes[PI_FPUTYPE]; /* ASCII FPU types */ + int pi_clock; /* CPU clock freq in MHz */ +} processor_info_t; + +/* + * Binding values for processor_bind(2) + */ +#define PBIND_NONE -1 /* LWP/thread is not bound */ +#define PBIND_QUERY -2 /* don't set, just return the binding */ + +/* + * User-level system call interface prototypes + */ +#ifndef _KERNEL +#ifdef __STDC__ + +extern int p_online(processorid_t processorid, int flag); +extern int processor_info(processorid_t processorid, + processor_info_t *infop); +extern int processor_bind(idtype_t idtype, id_t id, + processorid_t processorid, processorid_t *obind); +extern processorid_t getcpuid(void); +extern lgrpid_t gethomelgroup(void); + +#else + +extern int p_online(); +extern int processor_info(); +extern int processor_bind(); +extern processorid_t getcpuid(); +extern lgrpid_t gethomelgroup(); + +#endif /* __STDC__ */ + +#else /* _KERNEL */ + +/* + * Internal interface prototypes + */ +extern int p_online_internal(processorid_t, int, int *); + +#endif /* !_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PROCESSOR_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/procset.h b/sys/contrib/opensolaris/uts/common/sys/procset.h new file mode 100644 index 0000000..c3b5867 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/procset.h @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_PROCSET_H +#define _SYS_PROCSET_H + +#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.6 */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/feature_tests.h> +#include <sys/types.h> +#include <sys/signal.h> + +/* + * This file defines the data needed to specify a set of + * processes. These types are used by the sigsend, sigsendset, + * priocntl, priocntlset, waitid, evexit, and evexitset system + * calls. + */ +#define P_INITPID 1 +#define P_INITUID 0 +#define P_INITPGID 0 + + +/* + * The following defines the values for an identifier type. It + * specifies the interpretation of an id value. An idtype and + * id together define a simple set of processes. + */ +typedef enum +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) + idtype /* pollutes XPG4.2 namespace */ +#endif + { + P_PID, /* A process identifier. */ + P_PPID, /* A parent process identifier. */ + P_PGID, /* A process group (job control group) */ + /* identifier. */ + P_SID, /* A session identifier. */ + P_CID, /* A scheduling class identifier. */ + P_UID, /* A user identifier. */ + P_GID, /* A group identifier. */ + P_ALL, /* All processes. */ + P_LWPID, /* An LWP identifier. */ + P_TASKID, /* A task identifier. */ + P_PROJID, /* A project identifier. */ + P_POOLID, /* A pool identifier. */ + P_ZONEID, /* A zone identifier. */ + P_CTID, /* A (process) contract identifier. */ + P_CPUID, /* CPU identifier. */ + P_PSETID /* Processor set identifier */ +} idtype_t; + + +/* + * The following defines the operations which can be performed to + * combine two simple sets of processes to form another set of + * processes. + */ +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) +typedef enum idop { + POP_DIFF, /* Set difference. The processes which */ + /* are in the left operand set and not */ + /* in the right operand set. */ + POP_AND, /* Set disjunction. The processes */ + /* which are in both the left and right */ + /* operand sets. */ + POP_OR, /* Set conjunction. The processes */ + /* which are in either the left or the */ + /* right operand sets (or both). */ + POP_XOR /* Set exclusive or. The processes */ + /* which are in either the left or */ + /* right operand sets but not in both. */ +} idop_t; + + +/* + * The following structure is used to define a set of processes. + * The set is defined in terms of two simple sets of processes + * and an operator which operates on these two operand sets. + */ +typedef struct procset { + idop_t p_op; /* The operator connection the */ + /* following two operands each */ + /* of which is a simple set of */ + /* processes. */ + + idtype_t p_lidtype; + /* The type of the left operand */ + /* simple set. */ + id_t p_lid; /* The id of the left operand. */ + + idtype_t p_ridtype; + /* The type of the right */ + /* operand simple set. */ + id_t p_rid; /* The id of the right operand. */ +} procset_t; + +/* + * The following macro can be used to initialize a procset_t + * structure. + */ +#define setprocset(psp, op, ltype, lid, rtype, rid) \ + (psp)->p_op = (op); \ + (psp)->p_lidtype = (ltype); \ + (psp)->p_lid = (lid); \ + (psp)->p_ridtype = (rtype); \ + (psp)->p_rid = (rid); + +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ + +#ifdef _KERNEL + +struct proc; + +extern int dotoprocs(procset_t *, int (*)(), char *); +extern int dotolwp(procset_t *, int (*)(), char *); +extern int procinset(struct proc *, procset_t *); +extern int sigsendproc(struct proc *, sigsend_t *); +extern int sigsendset(procset_t *, sigsend_t *); +extern boolean_t cur_inset_only(procset_t *); +extern id_t getmyid(idtype_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PROCSET_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/sdt.h b/sys/contrib/opensolaris/uts/common/sys/sdt.h new file mode 100644 index 0000000..da695c9 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/sdt.h @@ -0,0 +1,176 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SDT_H +#define _SYS_SDT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL + +#define DTRACE_PROBE(provider, name) { \ + extern void __dtrace_##provider##___##name(void); \ + __dtrace_##provider##___##name(); \ +} + +#define DTRACE_PROBE1(provider, name, arg1) { \ + extern void __dtrace_##provider##___##name(unsigned long); \ + __dtrace_##provider##___##name((unsigned long)arg1); \ +} + +#define DTRACE_PROBE2(provider, name, arg1, arg2) { \ + extern void __dtrace_##provider##___##name(unsigned long, \ + unsigned long); \ + __dtrace_##provider##___##name((unsigned long)arg1, \ + (unsigned long)arg2); \ +} + +#define DTRACE_PROBE3(provider, name, arg1, arg2, arg3) { \ + extern void __dtrace_##provider##___##name(unsigned long, \ + unsigned long, unsigned long); \ + __dtrace_##provider##___##name((unsigned long)arg1, \ + (unsigned long)arg2, (unsigned long)arg3); \ +} + +#define DTRACE_PROBE4(provider, name, arg1, arg2, arg3, arg4) { \ + extern void __dtrace_##provider##___##name(unsigned long, \ + unsigned long, unsigned long, unsigned long); \ + __dtrace_##provider##___##name((unsigned long)arg1, \ + (unsigned long)arg2, (unsigned long)arg3, \ + (unsigned long)arg4); \ +} + +#define DTRACE_PROBE5(provider, name, arg1, arg2, arg3, arg4, arg5) { \ + extern void __dtrace_##provider##___##name(unsigned long, \ + unsigned long, unsigned long, unsigned long, unsigned long);\ + __dtrace_##provider##___##name((unsigned long)arg1, \ + (unsigned long)arg2, (unsigned long)arg3, \ + (unsigned long)arg4, (unsigned long)arg5); \ +} + +#else /* _KERNEL */ + +#define DTRACE_PROBE(name) { \ + extern void __dtrace_probe_##name(void); \ + __dtrace_probe_##name(); \ +} + +#define DTRACE_PROBE1(name, type1, arg1) { \ + extern void __dtrace_probe_##name(uintptr_t); \ + __dtrace_probe_##name((uintptr_t)(arg1)); \ +} + +#define DTRACE_PROBE2(name, type1, arg1, type2, arg2) { \ + extern void __dtrace_probe_##name(uintptr_t, uintptr_t); \ + __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2)); \ +} + +#define DTRACE_PROBE3(name, type1, arg1, type2, arg2, type3, arg3) { \ + extern void __dtrace_probe_##name(uintptr_t, uintptr_t, uintptr_t); \ + __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \ + (uintptr_t)(arg3)); \ +} + +#define DTRACE_PROBE4(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4) { \ + extern void __dtrace_probe_##name(uintptr_t, uintptr_t, \ + uintptr_t, uintptr_t); \ + __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \ + (uintptr_t)(arg3), (uintptr_t)(arg4)); \ +} + +#define DTRACE_SCHED(name) \ + DTRACE_PROBE(__sched_##name); + +#define DTRACE_SCHED1(name, type1, arg1) \ + DTRACE_PROBE1(__sched_##name, type1, arg1); + +#define DTRACE_SCHED2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__sched_##name, type1, arg1, type2, arg2); + +#define DTRACE_SCHED3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__sched_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_SCHED4(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__sched_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4); + +#define DTRACE_PROC(name) \ + DTRACE_PROBE(__proc_##name); + +#define DTRACE_PROC1(name, type1, arg1) \ + DTRACE_PROBE1(__proc_##name, type1, arg1); + +#define DTRACE_PROC2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__proc_##name, type1, arg1, type2, arg2); + +#define DTRACE_PROC3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__proc_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_PROC4(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__proc_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4); + +#define DTRACE_IO(name) \ + DTRACE_PROBE(__io_##name); + +#define DTRACE_IO1(name, type1, arg1) \ + DTRACE_PROBE1(__io_##name, type1, arg1); + +#define DTRACE_IO2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__io_##name, type1, arg1, type2, arg2); + +#define DTRACE_IO3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__io_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_IO4(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__io_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4); + +#define DTRACE_SYSEVENT2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__sysevent_##name, type1, arg1, type2, arg2); + +#endif /* _KERNEL */ + +extern const char *sdt_prefix; + +typedef struct sdt_probedesc { + char *sdpd_name; /* name of this probe */ + unsigned long sdpd_offset; /* offset of call in text */ + struct sdt_probedesc *sdpd_next; /* next static probe */ +} sdt_probedesc_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SDT_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/synch.h b/sys/contrib/opensolaris/uts/common/sys/synch.h new file mode 100644 index 0000000..8f52d72 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/synch.h @@ -0,0 +1,161 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYNCH_H +#define _SYS_SYNCH_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifndef _ASM +#include <sys/types.h> +#include <sys/int_types.h> +#endif /* _ASM */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _ASM +/* + * Thread and LWP mutexes have the same type + * definitions. + * + * NOTE: + * + * POSIX requires that <pthread.h> define the structures pthread_mutex_t + * and pthread_cond_t. Although these structures are identical to mutex_t + * (lwp_mutex_t) and cond_t (lwp_cond_t), defined here, a typedef of these + * types would require including <synch.h> in <pthread.h>, pulling in + * non-posix symbols/constants, violating POSIX namespace restrictions. Hence, + * pthread_mutex_t/pthread_cond_t have been redefined (in <sys/types.h>). + * Any modifications done to mutex_t/lwp_mutex_t or cond_t/lwp_cond_t must + * also be done to pthread_mutex_t/pthread_cond_t. + */ +typedef struct _lwp_mutex { + struct { + uint16_t flag1; + uint8_t flag2; + uint8_t ceiling; + union { + uint16_t bcptype; + struct { + uint8_t count_type1; + uint8_t count_type2; + } mtype_rcount; + } mbcp_type_un; + uint16_t magic; + } flags; + union { + struct { + uint8_t pad[8]; + } lock64; + struct { + uint32_t ownerpid; + uint32_t lockword; + } lock32; + upad64_t owner64; + } lock; + upad64_t data; +} lwp_mutex_t; + +/* + * Thread and LWP condition variables have the same + * type definition. + * NOTE: + * The layout of the following structure should be kept in sync with the + * layout of pthread_cond_t in sys/types.h. See NOTE above for lwp_mutex_t. + */ +typedef struct _lwp_cond { + struct { + uint8_t flag[4]; + uint16_t type; + uint16_t magic; + } flags; + upad64_t data; +} lwp_cond_t; + +/* + * LWP semaphores + */ +typedef struct _lwp_sema { + uint32_t count; /* semaphore count */ + uint16_t type; + uint16_t magic; + uint8_t flags[8]; /* last byte reserved for waiters */ + upad64_t data; /* optional data */ +} lwp_sema_t; + +/* + * Thread and LWP rwlocks have the same type definition. + * NOTE: The layout of this structure should be kept in sync with the layout + * of the correponding structure of pthread_rwlock_t in sys/types.h. + * Also, because we have to deal with C++, there is an identical structure + * for rwlock_t in head/sync.h that we cannot change. + */ +typedef struct _lwp_rwlock { + int32_t readers; /* -1 == writer else # of readers */ + uint16_t type; + uint16_t magic; + lwp_mutex_t mutex; /* used to indicate ownership */ + lwp_cond_t readercv; /* unused */ + lwp_cond_t writercv; /* unused */ +} lwp_rwlock_t; + +#endif /* _ASM */ +/* + * Definitions of synchronization types. + */ +#define USYNC_THREAD 0x00 /* private to a process */ +#define USYNC_PROCESS 0x01 /* shared by processes */ + +/* Keep the following 3 fields in sync with pthread.h */ +#define LOCK_NORMAL 0x00 /* same as USYNC_THREAD */ +#define LOCK_ERRORCHECK 0x02 /* error check lock */ +#define LOCK_RECURSIVE 0x04 /* recursive lock */ + +#define USYNC_PROCESS_ROBUST 0x08 /* shared by processes robustly */ + +/* Keep the following 5 fields in sync with pthread.h */ + +#define LOCK_PRIO_NONE 0x00 +#define LOCK_PRIO_INHERIT 0x10 +#define LOCK_PRIO_PROTECT 0x20 +#define LOCK_STALL_NP 0x00 +#define LOCK_ROBUST_NP 0x40 + +/* + * lwp_mutex_t flags + */ +#define LOCK_OWNERDEAD 0x1 +#define LOCK_NOTRECOVERABLE 0x2 +#define LOCK_INITED 0x4 +#define LOCK_UNMAPPED 0x8 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYNCH_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/sysevent.h b/sys/contrib/opensolaris/uts/common/sys/sysevent.h new file mode 100644 index 0000000..0a61e41 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/sysevent.h @@ -0,0 +1,227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSEVENT_H +#define _SYS_SYSEVENT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/nvpair.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef NULL +#if defined(_LP64) && !defined(__cplusplus) +#define NULL 0L +#else +#define NULL 0 +#endif +#endif + +/* Internal registration class and subclass */ +#define EC_ALL "register_all_classes" +#define EC_SUB_ALL "register_all_subclasses" + +/* + * Event allocation/enqueuing sleep/nosleep flags + */ +#define SE_SLEEP 0 +#define SE_NOSLEEP 1 + +/* Framework error codes */ +#define SE_EINVAL 1 /* Invalid argument */ +#define SE_ENOMEM 2 /* Unable to allocate memory */ +#define SE_EQSIZE 3 /* Maximum event q size exceeded */ +#define SE_EFAULT 4 /* Copy fault */ +#define SE_NOTFOUND 5 /* Attribute not found */ +#define SE_NO_TRANSPORT 6 /* sysevent transport down */ + +/* Internal data types */ + +#define SE_DATA_TYPE_BYTE DATA_TYPE_BYTE +#define SE_DATA_TYPE_INT16 DATA_TYPE_INT16 +#define SE_DATA_TYPE_UINT16 DATA_TYPE_UINT16 +#define SE_DATA_TYPE_INT32 DATA_TYPE_INT32 +#define SE_DATA_TYPE_UINT32 DATA_TYPE_UINT32 +#define SE_DATA_TYPE_INT64 DATA_TYPE_INT64 +#define SE_DATA_TYPE_UINT64 DATA_TYPE_UINT64 +#define SE_DATA_TYPE_STRING DATA_TYPE_STRING +#define SE_DATA_TYPE_BYTES DATA_TYPE_BYTE_ARRAY +#define SE_DATA_TYPE_TIME DATA_TYPE_HRTIME + +#define SE_KERN_PID 0 + +#define SUNW_VENDOR "SUNW" +#define SE_USR_PUB "usr:" +#define SE_KERN_PUB "kern:" +#define SUNW_KERN_PUB SUNW_VENDOR":"SE_KERN_PUB +#define SUNW_USR_PUB SUNW_VENDOR":"SE_USR_PUB + +/* + * Event header and attribute value limits + */ +#define MAX_ATTR_NAME 1024 +#define MAX_STRING_SZ 1024 +#define MAX_BYTE_ARRAY 1024 + +#define MAX_CLASS_LEN 64 +#define MAX_SUBCLASS_LEN 64 +#define MAX_PUB_LEN 128 +#define MAX_CHNAME_LEN 128 +#define MAX_SUBID_LEN 16 + +/* + * Limit for the event payload size + */ +#define MAX_EV_SIZE_LEN (SHRT_MAX/4) + +/* Opaque sysevent_t data type */ +typedef void *sysevent_t; + +/* Opaque channel bind data type */ +typedef void evchan_t; + +/* sysevent attribute list */ +typedef nvlist_t sysevent_attr_list_t; + +/* sysevent attribute name-value pair */ +typedef nvpair_t sysevent_attr_t; + +/* Unique event identifier */ +typedef struct sysevent_id { + uint64_t eid_seq; + hrtime_t eid_ts; +} sysevent_id_t; + +/* Event attribute value structures */ +typedef struct sysevent_bytes { + int32_t size; + uchar_t *data; +} sysevent_bytes_t; + +typedef struct sysevent_value { + int32_t value_type; /* data type */ + union { + uchar_t sv_byte; + int16_t sv_int16; + uint16_t sv_uint16; + int32_t sv_int32; + uint32_t sv_uint32; + int64_t sv_int64; + uint64_t sv_uint64; + hrtime_t sv_time; + char *sv_string; + sysevent_bytes_t sv_bytes; + } value; +} sysevent_value_t; + +/* + * The following flags determine the memory allocation semantics to use for + * kernel event buffer allocation by userland and kernel versions of + * sysevent_evc_publish(). + * + * EVCH_SLEEP and EVCH_NOSLEEP respectively map to KM_SLEEP and KM_NOSLEEP. + * EVCH_TRYHARD is a kernel-only publish flag that allow event allocation + * routines to use use alternate kmem caches in situations where free memory + * may be low. Kernel callers of sysevent_evc_publish() must set flags to + * one of EVCH_SLEEP, EVCH_NOSLEEP or EVCH_TRYHARD. Userland callers of + * sysevent_evc_publish() must set flags to one of EVCH_SLEEP or EVCH_NOSLEEP. + * + * EVCH_QWAIT determines whether or not we should wait for slots in the event + * queue at publication time. EVCH_QWAIT may be used by kernel and userland + * publishers and must be used in conjunction with any of one of EVCH_SLEEP, + * EVCH_NOSLEEP or EVCH_TRYHARD (kernel-only). + */ + +#define EVCH_NOSLEEP 0x0001 /* No sleep on kmem_alloc() */ +#define EVCH_SLEEP 0x0002 /* Sleep on kmem_alloc() */ +#define EVCH_TRYHARD 0x0004 /* May use alternate kmem cache for alloc */ +#define EVCH_QWAIT 0x0008 /* Wait for slot in event queue */ + +/* + * Meaning of flags for subscribe/unsubscribe. Bits 0 to 7 are dedicated to + * the consolidation private interface. + */ +#define EVCH_SUB_KEEP 0x0001 +#define EVCH_ALLSUB "all_subs" + +/* + * Meaning of flags parameter of channel bind function + */ +#define EVCH_CREAT 0x0001 /* Create a channel if not present */ +#define EVCH_HOLD_PEND 0x0002 +#define EVCH_B_FLAGS 0x0003 /* All valid bits */ + +/* + * Meaning of commands of evc_control function + */ +#define EVCH_GET_CHAN_LEN_MAX 1 /* Get event queue length limit */ +#define EVCH_GET_CHAN_LEN 2 /* Get event queue length */ +#define EVCH_SET_CHAN_LEN 3 /* Set event queue length */ +#define EVCH_CMD_LAST EVCH_SET_CHAN_LEN /* Last command */ + +/* + * Event channel interface definitions + */ +int sysevent_evc_bind(const char *, evchan_t **, uint32_t); +void sysevent_evc_unbind(evchan_t *); +int sysevent_evc_subscribe(evchan_t *, const char *, const char *, + int (*)(sysevent_t *, void *), void *, uint32_t); +void sysevent_evc_unsubscribe(evchan_t *, const char *); +int sysevent_evc_publish(evchan_t *, const char *, const char *, + const char *, const char *, nvlist_t *, uint32_t); +int sysevent_evc_control(evchan_t *, int, ...); + +#ifdef _KERNEL + +/* + * Kernel log_event interfaces. + */ +int log_sysevent(sysevent_t *, int, sysevent_id_t *); + +sysevent_t *sysevent_alloc(char *, char *, char *, int); +void sysevent_free(sysevent_t *); +int sysevent_add_attr(sysevent_attr_list_t **, char *, sysevent_value_t *, int); +void sysevent_free_attr(sysevent_attr_list_t *); +int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *); +void sysevent_detach_attributes(sysevent_t *); +char *sysevent_get_class_name(sysevent_t *); +char *sysevent_get_subclass_name(sysevent_t *); +uint64_t sysevent_get_seq(sysevent_t *); +void sysevent_get_time(sysevent_t *, hrtime_t *); +size_t sysevent_get_size(sysevent_t *); +char *sysevent_get_pub(sysevent_t *); +int sysevent_get_attr_list(sysevent_t *, nvlist_t **); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/sysmacros.h b/sys/contrib/opensolaris/uts/common/sys/sysmacros.h new file mode 100644 index 0000000..d772084 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/sysmacros.h @@ -0,0 +1,287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSMACROS_H +#define _SYS_SYSMACROS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Some macros for units conversion + */ +/* + * Disk blocks (sectors) and bytes. + */ +#define dtob(DD) ((DD) << DEV_BSHIFT) +#define btod(BB) (((BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) +#define btodt(BB) ((BB) >> DEV_BSHIFT) +#define lbtod(BB) (((offset_t)(BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif + +#ifdef _KERNEL + +/* + * Convert a single byte to/from binary-coded decimal (BCD). + */ +extern unsigned char byte_to_bcd[256]; +extern unsigned char bcd_to_byte[256]; + +#define BYTE_TO_BCD(x) byte_to_bcd[(x) & 0xff] +#define BCD_TO_BYTE(x) bcd_to_byte[(x) & 0xff] + +#endif /* _KERNEL */ + +/* + * WARNING: The device number macros defined here should not be used by device + * drivers or user software. Device drivers should use the device functions + * defined in the DDI/DKI interface (see also ddi.h). Application software + * should make use of the library routines available in makedev(3). A set of + * new device macros are provided to operate on the expanded device number + * format supported in SVR4. Macro versions of the DDI device functions are + * provided for use by kernel proper routines only. Macro routines bmajor(), + * major(), minor(), emajor(), eminor(), and makedev() will be removed or + * their definitions changed at the next major release following SVR4. + */ + +#define O_BITSMAJOR 7 /* # of SVR3 major device bits */ +#define O_BITSMINOR 8 /* # of SVR3 minor device bits */ +#define O_MAXMAJ 0x7f /* SVR3 max major value */ +#define O_MAXMIN 0xff /* SVR3 max minor value */ + + +#define L_BITSMAJOR32 14 /* # of SVR4 major device bits */ +#define L_BITSMINOR32 18 /* # of SVR4 minor device bits */ +#define L_MAXMAJ32 0x3fff /* SVR4 max major value */ +#define L_MAXMIN32 0x3ffff /* MAX minor for 3b2 software drivers. */ + /* For 3b2 hardware devices the minor is */ + /* restricted to 256 (0-255) */ + +#ifdef _LP64 +#define L_BITSMAJOR 32 /* # of major device bits in 64-bit Solaris */ +#define L_BITSMINOR 32 /* # of minor device bits in 64-bit Solaris */ +#define L_MAXMAJ 0xfffffffful /* max major value */ +#define L_MAXMIN 0xfffffffful /* max minor value */ +#else +#define L_BITSMAJOR L_BITSMAJOR32 +#define L_BITSMINOR L_BITSMINOR32 +#define L_MAXMAJ L_MAXMAJ32 +#define L_MAXMIN L_MAXMIN32 +#endif + +#ifdef _KERNEL + +/* major part of a device internal to the kernel */ + +#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ) +#define bmajor(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ) + +/* get internal major part of expanded device number */ + +#define getmajor(x) (major_t)((((dev_t)(x)) >> L_BITSMINOR) & L_MAXMAJ) + +/* minor part of a device internal to the kernel */ + +#define minor(x) (minor_t)((x) & O_MAXMIN) + +/* get internal minor part of expanded device number */ + +#define getminor(x) (minor_t)((x) & L_MAXMIN) + +#else + +/* major part of a device external from the kernel (same as emajor below) */ + +#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ) + +/* minor part of a device external from the kernel (same as eminor below) */ + +#define minor(x) (minor_t)((x) & O_MAXMIN) + +#endif /* _KERNEL */ + +/* create old device number */ + +#define makedev(x, y) (unsigned short)(((x) << O_BITSMINOR) | ((y) & O_MAXMIN)) + +/* make an new device number */ + +#define makedevice(x, y) (dev_t)(((dev_t)(x) << L_BITSMINOR) | ((y) & L_MAXMIN)) + + +/* + * emajor() allows kernel/driver code to print external major numbers + * eminor() allows kernel/driver code to print external minor numbers + */ + +#define emajor(x) \ + (major_t)(((unsigned int)(x) >> O_BITSMINOR) > O_MAXMAJ) ? \ + NODEV : (((unsigned int)(x) >> O_BITSMINOR) & O_MAXMAJ) + +#define eminor(x) \ + (minor_t)((x) & O_MAXMIN) + +/* + * get external major and minor device + * components from expanded device number + */ +#define getemajor(x) (major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \ + NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ)) +#define geteminor(x) (minor_t)((x) & L_MAXMIN) + +/* + * These are versions of the kernel routines for compressing and + * expanding long device numbers that don't return errors. + */ +#if (L_BITSMAJOR32 == L_BITSMAJOR) && (L_BITSMINOR32 == L_BITSMINOR) + +#define DEVCMPL(x) (x) +#define DEVEXPL(x) (x) + +#else + +#define DEVCMPL(x) \ + (dev32_t)((((x) >> L_BITSMINOR) > L_MAXMAJ32 || \ + ((x) & L_MAXMIN) > L_MAXMIN32) ? NODEV32 : \ + ((((x) >> L_BITSMINOR) << L_BITSMINOR32) | ((x) & L_MAXMIN32))) + +#define DEVEXPL(x) \ + (((x) == NODEV32) ? NODEV : \ + makedevice(((x) >> L_BITSMINOR32) & L_MAXMAJ32, (x) & L_MAXMIN32)) + +#endif /* L_BITSMAJOR32 ... */ + +/* convert to old (SVR3.2) dev format */ + +#define cmpdev(x) \ + (o_dev_t)((((x) >> L_BITSMINOR) > O_MAXMAJ || \ + ((x) & L_MAXMIN) > O_MAXMIN) ? NODEV : \ + ((((x) >> L_BITSMINOR) << O_BITSMINOR) | ((x) & O_MAXMIN))) + +/* convert to new (SVR4) dev format */ + +#define expdev(x) \ + (dev_t)(((dev_t)(((x) >> O_BITSMINOR) & O_MAXMAJ) << L_BITSMINOR) | \ + ((x) & O_MAXMIN)) + +/* + * Macro for checking power of 2 address alignment. + */ +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) + +/* + * Macros for counting and rounding. + */ +#define howmany(x, y) (((x)+((y)-1))/(y)) +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) + +/* + * Macro to determine if value is a power of 2 + */ +#define ISP2(x) (((x) & ((x) - 1)) == 0) + +/* + * Macros for various sorts of alignment and rounding when the alignment + * is known to be a power of 2. + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +#define P2END(x, align) (-(~(x) & -(align))) +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +/* + * Determine whether two numbers have the same high-order bit. + */ +#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y))) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +/* + * Macros to atomically increment/decrement a variable. mutex and var + * must be pointers. + */ +#define INCR_COUNT(var, mutex) mutex_enter(mutex), (*(var))++, mutex_exit(mutex) +#define DECR_COUNT(var, mutex) mutex_enter(mutex), (*(var))--, mutex_exit(mutex) + +#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) + +/* avoid any possibility of clashing with <stddef.h> version */ + +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSMACROS_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/vfs.h b/sys/contrib/opensolaris/uts/common/sys/vfs.h new file mode 100644 index 0000000..0834cf1 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/vfs.h @@ -0,0 +1,569 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SYS_VFS_H +#define _SYS_VFS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/cred.h> +#include <sys/vnode.h> +#include <sys/statvfs.h> +#include <sys/refstr.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Data associated with mounted file systems. + */ + +/* + * Operations vector. This is used internal to the kernel; file systems + * supply their list of operations via vfs_setfsops(). + */ + +typedef struct vfsops vfsops_t; + +/* + * File system identifier. Should be unique (at least per machine). + */ +typedef struct { + int val[2]; /* file system id type */ +} fsid_t; + +/* + * File identifier. Should be unique per filesystem on a single + * machine. This is typically called by a stateless file server + * in order to generate "file handles". + * + * Do not change the definition of struct fid ... fid_t without + * letting the CacheFS group know about it! They will have to do at + * least two things, in the same change that changes this structure: + * 1. change CFSVERSION in usr/src/uts/common/sys/fs/cachefs_fs.h + * 2. put the old version # in the canupgrade array + * in cachfs_upgrade() in usr/src/cmd/fs.d/cachefs/fsck/fsck.c + * This is necessary because CacheFS stores FIDs on disk. + * + * Many underlying file systems cast a struct fid into other + * file system dependent structures which may require 4 byte alignment. + * Because a fid starts with a short it may not be 4 byte aligned, the + * fid_pad will force the alignment. + */ +#define MAXFIDSZ 64 +#define OLD_MAXFIDSZ 16 + +typedef struct fid { + union { + long fid_pad; + struct { + ushort_t len; /* length of data in bytes */ + char data[MAXFIDSZ]; /* data (variable len) */ + } _fid; + } un; +} fid_t; + +#ifdef _SYSCALL32 +/* + * Solaris 64 - use old-style cache format with 32-bit aligned fid for on-disk + * struct compatibility. + */ +typedef struct fid32 { + union { + int32_t fid_pad; + struct { + uint16_t len; /* length of data in bytes */ + char data[MAXFIDSZ]; /* data (variable len) */ + } _fid; + } un; +} fid32_t; +#else /* not _SYSCALL32 */ +#define fid32 fid +typedef fid_t fid32_t; +#endif /* _SYSCALL32 */ + +#define fid_len un._fid.len +#define fid_data un._fid.data + +/* + * Structure defining a mount option for a filesystem. + * option names are found in mntent.h + */ +typedef struct mntopt { + char *mo_name; /* option name */ + char **mo_cancel; /* list of options cancelled by this one */ + char *mo_arg; /* argument string for this option */ + int mo_flags; /* flags for this mount option */ + void *mo_data; /* filesystem specific data */ +} mntopt_t; + +/* + * Flags that apply to mount options + */ + +#define MO_SET 0x01 /* option is set */ +#define MO_NODISPLAY 0x02 /* option not listed in mnttab */ +#define MO_HASVALUE 0x04 /* option takes a value */ +#define MO_IGNORE 0x08 /* option ignored by parser */ +#define MO_DEFAULT MO_SET /* option is on by default */ +#define MO_TAG 0x10 /* flags a tag set by user program */ +#define MO_EMPTY 0x20 /* empty space in option table */ + +#define VFS_NOFORCEOPT 0x01 /* honor MO_IGNORE (don't set option) */ +#define VFS_DISPLAY 0x02 /* Turn off MO_NODISPLAY bit for opt */ +#define VFS_NODISPLAY 0x04 /* Turn on MO_NODISPLAY bit for opt */ +#define VFS_CREATEOPT 0x08 /* Create the opt if it's not there */ + +/* + * Structure holding mount option strings for the mounted file system. + */ +typedef struct mntopts { + uint_t mo_count; /* number of entries in table */ + mntopt_t *mo_list; /* list of mount options */ +} mntopts_t; + +/* + * The kstat structures associated with the vopstats are kept in an + * AVL tree. This is to avoid the case where a file system does not + * use a unique fsid_t for each vfs (e.g., namefs). In order to do + * this, we need a structure that the AVL tree can use that also + * references the kstat. + * Note that the vks_fsid is generated from the value reported by + * VFS_STATVFS(). + */ +typedef struct vskstat_anchor { + avl_node_t vsk_node; /* Required for use by AVL routines */ + kstat_t *vsk_ksp; /* kstat structure for vopstats */ + ulong_t vsk_fsid; /* fsid associated w/this FS */ +} vsk_anchor_t; + +extern avl_tree_t vskstat_tree; +extern kmutex_t vskstat_tree_lock; + +/* + * Structure per mounted file system. Each mounted file system has + * an array of operations and an instance record. + * + * The file systems are kept on a doubly linked circular list headed by + * "rootvfs". + * File system implementations should not access this list; + * it's intended for use only in the kernel's vfs layer. + * + * Each zone also has its own list of mounts, containing filesystems mounted + * somewhere within the filesystem tree rooted at the zone's rootpath. The + * list is doubly linked to match the global list. + * + * mnttab locking: the in-kernel mnttab uses the vfs_mntpt, vfs_resource and + * vfs_mntopts fields in the vfs_t. mntpt and resource are refstr_ts that + * are set at mount time and can only be modified during a remount. + * It is safe to read these fields if you can prevent a remount on the vfs, + * or through the convenience funcs vfs_getmntpoint() and vfs_getresource(). + * The mntopts field may only be accessed through the provided convenience + * functions, as it is protected by the vfs list lock. Modifying a mount + * option requires grabbing the vfs list write lock, which can be a very + * high latency lock. + */ +struct zone; /* from zone.h */ +struct fem_head; /* from fem.h */ + +/* + * Private vfs data, NOT to be used by a file system implementation. + */ +typedef struct vfs_impl { + struct fem_head *vi_femhead; /* fs monitoring */ + /* + * Support for statistics on the vnode operations + */ + vsk_anchor_t *vi_vskap; /* anchor for vopstats' kstat */ + vopstats_t *vi_fstypevsp; /* ptr to per-fstype vopstats */ + vopstats_t vi_vopstats; /* per-mount vnode op stats */ +} vfs_impl_t; + +typedef struct vfs { + struct vfs *vfs_next; /* next VFS in VFS list */ + struct vfs *vfs_prev; /* prev VFS in VFS list */ + +/* vfs_op should not be used directly. Accessor functions are provided */ + vfsops_t *vfs_op; /* operations on VFS */ + + struct vnode *vfs_vnodecovered; /* vnode mounted on */ + uint_t vfs_flag; /* flags */ + uint_t vfs_bsize; /* native block size */ + int vfs_fstype; /* file system type index */ + fsid_t vfs_fsid; /* file system id */ + void *vfs_data; /* private data */ + dev_t vfs_dev; /* device of mounted VFS */ + ulong_t vfs_bcount; /* I/O count (accounting) */ + struct vfs *vfs_list; /* sync list pointer */ + struct vfs *vfs_hash; /* hash list pointer */ + ksema_t vfs_reflock; /* mount/unmount/sync lock */ + uint_t vfs_count; /* vfs reference count */ + mntopts_t vfs_mntopts; /* options mounted with */ + refstr_t *vfs_resource; /* mounted resource name */ + refstr_t *vfs_mntpt; /* mount point name */ + time_t vfs_mtime; /* time we were mounted */ + vfs_impl_t *vfs_implp; /* impl specific data */ + /* + * Zones support. Note that the zone that "owns" the mount isn't + * necessarily the same as the zone in which the zone is visible. + * That is, vfs_zone and (vfs_zone_next|vfs_zone_prev) may refer to + * different zones. + */ + struct zone *vfs_zone; /* zone that owns the mount */ + struct vfs *vfs_zone_next; /* next VFS visible in zone */ + struct vfs *vfs_zone_prev; /* prev VFS visible in zone */ +} vfs_t; + +#define vfs_femhead vfs_implp->vi_femhead +#define vfs_vskap vfs_implp->vi_vskap +#define vfs_fstypevsp vfs_implp->vi_fstypevsp +#define vfs_vopstats vfs_implp->vi_vopstats + +/* + * VFS flags. + */ +#define VFS_RDONLY 0x01 /* read-only vfs */ +#define VFS_NOMNTTAB 0x02 /* vfs not seen in mnttab */ +#define VFS_NOSETUID 0x08 /* setuid disallowed */ +#define VFS_REMOUNT 0x10 /* modify mount options only */ +#define VFS_NOTRUNC 0x20 /* does not truncate long file names */ +#define VFS_UNLINKABLE 0x40 /* unlink(2) can be applied to root */ +#define VFS_PXFS 0x80 /* clustering: global fs proxy vfs */ +#define VFS_UNMOUNTED 0x100 /* file system has been unmounted */ +#define VFS_NBMAND 0x200 /* allow non-blocking mandatory locks */ +#define VFS_XATTR 0x400 /* fs supports extended attributes */ +#define VFS_NODEVICES 0x800 /* device-special files disallowed */ +#define VFS_NOEXEC 0x1000 /* executables disallowed */ +#define VFS_STATS 0x2000 /* file system can collect stats */ + +#define VFS_NORESOURCE "unspecified_resource" +#define VFS_NOMNTPT "unspecified_mountpoint" + +/* + * Argument structure for mount(2). + * + * Flags are defined in <sys/mount.h>. + * + * Note that if the MS_SYSSPACE bit is set in flags, the pointer fields in + * this structure are to be interpreted as kernel addresses. File systems + * should be prepared for this possibility. + */ +struct mounta { + char *spec; + char *dir; + int flags; + char *fstype; + char *dataptr; + int datalen; + char *optptr; + int optlen; +}; + +/* + * Reasons for calling the vfs_mountroot() operation. + */ +enum whymountroot { ROOT_INIT, ROOT_REMOUNT, ROOT_UNMOUNT}; +typedef enum whymountroot whymountroot_t; + +/* + * Reasons for calling the VFS_VNSTATE(): + */ +enum vntrans { + VNTRANS_EXISTS, + VNTRANS_IDLED, + VNTRANS_RECLAIMED, + VNTRANS_DESTROYED +}; +typedef enum vntrans vntrans_t; + +/* + * VFS_OPS defines all the vfs operations. It is used to define + * the vfsops structure (below) and the fs_func_p union (vfs_opreg.h). + */ +#define VFS_OPS \ + int (*vfs_mount)(vfs_t *, vnode_t *, struct mounta *, cred_t *); \ + int (*vfs_unmount)(vfs_t *, int, cred_t *); \ + int (*vfs_root)(vfs_t *, vnode_t **); \ + int (*vfs_statvfs)(vfs_t *, statvfs64_t *); \ + int (*vfs_sync)(vfs_t *, short, cred_t *); \ + int (*vfs_vget)(vfs_t *, vnode_t **, fid_t *); \ + int (*vfs_mountroot)(vfs_t *, enum whymountroot); \ + void (*vfs_freevfs)(vfs_t *); \ + int (*vfs_vnstate)(vfs_t *, vnode_t *, vntrans_t) /* NB: No ";" */ + +/* + * Operations supported on virtual file system. + */ +struct vfsops { + VFS_OPS; /* Signature of all vfs operations (vfsops) */ +}; + +extern int fsop_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +extern int fsop_unmount(vfs_t *, int, cred_t *); +extern int fsop_root(vfs_t *, vnode_t **); +extern int fsop_statfs(vfs_t *, statvfs64_t *); +extern int fsop_sync(vfs_t *, short, cred_t *); +extern int fsop_vget(vfs_t *, vnode_t **, fid_t *); +extern int fsop_mountroot(vfs_t *, enum whymountroot); +extern void fsop_freefs(vfs_t *); +extern int fsop_sync_by_kind(int, short, cred_t *); +extern int fsop_vnstate(vfs_t *, vnode_t *, vntrans_t); + +#define VFS_MOUNT(vfsp, mvp, uap, cr) fsop_mount(vfsp, mvp, uap, cr) +#define VFS_UNMOUNT(vfsp, flag, cr) fsop_unmount(vfsp, flag, cr) +#define VFS_ROOT(vfsp, vpp) fsop_root(vfsp, vpp) +#define VFS_STATVFS(vfsp, sp) fsop_statfs(vfsp, sp) +#define VFS_SYNC(vfsp, flag, cr) fsop_sync(vfsp, flag, cr) +#define VFS_VGET(vfsp, vpp, fidp) fsop_vget(vfsp, vpp, fidp) +#define VFS_MOUNTROOT(vfsp, init) fsop_mountroot(vfsp, init) +#define VFS_FREEVFS(vfsp) fsop_freefs(vfsp) +#define VFS_VNSTATE(vfsp, vn, ns) fsop_vnstate(vfsp, vn, ns) + +#define VFSNAME_MOUNT "mount" +#define VFSNAME_UNMOUNT "unmount" +#define VFSNAME_ROOT "root" +#define VFSNAME_STATVFS "statvfs" +#define VFSNAME_SYNC "sync" +#define VFSNAME_VGET "vget" +#define VFSNAME_MOUNTROOT "mountroot" +#define VFSNAME_FREEVFS "freevfs" +#define VFSNAME_VNSTATE "vnstate" +/* + * Filesystem type switch table. + */ + +typedef struct vfssw { + char *vsw_name; /* type name -- max len _ST_FSTYPSZ */ + int (*vsw_init) (int, char *); + /* init routine (for non-loadable fs only) */ + int vsw_flag; /* flags */ + mntopts_t vsw_optproto; /* mount options table prototype */ + uint_t vsw_count; /* count of references */ + kmutex_t vsw_lock; /* lock to protect vsw_count */ + vfsops_t vsw_vfsops; /* filesystem operations vector */ +} vfssw_t; + +/* + * Filesystem type definition record. All file systems must export a record + * of this type through their modlfs structure. + */ + +typedef struct vfsdef_v3 { + int def_version; /* structure version, must be first */ + char *name; /* filesystem type name */ + int (*init) (int, char *); /* init routine */ + int flags; /* filesystem flags */ + mntopts_t *optproto; /* mount options table prototype */ +} vfsdef_v3; + +typedef struct vfsdef_v3 vfsdef_t; + +enum { + VFSDEF_VERSION = 3 +}; + +/* + * flags for vfssw and vfsdef + */ +#define VSW_HASPROTO 0x01 /* struct has a mount options prototype */ +#define VSW_CANRWRO 0x02 /* file system can transition from rw to ro */ +#define VSW_CANREMOUNT 0x04 /* file system supports remounts */ +#define VSW_NOTZONESAFE 0x08 /* zone_enter(2) should fail for these files */ +#define VSW_VOLATILEDEV 0x10 /* vfs_dev can change each time fs is mounted */ +#define VSW_STATS 0x20 /* file system can collect stats */ + +#define VSW_INSTALLED 0x8000 /* this vsw is associated with a file system */ + +#if defined(_KERNEL) +/* + * Public operations. + */ +struct umounta; +struct statvfsa; +struct fstatvfsa; + +void vfs_freevfsops(vfsops_t *); +int vfs_freevfsops_by_type(int); +void vfs_setops(vfs_t *, vfsops_t *); +vfsops_t *vfs_getops(vfs_t *vfsp); +int vfs_matchops(vfs_t *, vfsops_t *); +int vfs_can_sync(vfs_t *vfsp); +void vfs_init(vfs_t *vfsp, vfsops_t *, void *); +void vfsimpl_setup(vfs_t *vfsp); +void vfsimpl_teardown(vfs_t *vfsp); +void vn_exists(vnode_t *); +void vn_idle(vnode_t *); +void vn_reclaim(vnode_t *); +void vn_invalid(vnode_t *); + +int rootconf(void); +int svm_rootconf(void); +int domount(char *, struct mounta *, vnode_t *, struct cred *, + struct vfs **); +int dounmount(struct vfs *, int, cred_t *); +int vfs_lock(struct vfs *); +int vfs_rlock(struct vfs *); +void vfs_lock_wait(struct vfs *); +void vfs_rlock_wait(struct vfs *); +void vfs_unlock(struct vfs *); +int vfs_lock_held(struct vfs *); +struct _kthread *vfs_lock_owner(struct vfs *); +void sync(void); +void vfs_sync(int); +void vfs_mountroot(void); +void vfs_add(vnode_t *, struct vfs *, int); +void vfs_remove(struct vfs *); + +/* The following functions are not for general use by filesystems */ + +void vfs_createopttbl(mntopts_t *, const char *); +void vfs_copyopttbl(const mntopts_t *, mntopts_t *); +void vfs_mergeopttbl(const mntopts_t *, const mntopts_t *, mntopts_t *); +void vfs_freeopttbl(mntopts_t *); +void vfs_parsemntopts(mntopts_t *, char *, int); +int vfs_buildoptionstr(const mntopts_t *, char *, int); +struct mntopt *vfs_hasopt(const mntopts_t *, const char *); +void vfs_mnttab_modtimeupd(void); + +void vfs_clearmntopt(struct vfs *, const char *); +void vfs_setmntopt(struct vfs *, const char *, const char *, int); +void vfs_setresource(struct vfs *, const char *); +void vfs_setmntpoint(struct vfs *, const char *); +refstr_t *vfs_getresource(const struct vfs *); +refstr_t *vfs_getmntpoint(const struct vfs *); +int vfs_optionisset(const struct vfs *, const char *, char **); +int vfs_settag(uint_t, uint_t, const char *, const char *, cred_t *); +int vfs_clrtag(uint_t, uint_t, const char *, const char *, cred_t *); +void vfs_syncall(void); +void vfs_syncprogress(void); +void vfsinit(void); +void vfs_unmountall(void); +void vfs_make_fsid(fsid_t *, dev_t, int); +void vfs_addmip(dev_t, struct vfs *); +void vfs_delmip(struct vfs *); +int vfs_devismounted(dev_t); +int vfs_devmounting(dev_t, struct vfs *); +int vfs_opsinuse(vfsops_t *); +struct vfs *getvfs(fsid_t *); +struct vfs *vfs_dev2vfsp(dev_t); +struct vfs *vfs_mntpoint2vfsp(const char *); +struct vfssw *allocate_vfssw(char *); +struct vfssw *vfs_getvfssw(char *); +struct vfssw *vfs_getvfsswbyname(char *); +struct vfssw *vfs_getvfsswbyvfsops(vfsops_t *); +void vfs_refvfssw(struct vfssw *); +void vfs_unrefvfssw(struct vfssw *); +uint_t vf_to_stf(uint_t); +void vfs_mnttab_modtime(timespec_t *); +void vfs_mnttab_poll(timespec_t *, struct pollhead **); + +void vfs_list_lock(void); +void vfs_list_read_lock(void); +void vfs_list_unlock(void); +void vfs_list_add(struct vfs *); +void vfs_list_remove(struct vfs *); +void vfs_hold(vfs_t *vfsp); +void vfs_rele(vfs_t *vfsp); +void fs_freevfs(vfs_t *); +void vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype); + +int vfs_zone_change_safe(vfs_t *); + +#define VFSHASH(maj, min) (((int)((maj)+(min))) & (vfshsz - 1)) +#define VFS_ON_LIST(vfsp) \ + ((vfsp)->vfs_next != (vfsp) && (vfsp)->vfs_next != NULL) + +/* + * Globals. + */ + +extern struct vfssw vfssw[]; /* table of filesystem types */ +extern krwlock_t vfssw_lock; +extern char rootfstype[]; /* name of root fstype */ +extern const int nfstype; /* # of elements in vfssw array */ +extern vfsops_t *EIO_vfsops; /* operations for vfs being torn-down */ + +/* + * The following variables are private to the the kernel's vfs layer. File + * system implementations should not access them. + */ +extern struct vfs *rootvfs; /* ptr to root vfs structure */ +typedef struct { + struct vfs *rvfs_head; /* head vfs in chain */ + kmutex_t rvfs_lock; /* mutex protecting this chain */ + uint32_t rvfs_len; /* length of this chain */ +} rvfs_t; +extern rvfs_t *rvfs_list; +extern int vfshsz; /* # of elements in rvfs_head array */ +extern const mntopts_t vfs_mntopts; /* globally recognized options */ + +#endif /* defined(_KERNEL) */ + +#define VFS_HOLD(vfsp) { \ + vfs_hold(vfsp); \ +} + +#define VFS_RELE(vfsp) { \ + vfs_rele(vfsp); \ +} + +#define VFS_INIT(vfsp, op, data) { \ + vfs_init((vfsp), (op), (data)); \ + vfsimpl_setup((vfsp)); \ +} + + +#define VFS_INSTALLED(vfsswp) (((vfsswp)->vsw_flag & VSW_INSTALLED) != 0) +#define ALLOCATED_VFSSW(vswp) ((vswp)->vsw_name[0] != '\0') +#define RLOCK_VFSSW() (rw_enter(&vfssw_lock, RW_READER)) +#define RUNLOCK_VFSSW() (rw_exit(&vfssw_lock)) +#define WLOCK_VFSSW() (rw_enter(&vfssw_lock, RW_WRITER)) +#define WUNLOCK_VFSSW() (rw_exit(&vfssw_lock)) +#define VFSSW_LOCKED() (RW_LOCK_HELD(&vfssw_lock)) +#define VFSSW_WRITE_LOCKED() (RW_WRITE_HELD(&vfssw_lock)) +/* + * VFS_SYNC flags. + */ +#define SYNC_ATTR 0x01 /* sync attributes only */ +#define SYNC_CLOSE 0x02 /* close open file */ +#define SYNC_ALL 0x04 /* force to sync all fs */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VFS_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/vmem.h b/sys/contrib/opensolaris/uts/common/sys/vmem.h new file mode 100644 index 0000000..1cd2f30 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/vmem.h @@ -0,0 +1,142 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VMEM_H +#define _SYS_VMEM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * Per-allocation flags + */ +#define VM_SLEEP 0x00000000 /* same as KM_SLEEP */ +#define VM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */ +#define VM_PANIC 0x00000002 /* same as KM_PANIC */ +#define VM_PUSHPAGE 0x00000004 /* same as KM_PUSHPAGE */ +#define VM_KMFLAGS 0x000000ff /* flags that must match KM_* flags */ + +#define VM_BESTFIT 0x00000100 +#define VM_FIRSTFIT 0x00000200 +#define VM_NEXTFIT 0x00000400 + +/* + * The following flags are restricted for use only within the kernel. + * VM_MEMLOAD is for use by the HAT to avoid infinite recursion. + * VM_NORELOC is used by the kernel when static VA->PA mappings are required. + */ +#define VM_MEMLOAD 0x00000800 +#define VM_NORELOC 0x00001000 +/* + * VM_ABORT requests that vmem_alloc() *ignore* the VM_SLEEP/VM_NOSLEEP flags + * and forgo reaping if the allocation or attempted import, fails. This + * flag is a segkmem-specific flag, and should not be used by anyone else. + */ +#define VM_ABORT 0x00002000 + +#define VM_FLAGS 0x0000FFFF + +/* + * Arena creation flags + */ +#define VMC_POPULATOR 0x00010000 +#define VMC_NO_QCACHE 0x00020000 /* cannot use quantum caches */ +#define VMC_IDENTIFIER 0x00040000 /* not backed by memory */ +/* + * internal use only; the import function uses the vmem_ximport_t interface + * and may increase the request size if it so desires + */ +#define VMC_XALLOC 0x00080000 +#define VMC_FLAGS 0xFFFF0000 + +/* + * Public segment types + */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +/* + * Implementation-private segment types + */ +#define VMEM_SPAN 0x10 +#define VMEM_ROTOR 0x20 +#define VMEM_WALKER 0x40 + +/* + * VMEM_REENTRANT indicates to vmem_walk() that the callback routine may + * call back into the arena being walked, so vmem_walk() must drop the + * arena lock before each callback. The caveat is that since the arena + * isn't locked, its state can change. Therefore it is up to the callback + * routine to handle cases where the segment isn't of the expected type. + * For example, we use this to walk heap_arena when generating a crash dump; + * see segkmem_dump() for sample usage. + */ +#define VMEM_REENTRANT 0x80000000 + +typedef struct vmem vmem_t; +typedef void *(vmem_alloc_t)(vmem_t *, size_t, int); +typedef void (vmem_free_t)(vmem_t *, void *, size_t); + +/* + * Alternate import style; the requested size is passed in a pointer, + * which can be increased by the import function if desired. + */ +typedef void *(vmem_ximport_t)(vmem_t *, size_t *, int); + +#ifdef _KERNEL +extern vmem_t *vmem_init(const char *, void *, size_t, size_t, + vmem_alloc_t *, vmem_free_t *); +extern void vmem_update(void *); +extern int vmem_is_populator(); +extern size_t vmem_seg_size; +#endif + +extern vmem_t *vmem_create(const char *, void *, size_t, size_t, + vmem_alloc_t *, vmem_free_t *, vmem_t *, size_t, int); +extern vmem_t *vmem_xcreate(const char *, void *, size_t, size_t, + vmem_ximport_t *, vmem_free_t *, vmem_t *, size_t, int); +extern void vmem_destroy(vmem_t *); +extern void *vmem_alloc(vmem_t *, size_t, int); +extern void *vmem_xalloc(vmem_t *, size_t, size_t, size_t, size_t, + void *, void *, int); +extern void vmem_free(vmem_t *, void *, size_t); +extern void vmem_xfree(vmem_t *, void *, size_t); +extern void *vmem_add(vmem_t *, void *, size_t, int); +extern int vmem_contains(vmem_t *, void *, size_t); +extern void vmem_walk(vmem_t *, int, void (*)(void *, void *, size_t), void *); +extern size_t vmem_size(vmem_t *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VMEM_H */ diff --git a/sys/contrib/opensolaris/uts/common/sys/zmod.h b/sys/contrib/opensolaris/uts/common/sys/zmod.h new file mode 100644 index 0000000..ba02672 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/sys/zmod.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZMOD_H +#define _ZMOD_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * zmod - RFC-1950-compatible decompression routines + * + * This file provides the public interfaces to zmod, an in-kernel RFC 1950 + * decompression library. More information about the implementation of these + * interfaces can be found in the usr/src/uts/common/zmod/ directory. + */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) + +extern int z_uncompress(void *, size_t *, const void *, size_t); +extern int z_compress(void *, size_t *, const void *, size_t); +extern int z_compress_level(void *, size_t *, const void *, size_t, int); +extern const char *z_strerror(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZMOD_H */ diff --git a/sys/contrib/opensolaris/uts/common/zmod/adler32.c b/sys/contrib/opensolaris/uts/common/zmod/adler32.c new file mode 100644 index 0000000..59d8463 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/adler32.c @@ -0,0 +1,149 @@ +/* adler32.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2004 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#define ZLIB_INTERNAL +#include "zlib.h" + +#define BASE 65521UL /* largest prime smaller than 65536 */ +#define NMAX 5552 +/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ + +#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;} +#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); +#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); +#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); +#define DO16(buf) DO8(buf,0); DO8(buf,8); + +/* use NO_DIVIDE if your processor does not do division in hardware */ +#ifdef NO_DIVIDE +# define MOD(a) \ + do { \ + if (a >= (BASE << 16)) a -= (BASE << 16); \ + if (a >= (BASE << 15)) a -= (BASE << 15); \ + if (a >= (BASE << 14)) a -= (BASE << 14); \ + if (a >= (BASE << 13)) a -= (BASE << 13); \ + if (a >= (BASE << 12)) a -= (BASE << 12); \ + if (a >= (BASE << 11)) a -= (BASE << 11); \ + if (a >= (BASE << 10)) a -= (BASE << 10); \ + if (a >= (BASE << 9)) a -= (BASE << 9); \ + if (a >= (BASE << 8)) a -= (BASE << 8); \ + if (a >= (BASE << 7)) a -= (BASE << 7); \ + if (a >= (BASE << 6)) a -= (BASE << 6); \ + if (a >= (BASE << 5)) a -= (BASE << 5); \ + if (a >= (BASE << 4)) a -= (BASE << 4); \ + if (a >= (BASE << 3)) a -= (BASE << 3); \ + if (a >= (BASE << 2)) a -= (BASE << 2); \ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) +# define MOD4(a) \ + do { \ + if (a >= (BASE << 4)) a -= (BASE << 4); \ + if (a >= (BASE << 3)) a -= (BASE << 3); \ + if (a >= (BASE << 2)) a -= (BASE << 2); \ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) +#else +# define MOD(a) a %= BASE +# define MOD4(a) a %= BASE +#endif + +/* ========================================================================= */ +uLong ZEXPORT adler32(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned long sum2; + unsigned n; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) { + adler += buf[0]; + if (adler >= BASE) + adler -= BASE; + sum2 += adler; + if (sum2 >= BASE) + sum2 -= BASE; + return adler | (sum2 << 16); + } + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == Z_NULL) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (len < 16) { + while (len--) { + adler += *buf++; + sum2 += adler; + } + if (adler >= BASE) + adler -= BASE; + MOD4(sum2); /* only added so many BASE's */ + return adler | (sum2 << 16); + } + + /* do length NMAX blocks -- requires just one modulo operation */ + while (len >= NMAX) { + len -= NMAX; + n = NMAX / 16; /* NMAX is divisible by 16 */ + do { + DO16(buf); /* 16 sums unrolled */ + buf += 16; + } while (--n); + MOD(adler); + MOD(sum2); + } + + /* do remaining bytes (less than NMAX, still just one modulo) */ + if (len) { /* avoid modulos if none remaining */ + while (len >= 16) { + len -= 16; + DO16(buf); + buf += 16; + } + while (len--) { + adler += *buf++; + sum2 += adler; + } + MOD(adler); + MOD(sum2); + } + + /* return recombined sums */ + return adler | (sum2 << 16); +} + +/* ========================================================================= */ +uLong ZEXPORT adler32_combine(adler1, adler2, len2) + uLong adler1; + uLong adler2; + z_off_t len2; +{ + unsigned long sum1; + unsigned long sum2; + unsigned rem; + + /* the derivation of this formula is left as an exercise for the reader */ + rem = (unsigned)(len2 % BASE); + sum1 = adler1 & 0xffff; + sum2 = rem * sum1; + MOD(sum2); + sum1 += (adler2 & 0xffff) + BASE - 1; + sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; + if (sum1 > BASE) sum1 -= BASE; + if (sum1 > BASE) sum1 -= BASE; + if (sum2 > (BASE << 1)) sum2 -= (BASE << 1); + if (sum2 > BASE) sum2 -= BASE; + return sum1 | (sum2 << 16); +} diff --git a/sys/contrib/opensolaris/uts/common/zmod/crc32.c b/sys/contrib/opensolaris/uts/common/zmod/crc32.c new file mode 100644 index 0000000..61ad581 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/crc32.c @@ -0,0 +1,428 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results in about a + * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore + protection on the static variables used to control the first-use generation + of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should + first call get_crc_table() to initialize the tables before allowing more than + one thread to use crc32(). + */ + +#ifdef MAKECRCH +# include <stdio.h> +# ifndef DYNAMIC_CRC_TABLE +# define DYNAMIC_CRC_TABLE +# endif /* !DYNAMIC_CRC_TABLE */ +#endif /* MAKECRCH */ + +#include "zutil.h" /* for STDC and FAR definitions */ + +#define local static + +/* Find a four-byte integer type for crc32_little() and crc32_big(). */ +#ifndef NOBYFOUR +# ifdef STDC /* need ANSI C limits.h to determine sizes */ +# include <limits.h> +# define BYFOUR +# if (UINT_MAX == 0xffffffffUL) + typedef unsigned int u4; +# else +# if (ULONG_MAX == 0xffffffffUL) + typedef unsigned long u4; +# else +# if (USHRT_MAX == 0xffffffffUL) + typedef unsigned short u4; +# else +# undef BYFOUR /* can't find a four-byte integer type! */ +# endif +# endif +# endif +# endif /* STDC */ +#endif /* !NOBYFOUR */ + +/* Definitions for doing the crc four data bytes at a time. */ +#ifdef BYFOUR +# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \ + (((w)&0xff00)<<8)+(((w)&0xff)<<24)) + local unsigned long crc32_little OF((unsigned long, + const unsigned char FAR *, unsigned)); + local unsigned long crc32_big OF((unsigned long, + const unsigned char FAR *, unsigned)); +# define TBLS 8 +#else +# define TBLS 1 +#endif /* BYFOUR */ + +/* Local functions for crc concatenation */ +local unsigned long gf2_matrix_times OF((unsigned long *mat, + unsigned long vec)); +local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat)); + +#ifdef DYNAMIC_CRC_TABLE + +local volatile int crc_table_empty = 1; +local unsigned long FAR crc_table[TBLS][256]; +local void make_crc_table OF((void)); +#ifdef MAKECRCH + local void write_table OF((FILE *, const unsigned long FAR *)); +#endif /* MAKECRCH */ +/* + Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: + x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. + + Polynomials over GF(2) are represented in binary, one bit per coefficient, + with the lowest powers in the most significant bit. Then adding polynomials + is just exclusive-or, and multiplying a polynomial by x is a right shift by + one. If we call the above polynomial p, and represent a byte as the + polynomial q, also with the lowest power in the most significant bit (so the + byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p, + where a mod b means the remainder after dividing a by b. + + This calculation is done using the shift-register method of multiplying and + taking the remainder. The register is initialized to zero, and for each + incoming bit, x^32 is added mod p to the register if the bit is a one (where + x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by + x (which is shifting right by one and adding x^32 mod p if the bit shifted + out is a one). We start with the highest power (least significant bit) of + q and repeat for all eight bits of q. + + The first table is simply the CRC of all possible eight bit values. This is + all the information needed to generate CRCs on data a byte at a time for all + combinations of CRC register values and incoming bytes. The remaining tables + allow for word-at-a-time CRC calculation for both big-endian and little- + endian machines, where a word is four bytes. +*/ +local void make_crc_table() +{ + unsigned long c; + int n, k; + unsigned long poly; /* polynomial exclusive-or pattern */ + /* terms of polynomial defining this crc (except x^32): */ + static volatile int first = 1; /* flag to limit concurrent making */ + static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; + + /* See if another task is already doing this (not thread-safe, but better + than nothing -- significantly reduces duration of vulnerability in + case the advice about DYNAMIC_CRC_TABLE is ignored) */ + if (first) { + first = 0; + + /* make exclusive-or pattern from polynomial (0xedb88320UL) */ + poly = 0UL; + for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++) + poly |= 1UL << (31 - p[n]); + + /* generate a crc for every 8-bit value */ + for (n = 0; n < 256; n++) { + c = (unsigned long)n; + for (k = 0; k < 8; k++) + c = c & 1 ? poly ^ (c >> 1) : c >> 1; + crc_table[0][n] = c; + } + +#ifdef BYFOUR + /* generate crc for each value followed by one, two, and three zeros, + and then the byte reversal of those as well as the first table */ + for (n = 0; n < 256; n++) { + c = crc_table[0][n]; + crc_table[4][n] = REV(c); + for (k = 1; k < 4; k++) { + c = crc_table[0][c & 0xff] ^ (c >> 8); + crc_table[k][n] = c; + crc_table[k + 4][n] = REV(c); + } + } +#endif /* BYFOUR */ + + crc_table_empty = 0; + } + else { /* not first */ + /* wait for the other guy to finish (not efficient, but rare) */ + while (crc_table_empty) + ; + } + +#ifdef MAKECRCH + /* write out CRC tables to crc32.h */ + { + FILE *out; + + out = fopen("crc32.h", "w"); + if (out == NULL) return; + fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n"); + fprintf(out, " * Generated automatically by crc32.c\n */\n\n"); + fprintf(out, "local const unsigned long FAR "); + fprintf(out, "crc_table[TBLS][256] =\n{\n {\n"); + write_table(out, crc_table[0]); +# ifdef BYFOUR + fprintf(out, "#ifdef BYFOUR\n"); + for (k = 1; k < 8; k++) { + fprintf(out, " },\n {\n"); + write_table(out, crc_table[k]); + } + fprintf(out, "#endif\n"); +# endif /* BYFOUR */ + fprintf(out, " }\n};\n"); + fclose(out); + } +#endif /* MAKECRCH */ +} + +#ifdef MAKECRCH +local void write_table(out, table) + FILE *out; + const unsigned long FAR *table; +{ + int n; + + for (n = 0; n < 256; n++) + fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n], + n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", ")); +} +#endif /* MAKECRCH */ + +#else /* !DYNAMIC_CRC_TABLE */ +/* ======================================================================== + * Tables of CRC-32s of all single-byte values, made by make_crc_table(). + */ +#include "crc32.h" +#endif /* DYNAMIC_CRC_TABLE */ + +/* ========================================================================= + * This function can be used by asm versions of crc32() + */ +const unsigned long FAR * ZEXPORT get_crc_table() +{ +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + return (const unsigned long FAR *)crc_table; +} + +/* ========================================================================= */ +#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + +/* ========================================================================= */ +unsigned long ZEXPORT crc32(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + if (buf == Z_NULL) return 0UL; + +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + +#ifdef BYFOUR + if (sizeof(void *) == sizeof(ptrdiff_t)) { + u4 endian; + + endian = 1; + if (*((unsigned char *)(&endian))) + return crc32_little(crc, buf, len); + else + return crc32_big(crc, buf, len); + } +#endif /* BYFOUR */ + crc = crc ^ 0xffffffffUL; + while (len >= 8) { + DO8; + len -= 8; + } + if (len) do { + DO1; + } while (--len); + return crc ^ 0xffffffffUL; +} + +#ifdef BYFOUR + +/* ========================================================================= */ +#define DOLIT4 c ^= *buf4++; \ + c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ + crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] +#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 + +/* ========================================================================= */ +local unsigned long crc32_little(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = (u4)crc; + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + len--; + } + + buf4 = (const u4 FAR *)(const void FAR *)buf; + while (len >= 32) { + DOLIT32; + len -= 32; + } + while (len >= 4) { + DOLIT4; + len -= 4; + } + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + } while (--len); + c = ~c; + return (unsigned long)c; +} + +/* ========================================================================= */ +#define DOBIG4 c ^= *++buf4; \ + c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ + crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] +#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 + +/* ========================================================================= */ +local unsigned long crc32_big(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = REV((u4)crc); + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + len--; + } + + buf4 = (const u4 FAR *)(const void FAR *)buf; + buf4--; + while (len >= 32) { + DOBIG32; + len -= 32; + } + while (len >= 4) { + DOBIG4; + len -= 4; + } + buf4++; + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + } while (--len); + c = ~c; + return (unsigned long)(REV(c)); +} + +#endif /* BYFOUR */ + +#define GF2_DIM 32 /* dimension of GF(2) vectors (length of CRC) */ + +/* ========================================================================= */ +local unsigned long gf2_matrix_times(mat, vec) + unsigned long *mat; + unsigned long vec; +{ + unsigned long sum; + + sum = 0; + while (vec) { + if (vec & 1) + sum ^= *mat; + vec >>= 1; + mat++; + } + return sum; +} + +/* ========================================================================= */ +local void gf2_matrix_square(square, mat) + unsigned long *square; + unsigned long *mat; +{ + int n; + + for (n = 0; n < GF2_DIM; n++) + square[n] = gf2_matrix_times(mat, mat[n]); +} + +/* ========================================================================= */ +uLong ZEXPORT crc32_combine(crc1, crc2, len2) + uLong crc1; + uLong crc2; + z_off_t len2; +{ + int n; + unsigned long row; + unsigned long even[GF2_DIM]; /* even-power-of-two zeros operator */ + unsigned long odd[GF2_DIM]; /* odd-power-of-two zeros operator */ + + /* degenerate case */ + if (len2 == 0) + return crc1; + + /* put operator for one zero bit in odd */ + odd[0] = 0xedb88320UL; /* CRC-32 polynomial */ + row = 1; + for (n = 1; n < GF2_DIM; n++) { + odd[n] = row; + row <<= 1; + } + + /* put operator for two zero bits in even */ + gf2_matrix_square(even, odd); + + /* put operator for four zero bits in odd */ + gf2_matrix_square(odd, even); + + /* apply len2 zeros to crc1 (first square will put the operator for one + zero byte, eight zero bits, in even) */ + do { + /* apply zeros operator for this bit of len2 */ + gf2_matrix_square(even, odd); + if (len2 & 1) + crc1 = gf2_matrix_times(even, crc1); + len2 >>= 1; + + /* if no more bits set, then done */ + if (len2 == 0) + break; + + /* another iteration of the loop with odd and even swapped */ + gf2_matrix_square(odd, even); + if (len2 & 1) + crc1 = gf2_matrix_times(odd, crc1); + len2 >>= 1; + + /* if no more bits set, then done */ + } while (len2 != 0); + + /* return combined crc */ + crc1 ^= crc2; + return crc1; +} diff --git a/sys/contrib/opensolaris/uts/common/zmod/crc32.h b/sys/contrib/opensolaris/uts/common/zmod/crc32.h new file mode 100644 index 0000000..495c83e --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/crc32.h @@ -0,0 +1,443 @@ +/* crc32.h -- tables for rapid CRC calculation + * Generated automatically by crc32.c + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +local const unsigned long FAR crc_table[TBLS][256] = +{ + { + 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, + 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, + 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, + 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL, + 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL, + 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL, + 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, + 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, + 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, + 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL, + 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL, + 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL, + 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, + 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, + 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, + 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL, + 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL, + 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL, + 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, + 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, + 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, + 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL, + 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL, + 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL, + 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, + 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, + 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, + 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL, + 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL, + 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL, + 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, + 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, + 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, + 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL, + 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL, + 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL, + 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, + 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, + 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, + 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL, + 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL, + 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL, + 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, + 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, + 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, + 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL, + 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL, + 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL, + 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, + 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, + 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, + 0x2d02ef8dUL +#ifdef BYFOUR + }, + { + 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL, + 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL, + 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL, + 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL, + 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL, + 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL, + 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL, + 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL, + 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL, + 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL, + 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL, + 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL, + 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL, + 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL, + 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL, + 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL, + 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL, + 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL, + 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL, + 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL, + 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL, + 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL, + 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL, + 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL, + 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL, + 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL, + 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL, + 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL, + 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL, + 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL, + 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL, + 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL, + 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL, + 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL, + 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL, + 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL, + 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL, + 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL, + 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL, + 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL, + 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL, + 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL, + 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL, + 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL, + 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL, + 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL, + 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL, + 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL, + 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL, + 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL, + 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL, + 0x9324fd72UL + }, + { + 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL, + 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL, + 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL, + 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL, + 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL, + 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL, + 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL, + 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL, + 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL, + 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL, + 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL, + 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL, + 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL, + 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL, + 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL, + 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL, + 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL, + 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL, + 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL, + 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL, + 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL, + 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL, + 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL, + 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL, + 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL, + 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL, + 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL, + 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL, + 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL, + 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL, + 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL, + 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL, + 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL, + 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL, + 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL, + 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL, + 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL, + 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL, + 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL, + 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL, + 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL, + 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL, + 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL, + 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL, + 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL, + 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL, + 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL, + 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL, + 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL, + 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL, + 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL, + 0xbe9834edUL + }, + { + 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL, + 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL, + 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL, + 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL, + 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL, + 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL, + 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL, + 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL, + 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL, + 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL, + 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL, + 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL, + 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL, + 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL, + 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL, + 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL, + 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL, + 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL, + 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL, + 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL, + 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL, + 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL, + 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL, + 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL, + 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL, + 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL, + 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL, + 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL, + 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL, + 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL, + 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL, + 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL, + 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL, + 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL, + 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL, + 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL, + 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL, + 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL, + 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL, + 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL, + 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL, + 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL, + 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL, + 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL, + 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL, + 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL, + 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL, + 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL, + 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL, + 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL, + 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL, + 0xde0506f1UL + }, + { + 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL, + 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL, + 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL, + 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL, + 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL, + 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL, + 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL, + 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL, + 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL, + 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL, + 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL, + 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL, + 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL, + 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL, + 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL, + 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL, + 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL, + 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL, + 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL, + 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL, + 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL, + 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL, + 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL, + 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL, + 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL, + 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL, + 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL, + 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL, + 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL, + 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL, + 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL, + 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL, + 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL, + 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL, + 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL, + 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL, + 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL, + 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL, + 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL, + 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL, + 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL, + 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL, + 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL, + 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL, + 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL, + 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL, + 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL, + 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL, + 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL, + 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL, + 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL, + 0x8def022dUL + }, + { + 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL, + 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL, + 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL, + 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL, + 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL, + 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL, + 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL, + 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL, + 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL, + 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL, + 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL, + 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL, + 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL, + 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL, + 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL, + 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL, + 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL, + 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL, + 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL, + 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL, + 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL, + 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL, + 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL, + 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL, + 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL, + 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL, + 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL, + 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL, + 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL, + 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL, + 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL, + 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL, + 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL, + 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL, + 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL, + 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL, + 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL, + 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL, + 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL, + 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL, + 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL, + 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL, + 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL, + 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL, + 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL, + 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL, + 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL, + 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL, + 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL, + 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL, + 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL, + 0x72fd2493UL + }, + { + 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL, + 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL, + 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL, + 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL, + 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL, + 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL, + 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL, + 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL, + 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL, + 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL, + 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL, + 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL, + 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL, + 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL, + 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL, + 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL, + 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL, + 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL, + 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL, + 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL, + 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL, + 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL, + 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL, + 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL, + 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL, + 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL, + 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL, + 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL, + 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL, + 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL, + 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL, + 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL, + 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL, + 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL, + 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL, + 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL, + 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL, + 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL, + 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL, + 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL, + 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL, + 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL, + 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL, + 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL, + 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL, + 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL, + 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL, + 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL, + 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL, + 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL, + 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL, + 0xed3498beUL + }, + { + 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL, + 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL, + 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL, + 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL, + 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL, + 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL, + 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL, + 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL, + 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL, + 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL, + 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL, + 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL, + 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL, + 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL, + 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL, + 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL, + 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL, + 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL, + 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL, + 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL, + 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL, + 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL, + 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL, + 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL, + 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL, + 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL, + 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL, + 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL, + 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL, + 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL, + 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL, + 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL, + 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL, + 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL, + 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL, + 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL, + 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL, + 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL, + 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL, + 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL, + 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL, + 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL, + 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL, + 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL, + 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL, + 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL, + 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL, + 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL, + 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL, + 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL, + 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL, + 0xf10605deUL +#endif + } +}; diff --git a/sys/contrib/opensolaris/uts/common/zmod/deflate.c b/sys/contrib/opensolaris/uts/common/zmod/deflate.c new file mode 100644 index 0000000..7847e40 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/deflate.c @@ -0,0 +1,1742 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* deflate.c -- compress data using the deflation algorithm + * Copyright (C) 1995-2005 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ALGORITHM + * + * The "deflation" process depends on being able to identify portions + * of the input text which are identical to earlier input (within a + * sliding window trailing behind the input currently being processed). + * + * The most straightforward technique turns out to be the fastest for + * most input files: try all possible matches and select the longest. + * The key feature of this algorithm is that insertions into the string + * dictionary are very simple and thus fast, and deletions are avoided + * completely. Insertions are performed at each input character, whereas + * string matches are performed only when the previous match ends. So it + * is preferable to spend more time in matches to allow very fast string + * insertions and avoid deletions. The matching algorithm for small + * strings is inspired from that of Rabin & Karp. A brute force approach + * is used to find longer strings when a small match has been found. + * A similar algorithm is used in comic (by Jan-Mark Wams) and freeze + * (by Leonid Broukhis). + * A previous version of this file used a more sophisticated algorithm + * (by Fiala and Greene) which is guaranteed to run in linear amortized + * time, but has a larger average cost, uses more memory and is patented. + * However the F&G algorithm may be faster for some highly redundant + * files if the parameter max_chain_length (described below) is too large. + * + * ACKNOWLEDGEMENTS + * + * The idea of lazy evaluation of matches is due to Jan-Mark Wams, and + * I found it in 'freeze' written by Leonid Broukhis. + * Thanks to many people for bug reports and testing. + * + * REFERENCES + * + * Deutsch, L.P.,"DEFLATE Compressed Data Format Specification". + * Available in http://www.ietf.org/rfc/rfc1951.txt + * + * A description of the Rabin and Karp algorithm is given in the book + * "Algorithms" by R. Sedgewick, Addison-Wesley, p252. + * + * Fiala,E.R., and Greene,D.H. + * Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595 + * + */ + +#include "deflate.h" + +static const char deflate_copyright[] = + " deflate 1.2.3 Copyright 1995-2005 Jean-loup Gailly "; +/* + If you use the zlib library in a product, an acknowledgment is welcome + in the documentation of your product. If for some reason you cannot + include such an acknowledgment, I would appreciate that you keep this + copyright string in the executable of your product. + */ + +/* =========================================================================== + * Function prototypes. + */ +typedef enum { + need_more, /* block not completed, need more input or more output */ + block_done, /* block flush performed */ + finish_started, /* finish started, need only more output at next deflate */ + finish_done /* finish done, accept no more input or output */ +} block_state; + +typedef block_state (*compress_func) OF((deflate_state *s, int flush)); +/* Compression function. Returns the block state after the call. */ + +local void fill_window OF((deflate_state *s)); +local block_state deflate_stored OF((deflate_state *s, int flush)); +local block_state deflate_fast OF((deflate_state *s, int flush)); +#ifndef FASTEST +local block_state deflate_slow OF((deflate_state *s, int flush)); +#endif +local void lm_init OF((deflate_state *s)); +local void putShortMSB OF((deflate_state *s, uInt b)); +local void flush_pending OF((z_streamp strm)); +local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size)); +#ifndef FASTEST +#ifdef ASMV + void match_init OF((void)); /* asm code initialization */ + uInt longest_match OF((deflate_state *s, IPos cur_match)); +#else +local uInt longest_match OF((deflate_state *s, IPos cur_match)); +#endif +#endif +local uInt longest_match_fast OF((deflate_state *s, IPos cur_match)); + +#ifdef DEBUG +local void check_match OF((deflate_state *s, IPos start, IPos match, + int length)); +#endif + +/* =========================================================================== + * Local data + */ + +#define NIL 0 +/* Tail of hash chains */ + +#ifndef TOO_FAR +# define TOO_FAR 4096 +#endif +/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */ + +#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) +/* Minimum amount of lookahead, except at the end of the input file. + * See deflate.c for comments about the MIN_MATCH+1. + */ + +/* Values for max_lazy_match, good_match and max_chain_length, depending on + * the desired pack level (0..9). The values given below have been tuned to + * exclude worst case performance for pathological files. Better values may be + * found for specific files. + */ +typedef struct config_s { + ush good_length; /* reduce lazy search above this match length */ + ush max_lazy; /* do not perform lazy search above this match length */ + ush nice_length; /* quit search above this match length */ + ush max_chain; + compress_func func; +} config; + +#ifdef FASTEST +local const config configuration_table[2] = { +/* good lazy nice chain */ +/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */ +/* 1 */ {4, 4, 8, 4, deflate_fast}}; /* max speed, no lazy matches */ +#else +local const config configuration_table[10] = { +/* good lazy nice chain */ +/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */ +/* 1 */ {4, 4, 8, 4, deflate_fast}, /* max speed, no lazy matches */ +/* 2 */ {4, 5, 16, 8, deflate_fast}, +/* 3 */ {4, 6, 32, 32, deflate_fast}, + +/* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */ +/* 5 */ {8, 16, 32, 32, deflate_slow}, +/* 6 */ {8, 16, 128, 128, deflate_slow}, +/* 7 */ {8, 32, 128, 256, deflate_slow}, +/* 8 */ {32, 128, 258, 1024, deflate_slow}, +/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* max compression */ +#endif + +/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4 + * For deflate_fast() (levels <= 3) good is ignored and lazy has a different + * meaning. + */ + +#define EQUAL 0 +/* result of memcmp for equal strings */ + +#ifndef NO_DUMMY_DECL +struct static_tree_desc_s {int dummy;}; /* for buggy compilers */ +#endif + +/* =========================================================================== + * Update a hash value with the given input byte + * IN assertion: all calls to to UPDATE_HASH are made with consecutive + * input characters, so that a running hash key can be computed from the + * previous key instead of complete recalculation each time. + */ +#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask) + + +/* =========================================================================== + * Insert string str in the dictionary and set match_head to the previous head + * of the hash chain (the most recent string with same hash key). Return + * the previous length of the hash chain. + * If this file is compiled with -DFASTEST, the compression level is forced + * to 1, and no hash chains are maintained. + * IN assertion: all calls to to INSERT_STRING are made with consecutive + * input characters and the first MIN_MATCH bytes of str are valid + * (except for the last MIN_MATCH-1 bytes of the input file). + */ +#ifdef FASTEST +#define INSERT_STRING(s, str, match_head) \ + (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ + match_head = s->head[s->ins_h], \ + s->head[s->ins_h] = (Pos)(str)) +#else +#define INSERT_STRING(s, str, match_head) \ + (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ + match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \ + s->head[s->ins_h] = (Pos)(str)) +#endif + +/* =========================================================================== + * Initialize the hash table (avoiding 64K overflow for 16 bit systems). + * prev[] will be initialized on the fly. + */ +#define CLEAR_HASH(s) \ + s->head[s->hash_size-1] = NIL; \ + (void) zmemzero((Bytef *)s->head, \ + (unsigned)(s->hash_size-1)*sizeof(*s->head)); + +/* ========================================================================= */ +int ZEXPORT deflateInit_(strm, level, version, stream_size) + z_streamp strm; + int level; + const char *version; + int stream_size; +{ + return deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL, + Z_DEFAULT_STRATEGY, version, stream_size); + /* To do: ignore strm->next_in if we use it as window */ +} + +/* ========================================================================= */ +int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, + version, stream_size) + z_streamp strm; + int level; + int method; + int windowBits; + int memLevel; + int strategy; + const char *version; + int stream_size; +{ + deflate_state *s; + int wrap = 1; + static const char my_version[] = ZLIB_VERSION; + + ushf *overlay; + /* We overlay pending_buf and d_buf+l_buf. This works since the average + * output size for (length,distance) codes is <= 24 bits. + */ + + if (version == Z_NULL || version[0] != my_version[0] || + stream_size != sizeof(z_stream)) { + return Z_VERSION_ERROR; + } + if (strm == Z_NULL) return Z_STREAM_ERROR; + + strm->msg = Z_NULL; + if (strm->zalloc == (alloc_func)0) { + strm->zalloc = zcalloc; + strm->opaque = (voidpf)0; + } + if (strm->zfree == (free_func)0) strm->zfree = zcfree; + +#ifdef FASTEST + if (level != 0) level = 1; +#else + if (level == Z_DEFAULT_COMPRESSION) level = 6; +#endif + + if (windowBits < 0) { /* suppress zlib wrapper */ + wrap = 0; + windowBits = -windowBits; + } +#ifdef GZIP + else if (windowBits > 15) { + wrap = 2; /* write gzip wrapper instead */ + windowBits -= 16; + } +#endif + if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED || + windowBits < 8 || windowBits > 15 || level < 0 || level > 9 || + strategy < 0 || strategy > Z_FIXED) { + return Z_STREAM_ERROR; + } + if (windowBits == 8) windowBits = 9; /* until 256-byte window bug fixed */ + s = (deflate_state *) ZALLOC(strm, 1, sizeof(deflate_state)); + if (s == Z_NULL) return Z_MEM_ERROR; + strm->state = (struct internal_state FAR *)s; + s->strm = strm; + + s->wrap = wrap; + s->gzhead = Z_NULL; + s->w_bits = windowBits; + s->w_size = 1 << s->w_bits; + s->w_mask = s->w_size - 1; + + s->hash_bits = memLevel + 7; + s->hash_size = 1 << s->hash_bits; + s->hash_mask = s->hash_size - 1; + s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH); + + s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte)); + s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos)); + s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos)); + + s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */ + + overlay = (ushf *) ZALLOC(strm, s->lit_bufsize, sizeof(ush)+2); + s->pending_buf = (uchf *) overlay; + s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L); + + if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL || + s->pending_buf == Z_NULL) { + s->status = FINISH_STATE; + strm->msg = (char*)ERR_MSG(Z_MEM_ERROR); + (void) deflateEnd (strm); + return Z_MEM_ERROR; + } + s->d_buf = overlay + s->lit_bufsize/sizeof(ush); + s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize; + + s->level = level; + s->strategy = strategy; + s->method = (Byte)method; + + return deflateReset(strm); +} + +/* ========================================================================= */ +int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength) + z_streamp strm; + const Bytef *dictionary; + uInt dictLength; +{ + deflate_state *s; + uInt length = dictLength; + uInt n; + IPos hash_head = 0; + + if (strm == Z_NULL || strm->state == Z_NULL || dictionary == Z_NULL || + strm->state->wrap == 2 || + (strm->state->wrap == 1 && strm->state->status != INIT_STATE)) + return Z_STREAM_ERROR; + + s = strm->state; + if (s->wrap) + strm->adler = adler32(strm->adler, dictionary, dictLength); + + if (length < MIN_MATCH) return Z_OK; + if (length > MAX_DIST(s)) { + length = MAX_DIST(s); + dictionary += dictLength - length; /* use the tail of the dictionary */ + } + (void) zmemcpy(s->window, dictionary, length); + s->strstart = length; + s->block_start = (long)length; + + /* Insert all strings in the hash table (except for the last two bytes). + * s->lookahead stays null, so s->ins_h will be recomputed at the next + * call of fill_window. + */ + s->ins_h = s->window[0]; + UPDATE_HASH(s, s->ins_h, s->window[1]); + for (n = 0; n <= length - MIN_MATCH; n++) { + INSERT_STRING(s, n, hash_head); + } + if (hash_head) hash_head = 0; /* to make compiler happy */ + return Z_OK; +} + +/* ========================================================================= */ +int ZEXPORT deflateReset (strm) + z_streamp strm; +{ + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL || + strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0) { + return Z_STREAM_ERROR; + } + + strm->total_in = strm->total_out = 0; + strm->msg = Z_NULL; /* use zfree if we ever allocate msg dynamically */ + strm->data_type = Z_UNKNOWN; + + s = (deflate_state *)strm->state; + s->pending = 0; + s->pending_out = s->pending_buf; + + if (s->wrap < 0) { + s->wrap = -s->wrap; /* was made negative by deflate(..., Z_FINISH); */ + } + s->status = s->wrap ? INIT_STATE : BUSY_STATE; + strm->adler = +#ifdef GZIP + s->wrap == 2 ? crc32(0L, Z_NULL, 0) : +#endif + adler32(0L, Z_NULL, 0); + s->last_flush = Z_NO_FLUSH; + + _tr_init(s); + lm_init(s); + + return Z_OK; +} + +/* ========================================================================= */ +int ZEXPORT deflateSetHeader (strm, head) + z_streamp strm; + gz_headerp head; +{ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + if (strm->state->wrap != 2) return Z_STREAM_ERROR; + strm->state->gzhead = head; + return Z_OK; +} + +/* ========================================================================= */ +int ZEXPORT deflatePrime (strm, bits, value) + z_streamp strm; + int bits; + int value; +{ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + strm->state->bi_valid = bits; + strm->state->bi_buf = (ush)(value & ((1 << bits) - 1)); + return Z_OK; +} + +/* ========================================================================= */ +int ZEXPORT deflateParams(strm, level, strategy) + z_streamp strm; + int level; + int strategy; +{ + deflate_state *s; + compress_func func; + int err = Z_OK; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + s = strm->state; + +#ifdef FASTEST + if (level != 0) level = 1; +#else + if (level == Z_DEFAULT_COMPRESSION) level = 6; +#endif + if (level < 0 || level > 9 || strategy < 0 || strategy > Z_FIXED) { + return Z_STREAM_ERROR; + } + func = configuration_table[s->level].func; + + if (func != configuration_table[level].func && strm->total_in != 0) { + /* Flush the last buffer: */ + err = deflate(strm, Z_PARTIAL_FLUSH); + } + if (s->level != level) { + s->level = level; + s->max_lazy_match = configuration_table[level].max_lazy; + s->good_match = configuration_table[level].good_length; + s->nice_match = configuration_table[level].nice_length; + s->max_chain_length = configuration_table[level].max_chain; + } + s->strategy = strategy; + return err; +} + +/* ========================================================================= */ +int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain) + z_streamp strm; + int good_length; + int max_lazy; + int nice_length; + int max_chain; +{ + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + s = strm->state; + s->good_match = good_length; + s->max_lazy_match = max_lazy; + s->nice_match = nice_length; + s->max_chain_length = max_chain; + return Z_OK; +} + +/* ========================================================================= + * For the default windowBits of 15 and memLevel of 8, this function returns + * a close to exact, as well as small, upper bound on the compressed size. + * They are coded as constants here for a reason--if the #define's are + * changed, then this function needs to be changed as well. The return + * value for 15 and 8 only works for those exact settings. + * + * For any setting other than those defaults for windowBits and memLevel, + * the value returned is a conservative worst case for the maximum expansion + * resulting from using fixed blocks instead of stored blocks, which deflate + * can emit on compressed data for some combinations of the parameters. + * + * This function could be more sophisticated to provide closer upper bounds + * for every combination of windowBits and memLevel, as well as wrap. + * But even the conservative upper bound of about 14% expansion does not + * seem onerous for output buffer allocation. + */ +uLong ZEXPORT deflateBound(strm, sourceLen) + z_streamp strm; + uLong sourceLen; +{ + deflate_state *s; + uLong destLen; + + /* conservative upper bound */ + destLen = sourceLen + + ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 11; + + /* if can't get parameters, return conservative bound */ + if (strm == Z_NULL || strm->state == Z_NULL) + return destLen; + + /* if not default parameters, return conservative bound */ + s = strm->state; + if (s->w_bits != 15 || s->hash_bits != 8 + 7) + return destLen; + + /* default settings: return tight bound for that case */ + return compressBound(sourceLen); +} + +/* ========================================================================= + * Put a short in the pending buffer. The 16-bit value is put in MSB order. + * IN assertion: the stream state is correct and there is enough room in + * pending_buf. + */ +local void putShortMSB (s, b) + deflate_state *s; + uInt b; +{ + put_byte(s, (Byte)(b >> 8)); + put_byte(s, (Byte)(b & 0xff)); +} + +/* ========================================================================= + * Flush as much pending output as possible. All deflate() output goes + * through this function so some applications may wish to modify it + * to avoid allocating a large strm->next_out buffer and copying into it. + * (See also read_buf()). + */ +local void flush_pending(strm) + z_streamp strm; +{ + unsigned len = strm->state->pending; + + if (len > strm->avail_out) len = strm->avail_out; + if (len == 0) return; + + zmemcpy(strm->next_out, strm->state->pending_out, len); + strm->next_out += len; + strm->state->pending_out += len; + strm->total_out += len; + strm->avail_out -= len; + strm->state->pending -= len; + if (strm->state->pending == 0) { + strm->state->pending_out = strm->state->pending_buf; + } +} + +/* ========================================================================= */ +int ZEXPORT deflate (strm, flush) + z_streamp strm; + int flush; +{ + int old_flush; /* value of flush param for previous deflate call */ + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL || + flush > Z_FINISH || flush < 0) { + return Z_STREAM_ERROR; + } + s = strm->state; + + if (strm->next_out == Z_NULL || + (strm->next_in == Z_NULL && strm->avail_in != 0) || + (s->status == FINISH_STATE && flush != Z_FINISH)) { + ERR_RETURN(strm, Z_STREAM_ERROR); + } + if (strm->avail_out == 0) ERR_RETURN(strm, Z_BUF_ERROR); + + s->strm = strm; /* just in case */ + old_flush = s->last_flush; + s->last_flush = flush; + + /* Write the header */ + if (s->status == INIT_STATE) { +#ifdef GZIP + if (s->wrap == 2) { + strm->adler = crc32(0L, Z_NULL, 0); + put_byte(s, 31); + put_byte(s, 139); + put_byte(s, 8); + if (s->gzhead == NULL) { + put_byte(s, 0); + put_byte(s, 0); + put_byte(s, 0); + put_byte(s, 0); + put_byte(s, 0); + put_byte(s, s->level == 9 ? 2 : + (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ? + 4 : 0)); + put_byte(s, OS_CODE); + s->status = BUSY_STATE; + } + else { + put_byte(s, (s->gzhead->text ? 1 : 0) + + (s->gzhead->hcrc ? 2 : 0) + + (s->gzhead->extra == Z_NULL ? 0 : 4) + + (s->gzhead->name == Z_NULL ? 0 : 8) + + (s->gzhead->comment == Z_NULL ? 0 : 16) + ); + put_byte(s, (Byte)(s->gzhead->time & 0xff)); + put_byte(s, (Byte)((s->gzhead->time >> 8) & 0xff)); + put_byte(s, (Byte)((s->gzhead->time >> 16) & 0xff)); + put_byte(s, (Byte)((s->gzhead->time >> 24) & 0xff)); + put_byte(s, s->level == 9 ? 2 : + (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ? + 4 : 0)); + put_byte(s, s->gzhead->os & 0xff); + if (s->gzhead->extra != NULL) { + put_byte(s, s->gzhead->extra_len & 0xff); + put_byte(s, (s->gzhead->extra_len >> 8) & 0xff); + } + if (s->gzhead->hcrc) + strm->adler = crc32(strm->adler, s->pending_buf, + s->pending); + s->gzindex = 0; + s->status = EXTRA_STATE; + } + } + else +#endif + { + uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8; + uInt level_flags; + + if (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2) + level_flags = 0; + else if (s->level < 6) + level_flags = 1; + else if (s->level == 6) + level_flags = 2; + else + level_flags = 3; + header |= (level_flags << 6); + if (s->strstart != 0) header |= PRESET_DICT; + header += 31 - (header % 31); + + s->status = BUSY_STATE; + putShortMSB(s, header); + + /* Save the adler32 of the preset dictionary: */ + if (s->strstart != 0) { + putShortMSB(s, (uInt)(strm->adler >> 16)); + putShortMSB(s, (uInt)(strm->adler & 0xffff)); + } + strm->adler = adler32(0L, Z_NULL, 0); + } + } +#ifdef GZIP + if (s->status == EXTRA_STATE) { + if (s->gzhead->extra != NULL) { + uInt beg = s->pending; /* start of bytes to update crc */ + + while (s->gzindex < (s->gzhead->extra_len & 0xffff)) { + if (s->pending == s->pending_buf_size) { + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + flush_pending(strm); + beg = s->pending; + if (s->pending == s->pending_buf_size) + break; + } + put_byte(s, s->gzhead->extra[s->gzindex]); + s->gzindex++; + } + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + if (s->gzindex == s->gzhead->extra_len) { + s->gzindex = 0; + s->status = NAME_STATE; + } + } + else + s->status = NAME_STATE; + } + if (s->status == NAME_STATE) { + if (s->gzhead->name != NULL) { + uInt beg = s->pending; /* start of bytes to update crc */ + int val; + + do { + if (s->pending == s->pending_buf_size) { + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + flush_pending(strm); + beg = s->pending; + if (s->pending == s->pending_buf_size) { + val = 1; + break; + } + } + val = s->gzhead->name[s->gzindex++]; + put_byte(s, val); + } while (val != 0); + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + if (val == 0) { + s->gzindex = 0; + s->status = COMMENT_STATE; + } + } + else + s->status = COMMENT_STATE; + } + if (s->status == COMMENT_STATE) { + if (s->gzhead->comment != NULL) { + uInt beg = s->pending; /* start of bytes to update crc */ + int val; + + do { + if (s->pending == s->pending_buf_size) { + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + flush_pending(strm); + beg = s->pending; + if (s->pending == s->pending_buf_size) { + val = 1; + break; + } + } + val = s->gzhead->comment[s->gzindex++]; + put_byte(s, val); + } while (val != 0); + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + if (val == 0) + s->status = HCRC_STATE; + } + else + s->status = HCRC_STATE; + } + if (s->status == HCRC_STATE) { + if (s->gzhead->hcrc) { + if (s->pending + 2 > s->pending_buf_size) + flush_pending(strm); + if (s->pending + 2 <= s->pending_buf_size) { + put_byte(s, (Byte)(strm->adler & 0xff)); + put_byte(s, (Byte)((strm->adler >> 8) & 0xff)); + strm->adler = crc32(0L, Z_NULL, 0); + s->status = BUSY_STATE; + } + } + else + s->status = BUSY_STATE; + } +#endif + + /* Flush as much pending output as possible */ + if (s->pending != 0) { + flush_pending(strm); + if (strm->avail_out == 0) { + /* Since avail_out is 0, deflate will be called again with + * more output space, but possibly with both pending and + * avail_in equal to zero. There won't be anything to do, + * but this is not an error situation so make sure we + * return OK instead of BUF_ERROR at next call of deflate: + */ + s->last_flush = -1; + return Z_OK; + } + + /* Make sure there is something to do and avoid duplicate consecutive + * flushes. For repeated and useless calls with Z_FINISH, we keep + * returning Z_STREAM_END instead of Z_BUF_ERROR. + */ + } else if (strm->avail_in == 0 && flush <= old_flush && + flush != Z_FINISH) { + ERR_RETURN(strm, Z_BUF_ERROR); + } + + /* User must not provide more input after the first FINISH: */ + if (s->status == FINISH_STATE && strm->avail_in != 0) { + ERR_RETURN(strm, Z_BUF_ERROR); + } + + /* Start a new block or continue the current one. + */ + if (strm->avail_in != 0 || s->lookahead != 0 || + (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) { + block_state bstate; + + bstate = (*(configuration_table[s->level].func))(s, flush); + + if (bstate == finish_started || bstate == finish_done) { + s->status = FINISH_STATE; + } + if (bstate == need_more || bstate == finish_started) { + if (strm->avail_out == 0) { + s->last_flush = -1; /* avoid BUF_ERROR next call, see above */ + } + return Z_OK; + /* If flush != Z_NO_FLUSH && avail_out == 0, the next call + * of deflate should use the same flush parameter to make sure + * that the flush is complete. So we don't have to output an + * empty block here, this will be done at next call. This also + * ensures that for a very small output buffer, we emit at most + * one empty block. + */ + } + if (bstate == block_done) { + if (flush == Z_PARTIAL_FLUSH) { + _tr_align(s); + } else { /* FULL_FLUSH or SYNC_FLUSH */ + _tr_stored_block(s, (char*)0, 0L, 0); + /* For a full flush, this empty block will be recognized + * as a special marker by inflate_sync(). + */ + if (flush == Z_FULL_FLUSH) { + CLEAR_HASH(s); /* forget history */ + } + } + flush_pending(strm); + if (strm->avail_out == 0) { + s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */ + return Z_OK; + } + } + } + Assert(strm->avail_out > 0, "bug2"); + + if (flush != Z_FINISH) return Z_OK; + if (s->wrap <= 0) return Z_STREAM_END; + + /* Write the trailer */ +#ifdef GZIP + if (s->wrap == 2) { + put_byte(s, (Byte)(strm->adler & 0xff)); + put_byte(s, (Byte)((strm->adler >> 8) & 0xff)); + put_byte(s, (Byte)((strm->adler >> 16) & 0xff)); + put_byte(s, (Byte)((strm->adler >> 24) & 0xff)); + put_byte(s, (Byte)(strm->total_in & 0xff)); + put_byte(s, (Byte)((strm->total_in >> 8) & 0xff)); + put_byte(s, (Byte)((strm->total_in >> 16) & 0xff)); + put_byte(s, (Byte)((strm->total_in >> 24) & 0xff)); + } + else +#endif + { + putShortMSB(s, (uInt)(strm->adler >> 16)); + putShortMSB(s, (uInt)(strm->adler & 0xffff)); + } + flush_pending(strm); + /* If avail_out is zero, the application will call deflate again + * to flush the rest. + */ + if (s->wrap > 0) s->wrap = -s->wrap; /* write the trailer only once! */ + return s->pending != 0 ? Z_OK : Z_STREAM_END; +} + +/* ========================================================================= */ +int ZEXPORT deflateEnd (strm) + z_streamp strm; +{ + int status; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + + status = strm->state->status; + if (status != INIT_STATE && + status != EXTRA_STATE && + status != NAME_STATE && + status != COMMENT_STATE && + status != HCRC_STATE && + status != BUSY_STATE && + status != FINISH_STATE) { + return Z_STREAM_ERROR; + } + + /* Deallocate in reverse order of allocations: */ + TRY_FREE(strm, strm->state->pending_buf); + TRY_FREE(strm, strm->state->head); + TRY_FREE(strm, strm->state->prev); + TRY_FREE(strm, strm->state->window); + + ZFREE(strm, strm->state); + strm->state = Z_NULL; + + return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; +} + +/* ========================================================================= + * Copy the source state to the destination state. + * To simplify the source, this is not supported for 16-bit MSDOS (which + * doesn't have enough memory anyway to duplicate compression states). + */ +int ZEXPORT deflateCopy (dest, source) + z_streamp dest; + z_streamp source; +{ +#ifdef MAXSEG_64K + return Z_STREAM_ERROR; +#else + deflate_state *ds; + deflate_state *ss; + ushf *overlay; + + + if (source == Z_NULL || dest == Z_NULL || source->state == Z_NULL) { + return Z_STREAM_ERROR; + } + + ss = source->state; + + zmemcpy(dest, source, sizeof(z_stream)); + + ds = (deflate_state *) ZALLOC(dest, 1, sizeof(deflate_state)); + if (ds == Z_NULL) return Z_MEM_ERROR; + dest->state = (struct internal_state FAR *) ds; + zmemcpy(ds, ss, sizeof(deflate_state)); + ds->strm = dest; + + ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte)); + ds->prev = (Posf *) ZALLOC(dest, ds->w_size, sizeof(Pos)); + ds->head = (Posf *) ZALLOC(dest, ds->hash_size, sizeof(Pos)); + overlay = (ushf *) ZALLOC(dest, ds->lit_bufsize, sizeof(ush)+2); + ds->pending_buf = (uchf *) overlay; + + if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL || + ds->pending_buf == Z_NULL) { + deflateEnd (dest); + return Z_MEM_ERROR; + } + /* following zmemcpy do not work for 16-bit MSDOS */ + zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte)); + zmemcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos)); + zmemcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos)); + zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); + + ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); + ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush); + ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize; + + ds->l_desc.dyn_tree = ds->dyn_ltree; + ds->d_desc.dyn_tree = ds->dyn_dtree; + ds->bl_desc.dyn_tree = ds->bl_tree; + + return Z_OK; +#endif /* MAXSEG_64K */ +} + +/* =========================================================================== + * Read a new buffer from the current input stream, update the adler32 + * and total number of bytes read. All deflate() input goes through + * this function so some applications may wish to modify it to avoid + * allocating a large strm->next_in buffer and copying from it. + * (See also flush_pending()). + */ +local int read_buf(strm, buf, size) + z_streamp strm; + Bytef *buf; + unsigned size; +{ + unsigned len = strm->avail_in; + + if (len > size) len = size; + if (len == 0) return 0; + + strm->avail_in -= len; + + if (strm->state->wrap == 1) { + strm->adler = adler32(strm->adler, strm->next_in, len); + } +#ifdef GZIP + else if (strm->state->wrap == 2) { + strm->adler = crc32(strm->adler, strm->next_in, len); + } +#endif + zmemcpy(buf, strm->next_in, len); + strm->next_in += len; + strm->total_in += len; + + return (int)len; +} + +/* =========================================================================== + * Initialize the "longest match" routines for a new zlib stream + */ +local void lm_init (s) + deflate_state *s; +{ + s->window_size = (ulg)2L*s->w_size; + + CLEAR_HASH(s); + + /* Set the default configuration parameters: + */ + s->max_lazy_match = configuration_table[s->level].max_lazy; + s->good_match = configuration_table[s->level].good_length; + s->nice_match = configuration_table[s->level].nice_length; + s->max_chain_length = configuration_table[s->level].max_chain; + + s->strstart = 0; + s->block_start = 0L; + s->lookahead = 0; + s->match_length = s->prev_length = MIN_MATCH-1; + s->match_available = 0; + s->ins_h = 0; +#ifndef FASTEST +#ifdef ASMV + match_init(); /* initialize the asm code */ +#endif +#endif +} + +#ifndef FASTEST +/* =========================================================================== + * Set match_start to the longest match starting at the given string and + * return its length. Matches shorter or equal to prev_length are discarded, + * in which case the result is equal to prev_length and match_start is + * garbage. + * IN assertions: cur_match is the head of the hash chain for the current + * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 + * OUT assertion: the match length is not greater than s->lookahead. + */ +#ifndef ASMV +/* For 80x86 and 680x0, an optimized version will be provided in match.asm or + * match.S. The code will be functionally equivalent. + */ +local uInt longest_match(s, cur_match) + deflate_state *s; + IPos cur_match; /* current match */ +{ + unsigned chain_length = s->max_chain_length;/* max hash chain length */ + register Bytef *scan = s->window + s->strstart; /* current string */ + register Bytef *match; /* matched string */ + register int len; /* length of current match */ + int best_len = s->prev_length; /* best match length so far */ + int nice_match = s->nice_match; /* stop if match long enough */ + IPos limit = s->strstart > (IPos)MAX_DIST(s) ? + s->strstart - (IPos)MAX_DIST(s) : NIL; + /* Stop when cur_match becomes <= limit. To simplify the code, + * we prevent matches with the string of window index 0. + */ + Posf *prev = s->prev; + uInt wmask = s->w_mask; + +#ifdef UNALIGNED_OK + /* Compare two bytes at a time. Note: this is not always beneficial. + * Try with and without -DUNALIGNED_OK to check. + */ + register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1; + register ush scan_start = *(ushf*)scan; + register ush scan_end = *(ushf*)(scan+best_len-1); +#else + register Bytef *strend = s->window + s->strstart + MAX_MATCH; + register Byte scan_end1 = scan[best_len-1]; + register Byte scan_end = scan[best_len]; +#endif + + /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. + * It is easy to get rid of this optimization if necessary. + */ + Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); + + /* Do not waste too much time if we already have a good match: */ + if (s->prev_length >= s->good_match) { + chain_length >>= 2; + } + /* Do not look for matches beyond the end of the input. This is necessary + * to make deflate deterministic. + */ + if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; + + Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); + + do { + Assert(cur_match < s->strstart, "no future"); + match = s->window + cur_match; + + /* Skip to next match if the match length cannot increase + * or if the match length is less than 2. Note that the checks below + * for insufficient lookahead only occur occasionally for performance + * reasons. Therefore uninitialized memory will be accessed, and + * conditional jumps will be made that depend on those values. + * However the length of the match is limited to the lookahead, so + * the output of deflate is not affected by the uninitialized values. + */ +#if (defined(UNALIGNED_OK) && MAX_MATCH == 258) + /* This code assumes sizeof(unsigned short) == 2. Do not use + * UNALIGNED_OK if your compiler uses a different size. + */ + if (*(ushf*)(match+best_len-1) != scan_end || + *(ushf*)match != scan_start) continue; + + /* It is not necessary to compare scan[2] and match[2] since they are + * always equal when the other bytes match, given that the hash keys + * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at + * strstart+3, +5, ... up to strstart+257. We check for insufficient + * lookahead only every 4th comparison; the 128th check will be made + * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is + * necessary to put more guard bytes at the end of the window, or + * to check more often for insufficient lookahead. + */ + Assert(scan[2] == match[2], "scan[2]?"); + scan++, match++; + do { + } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) && + *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + scan < strend); + /* The funny "do {}" generates better code on most compilers */ + + /* Here, scan <= window+strstart+257 */ + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + if (*scan == *match) scan++; + + len = (MAX_MATCH - 1) - (int)(strend-scan); + scan = strend - (MAX_MATCH-1); + +#else /* UNALIGNED_OK */ + + if (match[best_len] != scan_end || + match[best_len-1] != scan_end1 || + *match != *scan || + *++match != scan[1]) continue; + + /* The check at best_len-1 can be removed because it will be made + * again later. (This heuristic is not always a win.) + * It is not necessary to compare scan[2] and match[2] since they + * are always equal when the other bytes match, given that + * the hash keys are equal and that HASH_BITS >= 8. + */ + scan += 2, match++; + Assert(*scan == *match, "match[2]?"); + + /* We check for insufficient lookahead only every 8th comparison; + * the 256th check will be made at strstart+258. + */ + do { + } while (*++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + scan < strend); + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = MAX_MATCH - (int)(strend - scan); + scan = strend - MAX_MATCH; + +#endif /* UNALIGNED_OK */ + + if (len > best_len) { + s->match_start = cur_match; + best_len = len; + if (len >= nice_match) break; +#ifdef UNALIGNED_OK + scan_end = *(ushf*)(scan+best_len-1); +#else + scan_end1 = scan[best_len-1]; + scan_end = scan[best_len]; +#endif + } + } while ((cur_match = prev[cur_match & wmask]) > limit + && --chain_length != 0); + + if ((uInt)best_len <= s->lookahead) return (uInt)best_len; + return s->lookahead; +} +#endif /* ASMV */ +#endif /* FASTEST */ + +/* --------------------------------------------------------------------------- + * Optimized version for level == 1 or strategy == Z_RLE only + */ +local uInt longest_match_fast(s, cur_match) + deflate_state *s; + IPos cur_match; /* current match */ +{ + register Bytef *scan = s->window + s->strstart; /* current string */ + register Bytef *match; /* matched string */ + register int len; /* length of current match */ + register Bytef *strend = s->window + s->strstart + MAX_MATCH; + + /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. + * It is easy to get rid of this optimization if necessary. + */ + Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); + + Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); + + Assert(cur_match < s->strstart, "no future"); + + match = s->window + cur_match; + + /* Return failure if the match length is less than 2: + */ + if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1; + + /* The check at best_len-1 can be removed because it will be made + * again later. (This heuristic is not always a win.) + * It is not necessary to compare scan[2] and match[2] since they + * are always equal when the other bytes match, given that + * the hash keys are equal and that HASH_BITS >= 8. + */ + scan += 2, match += 2; + Assert(*scan == *match, "match[2]?"); + + /* We check for insufficient lookahead only every 8th comparison; + * the 256th check will be made at strstart+258. + */ + do { + } while (*++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + scan < strend); + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = MAX_MATCH - (int)(strend - scan); + + if (len < MIN_MATCH) return MIN_MATCH - 1; + + s->match_start = cur_match; + return (uInt)len <= s->lookahead ? (uInt)len : s->lookahead; +} + +#ifdef DEBUG +/* =========================================================================== + * Check that the match at match_start is indeed a match. + */ +local void check_match(s, start, match, length) + deflate_state *s; + IPos start, match; + int length; +{ + /* check that the match is indeed a match */ + if (zmemcmp(s->window + match, + s->window + start, length) != EQUAL) { + fprintf(stderr, " start %u, match %u, length %d\n", + start, match, length); + do { + fprintf(stderr, "%c%c", s->window[match++], s->window[start++]); + } while (--length != 0); + z_error("invalid match"); + } + if (z_verbose > 1) { + fprintf(stderr,"\\[%d,%d]", start-match, length); + do { putc(s->window[start++], stderr); } while (--length != 0); + } +} +#else +# define check_match(s, start, match, length) +#endif /* DEBUG */ + +/* =========================================================================== + * Fill the window when the lookahead becomes insufficient. + * Updates strstart and lookahead. + * + * IN assertion: lookahead < MIN_LOOKAHEAD + * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD + * At least one byte has been read, or avail_in == 0; reads are + * performed for at least two bytes (required for the zip translate_eol + * option -- not supported here). + */ +local void fill_window(s) + deflate_state *s; +{ + register unsigned n, m; + register Posf *p; + unsigned more; /* Amount of free space at the end of the window. */ + uInt wsize = s->w_size; + + do { + more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart); + + /* Deal with !@#$% 64K limit: */ + if (sizeof(int) <= 2) { + if (more == 0 && s->strstart == 0 && s->lookahead == 0) { + more = wsize; + + } else if (more == (unsigned)(-1)) { + /* Very unlikely, but possible on 16 bit machine if + * strstart == 0 && lookahead == 1 (input done a byte at time) + */ + more--; + } + } + + /* If the window is almost full and there is insufficient lookahead, + * move the upper half to the lower one to make room in the upper half. + */ + if (s->strstart >= wsize+MAX_DIST(s)) { + + zmemcpy(s->window, s->window+wsize, (unsigned)wsize); + s->match_start -= wsize; + s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ + s->block_start -= (long) wsize; + + /* Slide the hash table (could be avoided with 32 bit values + at the expense of memory usage). We slide even when level == 0 + to keep the hash table consistent if we switch back to level > 0 + later. (Using level 0 permanently is not an optimal usage of + zlib, so we don't care about this pathological case.) + */ + /* %%% avoid this when Z_RLE */ + n = s->hash_size; + p = &s->head[n]; + do { + m = *--p; + *p = (Pos)(m >= wsize ? m-wsize : NIL); + } while (--n); + + n = wsize; +#ifndef FASTEST + p = &s->prev[n]; + do { + m = *--p; + *p = (Pos)(m >= wsize ? m-wsize : NIL); + /* If n is not on any hash chain, prev[n] is garbage but + * its value will never be used. + */ + } while (--n); +#endif + more += wsize; + } + if (s->strm->avail_in == 0) return; + + /* If there was no sliding: + * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && + * more == window_size - lookahead - strstart + * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) + * => more >= window_size - 2*WSIZE + 2 + * In the BIG_MEM or MMAP case (not yet supported), + * window_size == input_size + MIN_LOOKAHEAD && + * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. + * Otherwise, window_size == 2*WSIZE so more >= 2. + * If there was sliding, more >= WSIZE. So in all cases, more >= 2. + */ + Assert(more >= 2, "more < 2"); + + n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); + s->lookahead += n; + + /* Initialize the hash value now that we have some input: */ + if (s->lookahead >= MIN_MATCH) { + s->ins_h = s->window[s->strstart]; + UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); +#if MIN_MATCH != 3 + Call UPDATE_HASH() MIN_MATCH-3 more times +#endif + } + /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, + * but this is not important since only literal bytes will be emitted. + */ + + } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); +} + +/* =========================================================================== + * Flush the current block, with given end-of-file flag. + * IN assertion: strstart is set to the end of the current match. + */ +#define FLUSH_BLOCK_ONLY(s, eof) { \ + _tr_flush_block(s, (s->block_start >= 0L ? \ + (charf *)&s->window[(unsigned)s->block_start] : \ + (charf *)Z_NULL), \ + (ulg)((long)s->strstart - s->block_start), \ + (eof)); \ + s->block_start = s->strstart; \ + flush_pending(s->strm); \ + Tracev((stderr,"[FLUSH]")); \ +} + +/* Same but force premature exit if necessary. */ +#define FLUSH_BLOCK(s, eof) { \ + FLUSH_BLOCK_ONLY(s, eof); \ + if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \ +} + +/* =========================================================================== + * Copy without compression as much as possible from the input stream, return + * the current block state. + * This function does not insert new strings in the dictionary since + * uncompressible data is probably not useful. This function is used + * only for the level=0 compression option. + * NOTE: this function should be optimized to avoid extra copying from + * window to pending_buf. + */ +local block_state deflate_stored(s, flush) + deflate_state *s; + int flush; +{ + /* Stored blocks are limited to 0xffff bytes, pending_buf is limited + * to pending_buf_size, and each stored block has a 5 byte header: + */ + ulg max_block_size = 0xffff; + ulg max_start; + + if (max_block_size > s->pending_buf_size - 5) { + max_block_size = s->pending_buf_size - 5; + } + + /* Copy as much as possible from input to output: */ + for (;;) { + /* Fill the window as much as possible: */ + if (s->lookahead <= 1) { + + Assert(s->strstart < s->w_size+MAX_DIST(s) || + s->block_start >= (long)s->w_size, "slide too late"); + + fill_window(s); + if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more; + + if (s->lookahead == 0) break; /* flush the current block */ + } + Assert(s->block_start >= 0L, "block gone"); + + s->strstart += s->lookahead; + s->lookahead = 0; + + /* Emit a stored block if pending_buf will be full: */ + max_start = s->block_start + max_block_size; + if (s->strstart == 0 || (ulg)s->strstart >= max_start) { + /* strstart == 0 is possible when wraparound on 16-bit machine */ + s->lookahead = (uInt)(s->strstart - max_start); + s->strstart = (uInt)max_start; + FLUSH_BLOCK(s, 0); + } + /* Flush if we may have to slide, otherwise block_start may become + * negative and the data will be gone: + */ + if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) { + FLUSH_BLOCK(s, 0); + } + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} + +/* =========================================================================== + * Compress as much as possible from the input stream, return the current + * block state. + * This function does not perform lazy evaluation of matches and inserts + * new strings in the dictionary only for unmatched strings or for short + * matches. It is used only for the fast compression options. + */ +local block_state deflate_fast(s, flush) + deflate_state *s; + int flush; +{ + IPos hash_head = NIL; /* head of the hash chain */ + int bflush; /* set if current block must be flushed */ + + for (;;) { + /* Make sure that we always have enough lookahead, except + * at the end of the input file. We need MAX_MATCH bytes + * for the next match, plus MIN_MATCH bytes to insert the + * string following the next match. + */ + if (s->lookahead < MIN_LOOKAHEAD) { + fill_window(s); + if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { + return need_more; + } + if (s->lookahead == 0) break; /* flush the current block */ + } + + /* Insert the string window[strstart .. strstart+2] in the + * dictionary, and set hash_head to the head of the hash chain: + */ + if (s->lookahead >= MIN_MATCH) { + INSERT_STRING(s, s->strstart, hash_head); + } + + /* Find the longest match, discarding those <= prev_length. + * At this point we have always match_length < MIN_MATCH + */ + if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) { + /* To simplify the code, we prevent matches with the string + * of window index 0 (in particular we have to avoid a match + * of the string with itself at the start of the input file). + */ +#ifdef FASTEST + if ((s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) || + (s->strategy == Z_RLE && s->strstart - hash_head == 1)) { + s->match_length = longest_match_fast (s, hash_head); + } +#else + if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) { + s->match_length = longest_match (s, hash_head); + } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) { + s->match_length = longest_match_fast (s, hash_head); + } +#endif + /* longest_match() or longest_match_fast() sets match_start */ + } + if (s->match_length >= MIN_MATCH) { + check_match(s, s->strstart, s->match_start, s->match_length); + + _tr_tally_dist(s, s->strstart - s->match_start, + s->match_length - MIN_MATCH, bflush); + + s->lookahead -= s->match_length; + + /* Insert new strings in the hash table only if the match length + * is not too large. This saves time but degrades compression. + */ +#ifndef FASTEST + if (s->match_length <= s->max_insert_length && + s->lookahead >= MIN_MATCH) { + s->match_length--; /* string at strstart already in table */ + do { + s->strstart++; + INSERT_STRING(s, s->strstart, hash_head); + /* strstart never exceeds WSIZE-MAX_MATCH, so there are + * always MIN_MATCH bytes ahead. + */ + } while (--s->match_length != 0); + s->strstart++; + } else +#endif + { + s->strstart += s->match_length; + s->match_length = 0; + s->ins_h = s->window[s->strstart]; + UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); +#if MIN_MATCH != 3 + Call UPDATE_HASH() MIN_MATCH-3 more times +#endif + /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not + * matter since it will be recomputed at next deflate call. + */ + } + } else { + /* No match, output a literal byte */ + Tracevv((stderr,"%c", s->window[s->strstart])); + _tr_tally_lit (s, s->window[s->strstart], bflush); + s->lookahead--; + s->strstart++; + } + if (bflush) FLUSH_BLOCK(s, 0); + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} + +#ifndef FASTEST +/* =========================================================================== + * Same as above, but achieves better compression. We use a lazy + * evaluation for matches: a match is finally adopted only if there is + * no better match at the next window position. + */ +local block_state deflate_slow(s, flush) + deflate_state *s; + int flush; +{ + IPos hash_head = NIL; /* head of hash chain */ + int bflush; /* set if current block must be flushed */ + + /* Process the input block. */ + for (;;) { + /* Make sure that we always have enough lookahead, except + * at the end of the input file. We need MAX_MATCH bytes + * for the next match, plus MIN_MATCH bytes to insert the + * string following the next match. + */ + if (s->lookahead < MIN_LOOKAHEAD) { + fill_window(s); + if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { + return need_more; + } + if (s->lookahead == 0) break; /* flush the current block */ + } + + /* Insert the string window[strstart .. strstart+2] in the + * dictionary, and set hash_head to the head of the hash chain: + */ + if (s->lookahead >= MIN_MATCH) { + INSERT_STRING(s, s->strstart, hash_head); + } + + /* Find the longest match, discarding those <= prev_length. + */ + s->prev_length = s->match_length, s->prev_match = s->match_start; + s->match_length = MIN_MATCH-1; + + if (hash_head != NIL && s->prev_length < s->max_lazy_match && + s->strstart - hash_head <= MAX_DIST(s)) { + /* To simplify the code, we prevent matches with the string + * of window index 0 (in particular we have to avoid a match + * of the string with itself at the start of the input file). + */ + if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) { + s->match_length = longest_match (s, hash_head); + } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) { + s->match_length = longest_match_fast (s, hash_head); + } + /* longest_match() or longest_match_fast() sets match_start */ + + if (s->match_length <= 5 && (s->strategy == Z_FILTERED +#if TOO_FAR <= 32767 + || (s->match_length == MIN_MATCH && + s->strstart - s->match_start > TOO_FAR) +#endif + )) { + + /* If prev_match is also MIN_MATCH, match_start is garbage + * but we will ignore the current match anyway. + */ + s->match_length = MIN_MATCH-1; + } + } + /* If there was a match at the previous step and the current + * match is not better, output the previous match: + */ + if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) { + uInt max_insert = s->strstart + s->lookahead - MIN_MATCH; + /* Do not insert strings in hash table beyond this. */ + + check_match(s, s->strstart-1, s->prev_match, s->prev_length); + + _tr_tally_dist(s, s->strstart -1 - s->prev_match, + s->prev_length - MIN_MATCH, bflush); + + /* Insert in hash table all strings up to the end of the match. + * strstart-1 and strstart are already inserted. If there is not + * enough lookahead, the last two strings are not inserted in + * the hash table. + */ + s->lookahead -= s->prev_length-1; + s->prev_length -= 2; + do { + if (++s->strstart <= max_insert) { + INSERT_STRING(s, s->strstart, hash_head); + } + } while (--s->prev_length != 0); + s->match_available = 0; + s->match_length = MIN_MATCH-1; + s->strstart++; + + if (bflush) FLUSH_BLOCK(s, 0); + + } else if (s->match_available) { + /* If there was no match at the previous position, output a + * single literal. If there was a match but the current match + * is longer, truncate the previous match to a single literal. + */ + Tracevv((stderr,"%c", s->window[s->strstart-1])); + _tr_tally_lit(s, s->window[s->strstart-1], bflush); + if (bflush) { + FLUSH_BLOCK_ONLY(s, 0); + } + s->strstart++; + s->lookahead--; + if (s->strm->avail_out == 0) return need_more; + } else { + /* There is no previous match to compare with, wait for + * the next step to decide. + */ + s->match_available = 1; + s->strstart++; + s->lookahead--; + } + } + Assert (flush != Z_NO_FLUSH, "no flush?"); + if (s->match_available) { + Tracevv((stderr,"%c", s->window[s->strstart-1])); + _tr_tally_lit(s, s->window[s->strstart-1], bflush); + s->match_available = 0; + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} +#endif /* FASTEST */ + +#if 0 +/* =========================================================================== + * For Z_RLE, simply look for runs of bytes, generate matches only of distance + * one. Do not maintain a hash table. (It will be regenerated if this run of + * deflate switches away from Z_RLE.) + */ +local block_state deflate_rle(s, flush) + deflate_state *s; + int flush; +{ + int bflush; /* set if current block must be flushed */ + uInt run; /* length of run */ + uInt max; /* maximum length of run */ + uInt prev; /* byte at distance one to match */ + Bytef *scan; /* scan for end of run */ + + for (;;) { + /* Make sure that we always have enough lookahead, except + * at the end of the input file. We need MAX_MATCH bytes + * for the longest encodable run. + */ + if (s->lookahead < MAX_MATCH) { + fill_window(s); + if (s->lookahead < MAX_MATCH && flush == Z_NO_FLUSH) { + return need_more; + } + if (s->lookahead == 0) break; /* flush the current block */ + } + + /* See how many times the previous byte repeats */ + run = 0; + if (s->strstart > 0) { /* if there is a previous byte, that is */ + max = s->lookahead < MAX_MATCH ? s->lookahead : MAX_MATCH; + scan = s->window + s->strstart - 1; + prev = *scan++; + do { + if (*scan++ != prev) + break; + } while (++run < max); + } + + /* Emit match if have run of MIN_MATCH or longer, else emit literal */ + if (run >= MIN_MATCH) { + check_match(s, s->strstart, s->strstart - 1, run); + _tr_tally_dist(s, 1, run - MIN_MATCH, bflush); + s->lookahead -= run; + s->strstart += run; + } else { + /* No match, output a literal byte */ + Tracevv((stderr,"%c", s->window[s->strstart])); + _tr_tally_lit (s, s->window[s->strstart], bflush); + s->lookahead--; + s->strstart++; + } + if (bflush) FLUSH_BLOCK(s, 0); + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} +#endif diff --git a/sys/contrib/opensolaris/uts/common/zmod/deflate.h b/sys/contrib/opensolaris/uts/common/zmod/deflate.h new file mode 100644 index 0000000..d01a3c1 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/deflate.h @@ -0,0 +1,331 @@ +/* deflate.h -- internal compression state + * Copyright (C) 1995-2004 Jean-loup Gailly + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +#ifndef _DEFLATE_H +#define _DEFLATE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zutil.h" + +/* define NO_GZIP when compiling if you want to disable gzip header and + trailer creation by deflate(). NO_GZIP would be used to avoid linking in + the crc code when it is not needed. For shared libraries, gzip encoding + should be left enabled. */ +#ifndef NO_GZIP +# define GZIP +#endif + +/* =========================================================================== + * Internal compression state. + */ + +#define LENGTH_CODES 29 +/* number of length codes, not counting the special END_BLOCK code */ + +#define LITERALS 256 +/* number of literal bytes 0..255 */ + +#define L_CODES (LITERALS+1+LENGTH_CODES) +/* number of Literal or Length codes, including the END_BLOCK code */ + +#define D_CODES 30 +/* number of distance codes */ + +#define BL_CODES 19 +/* number of codes used to transfer the bit lengths */ + +#define HEAP_SIZE (2*L_CODES+1) +/* maximum heap size */ + +#define MAX_BITS 15 +/* All codes must not exceed MAX_BITS bits */ + +#define INIT_STATE 42 +#define EXTRA_STATE 69 +#define NAME_STATE 73 +#define COMMENT_STATE 91 +#define HCRC_STATE 103 +#define BUSY_STATE 113 +#define FINISH_STATE 666 +/* Stream status */ + + +/* Data structure describing a single value and its code string. */ +typedef struct ct_data_s { + union { + ush freq; /* frequency count */ + ush code; /* bit string */ + } fc; + union { + ush dad; /* father node in Huffman tree */ + ush len; /* length of bit string */ + } dl; +} FAR ct_data; + +#define Freq fc.freq +#define Code fc.code +#define Dad dl.dad +#define Len dl.len + +typedef struct static_tree_desc_s static_tree_desc; + +typedef struct tree_desc_s { + ct_data *dyn_tree; /* the dynamic tree */ + int max_code; /* largest code with non zero frequency */ + static_tree_desc *stat_desc; /* the corresponding static tree */ +} FAR tree_desc; + +typedef ush Pos; +typedef Pos FAR Posf; +typedef unsigned IPos; + +/* A Pos is an index in the character window. We use short instead of int to + * save space in the various tables. IPos is used only for parameter passing. + */ + +typedef struct internal_state { + z_streamp strm; /* pointer back to this zlib stream */ + int status; /* as the name implies */ + Bytef *pending_buf; /* output still pending */ + ulg pending_buf_size; /* size of pending_buf */ + Bytef *pending_out; /* next pending byte to output to the stream */ + uInt pending; /* nb of bytes in the pending buffer */ + int wrap; /* bit 0 true for zlib, bit 1 true for gzip */ + gz_headerp gzhead; /* gzip header information to write */ + uInt gzindex; /* where in extra, name, or comment */ + Byte method; /* STORED (for zip only) or DEFLATED */ + int last_flush; /* value of flush param for previous deflate call */ + + /* used by deflate.c: */ + + uInt w_size; /* LZ77 window size (32K by default) */ + uInt w_bits; /* log2(w_size) (8..16) */ + uInt w_mask; /* w_size - 1 */ + + Bytef *window; + /* Sliding window. Input bytes are read into the second half of the window, + * and move to the first half later to keep a dictionary of at least wSize + * bytes. With this organization, matches are limited to a distance of + * wSize-MAX_MATCH bytes, but this ensures that IO is always + * performed with a length multiple of the block size. Also, it limits + * the window size to 64K, which is quite useful on MSDOS. + * To do: use the user input buffer as sliding window. + */ + + ulg window_size; + /* Actual size of window: 2*wSize, except when the user input buffer + * is directly used as sliding window. + */ + + Posf *prev; + /* Link to older string with same hash index. To limit the size of this + * array to 64K, this link is maintained only for the last 32K strings. + * An index in this array is thus a window index modulo 32K. + */ + + Posf *head; /* Heads of the hash chains or NIL. */ + + uInt ins_h; /* hash index of string to be inserted */ + uInt hash_size; /* number of elements in hash table */ + uInt hash_bits; /* log2(hash_size) */ + uInt hash_mask; /* hash_size-1 */ + + uInt hash_shift; + /* Number of bits by which ins_h must be shifted at each input + * step. It must be such that after MIN_MATCH steps, the oldest + * byte no longer takes part in the hash key, that is: + * hash_shift * MIN_MATCH >= hash_bits + */ + + long block_start; + /* Window position at the beginning of the current output block. Gets + * negative when the window is moved backwards. + */ + + uInt match_length; /* length of best match */ + IPos prev_match; /* previous match */ + int match_available; /* set if previous match exists */ + uInt strstart; /* start of string to insert */ + uInt match_start; /* start of matching string */ + uInt lookahead; /* number of valid bytes ahead in window */ + + uInt prev_length; + /* Length of the best match at previous step. Matches not greater than this + * are discarded. This is used in the lazy match evaluation. + */ + + uInt max_chain_length; + /* To speed up deflation, hash chains are never searched beyond this + * length. A higher limit improves compression ratio but degrades the + * speed. + */ + + uInt max_lazy_match; + /* Attempt to find a better match only when the current match is strictly + * smaller than this value. This mechanism is used only for compression + * levels >= 4. + */ +# define max_insert_length max_lazy_match + /* Insert new strings in the hash table only if the match length is not + * greater than this length. This saves time but degrades compression. + * max_insert_length is used only for compression levels <= 3. + */ + + int level; /* compression level (1..9) */ + int strategy; /* favor or force Huffman coding*/ + + uInt good_match; + /* Use a faster search when the previous match is longer than this */ + + int nice_match; /* Stop searching when current match exceeds this */ + + /* used by trees.c: */ + /* Didn't use ct_data typedef below to supress compiler warning */ + struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */ + struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */ + struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */ + + struct tree_desc_s l_desc; /* desc. for literal tree */ + struct tree_desc_s d_desc; /* desc. for distance tree */ + struct tree_desc_s bl_desc; /* desc. for bit length tree */ + + ush bl_count[MAX_BITS+1]; + /* number of codes at each bit length for an optimal tree */ + + int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */ + int heap_len; /* number of elements in the heap */ + int heap_max; /* element of largest frequency */ + /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used. + * The same heap array is used to build all trees. + */ + + uch depth[2*L_CODES+1]; + /* Depth of each subtree used as tie breaker for trees of equal frequency + */ + + uchf *l_buf; /* buffer for literals or lengths */ + + uInt lit_bufsize; + /* Size of match buffer for literals/lengths. There are 4 reasons for + * limiting lit_bufsize to 64K: + * - frequencies can be kept in 16 bit counters + * - if compression is not successful for the first block, all input + * data is still in the window so we can still emit a stored block even + * when input comes from standard input. (This can also be done for + * all blocks if lit_bufsize is not greater than 32K.) + * - if compression is not successful for a file smaller than 64K, we can + * even emit a stored file instead of a stored block (saving 5 bytes). + * This is applicable only for zip (not gzip or zlib). + * - creating new Huffman trees less frequently may not provide fast + * adaptation to changes in the input data statistics. (Take for + * example a binary file with poorly compressible code followed by + * a highly compressible string table.) Smaller buffer sizes give + * fast adaptation but have of course the overhead of transmitting + * trees more frequently. + * - I can't count above 4 + */ + + uInt last_lit; /* running index in l_buf */ + + ushf *d_buf; + /* Buffer for distances. To simplify the code, d_buf and l_buf have + * the same number of elements. To use different lengths, an extra flag + * array would be necessary. + */ + + ulg opt_len; /* bit length of current block with optimal trees */ + ulg static_len; /* bit length of current block with static trees */ + uInt matches; /* number of string matches in current block */ + int last_eob_len; /* bit length of EOB code for last block */ + +#ifdef DEBUG + ulg compressed_len; /* total bit length of compressed file mod 2^32 */ + ulg bits_sent; /* bit length of compressed data sent mod 2^32 */ +#endif + + ush bi_buf; + /* Output buffer. bits are inserted starting at the bottom (least + * significant bits). + */ + int bi_valid; + /* Number of valid bits in bi_buf. All bits above the last valid bit + * are always zero. + */ + +} FAR deflate_state; + +/* Output a byte on the stream. + * IN assertion: there is enough room in pending_buf. + */ +#define put_byte(s, c) {s->pending_buf[s->pending++] = (c);} + + +#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) +/* Minimum amount of lookahead, except at the end of the input file. + * See deflate.c for comments about the MIN_MATCH+1. + */ + +#define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD) +/* In order to simplify the code, particularly on 16 bit machines, match + * distances are limited to MAX_DIST instead of WSIZE. + */ + + /* in trees.c */ +void _tr_init OF((deflate_state *s)); +int _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc)); +void _tr_flush_block OF((deflate_state *s, charf *buf, ulg stored_len, + int eof)); +void _tr_align OF((deflate_state *s)); +void _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len, + int eof)); + +#define d_code(dist) \ + ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)]) +/* Mapping from a distance to a distance code. dist is the distance - 1 and + * must not have side effects. _dist_code[256] and _dist_code[257] are never + * used. + */ + +#ifndef DEBUG +/* Inline versions of _tr_tally for speed: */ + +#if defined(GEN_TREES_H) || !defined(STDC) + extern uch _length_code[]; + extern uch _dist_code[]; +#else + extern const uch _length_code[]; + extern const uch _dist_code[]; +#endif + +# define _tr_tally_lit(s, c, flush) \ + { uch cc = (c); \ + s->d_buf[s->last_lit] = 0; \ + s->l_buf[s->last_lit++] = cc; \ + s->dyn_ltree[cc].Freq++; \ + flush = (s->last_lit == s->lit_bufsize-1); \ + } +# define _tr_tally_dist(s, distance, length, flush) \ + { uch len = (length); \ + ush dist = (distance); \ + s->d_buf[s->last_lit] = dist; \ + s->l_buf[s->last_lit++] = len; \ + dist--; \ + s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \ + s->dyn_dtree[d_code(dist)].Freq++; \ + flush = (s->last_lit == s->lit_bufsize-1); \ + } +#else +# define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c) +# define _tr_tally_dist(s, distance, length, flush) \ + flush = _tr_tally(s, distance, length) +#endif + +#endif /* _DEFLATE_H */ diff --git a/sys/contrib/opensolaris/uts/common/zmod/inffast.c b/sys/contrib/opensolaris/uts/common/zmod/inffast.c new file mode 100644 index 0000000..a6dcf3f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/inffast.c @@ -0,0 +1,320 @@ +/* inffast.c -- fast decoding + * Copyright (C) 1995-2004 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zutil.h" +#include "inftrees.h" +#include "inflate.h" +#include "inffast.h" + +#ifndef ASMINF + +/* Allow machine dependent optimization for post-increment or pre-increment. + Based on testing to date, + Pre-increment preferred for: + - PowerPC G3 (Adler) + - MIPS R5000 (Randers-Pehrson) + Post-increment preferred for: + - none + No measurable difference: + - Pentium III (Anderson) + - M68060 (Nikl) + */ +#ifdef POSTINC +# define OFF 0 +# define PUP(a) *(a)++ +#else +# define OFF 1 +# define PUP(a) *++(a) +#endif + +/* + Decode literal, length, and distance codes and write out the resulting + literal and match bytes until either not enough input or output is + available, an end-of-block is encountered, or a data error is encountered. + When large enough input and output buffers are supplied to inflate(), for + example, a 16K input buffer and a 64K output buffer, more than 95% of the + inflate execution time is spent in this routine. + + Entry assumptions: + + state->mode == LEN + strm->avail_in >= 6 + strm->avail_out >= 258 + start >= strm->avail_out + state->bits < 8 + + On return, state->mode is one of: + + LEN -- ran out of enough output space or enough available input + TYPE -- reached end of block code, inflate() to interpret next block + BAD -- error in block data + + Notes: + + - The maximum input bits used by a length/distance pair is 15 bits for the + length code, 5 bits for the length extra, 15 bits for the distance code, + and 13 bits for the distance extra. This totals 48 bits, or six bytes. + Therefore if strm->avail_in >= 6, then there is enough input to avoid + checking for available input while decoding. + + - The maximum bytes that a single length/distance pair can output is 258 + bytes, which is the maximum length that can be coded. inflate_fast() + requires strm->avail_out >= 258 for each loop to avoid checking for + output space. + */ +void inflate_fast(strm, start) +z_streamp strm; +unsigned start; /* inflate()'s starting value for strm->avail_out */ +{ + struct inflate_state FAR *state; + unsigned char FAR *in; /* local strm->next_in */ + unsigned char FAR *last; /* while in < last, enough input available */ + unsigned char FAR *out; /* local strm->next_out */ + unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ + unsigned char FAR *end; /* while out < end, enough space available */ +#ifdef INFLATE_STRICT + unsigned dmax; /* maximum distance from zlib header */ +#endif + unsigned wsize; /* window size or zero if not using window */ + unsigned whave; /* valid bytes in the window */ + unsigned write; /* window write index */ + unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */ + unsigned long hold; /* local strm->hold */ + unsigned bits; /* local strm->bits */ + code const FAR *lcode; /* local strm->lencode */ + code const FAR *dcode; /* local strm->distcode */ + unsigned lmask; /* mask for first level of length codes */ + unsigned dmask; /* mask for first level of distance codes */ + code this; /* retrieved table entry */ + unsigned op; /* code bits, operation, extra bits, or */ + /* window position, window bytes to copy */ + unsigned len; /* match length, unused bytes */ + unsigned dist; /* match distance */ + unsigned char FAR *from; /* where to copy match from */ + + /* copy state to local variables */ + state = (struct inflate_state FAR *)strm->state; + in = strm->next_in - OFF; + last = in + (strm->avail_in - 5); + out = strm->next_out - OFF; + beg = out - (start - strm->avail_out); + end = out + (strm->avail_out - 257); +#ifdef INFLATE_STRICT + dmax = state->dmax; +#endif + wsize = state->wsize; + whave = state->whave; + write = state->write; + window = state->window; + hold = state->hold; + bits = state->bits; + lcode = state->lencode; + dcode = state->distcode; + lmask = (1U << state->lenbits) - 1; + dmask = (1U << state->distbits) - 1; + + /* decode literals and length/distances until end-of-block or not enough + input data or output space */ + do { + if (bits < 15) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + } + this = lcode[hold & lmask]; + dolen: + op = (unsigned)(this.bits); + hold >>= op; + bits -= op; + op = (unsigned)(this.op); + if (op == 0) { /* literal */ + Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ? + "inflate: literal '%c'\n" : + "inflate: literal 0x%02x\n", this.val)); + PUP(out) = (unsigned char)(this.val); + } + else if (op & 16) { /* length base */ + len = (unsigned)(this.val); + op &= 15; /* number of extra bits */ + if (op) { + if (bits < op) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + } + len += (unsigned)hold & ((1U << op) - 1); + hold >>= op; + bits -= op; + } + Tracevv((stderr, "inflate: length %u\n", len)); + if (bits < 15) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + } + this = dcode[hold & dmask]; + dodist: + op = (unsigned)(this.bits); + hold >>= op; + bits -= op; + op = (unsigned)(this.op); + if (op & 16) { /* distance base */ + dist = (unsigned)(this.val); + op &= 15; /* number of extra bits */ + if (bits < op) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + if (bits < op) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + } + } + dist += (unsigned)hold & ((1U << op) - 1); +#ifdef INFLATE_STRICT + if (dist > dmax) { + strm->msg = (char *)"invalid distance too far back"; + state->mode = BAD; + break; + } +#endif + hold >>= op; + bits -= op; + Tracevv((stderr, "inflate: distance %u\n", dist)); + op = (unsigned)(out - beg); /* max distance in output */ + if (dist > op) { /* see if copy from window */ + op = dist - op; /* distance back in window */ + if (op > whave) { + strm->msg = (char *)"invalid distance too far back"; + state->mode = BAD; + break; + } + from = window - OFF; + if (write == 0) { /* very common case */ + from += wsize - op; + if (op < len) { /* some from window */ + len -= op; + do { + PUP(out) = PUP(from); + } while (--op); + from = out - dist; /* rest from output */ + } + } + else if (write < op) { /* wrap around window */ + from += wsize + write - op; + op -= write; + if (op < len) { /* some from end of window */ + len -= op; + do { + PUP(out) = PUP(from); + } while (--op); + from = window - OFF; + if (write < len) { /* some from start of window */ + op = write; + len -= op; + do { + PUP(out) = PUP(from); + } while (--op); + from = out - dist; /* rest from output */ + } + } + } + else { /* contiguous in window */ + from += write - op; + if (op < len) { /* some from window */ + len -= op; + do { + PUP(out) = PUP(from); + } while (--op); + from = out - dist; /* rest from output */ + } + } + while (len > 2) { + PUP(out) = PUP(from); + PUP(out) = PUP(from); + PUP(out) = PUP(from); + len -= 3; + } + if (len) { + PUP(out) = PUP(from); + if (len > 1) + PUP(out) = PUP(from); + } + } + else { + from = out - dist; /* copy direct from output */ + do { /* minimum length is three */ + PUP(out) = PUP(from); + PUP(out) = PUP(from); + PUP(out) = PUP(from); + len -= 3; + } while (len > 2); + if (len) { + PUP(out) = PUP(from); + if (len > 1) + PUP(out) = PUP(from); + } + } + } + else if ((op & 64) == 0) { /* 2nd level distance code */ + this = dcode[this.val + (hold & ((1U << op) - 1))]; + goto dodist; + } + else { + strm->msg = (char *)"invalid distance code"; + state->mode = BAD; + break; + } + } + else if ((op & 64) == 0) { /* 2nd level length code */ + this = lcode[this.val + (hold & ((1U << op) - 1))]; + goto dolen; + } + else if (op & 32) { /* end-of-block */ + Tracevv((stderr, "inflate: end of block\n")); + state->mode = TYPE; + break; + } + else { + strm->msg = (char *)"invalid literal/length code"; + state->mode = BAD; + break; + } + } while (in < last && out < end); + + /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ + len = bits >> 3; + in -= len; + bits -= len << 3; + hold &= (1U << bits) - 1; + + /* update state and return */ + strm->next_in = in + OFF; + strm->next_out = out + OFF; + strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); + strm->avail_out = (unsigned)(out < end ? + 257 + (end - out) : 257 - (out - end)); + state->hold = hold; + state->bits = bits; + return; +} + +/* + inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe): + - Using bit fields for code structure + - Different op definition to avoid & for extra bits (do & for table bits) + - Three separate decoding do-loops for direct, window, and write == 0 + - Special case for distance > 1 copies to do overlapped load and store copy + - Explicit branch predictions (based on measured branch probabilities) + - Deferring match copy and interspersed it with decoding subsequent codes + - Swapping literal/length else + - Swapping window/direct else + - Larger unrolled copy loops (three is about right) + - Moving len -= 3 statement into middle of loop + */ + +#endif /* !ASMINF */ diff --git a/sys/contrib/opensolaris/uts/common/zmod/inffast.h b/sys/contrib/opensolaris/uts/common/zmod/inffast.h new file mode 100644 index 0000000..2d214ef --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/inffast.h @@ -0,0 +1,13 @@ +/* inffast.h -- header to use inffast.c + * Copyright (C) 1995-2003 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +void inflate_fast OF((z_streamp strm, unsigned start)); diff --git a/sys/contrib/opensolaris/uts/common/zmod/inffixed.h b/sys/contrib/opensolaris/uts/common/zmod/inffixed.h new file mode 100644 index 0000000..ed55df8 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/inffixed.h @@ -0,0 +1,96 @@ + /* inffixed.h -- table for decoding fixed codes + * Generated automatically by makefixed(). + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + /* WARNING: this file should *not* be used by applications. It + is part of the implementation of the compression library and + is subject to change. Applications should only use zlib.h. + */ + + static const code lenfix[512] = { + {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48}, + {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128}, + {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59}, + {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176}, + {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20}, + {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100}, + {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8}, + {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216}, + {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76}, + {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114}, + {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2}, + {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148}, + {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42}, + {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86}, + {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15}, + {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236}, + {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62}, + {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142}, + {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31}, + {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162}, + {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25}, + {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105}, + {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4}, + {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202}, + {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69}, + {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125}, + {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13}, + {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195}, + {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35}, + {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91}, + {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19}, + {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246}, + {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55}, + {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135}, + {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99}, + {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190}, + {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16}, + {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96}, + {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6}, + {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209}, + {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72}, + {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116}, + {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4}, + {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153}, + {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44}, + {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82}, + {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11}, + {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229}, + {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58}, + {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138}, + {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51}, + {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173}, + {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30}, + {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110}, + {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0}, + {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195}, + {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65}, + {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121}, + {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9}, + {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258}, + {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37}, + {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93}, + {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23}, + {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251}, + {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51}, + {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131}, + {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67}, + {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183}, + {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23}, + {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103}, + {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9}, + {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223}, + {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79}, + {0,9,255} + }; + + static const code distfix[32] = { + {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025}, + {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193}, + {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385}, + {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577}, + {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073}, + {22,5,193},{64,5,0} + }; diff --git a/sys/contrib/opensolaris/uts/common/zmod/inflate.c b/sys/contrib/opensolaris/uts/common/zmod/inflate.c new file mode 100644 index 0000000..023e7a1 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/inflate.c @@ -0,0 +1,1395 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* inflate.c -- zlib decompression + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Change history: + * + * 1.2.beta0 24 Nov 2002 + * - First version -- complete rewrite of inflate to simplify code, avoid + * creation of window when not needed, minimize use of window when it is + * needed, make inffast.c even faster, implement gzip decoding, and to + * improve code readability and style over the previous zlib inflate code + * + * 1.2.beta1 25 Nov 2002 + * - Use pointers for available input and output checking in inffast.c + * - Remove input and output counters in inffast.c + * - Change inffast.c entry and loop from avail_in >= 7 to >= 6 + * - Remove unnecessary second byte pull from length extra in inffast.c + * - Unroll direct copy to three copies per loop in inffast.c + * + * 1.2.beta2 4 Dec 2002 + * - Change external routine names to reduce potential conflicts + * - Correct filename to inffixed.h for fixed tables in inflate.c + * - Make hbuf[] unsigned char to match parameter type in inflate.c + * - Change strm->next_out[-state->offset] to *(strm->next_out - state->offset) + * to avoid negation problem on Alphas (64 bit) in inflate.c + * + * 1.2.beta3 22 Dec 2002 + * - Add comments on state->bits assertion in inffast.c + * - Add comments on op field in inftrees.h + * - Fix bug in reuse of allocated window after inflateReset() + * - Remove bit fields--back to byte structure for speed + * - Remove distance extra == 0 check in inflate_fast()--only helps for lengths + * - Change post-increments to pre-increments in inflate_fast(), PPC biased? + * - Add compile time option, POSTINC, to use post-increments instead (Intel?) + * - Make MATCH copy in inflate() much faster for when inflate_fast() not used + * - Use local copies of stream next and avail values, as well as local bit + * buffer and bit count in inflate()--for speed when inflate_fast() not used + * + * 1.2.beta4 1 Jan 2003 + * - Split ptr - 257 statements in inflate_table() to avoid compiler warnings + * - Move a comment on output buffer sizes from inffast.c to inflate.c + * - Add comments in inffast.c to introduce the inflate_fast() routine + * - Rearrange window copies in inflate_fast() for speed and simplification + * - Unroll last copy for window match in inflate_fast() + * - Use local copies of window variables in inflate_fast() for speed + * - Pull out common write == 0 case for speed in inflate_fast() + * - Make op and len in inflate_fast() unsigned for consistency + * - Add FAR to lcode and dcode declarations in inflate_fast() + * - Simplified bad distance check in inflate_fast() + * - Added inflateBackInit(), inflateBack(), and inflateBackEnd() in new + * source file infback.c to provide a call-back interface to inflate for + * programs like gzip and unzip -- uses window as output buffer to avoid + * window copying + * + * 1.2.beta5 1 Jan 2003 + * - Improved inflateBack() interface to allow the caller to provide initial + * input in strm. + * - Fixed stored blocks bug in inflateBack() + * + * 1.2.beta6 4 Jan 2003 + * - Added comments in inffast.c on effectiveness of POSTINC + * - Typecasting all around to reduce compiler warnings + * - Changed loops from while (1) or do {} while (1) to for (;;), again to + * make compilers happy + * - Changed type of window in inflateBackInit() to unsigned char * + * + * 1.2.beta7 27 Jan 2003 + * - Changed many types to unsigned or unsigned short to avoid warnings + * - Added inflateCopy() function + * + * 1.2.0 9 Mar 2003 + * - Changed inflateBack() interface to provide separate opaque descriptors + * for the in() and out() functions + * - Changed inflateBack() argument and in_func typedef to swap the length + * and buffer address return values for the input function + * - Check next_in and next_out for Z_NULL on entry to inflate() + * + * The history for versions after 1.2.0 are in ChangeLog in zlib distribution. + */ + +#include "zutil.h" +#include "inftrees.h" +#include "inflate.h" +#include "inffast.h" + +#ifdef MAKEFIXED +# ifndef BUILDFIXED +# define BUILDFIXED +# endif +#endif + +/* function prototypes */ +local void fixedtables OF((struct inflate_state FAR *state)); +local int updatewindow OF((z_streamp strm, unsigned out)); +#ifdef BUILDFIXED + void makefixed OF((void)); +#endif +local unsigned syncsearch OF((unsigned FAR *have, unsigned char FAR *buf, + unsigned len)); + +int ZEXPORT inflateReset(strm) +z_streamp strm; +{ + struct inflate_state FAR *state; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + strm->total_in = strm->total_out = state->total = 0; + strm->msg = Z_NULL; + strm->adler = 1; /* to support ill-conceived Java test suite */ + state->mode = HEAD; + state->last = 0; + state->havedict = 0; + state->dmax = 32768U; + state->head = Z_NULL; + state->wsize = 0; + state->whave = 0; + state->write = 0; + state->hold = 0; + state->bits = 0; + state->lencode = state->distcode = state->next = state->codes; + Tracev((stderr, "inflate: reset\n")); + return Z_OK; +} + +int ZEXPORT inflatePrime(strm, bits, value) +z_streamp strm; +int bits; +int value; +{ + struct inflate_state FAR *state; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR; + value &= (1L << bits) - 1; + state->hold += value << state->bits; + state->bits += bits; + return Z_OK; +} + +int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size) +z_streamp strm; +int windowBits; +const char *version; +int stream_size; +{ + struct inflate_state FAR *state; + + if (version == Z_NULL || version[0] != ZLIB_VERSION[0] || + stream_size != (int)(sizeof(z_stream))) + return Z_VERSION_ERROR; + if (strm == Z_NULL) return Z_STREAM_ERROR; + strm->msg = Z_NULL; /* in case we return an error */ + if (strm->zalloc == (alloc_func)0) { + strm->zalloc = zcalloc; + strm->opaque = (voidpf)0; + } + if (strm->zfree == (free_func)0) strm->zfree = zcfree; + state = (struct inflate_state FAR *) + ZALLOC(strm, 1, sizeof(struct inflate_state)); + if (state == Z_NULL) return Z_MEM_ERROR; + Tracev((stderr, "inflate: allocated\n")); + strm->state = (struct internal_state FAR *)state; + if (windowBits < 0) { + state->wrap = 0; + windowBits = -windowBits; + } + else { + state->wrap = (windowBits >> 4) + 1; +#ifdef GUNZIP + if (windowBits < 48) windowBits &= 15; +#endif + } + if (windowBits < 8 || windowBits > 15) { + ZFREE(strm, state); + strm->state = Z_NULL; + return Z_STREAM_ERROR; + } + state->wbits = (unsigned)windowBits; + state->window = Z_NULL; + return inflateReset(strm); +} + +int ZEXPORT inflateInit_(strm, version, stream_size) +z_streamp strm; +const char *version; +int stream_size; +{ + return inflateInit2_(strm, DEF_WBITS, version, stream_size); +} + +/* + Return state with length and distance decoding tables and index sizes set to + fixed code decoding. Normally this returns fixed tables from inffixed.h. + If BUILDFIXED is defined, then instead this routine builds the tables the + first time it's called, and returns those tables the first time and + thereafter. This reduces the size of the code by about 2K bytes, in + exchange for a little execution time. However, BUILDFIXED should not be + used for threaded applications, since the rewriting of the tables and virgin + may not be thread-safe. + */ +local void fixedtables(state) +struct inflate_state FAR *state; +{ +#ifdef BUILDFIXED + static int virgin = 1; + static code *lenfix, *distfix; + static code fixed[544]; + + /* build fixed huffman tables if first call (may not be thread safe) */ + if (virgin) { + unsigned sym, bits; + static code *next; + + /* literal/length table */ + sym = 0; + while (sym < 144) state->lens[sym++] = 8; + while (sym < 256) state->lens[sym++] = 9; + while (sym < 280) state->lens[sym++] = 7; + while (sym < 288) state->lens[sym++] = 8; + next = fixed; + lenfix = next; + bits = 9; + inflate_table(LENS, state->lens, 288, &(next), &(bits), state->work); + + /* distance table */ + sym = 0; + while (sym < 32) state->lens[sym++] = 5; + distfix = next; + bits = 5; + inflate_table(DISTS, state->lens, 32, &(next), &(bits), state->work); + + /* do this just once */ + virgin = 0; + } +#else /* !BUILDFIXED */ +# include "inffixed.h" +#endif /* BUILDFIXED */ + state->lencode = lenfix; + state->lenbits = 9; + state->distcode = distfix; + state->distbits = 5; +} + +#ifdef MAKEFIXED +#include <stdio.h> + +/* + Write out the inffixed.h that is #include'd above. Defining MAKEFIXED also + defines BUILDFIXED, so the tables are built on the fly. makefixed() writes + those tables to stdout, which would be piped to inffixed.h. A small program + can simply call makefixed to do this: + + void makefixed(void); + + int main(void) + { + makefixed(); + return 0; + } + + Then that can be linked with zlib built with MAKEFIXED defined and run: + + a.out > inffixed.h + */ +void makefixed() +{ + unsigned low, size; + struct inflate_state state; + + fixedtables(&state); + puts(" /* inffixed.h -- table for decoding fixed codes"); + puts(" * Generated automatically by makefixed()."); + puts(" */"); + puts(""); + puts(" /* WARNING: this file should *not* be used by applications."); + puts(" It is part of the implementation of this library and is"); + puts(" subject to change. Applications should only use zlib.h."); + puts(" */"); + puts(""); + size = 1U << 9; + printf(" static const code lenfix[%u] = {", size); + low = 0; + for (;;) { + if ((low % 7) == 0) printf("\n "); + printf("{%u,%u,%d}", state.lencode[low].op, state.lencode[low].bits, + state.lencode[low].val); + if (++low == size) break; + putchar(','); + } + puts("\n };"); + size = 1U << 5; + printf("\n static const code distfix[%u] = {", size); + low = 0; + for (;;) { + if ((low % 6) == 0) printf("\n "); + printf("{%u,%u,%d}", state.distcode[low].op, state.distcode[low].bits, + state.distcode[low].val); + if (++low == size) break; + putchar(','); + } + puts("\n };"); +} +#endif /* MAKEFIXED */ + +/* + Update the window with the last wsize (normally 32K) bytes written before + returning. If window does not exist yet, create it. This is only called + when a window is already in use, or when output has been written during this + inflate call, but the end of the deflate stream has not been reached yet. + It is also called to create a window for dictionary data when a dictionary + is loaded. + + Providing output buffers larger than 32K to inflate() should provide a speed + advantage, since only the last 32K of output is copied to the sliding window + upon return from inflate(), and since all distances after the first 32K of + output will fall in the output data, making match copies simpler and faster. + The advantage may be dependent on the size of the processor's data caches. + */ +local int updatewindow(strm, out) +z_streamp strm; +unsigned out; +{ + struct inflate_state FAR *state; + unsigned copy, dist; + + state = (struct inflate_state FAR *)strm->state; + + /* if it hasn't been done already, allocate space for the window */ + if (state->window == Z_NULL) { + state->window = (unsigned char FAR *) + ZALLOC(strm, 1U << state->wbits, + sizeof(unsigned char)); + if (state->window == Z_NULL) return 1; + } + + /* if window not in use yet, initialize */ + if (state->wsize == 0) { + state->wsize = 1U << state->wbits; + state->write = 0; + state->whave = 0; + } + + /* copy state->wsize or less output bytes into the circular window */ + copy = out - strm->avail_out; + if (copy >= state->wsize) { + zmemcpy(state->window, strm->next_out - state->wsize, state->wsize); + state->write = 0; + state->whave = state->wsize; + } + else { + dist = state->wsize - state->write; + if (dist > copy) dist = copy; + zmemcpy(state->window + state->write, strm->next_out - copy, dist); + copy -= dist; + if (copy) { + zmemcpy(state->window, strm->next_out - copy, copy); + state->write = copy; + state->whave = state->wsize; + } + else { + state->write += dist; + if (state->write == state->wsize) state->write = 0; + if (state->whave < state->wsize) state->whave += dist; + } + } + return 0; +} + +/* Macros for inflate(): */ + +/* check function to use adler32() for zlib or crc32() for gzip */ +#ifdef GUNZIP +# define UPDATE(check, buf, len) \ + (state->flags ? crc32(check, buf, len) : adler32(check, buf, len)) +#else +# define UPDATE(check, buf, len) adler32(check, buf, len) +#endif + +/* check macros for header crc */ +#ifdef GUNZIP +# define CRC2(check, word) \ + do { \ + hbuf[0] = (unsigned char)(word); \ + hbuf[1] = (unsigned char)((word) >> 8); \ + check = crc32(check, hbuf, 2); \ + } while (0) + +# define CRC4(check, word) \ + do { \ + hbuf[0] = (unsigned char)(word); \ + hbuf[1] = (unsigned char)((word) >> 8); \ + hbuf[2] = (unsigned char)((word) >> 16); \ + hbuf[3] = (unsigned char)((word) >> 24); \ + check = crc32(check, hbuf, 4); \ + } while (0) +#endif + +/* Load registers with state in inflate() for speed */ +#define LOAD() \ + do { \ + put = strm->next_out; \ + left = strm->avail_out; \ + next = strm->next_in; \ + have = strm->avail_in; \ + hold = state->hold; \ + bits = state->bits; \ + } while (0) + +/* Restore state from registers in inflate() */ +#define RESTORE() \ + do { \ + strm->next_out = put; \ + strm->avail_out = left; \ + strm->next_in = next; \ + strm->avail_in = have; \ + state->hold = hold; \ + state->bits = bits; \ + } while (0) + +/* Clear the input bit accumulator */ +#define INITBITS() \ + do { \ + hold = 0; \ + bits = 0; \ + } while (0) + +/* Get a byte of input into the bit accumulator, or return from inflate() + if there is no input available. */ +#define PULLBYTE() \ + do { \ + if (have == 0) goto inf_leave; \ + have--; \ + hold += (unsigned long)(*next++) << bits; \ + bits += 8; \ + } while (0) + +/* Assure that there are at least n bits in the bit accumulator. If there is + not enough available input to do that, then return from inflate(). */ +#define NEEDBITS(n) \ + do { \ + while (bits < (unsigned)(n)) \ + PULLBYTE(); \ + } while (0) + +/* Return the low n bits of the bit accumulator (n < 16) */ +#define BITS(n) \ + ((unsigned)hold & ((1U << (n)) - 1)) + +/* Remove n bits from the bit accumulator */ +#define DROPBITS(n) \ + do { \ + hold >>= (n); \ + bits -= (unsigned)(n); \ + } while (0) + +/* Remove zero to seven bits as needed to go to a byte boundary */ +#define BYTEBITS() \ + do { \ + hold >>= bits & 7; \ + bits -= bits & 7; \ + } while (0) + +/* Reverse the bytes in a 32-bit value */ +#define REVERSE(q) \ + ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \ + (((q) & 0xff00) << 8) + (((q) & 0xff) << 24)) + +/* + inflate() uses a state machine to process as much input data and generate as + much output data as possible before returning. The state machine is + structured roughly as follows: + + for (;;) switch (state) { + ... + case STATEn: + if (not enough input data or output space to make progress) + return; + ... make progress ... + state = STATEm; + break; + ... + } + + so when inflate() is called again, the same case is attempted again, and + if the appropriate resources are provided, the machine proceeds to the + next state. The NEEDBITS() macro is usually the way the state evaluates + whether it can proceed or should return. NEEDBITS() does the return if + the requested bits are not available. The typical use of the BITS macros + is: + + NEEDBITS(n); + ... do something with BITS(n) ... + DROPBITS(n); + + where NEEDBITS(n) either returns from inflate() if there isn't enough + input left to load n bits into the accumulator, or it continues. BITS(n) + gives the low n bits in the accumulator. When done, DROPBITS(n) drops + the low n bits off the accumulator. INITBITS() clears the accumulator + and sets the number of available bits to zero. BYTEBITS() discards just + enough bits to put the accumulator on a byte boundary. After BYTEBITS() + and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. + + NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return + if there is no input available. The decoding of variable length codes uses + PULLBYTE() directly in order to pull just enough bytes to decode the next + code, and no more. + + Some states loop until they get enough input, making sure that enough + state information is maintained to continue the loop where it left off + if NEEDBITS() returns in the loop. For example, want, need, and keep + would all have to actually be part of the saved state in case NEEDBITS() + returns: + + case STATEw: + while (want < need) { + NEEDBITS(n); + keep[want++] = BITS(n); + DROPBITS(n); + } + state = STATEx; + case STATEx: + + As shown above, if the next state is also the next case, then the break + is omitted. + + A state may also return if there is not enough output space available to + complete that state. Those states are copying stored data, writing a + literal byte, and copying a matching string. + + When returning, a "goto inf_leave" is used to update the total counters, + update the check value, and determine whether any progress has been made + during that inflate() call in order to return the proper return code. + Progress is defined as a change in either strm->avail_in or strm->avail_out. + When there is a window, goto inf_leave will update the window with the last + output written. If a goto inf_leave occurs in the middle of decompression + and there is no window currently, goto inf_leave will create one and copy + output to the window for the next call of inflate(). + + In this implementation, the flush parameter of inflate() only affects the + return code (per zlib.h). inflate() always writes as much as possible to + strm->next_out, given the space available and the provided input--the effect + documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers + the allocation of and copying into a sliding window until necessary, which + provides the effect documented in zlib.h for Z_FINISH when the entire input + stream available. So the only thing the flush parameter actually does is: + when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it + will return Z_BUF_ERROR if it has not reached the end of the stream. + */ + +int ZEXPORT inflate(strm, flush) +z_streamp strm; +int flush; +{ + struct inflate_state FAR *state; + unsigned char FAR *next; /* next input */ + unsigned char FAR *put; /* next output */ + unsigned have, left; /* available input and output */ + unsigned long hold; /* bit buffer */ + unsigned bits; /* bits in bit buffer */ + unsigned in, out; /* save starting available input and output */ + unsigned copy; /* number of stored or match bytes to copy */ + unsigned char FAR *from; /* where to copy match bytes from */ + code this; /* current decoding table entry */ + code last; /* parent table entry */ + unsigned len; /* length to copy for repeats, bits to drop */ + int ret; /* return code */ +#ifdef GUNZIP + unsigned char hbuf[4]; /* buffer for gzip header crc calculation */ +#endif + static const unsigned short order[19] = /* permutation of code lengths */ + {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; + + if (strm == Z_NULL || strm->state == Z_NULL || strm->next_out == Z_NULL || + (strm->next_in == Z_NULL && strm->avail_in != 0)) + return Z_STREAM_ERROR; + + state = (struct inflate_state FAR *)strm->state; + if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ + LOAD(); + in = have; + out = left; + ret = Z_OK; + for (;;) + switch (state->mode) { + case HEAD: + if (state->wrap == 0) { + state->mode = TYPEDO; + break; + } + NEEDBITS(16); +#ifdef GUNZIP + if ((state->wrap & 2) && hold == 0x8b1f) { /* gzip header */ + state->check = crc32(0L, Z_NULL, 0); + CRC2(state->check, hold); + INITBITS(); + state->mode = FLAGS; + break; + } + state->flags = 0; /* expect zlib header */ + if (state->head != Z_NULL) + state->head->done = -1; + if (!(state->wrap & 1) || /* check if zlib header allowed */ +#else + if ( +#endif + ((BITS(8) << 8) + (hold >> 8)) % 31) { + strm->msg = (char *)"incorrect header check"; + state->mode = BAD; + break; + } + if (BITS(4) != Z_DEFLATED) { + strm->msg = (char *)"unknown compression method"; + state->mode = BAD; + break; + } + DROPBITS(4); + len = BITS(4) + 8; + if (len > state->wbits) { + strm->msg = (char *)"invalid window size"; + state->mode = BAD; + break; + } + state->dmax = 1U << len; + Tracev((stderr, "inflate: zlib header ok\n")); + strm->adler = state->check = adler32(0L, Z_NULL, 0); + state->mode = hold & 0x200 ? DICTID : TYPE; + INITBITS(); + break; +#ifdef GUNZIP + case FLAGS: + NEEDBITS(16); + state->flags = (int)(hold); + if ((state->flags & 0xff) != Z_DEFLATED) { + strm->msg = (char *)"unknown compression method"; + state->mode = BAD; + break; + } + if (state->flags & 0xe000) { + strm->msg = (char *)"unknown header flags set"; + state->mode = BAD; + break; + } + if (state->head != Z_NULL) + state->head->text = (int)((hold >> 8) & 1); + if (state->flags & 0x0200) CRC2(state->check, hold); + INITBITS(); + state->mode = TIME; + /*FALLTHRU*/ + case TIME: + NEEDBITS(32); + if (state->head != Z_NULL) + state->head->time = hold; + if (state->flags & 0x0200) CRC4(state->check, hold); + INITBITS(); + state->mode = OS; + /*FALLTHRU*/ + case OS: + NEEDBITS(16); + if (state->head != Z_NULL) { + state->head->xflags = (int)(hold & 0xff); + state->head->os = (int)(hold >> 8); + } + if (state->flags & 0x0200) CRC2(state->check, hold); + INITBITS(); + state->mode = EXLEN; + /*FALLTHRU*/ + case EXLEN: + if (state->flags & 0x0400) { + NEEDBITS(16); + state->length = (unsigned)(hold); + if (state->head != Z_NULL) + state->head->extra_len = (unsigned)hold; + if (state->flags & 0x0200) CRC2(state->check, hold); + INITBITS(); + } + else if (state->head != Z_NULL) + state->head->extra = Z_NULL; + state->mode = EXTRA; + /*FALLTHRU*/ + case EXTRA: + if (state->flags & 0x0400) { + copy = state->length; + if (copy > have) copy = have; + if (copy) { + if (state->head != Z_NULL && + state->head->extra != Z_NULL) { + len = state->head->extra_len - state->length; + zmemcpy(state->head->extra + len, next, + len + copy > state->head->extra_max ? + state->head->extra_max - len : copy); + } + if (state->flags & 0x0200) + state->check = crc32(state->check, next, copy); + have -= copy; + next += copy; + state->length -= copy; + } + if (state->length) goto inf_leave; + } + state->length = 0; + state->mode = NAME; + /*FALLTHRU*/ + case NAME: + if (state->flags & 0x0800) { + if (have == 0) goto inf_leave; + copy = 0; + do { + len = (unsigned)(next[copy++]); + if (state->head != Z_NULL && + state->head->name != Z_NULL && + state->length < state->head->name_max) + state->head->name[state->length++] = len; + } while (len && copy < have); + if (state->flags & 0x0200) + state->check = crc32(state->check, next, copy); + have -= copy; + next += copy; + if (len) goto inf_leave; + } + else if (state->head != Z_NULL) + state->head->name = Z_NULL; + state->length = 0; + state->mode = COMMENT; + /*FALLTHRU*/ + case COMMENT: + if (state->flags & 0x1000) { + if (have == 0) goto inf_leave; + copy = 0; + do { + len = (unsigned)(next[copy++]); + if (state->head != Z_NULL && + state->head->comment != Z_NULL && + state->length < state->head->comm_max) + state->head->comment[state->length++] = len; + } while (len && copy < have); + if (state->flags & 0x0200) + state->check = crc32(state->check, next, copy); + have -= copy; + next += copy; + if (len) goto inf_leave; + } + else if (state->head != Z_NULL) + state->head->comment = Z_NULL; + state->mode = HCRC; + /*FALLTHRU*/ + case HCRC: + if (state->flags & 0x0200) { + NEEDBITS(16); + if (hold != (state->check & 0xffff)) { + strm->msg = (char *)"header crc mismatch"; + state->mode = BAD; + break; + } + INITBITS(); + } + if (state->head != Z_NULL) { + state->head->hcrc = (int)((state->flags >> 9) & 1); + state->head->done = 1; + } + strm->adler = state->check = crc32(0L, Z_NULL, 0); + state->mode = TYPE; + break; +#endif + case DICTID: + NEEDBITS(32); + strm->adler = state->check = REVERSE(hold); + INITBITS(); + state->mode = DICT; + /*FALLTHRU*/ + case DICT: + if (state->havedict == 0) { + RESTORE(); + return Z_NEED_DICT; + } + strm->adler = state->check = adler32(0L, Z_NULL, 0); + state->mode = TYPE; + /*FALLTHRU*/ + case TYPE: + if (flush == Z_BLOCK) goto inf_leave; + /*FALLTHRU*/ + case TYPEDO: + if (state->last) { + BYTEBITS(); + state->mode = CHECK; + break; + } + NEEDBITS(3); + state->last = BITS(1); + DROPBITS(1); + switch (BITS(2)) { + case 0: /* stored block */ + Tracev((stderr, "inflate: stored block%s\n", + state->last ? " (last)" : "")); + state->mode = STORED; + break; + case 1: /* fixed block */ + fixedtables(state); + Tracev((stderr, "inflate: fixed codes block%s\n", + state->last ? " (last)" : "")); + state->mode = LEN; /* decode codes */ + break; + case 2: /* dynamic block */ + Tracev((stderr, "inflate: dynamic codes block%s\n", + state->last ? " (last)" : "")); + state->mode = TABLE; + break; + case 3: + strm->msg = (char *)"invalid block type"; + state->mode = BAD; + } + DROPBITS(2); + break; + case STORED: + BYTEBITS(); /* go to byte boundary */ + NEEDBITS(32); + if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { + strm->msg = (char *)"invalid stored block lengths"; + state->mode = BAD; + break; + } + state->length = (unsigned)hold & 0xffff; + Tracev((stderr, "inflate: stored length %u\n", + state->length)); + INITBITS(); + state->mode = COPY; + /*FALLTHRU*/ + case COPY: + copy = state->length; + if (copy) { + if (copy > have) copy = have; + if (copy > left) copy = left; + if (copy == 0) goto inf_leave; + zmemcpy(put, next, copy); + have -= copy; + next += copy; + left -= copy; + put += copy; + state->length -= copy; + break; + } + Tracev((stderr, "inflate: stored end\n")); + state->mode = TYPE; + break; + case TABLE: + NEEDBITS(14); + state->nlen = BITS(5) + 257; + DROPBITS(5); + state->ndist = BITS(5) + 1; + DROPBITS(5); + state->ncode = BITS(4) + 4; + DROPBITS(4); +#ifndef PKZIP_BUG_WORKAROUND + if (state->nlen > 286 || state->ndist > 30) { + strm->msg = (char *)"too many length or distance symbols"; + state->mode = BAD; + break; + } +#endif + Tracev((stderr, "inflate: table sizes ok\n")); + state->have = 0; + state->mode = LENLENS; + /*FALLTHRU*/ + case LENLENS: + while (state->have < state->ncode) { + NEEDBITS(3); + state->lens[order[state->have++]] = (unsigned short)BITS(3); + DROPBITS(3); + } + while (state->have < 19) + state->lens[order[state->have++]] = 0; + state->next = state->codes; + state->lencode = (code const FAR *)(state->next); + state->lenbits = 7; + ret = inflate_table(CODES, state->lens, 19, &(state->next), + &(state->lenbits), state->work); + if (ret) { + strm->msg = (char *)"invalid code lengths set"; + state->mode = BAD; + break; + } + Tracev((stderr, "inflate: code lengths ok\n")); + state->have = 0; + state->mode = CODELENS; + /*FALLTHRU*/ + case CODELENS: + while (state->have < state->nlen + state->ndist) { + for (;;) { + this = state->lencode[BITS(state->lenbits)]; + if ((unsigned)(this.bits) <= bits) break; + PULLBYTE(); + } + if (this.val < 16) { + NEEDBITS(this.bits); + DROPBITS(this.bits); + state->lens[state->have++] = this.val; + } + else { + if (this.val == 16) { + NEEDBITS(this.bits + 2); + DROPBITS(this.bits); + if (state->have == 0) { + strm->msg = (char *)"invalid bit length repeat"; + state->mode = BAD; + break; + } + len = state->lens[state->have - 1]; + copy = 3 + BITS(2); + DROPBITS(2); + } + else if (this.val == 17) { + NEEDBITS(this.bits + 3); + DROPBITS(this.bits); + len = 0; + copy = 3 + BITS(3); + DROPBITS(3); + } + else { + NEEDBITS(this.bits + 7); + DROPBITS(this.bits); + len = 0; + copy = 11 + BITS(7); + DROPBITS(7); + } + if (state->have + copy > state->nlen + state->ndist) { + strm->msg = (char *)"invalid bit length repeat"; + state->mode = BAD; + break; + } + while (copy--) + state->lens[state->have++] = (unsigned short)len; + } + } + + /* handle error breaks in while */ + if (state->mode == BAD) break; + + /* build code tables */ + state->next = state->codes; + state->lencode = (code const FAR *)(state->next); + state->lenbits = 9; + ret = inflate_table(LENS, state->lens, state->nlen, &(state->next), + &(state->lenbits), state->work); + if (ret) { + strm->msg = (char *)"invalid literal/lengths set"; + state->mode = BAD; + break; + } + state->distcode = (code const FAR *)(state->next); + state->distbits = 6; + ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist, + &(state->next), &(state->distbits), state->work); + if (ret) { + strm->msg = (char *)"invalid distances set"; + state->mode = BAD; + break; + } + Tracev((stderr, "inflate: codes ok\n")); + state->mode = LEN; + /*FALLTHRU*/ + case LEN: + if (have >= 6 && left >= 258) { + RESTORE(); + inflate_fast(strm, out); + LOAD(); + break; + } + for (;;) { + this = state->lencode[BITS(state->lenbits)]; + if ((unsigned)(this.bits) <= bits) break; + PULLBYTE(); + } + if (this.op && (this.op & 0xf0) == 0) { + last = this; + for (;;) { + this = state->lencode[last.val + + (BITS(last.bits + last.op) >> last.bits)]; + if ((unsigned)(last.bits + this.bits) <= bits) break; + PULLBYTE(); + } + DROPBITS(last.bits); + } + DROPBITS(this.bits); + state->length = (unsigned)this.val; + if ((int)(this.op) == 0) { + Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ? + "inflate: literal '%c'\n" : + "inflate: literal 0x%02x\n", this.val)); + state->mode = LIT; + break; + } + if (this.op & 32) { + Tracevv((stderr, "inflate: end of block\n")); + state->mode = TYPE; + break; + } + if (this.op & 64) { + strm->msg = (char *)"invalid literal/length code"; + state->mode = BAD; + break; + } + state->extra = (unsigned)(this.op) & 15; + state->mode = LENEXT; + /*FALLTHRU*/ + case LENEXT: + if (state->extra) { + NEEDBITS(state->extra); + state->length += BITS(state->extra); + DROPBITS(state->extra); + } + Tracevv((stderr, "inflate: length %u\n", state->length)); + state->mode = DIST; + /*FALLTHRU*/ + case DIST: + for (;;) { + this = state->distcode[BITS(state->distbits)]; + if ((unsigned)(this.bits) <= bits) break; + PULLBYTE(); + } + if ((this.op & 0xf0) == 0) { + last = this; + for (;;) { + this = state->distcode[last.val + + (BITS(last.bits + last.op) >> last.bits)]; + if ((unsigned)(last.bits + this.bits) <= bits) break; + PULLBYTE(); + } + DROPBITS(last.bits); + } + DROPBITS(this.bits); + if (this.op & 64) { + strm->msg = (char *)"invalid distance code"; + state->mode = BAD; + break; + } + state->offset = (unsigned)this.val; + state->extra = (unsigned)(this.op) & 15; + state->mode = DISTEXT; + /*FALLTHRU*/ + case DISTEXT: + if (state->extra) { + NEEDBITS(state->extra); + state->offset += BITS(state->extra); + DROPBITS(state->extra); + } +#ifdef INFLATE_STRICT + if (state->offset > state->dmax) { + strm->msg = (char *)"invalid distance too far back"; + state->mode = BAD; + break; + } +#endif + if (state->offset > state->whave + out - left) { + strm->msg = (char *)"invalid distance too far back"; + state->mode = BAD; + break; + } + Tracevv((stderr, "inflate: distance %u\n", state->offset)); + state->mode = MATCH; + /*FALLTHRU*/ + case MATCH: + if (left == 0) goto inf_leave; + copy = out - left; + if (state->offset > copy) { /* copy from window */ + copy = state->offset - copy; + if (copy > state->write) { + copy -= state->write; + from = state->window + (state->wsize - copy); + } + else + from = state->window + (state->write - copy); + if (copy > state->length) copy = state->length; + } + else { /* copy from output */ + from = put - state->offset; + copy = state->length; + } + if (copy > left) copy = left; + left -= copy; + state->length -= copy; + do { + *put++ = *from++; + } while (--copy); + if (state->length == 0) state->mode = LEN; + break; + case LIT: + if (left == 0) goto inf_leave; + *put++ = (unsigned char)(state->length); + left--; + state->mode = LEN; + break; + case CHECK: + if (state->wrap) { + NEEDBITS(32); + out -= left; + strm->total_out += out; + state->total += out; + if (out) + strm->adler = state->check = + UPDATE(state->check, put - out, out); + out = left; + if (( +#ifdef GUNZIP + state->flags ? hold : +#endif + REVERSE(hold)) != state->check) { + strm->msg = (char *)"incorrect data check"; + state->mode = BAD; + break; + } + INITBITS(); + Tracev((stderr, "inflate: check matches trailer\n")); + } +#ifdef GUNZIP + state->mode = LENGTH; + /*FALLTHRU*/ + case LENGTH: + if (state->wrap && state->flags) { + NEEDBITS(32); + if (hold != (state->total & 0xffffffffUL)) { + strm->msg = (char *)"incorrect length check"; + state->mode = BAD; + break; + } + INITBITS(); + Tracev((stderr, "inflate: length matches trailer\n")); + } +#endif + state->mode = DONE; + /*FALLTHRU*/ + case DONE: + ret = Z_STREAM_END; + goto inf_leave; + case BAD: + ret = Z_DATA_ERROR; + goto inf_leave; + case MEM: + return Z_MEM_ERROR; + case SYNC: + default: + return Z_STREAM_ERROR; + } + + /* + Return from inflate(), updating the total counts and the check value. + If there was no progress during the inflate() call, return a buffer + error. Call updatewindow() to create and/or update the window state. + Note: a memory error from inflate() is non-recoverable. + */ + inf_leave: + RESTORE(); + if (state->wsize || (state->mode < CHECK && out != strm->avail_out)) + if (updatewindow(strm, out)) { + state->mode = MEM; + return Z_MEM_ERROR; + } + in -= strm->avail_in; + out -= strm->avail_out; + strm->total_in += in; + strm->total_out += out; + state->total += out; + if (state->wrap && out) + strm->adler = state->check = + UPDATE(state->check, strm->next_out - out, out); + strm->data_type = state->bits + (state->last ? 64 : 0) + + (state->mode == TYPE ? 128 : 0); + if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) + ret = Z_BUF_ERROR; + return ret; +} + +int ZEXPORT inflateEnd(strm) +z_streamp strm; +{ + struct inflate_state FAR *state; + if (strm == Z_NULL || strm->state == Z_NULL || strm->zfree == (free_func)0) + return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if (state->window != Z_NULL) ZFREE(strm, state->window); + ZFREE(strm, strm->state); + strm->state = Z_NULL; + Tracev((stderr, "inflate: end\n")); + return Z_OK; +} + +int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength) +z_streamp strm; +const Bytef *dictionary; +uInt dictLength; +{ + struct inflate_state FAR *state; + unsigned long id; + + /* check state */ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if (state->wrap != 0 && state->mode != DICT) + return Z_STREAM_ERROR; + + /* check for correct dictionary id */ + if (state->mode == DICT) { + id = adler32(0L, Z_NULL, 0); + id = adler32(id, dictionary, dictLength); + if (id != state->check) + return Z_DATA_ERROR; + } + + /* copy dictionary to window */ + if (updatewindow(strm, strm->avail_out)) { + state->mode = MEM; + return Z_MEM_ERROR; + } + if (dictLength > state->wsize) { + zmemcpy(state->window, dictionary + dictLength - state->wsize, + state->wsize); + state->whave = state->wsize; + } + else { + zmemcpy(state->window + state->wsize - dictLength, dictionary, + dictLength); + state->whave = dictLength; + } + state->havedict = 1; + Tracev((stderr, "inflate: dictionary set\n")); + return Z_OK; +} + +int ZEXPORT inflateGetHeader(strm, head) +z_streamp strm; +gz_headerp head; +{ + struct inflate_state FAR *state; + + /* check state */ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if ((state->wrap & 2) == 0) return Z_STREAM_ERROR; + + /* save header structure */ + state->head = head; + head->done = 0; + return Z_OK; +} + +/* + Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff. Return when found + or when out of input. When called, *have is the number of pattern bytes + found in order so far, in 0..3. On return *have is updated to the new + state. If on return *have equals four, then the pattern was found and the + return value is how many bytes were read including the last byte of the + pattern. If *have is less than four, then the pattern has not been found + yet and the return value is len. In the latter case, syncsearch() can be + called again with more data and the *have state. *have is initialized to + zero for the first call. + */ +local unsigned syncsearch(have, buf, len) +unsigned FAR *have; +unsigned char FAR *buf; +unsigned len; +{ + unsigned got; + unsigned next; + + got = *have; + next = 0; + while (next < len && got < 4) { + if ((int)(buf[next]) == (got < 2 ? 0 : 0xff)) + got++; + else if (buf[next]) + got = 0; + else + got = 4 - got; + next++; + } + *have = got; + return next; +} + +int ZEXPORT inflateSync(strm) +z_streamp strm; +{ + unsigned len; /* number of bytes to look at or looked at */ + unsigned long in, out; /* temporary to save total_in and total_out */ + unsigned char buf[4]; /* to restore bit buffer to byte string */ + struct inflate_state FAR *state; + + /* check parameters */ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR; + + /* if first time, start search in bit buffer */ + if (state->mode != SYNC) { + state->mode = SYNC; + state->hold <<= state->bits & 7; + state->bits -= state->bits & 7; + len = 0; + while (state->bits >= 8) { + buf[len++] = (unsigned char)(state->hold); + state->hold >>= 8; + state->bits -= 8; + } + state->have = 0; + (void) syncsearch(&(state->have), buf, len); + } + + /* search available input */ + len = syncsearch(&(state->have), strm->next_in, strm->avail_in); + strm->avail_in -= len; + strm->next_in += len; + strm->total_in += len; + + /* return no joy or set up to restart inflate() on a new block */ + if (state->have != 4) return Z_DATA_ERROR; + in = strm->total_in; out = strm->total_out; + (void) inflateReset(strm); + strm->total_in = in; strm->total_out = out; + state->mode = TYPE; + return Z_OK; +} + +/* + Returns true if inflate is currently at the end of a block generated by + Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP + implementation to provide an additional safety check. PPP uses + Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored + block. When decompressing, PPP checks that at the end of input packet, + inflate is waiting for these length bytes. + */ +int ZEXPORT inflateSyncPoint(strm) +z_streamp strm; +{ + struct inflate_state FAR *state; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + return state->mode == STORED && state->bits == 0; +} + +int ZEXPORT inflateCopy(dest, source) +z_streamp dest; +z_streamp source; +{ + struct inflate_state FAR *state; + struct inflate_state FAR *copy; + unsigned char FAR *window; + unsigned wsize; + + /* check input */ + if (dest == Z_NULL || source == Z_NULL || source->state == Z_NULL || + source->zalloc == (alloc_func)0 || source->zfree == (free_func)0) + return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)source->state; + + /* allocate space */ + copy = (struct inflate_state FAR *) + ZALLOC(source, 1, sizeof(struct inflate_state)); + if (copy == Z_NULL) return Z_MEM_ERROR; + window = Z_NULL; + if (state->window != Z_NULL) { + window = (unsigned char FAR *) + ZALLOC(source, 1U << state->wbits, sizeof(unsigned char)); + if (window == Z_NULL) { + ZFREE(source, copy); + return Z_MEM_ERROR; + } + } + + /* copy state */ + zmemcpy(dest, source, sizeof(z_stream)); + zmemcpy(copy, state, sizeof(struct inflate_state)); + if (state->lencode >= state->codes && + state->lencode <= state->codes + ENOUGH - 1) { + copy->lencode = copy->codes + (state->lencode - state->codes); + copy->distcode = copy->codes + (state->distcode - state->codes); + } + copy->next = copy->codes + (state->next - state->codes); + if (window != Z_NULL) { + wsize = 1U << state->wbits; + zmemcpy(window, state->window, wsize); + } + copy->window = window; + dest->state = (struct internal_state FAR *)copy; + return Z_OK; +} diff --git a/sys/contrib/opensolaris/uts/common/zmod/inflate.h b/sys/contrib/opensolaris/uts/common/zmod/inflate.h new file mode 100644 index 0000000..4d28b22 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/inflate.h @@ -0,0 +1,117 @@ +/* inflate.h -- internal inflate state definition + * Copyright (C) 1995-2004 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +/* define NO_GZIP when compiling if you want to disable gzip header and + trailer decoding by inflate(). NO_GZIP would be used to avoid linking in + the crc code when it is not needed. For shared libraries, gzip decoding + should be left enabled. */ +#ifndef NO_GZIP +# define GUNZIP +#endif + +/* Possible inflate modes between inflate() calls */ +typedef enum { + HEAD, /* i: waiting for magic header */ + FLAGS, /* i: waiting for method and flags (gzip) */ + TIME, /* i: waiting for modification time (gzip) */ + OS, /* i: waiting for extra flags and operating system (gzip) */ + EXLEN, /* i: waiting for extra length (gzip) */ + EXTRA, /* i: waiting for extra bytes (gzip) */ + NAME, /* i: waiting for end of file name (gzip) */ + COMMENT, /* i: waiting for end of comment (gzip) */ + HCRC, /* i: waiting for header crc (gzip) */ + DICTID, /* i: waiting for dictionary check value */ + DICT, /* waiting for inflateSetDictionary() call */ + TYPE, /* i: waiting for type bits, including last-flag bit */ + TYPEDO, /* i: same, but skip check to exit inflate on new block */ + STORED, /* i: waiting for stored size (length and complement) */ + COPY, /* i/o: waiting for input or output to copy stored block */ + TABLE, /* i: waiting for dynamic block table lengths */ + LENLENS, /* i: waiting for code length code lengths */ + CODELENS, /* i: waiting for length/lit and distance code lengths */ + LEN, /* i: waiting for length/lit code */ + LENEXT, /* i: waiting for length extra bits */ + DIST, /* i: waiting for distance code */ + DISTEXT, /* i: waiting for distance extra bits */ + MATCH, /* o: waiting for output space to copy string */ + LIT, /* o: waiting for output space to write literal */ + CHECK, /* i: waiting for 32-bit check value */ + LENGTH, /* i: waiting for 32-bit length (gzip) */ + DONE, /* finished check, done -- remain here until reset */ + BAD, /* got a data error -- remain here until reset */ + MEM, /* got an inflate() memory error -- remain here until reset */ + SYNC /* looking for synchronization bytes to restart inflate() */ +} inflate_mode; + +/* + State transitions between above modes - + + (most modes can go to the BAD or MEM mode -- not shown for clarity) + + Process header: + HEAD -> (gzip) or (zlib) + (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME + NAME -> COMMENT -> HCRC -> TYPE + (zlib) -> DICTID or TYPE + DICTID -> DICT -> TYPE + Read deflate blocks: + TYPE -> STORED or TABLE or LEN or CHECK + STORED -> COPY -> TYPE + TABLE -> LENLENS -> CODELENS -> LEN + Read deflate codes: + LEN -> LENEXT or LIT or TYPE + LENEXT -> DIST -> DISTEXT -> MATCH -> LEN + LIT -> LEN + Process trailer: + CHECK -> LENGTH -> DONE + */ + +/* state maintained between inflate() calls. Approximately 7K bytes. */ +struct inflate_state { + inflate_mode mode; /* current inflate mode */ + int last; /* true if processing last block */ + int wrap; /* bit 0 true for zlib, bit 1 true for gzip */ + int havedict; /* true if dictionary provided */ + int flags; /* gzip header method and flags (0 if zlib) */ + unsigned dmax; /* zlib header max distance (INFLATE_STRICT) */ + unsigned long check; /* protected copy of check value */ + unsigned long total; /* protected copy of output count */ + gz_headerp head; /* where to save gzip header information */ + /* sliding window */ + unsigned wbits; /* log base 2 of requested window size */ + unsigned wsize; /* window size or zero if not using window */ + unsigned whave; /* valid bytes in the window */ + unsigned write; /* window write index */ + unsigned char FAR *window; /* allocated sliding window, if needed */ + /* bit accumulator */ + unsigned long hold; /* input bit accumulator */ + unsigned bits; /* number of bits in "in" */ + /* for string and stored block copying */ + unsigned length; /* literal or length of data to copy */ + unsigned offset; /* distance back to copy string from */ + /* for table and code decoding */ + unsigned extra; /* extra bits needed */ + /* fixed and dynamic code tables */ + code const FAR *lencode; /* starting table for length/literal codes */ + code const FAR *distcode; /* starting table for distance codes */ + unsigned lenbits; /* index bits for lencode */ + unsigned distbits; /* index bits for distcode */ + /* dynamic table building */ + unsigned ncode; /* number of code length code lengths */ + unsigned nlen; /* number of length code lengths */ + unsigned ndist; /* number of distance code lengths */ + unsigned have; /* number of code lengths in lens[] */ + code FAR *next; /* next available space in codes[] */ + unsigned short lens[320]; /* temporary storage for code lengths */ + unsigned short work[288]; /* work area for code table building */ + code codes[ENOUGH]; /* space for code tables */ +}; diff --git a/sys/contrib/opensolaris/uts/common/zmod/inftrees.c b/sys/contrib/opensolaris/uts/common/zmod/inftrees.c new file mode 100644 index 0000000..2d37167 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/inftrees.c @@ -0,0 +1,331 @@ +/* inftrees.c -- generate Huffman trees for efficient decoding + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zutil.h" +#include "inftrees.h" + +#define MAXBITS 15 + +static const char inflate_copyright[] = + " inflate 1.2.3 Copyright 1995-2005 Mark Adler "; +/* + If you use the zlib library in a product, an acknowledgment is welcome + in the documentation of your product. If for some reason you cannot + include such an acknowledgment, I would appreciate that you keep this + copyright string in the executable of your product. + */ + +/* + Build a set of tables to decode the provided canonical Huffman code. + The code lengths are lens[0..codes-1]. The result starts at *table, + whose indices are 0..2^bits-1. work is a writable array of at least + lens shorts, which is used as a work area. type is the type of code + to be generated, CODES, LENS, or DISTS. On return, zero is success, + -1 is an invalid code, and +1 means that ENOUGH isn't enough. table + on return points to the next available entry's address. bits is the + requested root table index bits, and on return it is the actual root + table index bits. It will differ if the request is greater than the + longest code or if it is less than the shortest code. + */ +int inflate_table(type, lens, codes, table, bits, work) +codetype type; +unsigned short FAR *lens; +unsigned codes; +code FAR * FAR *table; +unsigned FAR *bits; +unsigned short FAR *work; +{ + unsigned len; /* a code's length in bits */ + unsigned sym; /* index of code symbols */ + unsigned min, max; /* minimum and maximum code lengths */ + unsigned root; /* number of index bits for root table */ + unsigned curr; /* number of index bits for current table */ + unsigned drop; /* code bits to drop for sub-table */ + int left; /* number of prefix codes available */ + unsigned used; /* code entries in table used */ + unsigned huff; /* Huffman code */ + unsigned incr; /* for incrementing code, index */ + unsigned fill; /* index for replicating entries */ + unsigned low; /* low bits for current root entry */ + unsigned mask; /* mask for low root bits */ + code this; /* table entry for duplication */ + code FAR *next; /* next available space in table */ + const unsigned short FAR *base; /* base value table to use */ + const unsigned short FAR *extra; /* extra bits table to use */ + int end; /* use base and extra for symbol > end */ + unsigned short count[MAXBITS+1]; /* number of codes of each length */ + unsigned short offs[MAXBITS+1]; /* offsets in table for each length */ + static const unsigned short lbase[31] = { /* Length codes 257..285 base */ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; + static const unsigned short lext[31] = { /* Length codes 257..285 extra */ + 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, + 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 201, 196}; + static const unsigned short dbase[32] = { /* Distance codes 0..29 base */ + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, + 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, + 8193, 12289, 16385, 24577, 0, 0}; + static const unsigned short dext[32] = { /* Distance codes 0..29 extra */ + 16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, + 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, + 28, 28, 29, 29, 64, 64}; + + /* + Process a set of code lengths to create a canonical Huffman code. The + code lengths are lens[0..codes-1]. Each length corresponds to the + symbols 0..codes-1. The Huffman code is generated by first sorting the + symbols by length from short to long, and retaining the symbol order + for codes with equal lengths. Then the code starts with all zero bits + for the first code of the shortest length, and the codes are integer + increments for the same length, and zeros are appended as the length + increases. For the deflate format, these bits are stored backwards + from their more natural integer increment ordering, and so when the + decoding tables are built in the large loop below, the integer codes + are incremented backwards. + + This routine assumes, but does not check, that all of the entries in + lens[] are in the range 0..MAXBITS. The caller must assure this. + 1..MAXBITS is interpreted as that code length. zero means that that + symbol does not occur in this code. + + The codes are sorted by computing a count of codes for each length, + creating from that a table of starting indices for each length in the + sorted table, and then entering the symbols in order in the sorted + table. The sorted table is work[], with that space being provided by + the caller. + + The length counts are used for other purposes as well, i.e. finding + the minimum and maximum length codes, determining if there are any + codes at all, checking for a valid set of lengths, and looking ahead + at length counts to determine sub-table sizes when building the + decoding tables. + */ + + /* accumulate lengths for codes (assumes lens[] all in 0..MAXBITS) */ + for (len = 0; len <= MAXBITS; len++) + count[len] = 0; + for (sym = 0; sym < codes; sym++) + count[lens[sym]]++; + + /* bound code lengths, force root to be within code lengths */ + root = *bits; + for (max = MAXBITS; max >= 1; max--) + if (count[max] != 0) break; + if (root > max) root = max; + if (max == 0) { /* no symbols to code at all */ + this.op = (unsigned char)64; /* invalid code marker */ + this.bits = (unsigned char)1; + this.val = (unsigned short)0; + *(*table)++ = this; /* make a table to force an error */ + *(*table)++ = this; + *bits = 1; + return 0; /* no symbols, but wait for decoding to report error */ + } + for (min = 1; min <= MAXBITS; min++) + if (count[min] != 0) break; + if (root < min) root = min; + + /* check for an over-subscribed or incomplete set of lengths */ + left = 1; + for (len = 1; len <= MAXBITS; len++) { + left <<= 1; + left -= count[len]; + if (left < 0) return -1; /* over-subscribed */ + } + if (left > 0 && (type == CODES || max != 1)) + return -1; /* incomplete set */ + + /* generate offsets into symbol table for each length for sorting */ + offs[1] = 0; + for (len = 1; len < MAXBITS; len++) + offs[len + 1] = offs[len] + count[len]; + + /* sort symbols by length, by symbol order within each length */ + for (sym = 0; sym < codes; sym++) + if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym; + + /* + Create and fill in decoding tables. In this loop, the table being + filled is at next and has curr index bits. The code being used is huff + with length len. That code is converted to an index by dropping drop + bits off of the bottom. For codes where len is less than drop + curr, + those top drop + curr - len bits are incremented through all values to + fill the table with replicated entries. + + root is the number of index bits for the root table. When len exceeds + root, sub-tables are created pointed to by the root entry with an index + of the low root bits of huff. This is saved in low to check for when a + new sub-table should be started. drop is zero when the root table is + being filled, and drop is root when sub-tables are being filled. + + When a new sub-table is needed, it is necessary to look ahead in the + code lengths to determine what size sub-table is needed. The length + counts are used for this, and so count[] is decremented as codes are + entered in the tables. + + used keeps track of how many table entries have been allocated from the + provided *table space. It is checked when a LENS table is being made + against the space in *table, ENOUGH, minus the maximum space needed by + the worst case distance code, MAXD. This should never happen, but the + sufficiency of ENOUGH has not been proven exhaustively, hence the check. + This assumes that when type == LENS, bits == 9. + + sym increments through all symbols, and the loop terminates when + all codes of length max, i.e. all codes, have been processed. This + routine permits incomplete codes, so another loop after this one fills + in the rest of the decoding tables with invalid code markers. + */ + + /* set up for code type */ + switch (type) { + case CODES: + base = extra = work; /* dummy value--not used */ + end = 19; + break; + case LENS: + base = lbase; + base -= 257; + extra = lext; + extra -= 257; + end = 256; + break; + default: /* DISTS */ + base = dbase; + extra = dext; + end = -1; + } + + /* initialize state for loop */ + huff = 0; /* starting code */ + sym = 0; /* starting code symbol */ + len = min; /* starting code length */ + next = *table; /* current table to fill in */ + curr = root; /* current table index bits */ + drop = 0; /* current bits to drop from code for index */ + low = (unsigned)(-1); /* trigger new sub-table when len > root */ + used = 1U << root; /* use root table entries */ + mask = used - 1; /* mask for comparing low */ + + /* check available table space */ + if (type == LENS && used >= ENOUGH - MAXD) + return 1; + + /* process all codes and make table entries */ + for (;;) { + /* create table entry */ + this.bits = (unsigned char)(len - drop); + if ((int)(work[sym]) < end) { + this.op = (unsigned char)0; + this.val = work[sym]; + } + else if ((int)(work[sym]) > end) { + this.op = (unsigned char)(extra[work[sym]]); + this.val = base[work[sym]]; + } + else { + this.op = (unsigned char)(32 + 64); /* end of block */ + this.val = 0; + } + + /* replicate for those indices with low len bits equal to huff */ + incr = 1U << (len - drop); + fill = 1U << curr; + min = fill; /* save offset to next table */ + do { + fill -= incr; + next[(huff >> drop) + fill] = this; + } while (fill != 0); + + /* backwards increment the len-bit code huff */ + incr = 1U << (len - 1); + while (huff & incr) + incr >>= 1; + if (incr != 0) { + huff &= incr - 1; + huff += incr; + } + else + huff = 0; + + /* go to next symbol, update count, len */ + sym++; + if (--(count[len]) == 0) { + if (len == max) break; + len = lens[work[sym]]; + } + + /* create new sub-table if needed */ + if (len > root && (huff & mask) != low) { + /* if first time, transition to sub-tables */ + if (drop == 0) + drop = root; + + /* increment past last table */ + next += min; /* here min is 1 << curr */ + + /* determine length of next table */ + curr = len - drop; + left = (int)(1 << curr); + while (curr + drop < max) { + left -= count[curr + drop]; + if (left <= 0) break; + curr++; + left <<= 1; + } + + /* check for enough space */ + used += 1U << curr; + if (type == LENS && used >= ENOUGH - MAXD) + return 1; + + /* point entry in root table to sub-table */ + low = huff & mask; + (*table)[low].op = (unsigned char)curr; + (*table)[low].bits = (unsigned char)root; + (*table)[low].val = (unsigned short)(next - *table); + } + } + + /* + Fill in rest of table for incomplete codes. This loop is similar to the + loop above in incrementing huff for table indices. It is assumed that + len is equal to curr + drop, so there is no loop needed to increment + through high index bits. When the current sub-table is filled, the loop + drops back to the root table to fill in any remaining entries there. + */ + this.op = (unsigned char)64; /* invalid code marker */ + this.bits = (unsigned char)(len - drop); + this.val = (unsigned short)0; + while (huff != 0) { + /* when done with sub-table, drop back to root table */ + if (drop != 0 && (huff & mask) != low) { + drop = 0; + len = root; + next = *table; + this.bits = (unsigned char)len; + } + + /* put invalid code marker in table */ + next[huff >> drop] = this; + + /* backwards increment the len-bit code huff */ + incr = 1U << (len - 1); + while (huff & incr) + incr >>= 1; + if (incr != 0) { + huff &= incr - 1; + huff += incr; + } + else + huff = 0; + } + + /* set return parameters */ + *table += used; + *bits = root; + return 0; +} diff --git a/sys/contrib/opensolaris/uts/common/zmod/inftrees.h b/sys/contrib/opensolaris/uts/common/zmod/inftrees.h new file mode 100644 index 0000000..546e8c0 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/inftrees.h @@ -0,0 +1,57 @@ +/* inftrees.h -- header to use inftrees.c + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +/* Structure for decoding tables. Each entry provides either the + information needed to do the operation requested by the code that + indexed that table entry, or it provides a pointer to another + table that indexes more bits of the code. op indicates whether + the entry is a pointer to another table, a literal, a length or + distance, an end-of-block, or an invalid code. For a table + pointer, the low four bits of op is the number of index bits of + that table. For a length or distance, the low four bits of op + is the number of extra bits to get after the code. bits is + the number of bits in this code or part of the code to drop off + of the bit buffer. val is the actual byte to output in the case + of a literal, the base length or distance, or the offset from + the current table to the next table. Each entry is four bytes. */ +typedef struct { + unsigned char op; /* operation, extra bits, table bits */ + unsigned char bits; /* bits in this part of the code */ + unsigned short val; /* offset in table or code value */ +} code; + +/* op values as set by inflate_table(): + 00000000 - literal + 0000tttt - table link, tttt != 0 is the number of table index bits + 0001eeee - length or distance, eeee is the number of extra bits + 01100000 - end of block + 01000000 - invalid code + */ + +/* Maximum size of dynamic tree. The maximum found in a long but non- + exhaustive search was 1444 code structures (852 for length/literals + and 592 for distances, the latter actually the result of an + exhaustive search). The true maximum is not known, but the value + below is more than safe. */ +#define ENOUGH 2048 +#define MAXD 592 + +/* Type of code to build for inftable() */ +typedef enum { + CODES, + LENS, + DISTS +} codetype; + +extern int inflate_table OF((codetype type, unsigned short FAR *lens, + unsigned codes, code FAR * FAR *table, + unsigned FAR *bits, unsigned short FAR *work)); diff --git a/sys/contrib/opensolaris/uts/common/zmod/trees.c b/sys/contrib/opensolaris/uts/common/zmod/trees.c new file mode 100644 index 0000000..ce0cebc --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/trees.c @@ -0,0 +1,1219 @@ +/* trees.c -- output deflated data using Huffman coding + * Copyright (C) 1995-2005 Jean-loup Gailly + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ALGORITHM + * + * The "deflation" process uses several Huffman trees. The more + * common source values are represented by shorter bit sequences. + * + * Each code tree is stored in a compressed form which is itself + * a Huffman encoding of the lengths of all the code strings (in + * ascending order by source values). The actual code strings are + * reconstructed from the lengths in the inflate process, as described + * in the deflate specification. + * + * REFERENCES + * + * Deutsch, L.P.,"'Deflate' Compressed Data Format Specification". + * Available in ftp.uu.net:/pub/archiving/zip/doc/deflate-1.1.doc + * + * Storer, James A. + * Data Compression: Methods and Theory, pp. 49-50. + * Computer Science Press, 1988. ISBN 0-7167-8156-5. + * + * Sedgewick, R. + * Algorithms, p290. + * Addison-Wesley, 1983. ISBN 0-201-06672-6. + */ + +/* #define GEN_TREES_H */ + +#include "deflate.h" + +#ifdef DEBUG +# include <ctype.h> +#endif + +/* =========================================================================== + * Constants + */ + +#define MAX_BL_BITS 7 +/* Bit length codes must not exceed MAX_BL_BITS bits */ + +#define END_BLOCK 256 +/* end of block literal code */ + +#define REP_3_6 16 +/* repeat previous bit length 3-6 times (2 bits of repeat count) */ + +#define REPZ_3_10 17 +/* repeat a zero length 3-10 times (3 bits of repeat count) */ + +#define REPZ_11_138 18 +/* repeat a zero length 11-138 times (7 bits of repeat count) */ + +local const int extra_lbits[LENGTH_CODES] /* extra bits for each length code */ + = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0}; + +local const int extra_dbits[D_CODES] /* extra bits for each distance code */ + = {0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13}; + +local const int extra_blbits[BL_CODES]/* extra bits for each bit length code */ + = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7}; + +local const uch bl_order[BL_CODES] + = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15}; +/* The lengths of the bit length codes are sent in order of decreasing + * probability, to avoid transmitting the lengths for unused bit length codes. + */ + +#define Buf_size (8 * 2*sizeof(char)) +/* Number of bits used within bi_buf. (bi_buf might be implemented on + * more than 16 bits on some systems.) + */ + +/* =========================================================================== + * Local data. These are initialized only once. + */ + +#define DIST_CODE_LEN 512 /* see definition of array dist_code below */ + +#if defined(GEN_TREES_H) || !defined(STDC) +/* non ANSI compilers may not accept trees.h */ + +local ct_data static_ltree[L_CODES+2]; +/* The static literal tree. Since the bit lengths are imposed, there is no + * need for the L_CODES extra codes used during heap construction. However + * The codes 286 and 287 are needed to build a canonical tree (see _tr_init + * below). + */ + +local ct_data static_dtree[D_CODES]; +/* The static distance tree. (Actually a trivial tree since all codes use + * 5 bits.) + */ + +uch _dist_code[DIST_CODE_LEN]; +/* Distance codes. The first 256 values correspond to the distances + * 3 .. 258, the last 256 values correspond to the top 8 bits of + * the 15 bit distances. + */ + +uch _length_code[MAX_MATCH-MIN_MATCH+1]; +/* length code for each normalized match length (0 == MIN_MATCH) */ + +local int base_length[LENGTH_CODES]; +/* First normalized length for each code (0 = MIN_MATCH) */ + +local int base_dist[D_CODES]; +/* First normalized distance for each code (0 = distance of 1) */ + +#else +# include "trees.h" +#endif /* GEN_TREES_H */ + +struct static_tree_desc_s { + const ct_data *static_tree; /* static tree or NULL */ + const intf *extra_bits; /* extra bits for each code or NULL */ + int extra_base; /* base index for extra_bits */ + int elems; /* max number of elements in the tree */ + int max_length; /* max bit length for the codes */ +}; + +local static_tree_desc static_l_desc = +{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS}; + +local static_tree_desc static_d_desc = +{static_dtree, extra_dbits, 0, D_CODES, MAX_BITS}; + +local static_tree_desc static_bl_desc = +{(const ct_data *)0, extra_blbits, 0, BL_CODES, MAX_BL_BITS}; + +/* =========================================================================== + * Local (static) routines in this file. + */ + +local void tr_static_init OF((void)); +local void init_block OF((deflate_state *s)); +local void pqdownheap OF((deflate_state *s, ct_data *tree, int k)); +local void gen_bitlen OF((deflate_state *s, tree_desc *desc)); +local void gen_codes OF((ct_data *tree, int max_code, ushf *bl_count)); +local void build_tree OF((deflate_state *s, tree_desc *desc)); +local void scan_tree OF((deflate_state *s, ct_data *tree, int max_code)); +local void send_tree OF((deflate_state *s, ct_data *tree, int max_code)); +local int build_bl_tree OF((deflate_state *s)); +local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes, + int blcodes)); +local void compress_block OF((deflate_state *s, ct_data *ltree, + ct_data *dtree)); +local void set_data_type OF((deflate_state *s)); +local unsigned bi_reverse OF((unsigned value, int length)); +local void bi_windup OF((deflate_state *s)); +local void bi_flush OF((deflate_state *s)); +local void copy_block OF((deflate_state *s, charf *buf, unsigned len, + int header)); + +#ifdef GEN_TREES_H +local void gen_trees_header OF((void)); +#endif + +#ifndef DEBUG +# define send_code(s, c, tree) send_bits(s, tree[c].Code, tree[c].Len) + /* Send a code of the given tree. c and tree must not have side effects */ + +#else /* DEBUG */ +# define send_code(s, c, tree) \ + { if (z_verbose>2) fprintf(stderr,"\ncd %3d ",(c)); \ + send_bits(s, tree[c].Code, tree[c].Len); } +#endif + +/* =========================================================================== + * Output a short LSB first on the stream. + * IN assertion: there is enough room in pendingBuf. + */ +#define put_short(s, w) { \ + put_byte(s, (uch)((w) & 0xff)); \ + put_byte(s, (uch)((ush)(w) >> 8)); \ +} + +/* =========================================================================== + * Send a value on a given number of bits. + * IN assertion: length <= 16 and value fits in length bits. + */ +#ifdef DEBUG +local void send_bits OF((deflate_state *s, int value, int length)); + +local void send_bits(s, value, length) + deflate_state *s; + int value; /* value to send */ + int length; /* number of bits */ +{ + Tracevv((stderr," l %2d v %4x ", length, value)); + Assert(length > 0 && length <= 15, "invalid length"); + s->bits_sent += (ulg)length; + + /* If not enough room in bi_buf, use (valid) bits from bi_buf and + * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid)) + * unused bits in value. + */ + if (s->bi_valid > (int)Buf_size - length) { + s->bi_buf |= (value << s->bi_valid); + put_short(s, s->bi_buf); + s->bi_buf = (ush)value >> (Buf_size - s->bi_valid); + s->bi_valid += length - Buf_size; + } else { + s->bi_buf |= value << s->bi_valid; + s->bi_valid += length; + } +} +#else /* !DEBUG */ + +#define send_bits(s, value, length) \ +{ int len = length;\ + if (s->bi_valid > (int)Buf_size - len) {\ + int val = value;\ + s->bi_buf |= (val << s->bi_valid);\ + put_short(s, s->bi_buf);\ + s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\ + s->bi_valid += len - Buf_size;\ + } else {\ + s->bi_buf |= (value) << s->bi_valid;\ + s->bi_valid += len;\ + }\ +} +#endif /* DEBUG */ + + +/* the arguments must not have side effects */ + +/* =========================================================================== + * Initialize the various 'constant' tables. + */ +local void tr_static_init() +{ +#if defined(GEN_TREES_H) || !defined(STDC) + static int static_init_done = 0; + int n; /* iterates over tree elements */ + int bits; /* bit counter */ + int length; /* length value */ + int code; /* code value */ + int dist; /* distance index */ + ush bl_count[MAX_BITS+1]; + /* number of codes at each bit length for an optimal tree */ + + if (static_init_done) return; + + /* For some embedded targets, global variables are not initialized: */ + static_l_desc.static_tree = static_ltree; + static_l_desc.extra_bits = extra_lbits; + static_d_desc.static_tree = static_dtree; + static_d_desc.extra_bits = extra_dbits; + static_bl_desc.extra_bits = extra_blbits; + + /* Initialize the mapping length (0..255) -> length code (0..28) */ + length = 0; + for (code = 0; code < LENGTH_CODES-1; code++) { + base_length[code] = length; + for (n = 0; n < (1<<extra_lbits[code]); n++) { + _length_code[length++] = (uch)code; + } + } + Assert (length == 256, "tr_static_init: length != 256"); + /* Note that the length 255 (match length 258) can be represented + * in two different ways: code 284 + 5 bits or code 285, so we + * overwrite length_code[255] to use the best encoding: + */ + _length_code[length-1] = (uch)code; + + /* Initialize the mapping dist (0..32K) -> dist code (0..29) */ + dist = 0; + for (code = 0 ; code < 16; code++) { + base_dist[code] = dist; + for (n = 0; n < (1<<extra_dbits[code]); n++) { + _dist_code[dist++] = (uch)code; + } + } + Assert (dist == 256, "tr_static_init: dist != 256"); + dist >>= 7; /* from now on, all distances are divided by 128 */ + for ( ; code < D_CODES; code++) { + base_dist[code] = dist << 7; + for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) { + _dist_code[256 + dist++] = (uch)code; + } + } + Assert (dist == 256, "tr_static_init: 256+dist != 512"); + + /* Construct the codes of the static literal tree */ + for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0; + n = 0; + while (n <= 143) static_ltree[n++].Len = 8, bl_count[8]++; + while (n <= 255) static_ltree[n++].Len = 9, bl_count[9]++; + while (n <= 279) static_ltree[n++].Len = 7, bl_count[7]++; + while (n <= 287) static_ltree[n++].Len = 8, bl_count[8]++; + /* Codes 286 and 287 do not exist, but we must include them in the + * tree construction to get a canonical Huffman tree (longest code + * all ones) + */ + gen_codes((ct_data *)static_ltree, L_CODES+1, bl_count); + + /* The static distance tree is trivial: */ + for (n = 0; n < D_CODES; n++) { + static_dtree[n].Len = 5; + static_dtree[n].Code = bi_reverse((unsigned)n, 5); + } + static_init_done = 1; + +# ifdef GEN_TREES_H + gen_trees_header(); +# endif +#endif /* defined(GEN_TREES_H) || !defined(STDC) */ +} + +/* =========================================================================== + * Genererate the file trees.h describing the static trees. + */ +#ifdef GEN_TREES_H +# ifndef DEBUG +# include <stdio.h> +# endif + +# define SEPARATOR(i, last, width) \ + ((i) == (last)? "\n};\n\n" : \ + ((i) % (width) == (width)-1 ? ",\n" : ", ")) + +void gen_trees_header() +{ + FILE *header = fopen("trees.h", "w"); + int i; + + Assert (header != NULL, "Can't open trees.h"); + fprintf(header, + "/* header created automatically with -DGEN_TREES_H */\n\n"); + + fprintf(header, "local const ct_data static_ltree[L_CODES+2] = {\n"); + for (i = 0; i < L_CODES+2; i++) { + fprintf(header, "{{%3u},{%3u}}%s", static_ltree[i].Code, + static_ltree[i].Len, SEPARATOR(i, L_CODES+1, 5)); + } + + fprintf(header, "local const ct_data static_dtree[D_CODES] = {\n"); + for (i = 0; i < D_CODES; i++) { + fprintf(header, "{{%2u},{%2u}}%s", static_dtree[i].Code, + static_dtree[i].Len, SEPARATOR(i, D_CODES-1, 5)); + } + + fprintf(header, "const uch _dist_code[DIST_CODE_LEN] = {\n"); + for (i = 0; i < DIST_CODE_LEN; i++) { + fprintf(header, "%2u%s", _dist_code[i], + SEPARATOR(i, DIST_CODE_LEN-1, 20)); + } + + fprintf(header, "const uch _length_code[MAX_MATCH-MIN_MATCH+1]= {\n"); + for (i = 0; i < MAX_MATCH-MIN_MATCH+1; i++) { + fprintf(header, "%2u%s", _length_code[i], + SEPARATOR(i, MAX_MATCH-MIN_MATCH, 20)); + } + + fprintf(header, "local const int base_length[LENGTH_CODES] = {\n"); + for (i = 0; i < LENGTH_CODES; i++) { + fprintf(header, "%1u%s", base_length[i], + SEPARATOR(i, LENGTH_CODES-1, 20)); + } + + fprintf(header, "local const int base_dist[D_CODES] = {\n"); + for (i = 0; i < D_CODES; i++) { + fprintf(header, "%5u%s", base_dist[i], + SEPARATOR(i, D_CODES-1, 10)); + } + + fclose(header); +} +#endif /* GEN_TREES_H */ + +/* =========================================================================== + * Initialize the tree data structures for a new zlib stream. + */ +void _tr_init(s) + deflate_state *s; +{ + tr_static_init(); + + s->l_desc.dyn_tree = s->dyn_ltree; + s->l_desc.stat_desc = &static_l_desc; + + s->d_desc.dyn_tree = s->dyn_dtree; + s->d_desc.stat_desc = &static_d_desc; + + s->bl_desc.dyn_tree = s->bl_tree; + s->bl_desc.stat_desc = &static_bl_desc; + + s->bi_buf = 0; + s->bi_valid = 0; + s->last_eob_len = 8; /* enough lookahead for inflate */ +#ifdef DEBUG + s->compressed_len = 0L; + s->bits_sent = 0L; +#endif + + /* Initialize the first block of the first file: */ + init_block(s); +} + +/* =========================================================================== + * Initialize a new block. + */ +local void init_block(s) + deflate_state *s; +{ + int n; /* iterates over tree elements */ + + /* Initialize the trees. */ + for (n = 0; n < L_CODES; n++) s->dyn_ltree[n].Freq = 0; + for (n = 0; n < D_CODES; n++) s->dyn_dtree[n].Freq = 0; + for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0; + + s->dyn_ltree[END_BLOCK].Freq = 1; + s->opt_len = s->static_len = 0L; + s->last_lit = s->matches = 0; +} + +#define SMALLEST 1 +/* Index within the heap array of least frequent node in the Huffman tree */ + + +/* =========================================================================== + * Remove the smallest element from the heap and recreate the heap with + * one less element. Updates heap and heap_len. + */ +#define pqremove(s, tree, top) \ +{\ + top = s->heap[SMALLEST]; \ + s->heap[SMALLEST] = s->heap[s->heap_len--]; \ + pqdownheap(s, tree, SMALLEST); \ +} + +/* =========================================================================== + * Compares to subtrees, using the tree depth as tie breaker when + * the subtrees have equal frequency. This minimizes the worst case length. + */ +#define smaller(tree, n, m, depth) \ + (tree[n].Freq < tree[m].Freq || \ + (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m])) + +/* =========================================================================== + * Restore the heap property by moving down the tree starting at node k, + * exchanging a node with the smallest of its two sons if necessary, stopping + * when the heap property is re-established (each father smaller than its + * two sons). + */ +local void pqdownheap(s, tree, k) + deflate_state *s; + ct_data *tree; /* the tree to restore */ + int k; /* node to move down */ +{ + int v = s->heap[k]; + int j = k << 1; /* left son of k */ + while (j <= s->heap_len) { + /* Set j to the smallest of the two sons: */ + if (j < s->heap_len && + smaller(tree, s->heap[j+1], s->heap[j], s->depth)) { + j++; + } + /* Exit if v is smaller than both sons */ + if (smaller(tree, v, s->heap[j], s->depth)) break; + + /* Exchange v with the smallest son */ + s->heap[k] = s->heap[j]; k = j; + + /* And continue down the tree, setting j to the left son of k */ + j <<= 1; + } + s->heap[k] = v; +} + +/* =========================================================================== + * Compute the optimal bit lengths for a tree and update the total bit length + * for the current block. + * IN assertion: the fields freq and dad are set, heap[heap_max] and + * above are the tree nodes sorted by increasing frequency. + * OUT assertions: the field len is set to the optimal bit length, the + * array bl_count contains the frequencies for each bit length. + * The length opt_len is updated; static_len is also updated if stree is + * not null. + */ +local void gen_bitlen(s, desc) + deflate_state *s; + tree_desc *desc; /* the tree descriptor */ +{ + ct_data *tree = desc->dyn_tree; + int max_code = desc->max_code; + const ct_data *stree = desc->stat_desc->static_tree; + const intf *extra = desc->stat_desc->extra_bits; + int base = desc->stat_desc->extra_base; + int max_length = desc->stat_desc->max_length; + int h; /* heap index */ + int n, m; /* iterate over the tree elements */ + int bits; /* bit length */ + int xbits; /* extra bits */ + ush f; /* frequency */ + int overflow = 0; /* number of elements with bit length too large */ + + for (bits = 0; bits <= MAX_BITS; bits++) s->bl_count[bits] = 0; + + /* In a first pass, compute the optimal bit lengths (which may + * overflow in the case of the bit length tree). + */ + tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */ + + for (h = s->heap_max+1; h < HEAP_SIZE; h++) { + n = s->heap[h]; + bits = tree[tree[n].Dad].Len + 1; + if (bits > max_length) bits = max_length, overflow++; + tree[n].Len = (ush)bits; + /* We overwrite tree[n].Dad which is no longer needed */ + + if (n > max_code) continue; /* not a leaf node */ + + s->bl_count[bits]++; + xbits = 0; + if (n >= base) xbits = extra[n-base]; + f = tree[n].Freq; + s->opt_len += (ulg)f * (bits + xbits); + if (stree) s->static_len += (ulg)f * (stree[n].Len + xbits); + } + if (overflow == 0) return; + + Trace((stderr,"\nbit length overflow\n")); + /* This happens for example on obj2 and pic of the Calgary corpus */ + + /* Find the first bit length which could increase: */ + do { + bits = max_length-1; + while (s->bl_count[bits] == 0) bits--; + s->bl_count[bits]--; /* move one leaf down the tree */ + s->bl_count[bits+1] += 2; /* move one overflow item as its brother */ + s->bl_count[max_length]--; + /* The brother of the overflow item also moves one step up, + * but this does not affect bl_count[max_length] + */ + overflow -= 2; + } while (overflow > 0); + + /* Now recompute all bit lengths, scanning in increasing frequency. + * h is still equal to HEAP_SIZE. (It is simpler to reconstruct all + * lengths instead of fixing only the wrong ones. This idea is taken + * from 'ar' written by Haruhiko Okumura.) + */ + for (bits = max_length; bits != 0; bits--) { + n = s->bl_count[bits]; + while (n != 0) { + m = s->heap[--h]; + if (m > max_code) continue; + if ((unsigned) tree[m].Len != (unsigned) bits) { + Trace((stderr,"code %d bits %d->%d\n", m, tree[m].Len, bits)); + s->opt_len += ((long)bits - (long)tree[m].Len) + *(long)tree[m].Freq; + tree[m].Len = (ush)bits; + } + n--; + } + } +} + +/* =========================================================================== + * Generate the codes for a given tree and bit counts (which need not be + * optimal). + * IN assertion: the array bl_count contains the bit length statistics for + * the given tree and the field len is set for all tree elements. + * OUT assertion: the field code is set for all tree elements of non + * zero code length. + */ +local void gen_codes (tree, max_code, bl_count) + ct_data *tree; /* the tree to decorate */ + int max_code; /* largest code with non zero frequency */ + ushf *bl_count; /* number of codes at each bit length */ +{ + ush next_code[MAX_BITS+1]; /* next code value for each bit length */ + ush code = 0; /* running code value */ + int bits; /* bit index */ + int n; /* code index */ + + /* The distribution counts are first used to generate the code values + * without bit reversal. + */ + for (bits = 1; bits <= MAX_BITS; bits++) { + next_code[bits] = code = (code + bl_count[bits-1]) << 1; + } + /* Check that the bit counts in bl_count are consistent. The last code + * must be all ones. + */ + Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1, + "inconsistent bit counts"); + Tracev((stderr,"\ngen_codes: max_code %d ", max_code)); + + for (n = 0; n <= max_code; n++) { + int len = tree[n].Len; + if (len == 0) continue; + /* Now reverse the bits */ + tree[n].Code = bi_reverse(next_code[len]++, len); + + Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ", + n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1)); + } +} + +/* =========================================================================== + * Construct one Huffman tree and assigns the code bit strings and lengths. + * Update the total bit length for the current block. + * IN assertion: the field freq is set for all tree elements. + * OUT assertions: the fields len and code are set to the optimal bit length + * and corresponding code. The length opt_len is updated; static_len is + * also updated if stree is not null. The field max_code is set. + */ +local void build_tree(s, desc) + deflate_state *s; + tree_desc *desc; /* the tree descriptor */ +{ + ct_data *tree = desc->dyn_tree; + const ct_data *stree = desc->stat_desc->static_tree; + int elems = desc->stat_desc->elems; + int n, m; /* iterate over heap elements */ + int max_code = -1; /* largest code with non zero frequency */ + int node; /* new node being created */ + + /* Construct the initial heap, with least frequent element in + * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1]. + * heap[0] is not used. + */ + s->heap_len = 0, s->heap_max = HEAP_SIZE; + + for (n = 0; n < elems; n++) { + if (tree[n].Freq != 0) { + s->heap[++(s->heap_len)] = max_code = n; + s->depth[n] = 0; + } else { + tree[n].Len = 0; + } + } + + /* The pkzip format requires that at least one distance code exists, + * and that at least one bit should be sent even if there is only one + * possible code. So to avoid special checks later on we force at least + * two codes of non zero frequency. + */ + while (s->heap_len < 2) { + node = s->heap[++(s->heap_len)] = (max_code < 2 ? ++max_code : 0); + tree[node].Freq = 1; + s->depth[node] = 0; + s->opt_len--; if (stree) s->static_len -= stree[node].Len; + /* node is 0 or 1 so it does not have extra bits */ + } + desc->max_code = max_code; + + /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree, + * establish sub-heaps of increasing lengths: + */ + for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n); + + /* Construct the Huffman tree by repeatedly combining the least two + * frequent nodes. + */ + node = elems; /* next internal node of the tree */ + do { + pqremove(s, tree, n); /* n = node of least frequency */ + m = s->heap[SMALLEST]; /* m = node of next least frequency */ + + s->heap[--(s->heap_max)] = n; /* keep the nodes sorted by frequency */ + s->heap[--(s->heap_max)] = m; + + /* Create a new node father of n and m */ + tree[node].Freq = tree[n].Freq + tree[m].Freq; + s->depth[node] = (uch)((s->depth[n] >= s->depth[m] ? + s->depth[n] : s->depth[m]) + 1); + tree[n].Dad = tree[m].Dad = (ush)node; +#ifdef DUMP_BL_TREE + if (tree == s->bl_tree) { + fprintf(stderr,"\nnode %d(%d), sons %d(%d) %d(%d)", + node, tree[node].Freq, n, tree[n].Freq, m, tree[m].Freq); + } +#endif + /* and insert the new node in the heap */ + s->heap[SMALLEST] = node++; + pqdownheap(s, tree, SMALLEST); + + } while (s->heap_len >= 2); + + s->heap[--(s->heap_max)] = s->heap[SMALLEST]; + + /* At this point, the fields freq and dad are set. We can now + * generate the bit lengths. + */ + gen_bitlen(s, (tree_desc *)desc); + + /* The field len is now set, we can generate the bit codes */ + gen_codes ((ct_data *)tree, max_code, s->bl_count); +} + +/* =========================================================================== + * Scan a literal or distance tree to determine the frequencies of the codes + * in the bit length tree. + */ +local void scan_tree (s, tree, max_code) + deflate_state *s; + ct_data *tree; /* the tree to be scanned */ + int max_code; /* and its largest code of non zero frequency */ +{ + int n; /* iterates over all tree elements */ + int prevlen = -1; /* last emitted length */ + int curlen; /* length of current code */ + int nextlen = tree[0].Len; /* length of next code */ + int count = 0; /* repeat count of the current code */ + int max_count = 7; /* max repeat count */ + int min_count = 4; /* min repeat count */ + + if (nextlen == 0) max_count = 138, min_count = 3; + tree[max_code+1].Len = (ush)0xffff; /* guard */ + + for (n = 0; n <= max_code; n++) { + curlen = nextlen; nextlen = tree[n+1].Len; + if (++count < max_count && curlen == nextlen) { + continue; + } else if (count < min_count) { + s->bl_tree[curlen].Freq += count; + } else if (curlen != 0) { + if (curlen != prevlen) s->bl_tree[curlen].Freq++; + s->bl_tree[REP_3_6].Freq++; + } else if (count <= 10) { + s->bl_tree[REPZ_3_10].Freq++; + } else { + s->bl_tree[REPZ_11_138].Freq++; + } + count = 0; prevlen = curlen; + if (nextlen == 0) { + max_count = 138, min_count = 3; + } else if (curlen == nextlen) { + max_count = 6, min_count = 3; + } else { + max_count = 7, min_count = 4; + } + } +} + +/* =========================================================================== + * Send a literal or distance tree in compressed form, using the codes in + * bl_tree. + */ +local void send_tree (s, tree, max_code) + deflate_state *s; + ct_data *tree; /* the tree to be scanned */ + int max_code; /* and its largest code of non zero frequency */ +{ + int n; /* iterates over all tree elements */ + int prevlen = -1; /* last emitted length */ + int curlen; /* length of current code */ + int nextlen = tree[0].Len; /* length of next code */ + int count = 0; /* repeat count of the current code */ + int max_count = 7; /* max repeat count */ + int min_count = 4; /* min repeat count */ + + /* tree[max_code+1].Len = -1; */ /* guard already set */ + if (nextlen == 0) max_count = 138, min_count = 3; + + for (n = 0; n <= max_code; n++) { + curlen = nextlen; nextlen = tree[n+1].Len; + if (++count < max_count && curlen == nextlen) { + continue; + } else if (count < min_count) { + do { send_code(s, curlen, s->bl_tree); } while (--count != 0); + + } else if (curlen != 0) { + if (curlen != prevlen) { + send_code(s, curlen, s->bl_tree); count--; + } + Assert(count >= 3 && count <= 6, " 3_6?"); + send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2); + + } else if (count <= 10) { + send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3); + + } else { + send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7); + } + count = 0; prevlen = curlen; + if (nextlen == 0) { + max_count = 138, min_count = 3; + } else if (curlen == nextlen) { + max_count = 6, min_count = 3; + } else { + max_count = 7, min_count = 4; + } + } +} + +/* =========================================================================== + * Construct the Huffman tree for the bit lengths and return the index in + * bl_order of the last bit length code to send. + */ +local int build_bl_tree(s) + deflate_state *s; +{ + int max_blindex; /* index of last bit length code of non zero freq */ + + /* Determine the bit length frequencies for literal and distance trees */ + scan_tree(s, (ct_data *)s->dyn_ltree, s->l_desc.max_code); + scan_tree(s, (ct_data *)s->dyn_dtree, s->d_desc.max_code); + + /* Build the bit length tree: */ + build_tree(s, (tree_desc *)(&(s->bl_desc))); + /* opt_len now includes the length of the tree representations, except + * the lengths of the bit lengths codes and the 5+5+4 bits for the counts. + */ + + /* Determine the number of bit length codes to send. The pkzip format + * requires that at least 4 bit length codes be sent. (appnote.txt says + * 3 but the actual value used is 4.) + */ + for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) { + if (s->bl_tree[bl_order[max_blindex]].Len != 0) break; + } + /* Update opt_len to include the bit length tree and counts */ + s->opt_len += 3*(max_blindex+1) + 5+5+4; + Tracev((stderr, "\ndyn trees: dyn %ld, stat %ld", + s->opt_len, s->static_len)); + + return max_blindex; +} + +/* =========================================================================== + * Send the header for a block using dynamic Huffman trees: the counts, the + * lengths of the bit length codes, the literal tree and the distance tree. + * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4. + */ +local void send_all_trees(s, lcodes, dcodes, blcodes) + deflate_state *s; + int lcodes, dcodes, blcodes; /* number of codes for each tree */ +{ + int rank; /* index in bl_order */ + + Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes"); + Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES, + "too many codes"); + Tracev((stderr, "\nbl counts: ")); + send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */ + send_bits(s, dcodes-1, 5); + send_bits(s, blcodes-4, 4); /* not -3 as stated in appnote.txt */ + for (rank = 0; rank < blcodes; rank++) { + Tracev((stderr, "\nbl code %2d ", bl_order[rank])); + send_bits(s, s->bl_tree[bl_order[rank]].Len, 3); + } + Tracev((stderr, "\nbl tree: sent %ld", s->bits_sent)); + + send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */ + Tracev((stderr, "\nlit tree: sent %ld", s->bits_sent)); + + send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */ + Tracev((stderr, "\ndist tree: sent %ld", s->bits_sent)); +} + +/* =========================================================================== + * Send a stored block + */ +void _tr_stored_block(s, buf, stored_len, eof) + deflate_state *s; + charf *buf; /* input block */ + ulg stored_len; /* length of input block */ + int eof; /* true if this is the last block for a file */ +{ + send_bits(s, (STORED_BLOCK<<1)+eof, 3); /* send block type */ +#ifdef DEBUG + s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L; + s->compressed_len += (stored_len + 4) << 3; +#endif + copy_block(s, buf, (unsigned)stored_len, 1); /* with header */ +} + +/* =========================================================================== + * Send one empty static block to give enough lookahead for inflate. + * This takes 10 bits, of which 7 may remain in the bit buffer. + * The current inflate code requires 9 bits of lookahead. If the + * last two codes for the previous block (real code plus EOB) were coded + * on 5 bits or less, inflate may have only 5+3 bits of lookahead to decode + * the last real code. In this case we send two empty static blocks instead + * of one. (There are no problems if the previous block is stored or fixed.) + * To simplify the code, we assume the worst case of last real code encoded + * on one bit only. + */ +void _tr_align(s) + deflate_state *s; +{ + send_bits(s, STATIC_TREES<<1, 3); + send_code(s, END_BLOCK, static_ltree); +#ifdef DEBUG + s->compressed_len += 10L; /* 3 for block type, 7 for EOB */ +#endif + bi_flush(s); + /* Of the 10 bits for the empty block, we have already sent + * (10 - bi_valid) bits. The lookahead for the last real code (before + * the EOB of the previous block) was thus at least one plus the length + * of the EOB plus what we have just sent of the empty static block. + */ + if (1 + s->last_eob_len + 10 - s->bi_valid < 9) { + send_bits(s, STATIC_TREES<<1, 3); + send_code(s, END_BLOCK, static_ltree); +#ifdef DEBUG + s->compressed_len += 10L; +#endif + bi_flush(s); + } + s->last_eob_len = 7; +} + +/* =========================================================================== + * Determine the best encoding for the current block: dynamic trees, static + * trees or store, and output the encoded block to the zip file. + */ +void _tr_flush_block(s, buf, stored_len, eof) + deflate_state *s; + charf *buf; /* input block, or NULL if too old */ + ulg stored_len; /* length of input block */ + int eof; /* true if this is the last block for a file */ +{ + ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */ + int max_blindex = 0; /* index of last bit length code of non zero freq */ + + /* Build the Huffman trees unless a stored block is forced */ + if (s->level > 0) { + + /* Check if the file is binary or text */ + if (stored_len > 0 && s->strm->data_type == Z_UNKNOWN) + set_data_type(s); + + /* Construct the literal and distance trees */ + build_tree(s, (tree_desc *)(&(s->l_desc))); + Tracev((stderr, "\nlit data: dyn %ld, stat %ld", s->opt_len, + s->static_len)); + + build_tree(s, (tree_desc *)(&(s->d_desc))); + Tracev((stderr, "\ndist data: dyn %ld, stat %ld", s->opt_len, + s->static_len)); + /* At this point, opt_len and static_len are the total bit lengths of + * the compressed block data, excluding the tree representations. + */ + + /* Build the bit length tree for the above two trees, and get the index + * in bl_order of the last bit length code to send. + */ + max_blindex = build_bl_tree(s); + + /* Determine the best encoding. Compute the block lengths in bytes. */ + opt_lenb = (s->opt_len+3+7)>>3; + static_lenb = (s->static_len+3+7)>>3; + + Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ", + opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len, + s->last_lit)); + + if (static_lenb <= opt_lenb) opt_lenb = static_lenb; + + } else { + Assert(buf != (char*)0, "lost buf"); + opt_lenb = static_lenb = stored_len + 5; /* force a stored block */ + } + +#ifdef FORCE_STORED + if (buf != (char*)0) { /* force stored block */ +#else + if (stored_len+4 <= opt_lenb && buf != (char*)0) { + /* 4: two words for the lengths */ +#endif + /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE. + * Otherwise we can't have processed more than WSIZE input bytes since + * the last block flush, because compression would have been + * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to + * transform a block into a stored block. + */ + _tr_stored_block(s, buf, stored_len, eof); + +#ifdef FORCE_STATIC + } else if (static_lenb >= 0) { /* force static trees */ +#else + } else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) { +#endif + send_bits(s, (STATIC_TREES<<1)+eof, 3); + compress_block(s, (ct_data *)static_ltree, (ct_data *)static_dtree); +#ifdef DEBUG + s->compressed_len += 3 + s->static_len; +#endif + } else { + send_bits(s, (DYN_TREES<<1)+eof, 3); + send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1, + max_blindex+1); + compress_block(s, (ct_data *)s->dyn_ltree, (ct_data *)s->dyn_dtree); +#ifdef DEBUG + s->compressed_len += 3 + s->opt_len; +#endif + } + Assert (s->compressed_len == s->bits_sent, "bad compressed size"); + /* The above check is made mod 2^32, for files larger than 512 MB + * and uLong implemented on 32 bits. + */ + init_block(s); + + if (eof) { + bi_windup(s); +#ifdef DEBUG + s->compressed_len += 7; /* align on byte boundary */ +#endif + } + Tracev((stderr,"\ncomprlen %lu(%lu) ", s->compressed_len>>3, + s->compressed_len-7*eof)); +} + +/* =========================================================================== + * Save the match info and tally the frequency counts. Return true if + * the current block must be flushed. + */ +int _tr_tally (s, dist, lc) + deflate_state *s; + unsigned dist; /* distance of matched string */ + unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */ +{ + s->d_buf[s->last_lit] = (ush)dist; + s->l_buf[s->last_lit++] = (uch)lc; + if (dist == 0) { + /* lc is the unmatched char */ + s->dyn_ltree[lc].Freq++; + } else { + s->matches++; + /* Here, lc is the match length - MIN_MATCH */ + dist--; /* dist = match distance - 1 */ + Assert((ush)dist < (ush)MAX_DIST(s) && + (ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) && + (ush)d_code(dist) < (ush)D_CODES, "_tr_tally: bad match"); + + s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++; + s->dyn_dtree[d_code(dist)].Freq++; + } + +#ifdef TRUNCATE_BLOCK + /* Try to guess if it is profitable to stop the current block here */ + if ((s->last_lit & 0x1fff) == 0 && s->level > 2) { + /* Compute an upper bound for the compressed length */ + ulg out_length = (ulg)s->last_lit*8L; + ulg in_length = (ulg)((long)s->strstart - s->block_start); + int dcode; + for (dcode = 0; dcode < D_CODES; dcode++) { + out_length += (ulg)s->dyn_dtree[dcode].Freq * + (5L+extra_dbits[dcode]); + } + out_length >>= 3; + Tracev((stderr,"\nlast_lit %u, in %ld, out ~%ld(%ld%%) ", + s->last_lit, in_length, out_length, + 100L - out_length*100L/in_length)); + if (s->matches < s->last_lit/2 && out_length < in_length/2) return 1; + } +#endif + return (s->last_lit == s->lit_bufsize-1); + /* We avoid equality with lit_bufsize because of wraparound at 64K + * on 16 bit machines and because stored blocks are restricted to + * 64K-1 bytes. + */ +} + +/* =========================================================================== + * Send the block data compressed using the given Huffman trees + */ +local void compress_block(s, ltree, dtree) + deflate_state *s; + ct_data *ltree; /* literal tree */ + ct_data *dtree; /* distance tree */ +{ + unsigned dist; /* distance of matched string */ + int lc; /* match length or unmatched char (if dist == 0) */ + unsigned lx = 0; /* running index in l_buf */ + unsigned code; /* the code to send */ + int extra; /* number of extra bits to send */ + + if (s->last_lit != 0) do { + dist = s->d_buf[lx]; + lc = s->l_buf[lx++]; + if (dist == 0) { + send_code(s, lc, ltree); /* send a literal byte */ + Tracecv(isgraph(lc), (stderr," '%c' ", lc)); + } else { + /* Here, lc is the match length - MIN_MATCH */ + code = _length_code[lc]; + send_code(s, code+LITERALS+1, ltree); /* send the length code */ + extra = extra_lbits[code]; + if (extra != 0) { + lc -= base_length[code]; + send_bits(s, lc, extra); /* send the extra length bits */ + } + dist--; /* dist is now the match distance - 1 */ + code = d_code(dist); + Assert (code < D_CODES, "bad d_code"); + + send_code(s, code, dtree); /* send the distance code */ + extra = extra_dbits[code]; + if (extra != 0) { + dist -= base_dist[code]; + send_bits(s, dist, extra); /* send the extra distance bits */ + } + } /* literal or match pair ? */ + + /* Check that the overlay between pending_buf and d_buf+l_buf is ok: */ + Assert((uInt)(s->pending) < s->lit_bufsize + 2*lx, + "pendingBuf overflow"); + + } while (lx < s->last_lit); + + send_code(s, END_BLOCK, ltree); + s->last_eob_len = ltree[END_BLOCK].Len; +} + +/* =========================================================================== + * Set the data type to BINARY or TEXT, using a crude approximation: + * set it to Z_TEXT if all symbols are either printable characters (33 to 255) + * or white spaces (9 to 13, or 32); or set it to Z_BINARY otherwise. + * IN assertion: the fields Freq of dyn_ltree are set. + */ +local void set_data_type(s) + deflate_state *s; +{ + int n; + + for (n = 0; n < 9; n++) + if (s->dyn_ltree[n].Freq != 0) + break; + if (n == 9) + for (n = 14; n < 32; n++) + if (s->dyn_ltree[n].Freq != 0) + break; + s->strm->data_type = (n == 32) ? Z_TEXT : Z_BINARY; +} + +/* =========================================================================== + * Reverse the first len bits of a code, using straightforward code (a faster + * method would use a table) + * IN assertion: 1 <= len <= 15 + */ +local unsigned bi_reverse(code, len) + unsigned code; /* the value to invert */ + int len; /* its bit length */ +{ + register unsigned res = 0; + do { + res |= code & 1; + code >>= 1, res <<= 1; + } while (--len > 0); + return res >> 1; +} + +/* =========================================================================== + * Flush the bit buffer, keeping at most 7 bits in it. + */ +local void bi_flush(s) + deflate_state *s; +{ + if (s->bi_valid == 16) { + put_short(s, s->bi_buf); + s->bi_buf = 0; + s->bi_valid = 0; + } else if (s->bi_valid >= 8) { + put_byte(s, (Byte)s->bi_buf); + s->bi_buf >>= 8; + s->bi_valid -= 8; + } +} + +/* =========================================================================== + * Flush the bit buffer and align the output on a byte boundary + */ +local void bi_windup(s) + deflate_state *s; +{ + if (s->bi_valid > 8) { + put_short(s, s->bi_buf); + } else if (s->bi_valid > 0) { + put_byte(s, (Byte)s->bi_buf); + } + s->bi_buf = 0; + s->bi_valid = 0; +#ifdef DEBUG + s->bits_sent = (s->bits_sent+7) & ~7; +#endif +} + +/* =========================================================================== + * Copy a stored block, storing first the length and its + * one's complement if requested. + */ +local void copy_block(s, buf, len, header) + deflate_state *s; + charf *buf; /* the input data */ + unsigned len; /* its length */ + int header; /* true if block header must be written */ +{ + bi_windup(s); /* align on byte boundary */ + s->last_eob_len = 8; /* enough lookahead for inflate */ + + if (header) { + put_short(s, (ush)len); + put_short(s, (ush)~len); +#ifdef DEBUG + s->bits_sent += 2*16; +#endif + } +#ifdef DEBUG + s->bits_sent += (ulg)len<<3; +#endif + while (len--) { + put_byte(s, *buf++); + } +} diff --git a/sys/contrib/opensolaris/uts/common/zmod/zconf.h b/sys/contrib/opensolaris/uts/common/zmod/zconf.h new file mode 100644 index 0000000..ccce7b2 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/zconf.h @@ -0,0 +1,117 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZCONF_H +#define _ZCONF_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * We don't want to turn on zlib's debugging. + */ +#undef DEBUG + +/* + * We define our own memory allocation and deallocation routines that use kmem. + */ +#define MY_ZCALLOC + +/* + * We don't define HAVE_MEMCPY here, but do in zutil.c, and implement our + * our versions of zmemcpy(), zmemzero(), and zmemcmp(). + */ + +/* + * We have a sufficiently capable compiler as to not need zlib's compiler hack. + */ +#define NO_DUMMY_DECL + +#define compressBound(len) (len + (len >> 12) + (len >> 14) + 11) + +#define z_off_t off_t +#define OF(p) p +#define ZEXTERN extern +#define ZEXPORT +#define ZEXPORTVA +#define FAR + +#define deflateInit_ z_deflateInit_ +#define deflate z_deflate +#define deflateEnd z_deflateEnd +#define inflateInit_ z_inflateInit_ +#define inflate z_inflate +#define inflateEnd z_inflateEnd +#define deflateInit2_ z_deflateInit2_ +#define deflateSetDictionary z_deflateSetDictionary +#define deflateCopy z_deflateCopy +#define deflateReset z_deflateReset +#define deflateParams z_deflateParams +#define deflateBound z_deflateBound +#define deflatePrime z_deflatePrime +#define inflateInit2_ z_inflateInit2_ +#define inflateSetDictionary z_inflateSetDictionary +#define inflateSync z_inflateSync +#define inflateSyncPoint z_inflateSyncPoint +#define inflateCopy z_inflateCopy +#define inflateReset z_inflateReset +#define inflateBack z_inflateBack +#define inflateBackEnd z_inflateBackEnd +#define compress zz_compress +#define compress2 zz_compress2 +#define uncompress zz_uncompress +#define adler32 z_adler32 +#define crc32 z_crc32 +#define get_crc_table z_get_crc_table +#define zError z_zError + +#define MAX_MEM_LEVEL 9 +#define MAX_WBITS 15 + +typedef unsigned char Byte; +typedef unsigned int uInt; +typedef unsigned long uLong; +typedef Byte Bytef; +typedef char charf; +typedef int intf; +typedef uInt uIntf; +typedef uLong uLongf; +typedef void *voidpc; +typedef void *voidpf; +typedef void *voidp; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZCONF_H */ diff --git a/sys/contrib/opensolaris/uts/common/zmod/zlib.h b/sys/contrib/opensolaris/uts/common/zmod/zlib.h new file mode 100644 index 0000000..9b971a0 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/zlib.h @@ -0,0 +1,1359 @@ +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.3, July 18th, 2005 + + Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +#ifndef _ZLIB_H +#define _ZLIB_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zconf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZLIB_VERSION "1.2.3" +#define ZLIB_VERNUM 0x1230 + +/* + The 'zlib' compression library provides in-memory compression and + decompression functions, including integrity checks of the uncompressed + data. This version of the library supports only one compression method + (deflation) but other algorithms will be added later and will have the same + stream interface. + + Compression can be done in a single step if the buffers are large + enough (for example if an input file is mmap'ed), or can be done by + repeated calls of the compression function. In the latter case, the + application must provide more input and/or consume the output + (providing more output space) before each call. + + The compressed data format used by default by the in-memory functions is + the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped + around a deflate stream, which is itself documented in RFC 1951. + + The library also supports reading and writing files in gzip (.gz) format + with an interface similar to that of stdio using the functions that start + with "gz". The gzip format is different from the zlib format. gzip is a + gzip wrapper, documented in RFC 1952, wrapped around a deflate stream. + + This library can optionally read and write gzip streams in memory as well. + + The zlib format was designed to be compact and fast for use in memory + and on communications channels. The gzip format was designed for single- + file compression on file systems, has a larger header than zlib to maintain + directory information, and uses a different, slower check method than zlib. + + The library does not install any signal handler. The decoder checks + the consistency of the compressed data, so the library should never + crash even in case of corrupted input. +*/ + +typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); +typedef void (*free_func) OF((voidpf opaque, voidpf address)); + +struct internal_state; + +typedef struct z_stream_s { + Bytef *next_in; /* next input byte */ + uInt avail_in; /* number of bytes available at next_in */ + uLong total_in; /* total nb of input bytes read so far */ + + Bytef *next_out; /* next output byte should be put there */ + uInt avail_out; /* remaining free space at next_out */ + uLong total_out; /* total nb of bytes output so far */ + + char *msg; /* last error message, NULL if no error */ + struct internal_state FAR *state; /* not visible by applications */ + + alloc_func zalloc; /* used to allocate the internal state */ + free_func zfree; /* used to free the internal state */ + voidpf opaque; /* private data object passed to zalloc and zfree */ + + int data_type; /* best guess about the data type: binary or text */ + uLong adler; /* adler32 value of the uncompressed data */ + uLong reserved; /* reserved for future use */ +} z_stream; + +typedef z_stream FAR *z_streamp; + +/* + gzip header information passed to and from zlib routines. See RFC 1952 + for more details on the meanings of these fields. +*/ +typedef struct gz_header_s { + int text; /* true if compressed data believed to be text */ + uLong time; /* modification time */ + int xflags; /* extra flags (not used when writing a gzip file) */ + int os; /* operating system */ + Bytef *extra; /* pointer to extra field or Z_NULL if none */ + uInt extra_len; /* extra field length (valid if extra != Z_NULL) */ + uInt extra_max; /* space at extra (only when reading header) */ + Bytef *name; /* pointer to zero-terminated file name or Z_NULL */ + uInt name_max; /* space at name (only when reading header) */ + Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */ + uInt comm_max; /* space at comment (only when reading header) */ + int hcrc; /* true if there was or will be a header crc */ + int done; /* true when done reading gzip header (not used + when writing a gzip file) */ +} gz_header; + +typedef gz_header FAR *gz_headerp; + +/* + The application must update next_in and avail_in when avail_in has + dropped to zero. It must update next_out and avail_out when avail_out + has dropped to zero. The application must initialize zalloc, zfree and + opaque before calling the init function. All other fields are set by the + compression library and must not be updated by the application. + + The opaque value provided by the application will be passed as the first + parameter for calls of zalloc and zfree. This can be useful for custom + memory management. The compression library attaches no meaning to the + opaque value. + + zalloc must return Z_NULL if there is not enough memory for the object. + If zlib is used in a multi-threaded application, zalloc and zfree must be + thread safe. + + On 16-bit systems, the functions zalloc and zfree must be able to allocate + exactly 65536 bytes, but will not be required to allocate more than this + if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, + pointers returned by zalloc for objects of exactly 65536 bytes *must* + have their offset normalized to zero. The default allocation function + provided by this library ensures this (see zutil.c). To reduce memory + requirements and avoid any allocation of 64K objects, at the expense of + compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). + + The fields total_in and total_out can be used for statistics or + progress reports. After compression, total_in holds the total size of + the uncompressed data and may be saved for use in the decompressor + (particularly if the decompressor wants to decompress everything in + a single step). +*/ + + /* constants */ + +#define Z_NO_FLUSH 0 +#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */ +#define Z_SYNC_FLUSH 2 +#define Z_FULL_FLUSH 3 +#define Z_FINISH 4 +#define Z_BLOCK 5 +/* Allowed flush values; see deflate() and inflate() below for details */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) +/* Return codes for the compression/decompression functions. Negative + * values are errors, positive values are used for special but normal events. + */ + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) +/* compression levels */ + +#define Z_FILTERED 1 +#define Z_HUFFMAN_ONLY 2 +#define Z_RLE 3 +#define Z_FIXED 4 +#define Z_DEFAULT_STRATEGY 0 +/* compression strategy; see deflateInit2() below for details */ + +#define Z_BINARY 0 +#define Z_TEXT 1 +#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */ +#define Z_UNKNOWN 2 +/* Possible values of the data_type field (though see inflate()) */ + +#define Z_DEFLATED 8 +/* The deflate compression method (the only one supported in this version) */ + +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define zlib_version zlibVersion() +/* for compatibility with versions < 1.0.2 */ + + /* basic functions */ + +ZEXTERN const char * ZEXPORT zlibVersion OF((void)); +/* The application can compare zlibVersion and ZLIB_VERSION for consistency. + If the first character differs, the library code actually used is + not compatible with the zlib.h header file used by the application. + This check is automatically made by deflateInit and inflateInit. + */ + +/* +ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); + + Initializes the internal stream state for compression. The fields + zalloc, zfree and opaque must be initialized before by the caller. + If zalloc and zfree are set to Z_NULL, deflateInit updates them to + use default allocation functions. + + The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: + 1 gives best speed, 9 gives best compression, 0 gives no compression at + all (the input data is simply copied a block at a time). + Z_DEFAULT_COMPRESSION requests a default compromise between speed and + compression (currently equivalent to level 6). + + deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if level is not a valid compression level, + Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible + with the version assumed by the caller (ZLIB_VERSION). + msg is set to null if there is no error message. deflateInit does not + perform any compression: this will be done by deflate(). +*/ + + +ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush)); +/* + deflate compresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce some + output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. deflate performs one or both of the + following actions: + + - Compress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in and avail_in are updated and + processing will resume at this point for the next call of deflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. This action is forced if the parameter flush is non zero. + Forcing flush frequently degrades the compression ratio, so this parameter + should be set only when necessary (in interactive applications). + Some output may be provided even if flush is not set. + + Before the call of deflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating avail_in or avail_out accordingly; avail_out + should never be zero before the call. The application can consume the + compressed output when it wants, for example when the output buffer is full + (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK + and with zero avail_out, it must be called again after making room in the + output buffer because there might be more output pending. + + Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to + decide how much data to accumualte before producing output, in order to + maximize compression. + + If the parameter flush is set to Z_SYNC_FLUSH, all pending output is + flushed to the output buffer and the output is aligned on a byte boundary, so + that the decompressor can get all input data available so far. (In particular + avail_in is zero after the call if enough output space has been provided + before the call.) Flushing may degrade compression for some compression + algorithms and so it should be used only when necessary. + + If flush is set to Z_FULL_FLUSH, all output is flushed as with + Z_SYNC_FLUSH, and the compression state is reset so that decompression can + restart from this point if previous compressed data has been damaged or if + random access is desired. Using Z_FULL_FLUSH too often can seriously degrade + compression. + + If deflate returns with avail_out == 0, this function must be called again + with the same value of the flush parameter and more output space (updated + avail_out), until the flush is complete (deflate returns with non-zero + avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that + avail_out is greater than six to avoid repeated flush markers due to + avail_out == 0 on return. + + If the parameter flush is set to Z_FINISH, pending input is processed, + pending output is flushed and deflate returns with Z_STREAM_END if there + was enough output space; if deflate returns with Z_OK, this function must be + called again with Z_FINISH and more output space (updated avail_out) but no + more input data, until it returns with Z_STREAM_END or an error. After + deflate has returned Z_STREAM_END, the only possible operations on the + stream are deflateReset or deflateEnd. + + Z_FINISH can be used immediately after deflateInit if all the compression + is to be done in a single step. In this case, avail_out must be at least + the value returned by deflateBound (see below). If deflate does not return + Z_STREAM_END, then it must be called again as described above. + + deflate() sets strm->adler to the adler32 checksum of all input read + so far (that is, total_in bytes). + + deflate() may update strm->data_type if it can make a good guess about + the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered + binary. This field is only for information purposes and does not affect + the compression algorithm in any manner. + + deflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if all input has been + consumed and all output has been produced (only when flush is set to + Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example + if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible + (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not + fatal, and deflate() can be called again with more input and more output + space to continue compressing. +*/ + + +ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the + stream state was inconsistent, Z_DATA_ERROR if the stream was freed + prematurely (some input or output was discarded). In the error case, + msg may be set but then points to a static string (which must not be + deallocated). +*/ + + +/* +ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); + + Initializes the internal stream state for decompression. The fields + next_in, avail_in, zalloc, zfree and opaque must be initialized before by + the caller. If next_in is not Z_NULL and avail_in is large enough (the exact + value depends on the compression method), inflateInit determines the + compression method from the zlib header and allocates all data structures + accordingly; otherwise the allocation will be deferred to the first call of + inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to + use default allocation functions. + + inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_VERSION_ERROR if the zlib library version is incompatible with the + version assumed by the caller. msg is set to null if there is no error + message. inflateInit does not perform any decompression apart from reading + the zlib header if present: this will be done by inflate(). (So next_in and + avail_in may be modified, but next_out and avail_out are unchanged.) +*/ + + +ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush)); +/* + inflate decompresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce + some output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. inflate performs one or both of the + following actions: + + - Decompress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in is updated and processing + will resume at this point for the next call of inflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. inflate() provides as much output as possible, until there + is no more input data or no more space in the output buffer (see below + about the flush parameter). + + Before the call of inflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating the next_* and avail_* values accordingly. + The application can consume the uncompressed output when it wants, for + example when the output buffer is full (avail_out == 0), or after each + call of inflate(). If inflate returns Z_OK and with zero avail_out, it + must be called again after making room in the output buffer because there + might be more output pending. + + The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, + Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much + output as possible to the output buffer. Z_BLOCK requests that inflate() stop + if and when it gets to the next deflate block boundary. When decoding the + zlib or gzip format, this will cause inflate() to return immediately after + the header and before the first block. When doing a raw inflate, inflate() + will go ahead and process the first block, and will return when it gets to + the end of that block, or when it runs out of data. + + The Z_BLOCK option assists in appending to or combining deflate streams. + Also to assist in this, on return inflate() will set strm->data_type to the + number of unused bits in the last byte taken from strm->next_in, plus 64 + if inflate() is currently decoding the last block in the deflate stream, + plus 128 if inflate() returned immediately after decoding an end-of-block + code or decoding the complete header up to just before the first byte of the + deflate stream. The end-of-block will not be indicated until all of the + uncompressed data from that block has been written to strm->next_out. The + number of unused bits may in general be greater than seven, except when + bit 7 of data_type is set, in which case the number of unused bits will be + less than eight. + + inflate() should normally be called until it returns Z_STREAM_END or an + error. However if all decompression is to be performed in a single step + (a single call of inflate), the parameter flush should be set to + Z_FINISH. In this case all pending input is processed and all pending + output is flushed; avail_out must be large enough to hold all the + uncompressed data. (The size of the uncompressed data may have been saved + by the compressor for this purpose.) The next operation on this stream must + be inflateEnd to deallocate the decompression state. The use of Z_FINISH + is never required, but can be used to inform inflate that a faster approach + may be used for the single inflate() call. + + In this implementation, inflate() always flushes as much output as + possible to the output buffer, and always uses the faster approach on the + first call. So the only effect of the flush parameter in this implementation + is on the return value of inflate(), as noted below, or when it returns early + because Z_BLOCK is used. + + If a preset dictionary is needed after this call (see inflateSetDictionary + below), inflate sets strm->adler to the adler32 checksum of the dictionary + chosen by the compressor and returns Z_NEED_DICT; otherwise it sets + strm->adler to the adler32 checksum of all output produced so far (that is, + total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described + below. At the end of the stream, inflate() checks that its computed adler32 + checksum is equal to that saved by the compressor and returns Z_STREAM_END + only if the checksum is correct. + + inflate() will decompress and check either zlib-wrapped or gzip-wrapped + deflate data. The header type is detected automatically. Any information + contained in the gzip header is not retained, so applications that need that + information should instead use raw inflate, see inflateInit2() below, or + inflateBack() and perform their own processing of the gzip header and + trailer. + + inflate() returns Z_OK if some progress has been made (more input processed + or more output produced), Z_STREAM_END if the end of the compressed data has + been reached and all uncompressed output has been produced, Z_NEED_DICT if a + preset dictionary is needed at this point, Z_DATA_ERROR if the input data was + corrupted (input stream not conforming to the zlib format or incorrect check + value), Z_STREAM_ERROR if the stream structure was inconsistent (for example + if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory, + Z_BUF_ERROR if no progress is possible or if there was not enough room in the + output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and + inflate() can be called again with more input and more output space to + continue decompressing. If Z_DATA_ERROR is returned, the application may then + call inflateSync() to look for a good compression block if a partial recovery + of the data is desired. +*/ + + +ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state + was inconsistent. In the error case, msg may be set but then points to a + static string (which must not be deallocated). +*/ + + /* Advanced functions */ + +/* + The following functions are needed only in some special applications. +*/ + +/* +ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm, + int level, + int method, + int windowBits, + int memLevel, + int strategy)); + + This is another version of deflateInit with more compression options. The + fields next_in, zalloc, zfree and opaque must be initialized before by + the caller. + + The method parameter is the compression method. It must be Z_DEFLATED in + this version of the library. + + The windowBits parameter is the base two logarithm of the window size + (the size of the history buffer). It should be in the range 8..15 for this + version of the library. Larger values of this parameter result in better + compression at the expense of memory usage. The default value is 15 if + deflateInit is used instead. + + windowBits can also be -8..-15 for raw deflate. In this case, -windowBits + determines the window size. deflate() will then generate raw deflate data + with no zlib header or trailer, and will not compute an adler32 check value. + + windowBits can also be greater than 15 for optional gzip encoding. Add + 16 to windowBits to write a simple gzip header and trailer around the + compressed data instead of a zlib wrapper. The gzip header will have no + file name, no extra data, no comment, no modification time (set to zero), + no header crc, and the operating system will be set to 255 (unknown). If a + gzip stream is being written, strm->adler is a crc32 instead of an adler32. + + The memLevel parameter specifies how much memory should be allocated + for the internal compression state. memLevel=1 uses minimum memory but + is slow and reduces compression ratio; memLevel=9 uses maximum memory + for optimal speed. The default value is 8. See zconf.h for total memory + usage as a function of windowBits and memLevel. + + The strategy parameter is used to tune the compression algorithm. Use the + value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a + filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no + string match), or Z_RLE to limit match distances to one (run-length + encoding). Filtered data consists mostly of small values with a somewhat + random distribution. In this case, the compression algorithm is tuned to + compress them better. The effect of Z_FILTERED is to force more Huffman + coding and less string matching; it is somewhat intermediate between + Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as + Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy + parameter only affects the compression ratio but not the correctness of the + compressed output even if it is not set appropriately. Z_FIXED prevents the + use of dynamic Huffman codes, allowing for a simpler decoder for special + applications. + + deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid + method). msg is set to null if there is no error message. deflateInit2 does + not perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the compression dictionary from the given byte sequence + without producing any compressed output. This function must be called + immediately after deflateInit, deflateInit2 or deflateReset, before any + call of deflate. The compressor and decompressor must use exactly the same + dictionary (see inflateSetDictionary). + + The dictionary should consist of strings (byte sequences) that are likely + to be encountered later in the data to be compressed, with the most commonly + used strings preferably put towards the end of the dictionary. Using a + dictionary is most useful when the data to be compressed is short and can be + predicted with good accuracy; the data can then be compressed better than + with the default empty dictionary. + + Depending on the size of the compression data structures selected by + deflateInit or deflateInit2, a part of the dictionary may in effect be + discarded, for example if the dictionary is larger than the window size in + deflate or deflate2. Thus the strings most likely to be useful should be + put at the end of the dictionary, not at the front. In addition, the + current implementation of deflate will use at most the window size minus + 262 bytes of the provided dictionary. + + Upon return of this function, strm->adler is set to the adler32 value + of the dictionary; the decompressor may later use this value to determine + which dictionary has been used by the compressor. (The adler32 value + applies to the whole dictionary even if only a subset of the dictionary is + actually used by the compressor.) If a raw deflate was requested, then the + adler32 value is not computed and strm->adler is not set. + + deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent (for example if deflate has already been called for this stream + or if the compression method is bsort). deflateSetDictionary does not + perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when several compression strategies will be + tried, for example when there are several ways of pre-processing the input + data with a filter. The streams that will be discarded should then be freed + by calling deflateEnd. Note that deflateCopy duplicates the internal + compression state which can be quite large, so this strategy is slow and + can consume lots of memory. + + deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm)); +/* + This function is equivalent to deflateEnd followed by deflateInit, + but does not free and reallocate all the internal compression state. + The stream will keep the same compression level and any other attributes + that may have been set by deflateInit2. + + deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm, + int level, + int strategy)); +/* + Dynamically update the compression level and compression strategy. The + interpretation of level and strategy is as in deflateInit2. This can be + used to switch between compression and straight copy of the input data, or + to switch to a different kind of input data requiring a different + strategy. If the compression level is changed, the input available so far + is compressed with the old level (and may be flushed); the new level will + take effect only at the next call of deflate(). + + Before the call of deflateParams, the stream state must be set as for + a call of deflate(), since the currently available input may have to + be compressed and flushed. In particular, strm->avail_out must be non-zero. + + deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source + stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR + if strm->avail_out was zero. +*/ + +ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm, + int good_length, + int max_lazy, + int nice_length, + int max_chain)); +/* + Fine tune deflate's internal compression parameters. This should only be + used by someone who understands the algorithm used by zlib's deflate for + searching for the best matching string, and even then only by the most + fanatic optimizer trying to squeeze out the last compressed bit for their + specific input data. Read the deflate.c source code for the meaning of the + max_lazy, good_length, nice_length, and max_chain parameters. + + deflateTune() can be called after deflateInit() or deflateInit2(), and + returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream. + */ + +ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm, + uLong sourceLen)); +/* + deflateBound() returns an upper bound on the compressed size after + deflation of sourceLen bytes. It must be called after deflateInit() + or deflateInit2(). This would be used to allocate an output buffer + for deflation in a single pass, and so would be called before deflate(). +*/ + +ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + deflatePrime() inserts bits in the deflate output stream. The intent + is that this function is used to start off the deflate output with the + bits leftover from a previous deflate stream when appending to it. As such, + this function can only be used for raw deflate, and must be used before the + first deflate() call after a deflateInit2() or deflateReset(). bits must be + less than or equal to 16, and that many of the least significant bits of + value will be inserted in the output. + + deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm, + gz_headerp head)); +/* + deflateSetHeader() provides gzip header information for when a gzip + stream is requested by deflateInit2(). deflateSetHeader() may be called + after deflateInit2() or deflateReset() and before the first call of + deflate(). The text, time, os, extra field, name, and comment information + in the provided gz_header structure are written to the gzip header (xflag is + ignored -- the extra flags are set according to the compression level). The + caller must assure that, if not Z_NULL, name and comment are terminated with + a zero byte, and that if extra is not Z_NULL, that extra_len bytes are + available there. If hcrc is true, a gzip header crc is included. Note that + the current versions of the command-line version of gzip (up through version + 1.3.x) do not support header crc's, and will report that it is a "multi-part + gzip file" and give up. + + If deflateSetHeader is not used, the default gzip header has text false, + the time set to zero, and os set to 255, with no extra, name, or comment + fields. The gzip header is returned to the default state by deflateReset(). + + deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm, + int windowBits)); + + This is another version of inflateInit with an extra parameter. The + fields next_in, avail_in, zalloc, zfree and opaque must be initialized + before by the caller. + + The windowBits parameter is the base two logarithm of the maximum window + size (the size of the history buffer). It should be in the range 8..15 for + this version of the library. The default value is 15 if inflateInit is used + instead. windowBits must be greater than or equal to the windowBits value + provided to deflateInit2() while compressing, or it must be equal to 15 if + deflateInit2() was not used. If a compressed stream with a larger window + size is given as input, inflate() will return with the error code + Z_DATA_ERROR instead of trying to allocate a larger window. + + windowBits can also be -8..-15 for raw inflate. In this case, -windowBits + determines the window size. inflate() will then process raw deflate data, + not looking for a zlib or gzip header, not generating a check value, and not + looking for any check values for comparison at the end of the stream. This + is for use with other formats that use the deflate compressed data format + such as zip. Those formats provide their own check values. If a custom + format is developed using the raw deflate format for compressed data, it is + recommended that a check value such as an adler32 or a crc32 be applied to + the uncompressed data as is done in the zlib, gzip, and zip formats. For + most applications, the zlib format should be used as is. Note that comments + above on the use in deflateInit2() applies to the magnitude of windowBits. + + windowBits can also be greater than 15 for optional gzip decoding. Add + 32 to windowBits to enable zlib and gzip decoding with automatic header + detection, or add 16 to decode only the gzip format (the zlib format will + return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is + a crc32 instead of an adler32. + + inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg + is set to null if there is no error message. inflateInit2 does not perform + any decompression apart from reading the zlib header if present: this will + be done by inflate(). (So next_in and avail_in may be modified, but next_out + and avail_out are unchanged.) +*/ + +ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the decompression dictionary from the given uncompressed byte + sequence. This function must be called immediately after a call of inflate, + if that call returned Z_NEED_DICT. The dictionary chosen by the compressor + can be determined from the adler32 value returned by that call of inflate. + The compressor and decompressor must use exactly the same dictionary (see + deflateSetDictionary). For raw inflate, this function can be called + immediately after inflateInit2() or inflateReset() and before any call of + inflate() to set the dictionary. The application must insure that the + dictionary that was used for compression is provided. + + inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the + expected one (incorrect adler32 value). inflateSetDictionary does not + perform any decompression: this will be done by subsequent calls of + inflate(). +*/ + +ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm)); +/* + Skips invalid compressed data until a full flush point (see above the + description of deflate with Z_FULL_FLUSH) can be found, or until all + available input is skipped. No output is provided. + + inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR + if no more input was provided, Z_DATA_ERROR if no flush point has been found, + or Z_STREAM_ERROR if the stream structure was inconsistent. In the success + case, the application may save the current current value of total_in which + indicates where valid compressed data was found. In the error case, the + application may repeatedly call inflateSync, providing more input each time, + until success or end of the input data. +*/ + +ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when randomly accessing a large stream. The + first pass through the stream can periodically record the inflate state, + allowing restarting inflate at those points when randomly accessing the + stream. + + inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm)); +/* + This function is equivalent to inflateEnd followed by inflateInit, + but does not free and reallocate all the internal decompression state. + The stream will keep attributes that may have been set by inflateInit2. + + inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + This function inserts bits in the inflate input stream. The intent is + that this function is used to start inflating at a bit position in the + middle of a byte. The provided bits will be used before any bytes are used + from next_in. This function should only be used with raw inflate, and + should be used before the first inflate() call after inflateInit2() or + inflateReset(). bits must be less than or equal to 16, and that many of the + least significant bits of value will be inserted in the input. + + inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm, + gz_headerp head)); +/* + inflateGetHeader() requests that gzip header information be stored in the + provided gz_header structure. inflateGetHeader() may be called after + inflateInit2() or inflateReset(), and before the first call of inflate(). + As inflate() processes the gzip stream, head->done is zero until the header + is completed, at which time head->done is set to one. If a zlib stream is + being decoded, then head->done is set to -1 to indicate that there will be + no gzip header information forthcoming. Note that Z_BLOCK can be used to + force inflate() to return immediately after header processing is complete + and before any actual data is decompressed. + + The text, time, xflags, and os fields are filled in with the gzip header + contents. hcrc is set to true if there is a header CRC. (The header CRC + was valid if done is set to one.) If extra is not Z_NULL, then extra_max + contains the maximum number of bytes to write to extra. Once done is true, + extra_len contains the actual extra field length, and extra contains the + extra field, or that field truncated if extra_max is less than extra_len. + If name is not Z_NULL, then up to name_max characters are written there, + terminated with a zero unless the length is greater than name_max. If + comment is not Z_NULL, then up to comm_max characters are written there, + terminated with a zero unless the length is greater than comm_max. When + any of extra, name, or comment are not Z_NULL and the respective field is + not present in the header, then that field is set to Z_NULL to signal its + absence. This allows the use of deflateSetHeader() with the returned + structure to duplicate the header. However if those fields are set to + allocated memory, then the application will need to save those pointers + elsewhere so that they can be eventually freed. + + If inflateGetHeader is not used, then the header information is simply + discarded. The header is always checked for validity, including the header + CRC if present. inflateReset() will reset the process to discard the header + information. The application would need to call inflateGetHeader() again to + retrieve the header from the next gzip stream. + + inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits, + unsigned char FAR *window)); + + Initialize the internal stream state for decompression using inflateBack() + calls. The fields zalloc, zfree and opaque in strm must be initialized + before the call. If zalloc and zfree are Z_NULL, then the default library- + derived memory allocation routines are used. windowBits is the base two + logarithm of the window size, in the range 8..15. window is a caller + supplied buffer of that size. Except for special applications where it is + assured that deflate was used with small window sizes, windowBits must be 15 + and a 32K byte window must be supplied to be able to decompress general + deflate streams. + + See inflateBack() for the usage of these routines. + + inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of + the paramaters are invalid, Z_MEM_ERROR if the internal state could not + be allocated, or Z_VERSION_ERROR if the version of the library does not + match the version of the header file. +*/ + +typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *)); +typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned)); + +ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm, + in_func in, void FAR *in_desc, + out_func out, void FAR *out_desc)); +/* + inflateBack() does a raw inflate with a single call using a call-back + interface for input and output. This is more efficient than inflate() for + file i/o applications in that it avoids copying between the output and the + sliding window by simply making the window itself the output buffer. This + function trusts the application to not change the output buffer passed by + the output function, at least until inflateBack() returns. + + inflateBackInit() must be called first to allocate the internal state + and to initialize the state with the user-provided window buffer. + inflateBack() may then be used multiple times to inflate a complete, raw + deflate stream with each call. inflateBackEnd() is then called to free + the allocated state. + + A raw deflate stream is one with no zlib or gzip header or trailer. + This routine would normally be used in a utility that reads zip or gzip + files and writes out uncompressed files. The utility would decode the + header and process the trailer on its own, hence this routine expects + only the raw deflate stream to decompress. This is different from the + normal behavior of inflate(), which expects either a zlib or gzip header and + trailer around the deflate stream. + + inflateBack() uses two subroutines supplied by the caller that are then + called by inflateBack() for input and output. inflateBack() calls those + routines until it reads a complete deflate stream and writes out all of the + uncompressed data, or until it encounters an error. The function's + parameters and return types are defined above in the in_func and out_func + typedefs. inflateBack() will call in(in_desc, &buf) which should return the + number of bytes of provided input, and a pointer to that input in buf. If + there is no input available, in() must return zero--buf is ignored in that + case--and inflateBack() will return a buffer error. inflateBack() will call + out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out() + should return zero on success, or non-zero on failure. If out() returns + non-zero, inflateBack() will return with an error. Neither in() nor out() + are permitted to change the contents of the window provided to + inflateBackInit(), which is also the buffer that out() uses to write from. + The length written by out() will be at most the window size. Any non-zero + amount of input may be provided by in(). + + For convenience, inflateBack() can be provided input on the first call by + setting strm->next_in and strm->avail_in. If that input is exhausted, then + in() will be called. Therefore strm->next_in must be initialized before + calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called + immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in + must also be initialized, and then if strm->avail_in is not zero, input will + initially be taken from strm->next_in[0 .. strm->avail_in - 1]. + + The in_desc and out_desc parameters of inflateBack() is passed as the + first parameter of in() and out() respectively when they are called. These + descriptors can be optionally used to pass any information that the caller- + supplied in() and out() functions need to do their job. + + On return, inflateBack() will set strm->next_in and strm->avail_in to + pass back any unused input that was provided by the last in() call. The + return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR + if in() or out() returned an error, Z_DATA_ERROR if there was a format + error in the deflate stream (in which case strm->msg is set to indicate the + nature of the error), or Z_STREAM_ERROR if the stream was not properly + initialized. In the case of Z_BUF_ERROR, an input or output error can be + distinguished using strm->next_in which will be Z_NULL only if in() returned + an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to + out() returning non-zero. (in() will always be called before out(), so + strm->next_in is assured to be defined if out() returns non-zero.) Note + that inflateBack() cannot return Z_OK. +*/ + +ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm)); +/* + All memory allocated by inflateBackInit() is freed. + + inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream + state was inconsistent. +*/ + +ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void)); +/* Return flags indicating compile-time options. + + Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other: + 1.0: size of uInt + 3.2: size of uLong + 5.4: size of voidpf (pointer) + 7.6: size of z_off_t + + Compiler, assembler, and debug options: + 8: DEBUG + 9: ASMV or ASMINF -- use ASM code + 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention + 11: 0 (reserved) + + One-time table building (smaller code, but not thread-safe if true): + 12: BUILDFIXED -- build static block decoding tables when needed + 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed + 14,15: 0 (reserved) + + Library content (indicates missing functionality): + 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking + deflate code when not needed) + 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect + and decode gzip streams (to avoid linking crc code) + 18-19: 0 (reserved) + + Operation variations (changes in library functionality): + 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate + 21: FASTEST -- deflate algorithm with only one, lowest compression level + 22,23: 0 (reserved) + + The sprintf variant used by gzprintf (zero is best): + 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format + 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure! + 26: 0 = returns value, 1 = void -- 1 means inferred string length returned + + Remainder: + 27-31: 0 (reserved) + */ + + + /* utility functions */ + +/* + The following utility functions are implemented on top of the + basic stream-oriented functions. To simplify the interface, some + default options are assumed (compression level and memory usage, + standard memory allocation functions). The source code of these + utility functions can easily be modified if you need special options. +*/ + +ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be at least the value returned + by compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + This function can be used to compress a whole file at once if the + input file is mmap'ed. + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + +ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen, + int level)); +/* + Compresses the source buffer into the destination buffer. The level + parameter has the same meaning as in deflateInit. sourceLen is the byte + length of the source buffer. Upon entry, destLen is the total size of the + destination buffer, which must be at least the value returned by + compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + + compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_BUF_ERROR if there was not enough room in the output buffer, + Z_STREAM_ERROR if the level parameter is invalid. +*/ + +ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen)); +/* + compressBound() returns an upper bound on the compressed size after + compress() or compress2() on sourceLen bytes. It would be used before + a compress() or compress2() call to allocate the destination buffer. +*/ + +ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be large enough to hold the + entire uncompressed data. (The size of the uncompressed data must have + been saved previously by the compressor and transmitted to the decompressor + by some mechanism outside the scope of this compression library.) + Upon exit, destLen is the actual size of the compressed buffer. + This function can be used to decompress a whole file at once if the + input file is mmap'ed. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. +*/ + + +typedef voidp gzFile; + +ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode)); +/* + Opens a gzip (.gz) file for reading or writing. The mode parameter + is as in fopen ("rb" or "wb") but can also include a compression level + ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for + Huffman only compression as in "wb1h", or 'R' for run-length encoding + as in "wb1R". (See the description of deflateInit2 for more information + about the strategy parameter.) + + gzopen can be used to read a file which is not in gzip format; in this + case gzread will directly read from the file without decompression. + + gzopen returns NULL if the file could not be opened or if there was + insufficient memory to allocate the (de)compression state; errno + can be checked to distinguish the two cases (if errno is zero, the + zlib error is Z_MEM_ERROR). */ + +ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode)); +/* + gzdopen() associates a gzFile with the file descriptor fd. File + descriptors are obtained from calls like open, dup, creat, pipe or + fileno (in the file has been previously opened with fopen). + The mode parameter is as in gzopen. + The next call of gzclose on the returned gzFile will also close the + file descriptor fd, just like fclose(fdopen(fd), mode) closes the file + descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode). + gzdopen returns NULL if there was insufficient memory to allocate + the (de)compression state. +*/ + +ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy)); +/* + Dynamically update the compression level or strategy. See the description + of deflateInit2 for the meaning of these parameters. + gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not + opened for writing. +*/ + +ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len)); +/* + Reads the given number of uncompressed bytes from the compressed file. + If the input file was not in gzip format, gzread copies the given number + of bytes into the buffer. + gzread returns the number of uncompressed bytes actually read (0 for + end of file, -1 for error). */ + +ZEXTERN int ZEXPORT gzwrite OF((gzFile file, + voidpc buf, unsigned len)); +/* + Writes the given number of uncompressed bytes into the compressed file. + gzwrite returns the number of uncompressed bytes actually written + (0 in case of error). +*/ + +ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...)); +/* + Converts, formats, and writes the args to the compressed file under + control of the format string, as in fprintf. gzprintf returns the number of + uncompressed bytes actually written (0 in case of error). The number of + uncompressed bytes written is limited to 4095. The caller should assure that + this limit is not exceeded. If it is exceeded, then gzprintf() will return + return an error (0) with nothing written. In this case, there may also be a + buffer overflow with unpredictable consequences, which is possible only if + zlib was compiled with the insecure functions sprintf() or vsprintf() + because the secure snprintf() or vsnprintf() functions were not available. +*/ + +ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s)); +/* + Writes the given null-terminated string to the compressed file, excluding + the terminating null character. + gzputs returns the number of characters written, or -1 in case of error. +*/ + +ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len)); +/* + Reads bytes from the compressed file until len-1 characters are read, or + a newline character is read and transferred to buf, or an end-of-file + condition is encountered. The string is then terminated with a null + character. + gzgets returns buf, or Z_NULL in case of error. +*/ + +ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c)); +/* + Writes c, converted to an unsigned char, into the compressed file. + gzputc returns the value that was written, or -1 in case of error. +*/ + +ZEXTERN int ZEXPORT gzgetc OF((gzFile file)); +/* + Reads one byte from the compressed file. gzgetc returns this byte + or -1 in case of end of file or error. +*/ + +ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file)); +/* + Push one character back onto the stream to be read again later. + Only one character of push-back is allowed. gzungetc() returns the + character pushed, or -1 on failure. gzungetc() will fail if a + character has been pushed but not read yet, or if c is -1. The pushed + character will be discarded if the stream is repositioned with gzseek() + or gzrewind(). +*/ + +ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush)); +/* + Flushes all pending output into the compressed file. The parameter + flush is as in the deflate() function. The return value is the zlib + error number (see function gzerror below). gzflush returns Z_OK if + the flush parameter is Z_FINISH and all output could be flushed. + gzflush should be called only when strictly necessary because it can + degrade compression. +*/ + +ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file, + z_off_t offset, int whence)); +/* + Sets the starting position for the next gzread or gzwrite on the + given compressed file. The offset represents a number of bytes in the + uncompressed data stream. The whence parameter is defined as in lseek(2); + the value SEEK_END is not supported. + If the file is opened for reading, this function is emulated but can be + extremely slow. If the file is opened for writing, only forward seeks are + supported; gzseek then compresses a sequence of zeroes up to the new + starting position. + + gzseek returns the resulting offset location as measured in bytes from + the beginning of the uncompressed stream, or -1 in case of error, in + particular if the file is opened for writing and the new starting position + would be before the current position. +*/ + +ZEXTERN int ZEXPORT gzrewind OF((gzFile file)); +/* + Rewinds the given file. This function is supported only for reading. + + gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET) +*/ + +ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file)); +/* + Returns the starting position for the next gzread or gzwrite on the + given compressed file. This position represents a number of bytes in the + uncompressed data stream. + + gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) +*/ + +ZEXTERN int ZEXPORT gzeof OF((gzFile file)); +/* + Returns 1 when EOF has previously been detected reading the given + input stream, otherwise zero. +*/ + +ZEXTERN int ZEXPORT gzdirect OF((gzFile file)); +/* + Returns 1 if file is being read directly without decompression, otherwise + zero. +*/ + +ZEXTERN int ZEXPORT gzclose OF((gzFile file)); +/* + Flushes all pending output if necessary, closes the compressed file + and deallocates all the (de)compression state. The return value is the zlib + error number (see function gzerror below). +*/ + +ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum)); +/* + Returns the error message for the last error which occurred on the + given compressed file. errnum is set to zlib error number. If an + error occurred in the file system and not in the compression library, + errnum is set to Z_ERRNO and the application may consult errno + to get the exact error code. +*/ + +ZEXTERN void ZEXPORT gzclearerr OF((gzFile file)); +/* + Clears the error and end-of-file flags for file. This is analogous to the + clearerr() function in stdio. This is useful for continuing to read a gzip + file that is being written concurrently. +*/ + + /* checksum functions */ + +/* + These functions are not related to compression but are exported + anyway because they might be useful in applications using the + compression library. +*/ + +ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); +/* + Update a running Adler-32 checksum with the bytes buf[0..len-1] and + return the updated checksum. If buf is NULL, this function returns + the required initial value for the checksum. + An Adler-32 checksum is almost as reliable as a CRC32 but can be computed + much faster. Usage example: + + uLong adler = adler32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + adler = adler32(adler, buffer, length); + } + if (adler != original_adler) error(); +*/ + +ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2, + z_off_t len2)); +/* + Combine two Adler-32 checksums into one. For two sequences of bytes, seq1 + and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for + each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of + seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. +*/ + +ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); +/* + Update a running CRC-32 with the bytes buf[0..len-1] and return the + updated CRC-32. If buf is NULL, this function returns the required initial + value for the for the crc. Pre- and post-conditioning (one's complement) is + performed within this function so it shouldn't be done by the application. + Usage example: + + uLong crc = crc32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + crc = crc32(crc, buffer, length); + } + if (crc != original_crc) error(); +*/ + +ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2)); + +/* + Combine two CRC-32 check values into one. For two sequences of bytes, + seq1 and seq2 with lengths len1 and len2, CRC-32 check values were + calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 + check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and + len2. +*/ + + + /* various hacks, don't look :) */ + +/* deflateInit and inflateInit are macros to allow checking the zlib version + * and the compiler's view of z_stream: + */ +ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method, + int windowBits, int memLevel, + int strategy, const char *version, + int stream_size)); +ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits, + unsigned char FAR *window, + const char *version, + int stream_size)); +#define deflateInit(strm, level) \ + deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit(strm) \ + inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream)) +#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ + deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ + (strategy), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit2(strm, windowBits) \ + inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream)) +#define inflateBackInit(strm, windowBits, window) \ + inflateBackInit_((strm), (windowBits), (window), \ + ZLIB_VERSION, sizeof(z_stream)) + + +#if !defined(_ZUTIL_H) && !defined(NO_DUMMY_DECL) + struct internal_state {int dummy;}; /* hack for buggy compilers */ +#endif + +ZEXTERN const char * ZEXPORT zError OF((int)); +ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z)); +ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void)); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZLIB_H */ diff --git a/sys/contrib/opensolaris/uts/common/zmod/zmod.c b/sys/contrib/opensolaris/uts/common/zmod/zmod.c new file mode 100644 index 0000000..2627239 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/zmod.c @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/zmod.h> + +#include "zlib.h" + +/* + * Uncompress the buffer 'src' into the buffer 'dst'. The caller must store + * the expected decompressed data size externally so it can be passed in. + * The resulting decompressed size is then returned through dstlen. This + * function return Z_OK on success, or another error code on failure. + */ +int +z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + z_stream zs; + int err; + + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + + if ((err = inflateInit(&zs)) != Z_OK) + return (err); + + if ((err = inflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) inflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + + *dstlen = zs.total_out; + return (inflateEnd(&zs)); +} + +int +z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, + int level) +{ + + z_stream zs; + int err; + + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + + if ((err = deflateInit(&zs, level)) != Z_OK) + return (err); + + if ((err = deflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) deflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + + *dstlen = zs.total_out; + return (deflateEnd(&zs)); +} + +int +z_compress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + return (z_compress_level(dst, dstlen, src, srclen, + Z_DEFAULT_COMPRESSION)); +} + +/* + * Convert a zlib error code into a string error message. + */ +const char * +z_strerror(int err) +{ + int i = Z_NEED_DICT - err; + + if (i < 0 || i > Z_NEED_DICT - Z_VERSION_ERROR) + return ("unknown error"); + + return (zError(err)); +} diff --git a/sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c b/sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c new file mode 100644 index 0000000..0542712 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/systm.h> +#include <sys/cmn_err.h> +#include <sys/kobj.h> + +struct zchdr { + uint_t zch_magic; + uint_t zch_size; +}; + +#define ZCH_MAGIC 0x3cc13cc1 + +/*ARGSUSED*/ +void * +zcalloc(void *opaque, uint_t items, uint_t size) +{ + size_t nbytes = sizeof (struct zchdr) + items * size; + struct zchdr *z = kobj_zalloc(nbytes, KM_NOWAIT|KM_TMP); + + if (z == NULL) + return (NULL); + + z->zch_magic = ZCH_MAGIC; + z->zch_size = nbytes; + + return (z + 1); +} + +/*ARGSUSED*/ +void +zcfree(void *opaque, void *ptr) +{ + struct zchdr *z = ((struct zchdr *)ptr) - 1; + + if (z->zch_magic != ZCH_MAGIC) + panic("zcfree region corrupt: hdr=%p ptr=%p", (void *)z, ptr); + + kobj_free(z, z->zch_size); +} + +void +zmemcpy(void *dest, const void *source, uint_t len) +{ + bcopy(source, dest, len); +} + +int +zmemcmp(const void *s1, const void *s2, uint_t len) +{ + return (bcmp(s1, s2, len)); +} + +void +zmemzero(void *dest, uint_t len) +{ + bzero(dest, len); +} diff --git a/sys/contrib/opensolaris/uts/common/zmod/zutil.c b/sys/contrib/opensolaris/uts/common/zmod/zutil.c new file mode 100644 index 0000000..7d46e30 --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/zutil.c @@ -0,0 +1,324 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* zutil.c -- target dependent utility functions for the compression library + * Copyright (C) 1995-2005 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zutil.h" + +#ifndef NO_DUMMY_DECL +struct internal_state {int dummy;}; /* for buggy compilers */ +#endif + +const char * const z_errmsg[10] = { +"need dictionary", /* Z_NEED_DICT 2 */ +"stream end", /* Z_STREAM_END 1 */ +"", /* Z_OK 0 */ +"file error", /* Z_ERRNO (-1) */ +"stream error", /* Z_STREAM_ERROR (-2) */ +"data error", /* Z_DATA_ERROR (-3) */ +"insufficient memory", /* Z_MEM_ERROR (-4) */ +"buffer error", /* Z_BUF_ERROR (-5) */ +"incompatible version",/* Z_VERSION_ERROR (-6) */ +""}; + + +const char * ZEXPORT zlibVersion() +{ + return ZLIB_VERSION; +} + +uLong ZEXPORT zlibCompileFlags() +{ + uLong flags; + + flags = 0; + switch (sizeof(uInt)) { + case 2: break; + case 4: flags += 1; break; + case 8: flags += 2; break; + default: flags += 3; + } + switch (sizeof(uLong)) { + case 2: break; + case 4: flags += 1 << 2; break; + case 8: flags += 2 << 2; break; + default: flags += 3 << 2; + } + switch (sizeof(voidpf)) { + case 2: break; + case 4: flags += 1 << 4; break; + case 8: flags += 2 << 4; break; + default: flags += 3 << 4; + } + switch (sizeof(z_off_t)) { + case 2: break; + case 4: flags += 1 << 6; break; + case 8: flags += 2 << 6; break; + default: flags += 3 << 6; + } +#ifdef DEBUG + flags += 1 << 8; +#endif +#if defined(ASMV) || defined(ASMINF) + flags += 1 << 9; +#endif +#ifdef ZLIB_WINAPI + flags += 1 << 10; +#endif +#ifdef BUILDFIXED + flags += 1 << 12; +#endif +#ifdef DYNAMIC_CRC_TABLE + flags += 1 << 13; +#endif +#ifdef NO_GZCOMPRESS + flags += 1L << 16; +#endif +#ifdef NO_GZIP + flags += 1L << 17; +#endif +#ifdef PKZIP_BUG_WORKAROUND + flags += 1L << 20; +#endif +#ifdef FASTEST + flags += 1L << 21; +#endif +#ifdef STDC +# ifdef NO_vsnprintf + flags += 1L << 25; +# ifdef HAS_vsprintf_void + flags += 1L << 26; +# endif +# else +# ifdef HAS_vsnprintf_void + flags += 1L << 26; +# endif +# endif +#else + flags += 1L << 24; +# ifdef NO_snprintf + flags += 1L << 25; +# ifdef HAS_sprintf_void + flags += 1L << 26; +# endif +# else +# ifdef HAS_snprintf_void + flags += 1L << 26; +# endif +# endif +#endif + return flags; +} + +#ifdef DEBUG + +# ifndef verbose +# define verbose 0 +# endif +int z_verbose = verbose; + +void z_error (m) + char *m; +{ + fprintf(stderr, "%s\n", m); + exit(1); +} +#endif + +/* exported to allow conversion of error code to string for compress() and + * uncompress() + */ +const char * ZEXPORT zError(err) + int err; +{ + return ERR_MSG(err); +} + +#if defined(_WIN32_WCE) + /* The Microsoft C Run-Time Library for Windows CE doesn't have + * errno. We define it as a global variable to simplify porting. + * Its value is always 0 and should not be used. + */ + int errno = 0; +#endif + +#define HAVE_MEMCPY +#ifndef HAVE_MEMCPY + +void zmemcpy(dest, source, len) + Bytef* dest; + const Bytef* source; + uInt len; +{ + if (len == 0) return; + do { + *dest++ = *source++; /* ??? to be unrolled */ + } while (--len != 0); +} + +int zmemcmp(s1, s2, len) + const Bytef* s1; + const Bytef* s2; + uInt len; +{ + uInt j; + + for (j = 0; j < len; j++) { + if (s1[j] != s2[j]) return 2*(s1[j] > s2[j])-1; + } + return 0; +} + +void zmemzero(dest, len) + Bytef* dest; + uInt len; +{ + if (len == 0) return; + do { + *dest++ = 0; /* ??? to be unrolled */ + } while (--len != 0); +} +#endif + + +#ifdef SYS16BIT + +#ifdef __TURBOC__ +/* Turbo C in 16-bit mode */ + +# define MY_ZCALLOC + +/* Turbo C malloc() does not allow dynamic allocation of 64K bytes + * and farmalloc(64K) returns a pointer with an offset of 8, so we + * must fix the pointer. Warning: the pointer must be put back to its + * original form in order to free it, use zcfree(). + */ + +#define MAX_PTR 10 +/* 10*64K = 640K */ + +local int next_ptr = 0; + +typedef struct ptr_table_s { + voidpf org_ptr; + voidpf new_ptr; +} ptr_table; + +local ptr_table table[MAX_PTR]; +/* This table is used to remember the original form of pointers + * to large buffers (64K). Such pointers are normalized with a zero offset. + * Since MSDOS is not a preemptive multitasking OS, this table is not + * protected from concurrent access. This hack doesn't work anyway on + * a protected system like OS/2. Use Microsoft C instead. + */ + +voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) +{ + voidpf buf = opaque; /* just to make some compilers happy */ + ulg bsize = (ulg)items*size; + + /* If we allocate less than 65520 bytes, we assume that farmalloc + * will return a usable pointer which doesn't have to be normalized. + */ + if (bsize < 65520L) { + buf = farmalloc(bsize); + if (*(ush*)&buf != 0) return buf; + } else { + buf = farmalloc(bsize + 16L); + } + if (buf == NULL || next_ptr >= MAX_PTR) return NULL; + table[next_ptr].org_ptr = buf; + + /* Normalize the pointer to seg:0 */ + *((ush*)&buf+1) += ((ush)((uch*)buf-0) + 15) >> 4; + *(ush*)&buf = 0; + table[next_ptr++].new_ptr = buf; + return buf; +} + +void zcfree (voidpf opaque, voidpf ptr) +{ + int n; + if (*(ush*)&ptr != 0) { /* object < 64K */ + farfree(ptr); + return; + } + /* Find the original pointer */ + for (n = 0; n < next_ptr; n++) { + if (ptr != table[n].new_ptr) continue; + + farfree(table[n].org_ptr); + while (++n < next_ptr) { + table[n-1] = table[n]; + } + next_ptr--; + return; + } + ptr = opaque; /* just to make some compilers happy */ + Assert(0, "zcfree: ptr not found"); +} + +#endif /* __TURBOC__ */ + + +#ifdef M_I86 +/* Microsoft C in 16-bit mode */ + +# define MY_ZCALLOC + +#if (!defined(_MSC_VER) || (_MSC_VER <= 600)) +# define _halloc halloc +# define _hfree hfree +#endif + +voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) +{ + if (opaque) opaque = 0; /* to make compiler happy */ + return _halloc((long)items, size); +} + +void zcfree (voidpf opaque, voidpf ptr) +{ + if (opaque) opaque = 0; /* to make compiler happy */ + _hfree(ptr); +} + +#endif /* M_I86 */ + +#endif /* SYS16BIT */ + + +#ifndef MY_ZCALLOC /* Any system without a special alloc function */ + +#ifndef STDC +extern voidp malloc OF((uInt size)); +extern voidp calloc OF((uInt items, uInt size)); +extern void free OF((voidpf ptr)); +#endif + +voidpf zcalloc (opaque, items, size) + voidpf opaque; + unsigned items; + unsigned size; +{ + if (opaque) items += size - size; /* make compiler happy */ + return sizeof(uInt) > 2 ? (voidpf)malloc(items * size) : + (voidpf)calloc(items, size); +} + +void zcfree (opaque, ptr) + voidpf opaque; + voidpf ptr; +{ + free(ptr); + if (opaque) return; /* make compiler happy */ +} + +#endif /* MY_ZCALLOC */ diff --git a/sys/contrib/opensolaris/uts/common/zmod/zutil.h b/sys/contrib/opensolaris/uts/common/zmod/zutil.h new file mode 100644 index 0000000..1d02c1d --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/zmod/zutil.h @@ -0,0 +1,274 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* zutil.h -- internal interface and configuration of the compression library + * Copyright (C) 1995-2005 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +#ifndef _ZUTIL_H +#define _ZUTIL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#define ZLIB_INTERNAL +#include "zlib.h" + +#ifdef STDC +# ifndef _WIN32_WCE +# include <stddef.h> +# endif +# include <string.h> +# include <stdlib.h> +#endif +#ifdef NO_ERRNO_H +# ifdef _WIN32_WCE + /* The Microsoft C Run-Time Library for Windows CE doesn't have + * errno. We define it as a global variable to simplify porting. + * Its value is always 0 and should not be used. We rename it to + * avoid conflict with other libraries that use the same workaround. + */ +# define errno z_errno +# endif + extern int errno; +#else +# ifndef _WIN32_WCE +# include <sys/errno.h> +# endif +#endif + +#ifndef local +# define local static +#endif +/* compile with -Dlocal if your debugger can't find static symbols */ + +typedef unsigned char uch; +typedef uch FAR uchf; +typedef unsigned short ush; +typedef ush FAR ushf; +typedef unsigned long ulg; + +extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ +/* (size given to avoid silly warnings with Visual C++) */ + +#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)] + +#define ERR_RETURN(strm,err) \ + return (strm->msg = (char*)ERR_MSG(err), (err)) +/* To be used only when the state is known to be valid */ + + /* common constants */ + +#ifndef DEF_WBITS +# define DEF_WBITS MAX_WBITS +#endif +/* default windowBits for decompression. MAX_WBITS is for compression only */ + +#if MAX_MEM_LEVEL >= 8 +# define DEF_MEM_LEVEL 8 +#else +# define DEF_MEM_LEVEL MAX_MEM_LEVEL +#endif +/* default memLevel */ + +#define STORED_BLOCK 0 +#define STATIC_TREES 1 +#define DYN_TREES 2 +/* The three kinds of block type */ + +#define MIN_MATCH 3 +#define MAX_MATCH 258 +/* The minimum and maximum match lengths */ + +#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ + + /* target dependencies */ + +#if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32)) +# define OS_CODE 0x00 +# if defined(__TURBOC__) || defined(__BORLANDC__) +# if(__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__)) + /* Allow compilation with ANSI keywords only enabled */ + void _Cdecl farfree( void *block ); + void *_Cdecl farmalloc( unsigned long nbytes ); +# else +# include <alloc.h> +# endif +# else /* MSC or DJGPP */ +# include <malloc.h> +# endif +#endif + +#ifdef AMIGA +# define OS_CODE 0x01 +#endif + +#if defined(VAXC) || defined(VMS) +# define OS_CODE 0x02 +# define F_OPEN(name, mode) \ + fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512") +#endif + +#if defined(ATARI) || defined(atarist) +# define OS_CODE 0x05 +#endif + +#ifdef OS2 +# define OS_CODE 0x06 +# ifdef M_I86 + #include <malloc.h> +# endif +#endif + +#if defined(MACOS) || defined(TARGET_OS_MAC) +# define OS_CODE 0x07 +# if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os +# include <unix.h> /* for fdopen */ +# else +# ifndef fdopen +# define fdopen(fd,mode) NULL /* No fdopen() */ +# endif +# endif +#endif + +#ifdef TOPS20 +# define OS_CODE 0x0a +#endif + +#ifdef WIN32 +# ifndef __CYGWIN__ /* Cygwin is Unix, not Win32 */ +# define OS_CODE 0x0b +# endif +#endif + +#ifdef __50SERIES /* Prime/PRIMOS */ +# define OS_CODE 0x0f +#endif + +#if defined(_BEOS_) || defined(RISCOS) +# define fdopen(fd,mode) NULL /* No fdopen() */ +#endif + +#if (defined(_MSC_VER) && (_MSC_VER > 600)) +# if defined(_WIN32_WCE) +# define fdopen(fd,mode) NULL /* No fdopen() */ +# ifndef _PTRDIFF_T_DEFINED + typedef int ptrdiff_t; +# define _PTRDIFF_T_DEFINED +# endif +# else +# define fdopen(fd,type) _fdopen(fd,type) +# endif +#endif + + /* common defaults */ + +#ifndef OS_CODE +# define OS_CODE 0x03 /* assume Unix */ +#endif + +#ifndef F_OPEN +# define F_OPEN(name, mode) fopen((name), (mode)) +#endif + + /* functions */ + +#if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550) +# ifndef HAVE_VSNPRINTF +# define HAVE_VSNPRINTF +# endif +#endif +#if defined(__CYGWIN__) +# ifndef HAVE_VSNPRINTF +# define HAVE_VSNPRINTF +# endif +#endif +#ifndef HAVE_VSNPRINTF +# ifdef MSDOS + /* vsnprintf may exist on some MS-DOS compilers (DJGPP?), + but for now we just assume it doesn't. */ +# define NO_vsnprintf +# endif +# ifdef __TURBOC__ +# define NO_vsnprintf +# endif +# ifdef WIN32 + /* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */ +# if !defined(vsnprintf) && !defined(NO_vsnprintf) +# define vsnprintf _vsnprintf +# endif +# endif +# ifdef __SASC +# define NO_vsnprintf +# endif +#endif +#ifdef VMS +# define NO_vsnprintf +#endif + +#if defined(pyr) +# define NO_MEMCPY +#endif +#if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__) + /* Use our own functions for small and medium model with MSC <= 5.0. + * You may have to use the same strategy for Borland C (untested). + * The __SC__ check is for Symantec. + */ +# define NO_MEMCPY +#endif +#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY) +# define HAVE_MEMCPY +#endif +#ifdef HAVE_MEMCPY +# ifdef SMALL_MEDIUM /* MSDOS small or medium model */ +# define zmemcpy _fmemcpy +# define zmemcmp _fmemcmp +# define zmemzero(dest, len) _fmemset(dest, 0, len) +# else +# define zmemcpy memcpy +# define zmemcmp memcmp +# define zmemzero(dest, len) memset(dest, 0, len) +# endif +#else + extern void zmemcpy OF((void* dest, const void* source, uInt len)); + extern int zmemcmp OF((const void* s1, const void* s2, uInt len)); + extern void zmemzero OF((void* dest, uInt len)); +#endif + +/* Diagnostic functions */ +#ifdef DEBUG +# include <stdio.h> + extern int z_verbose; + extern void z_error OF((char *m)); +# define Assert(cond,msg) {if(!(cond)) z_error(msg);} +# define Trace(x) {if (z_verbose>=0) fprintf x ;} +# define Tracev(x) {if (z_verbose>0) fprintf x ;} +# define Tracevv(x) {if (z_verbose>1) fprintf x ;} +# define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;} +# define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;} +#else +# define Assert(cond,msg) +# define Trace(x) +# define Tracev(x) +# define Tracevv(x) +# define Tracec(c,x) +# define Tracecv(c,x) +#endif + + +voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size)); +void zcfree OF((voidpf opaque, voidpf ptr)); + +#define ZALLOC(strm, items, size) \ + (*((strm)->zalloc))((strm)->opaque, (items), (size)) +#define ZFREE(strm, addr) (*((strm)->zfree))((strm)->opaque, (voidpf)(addr)) +#define TRY_FREE(s, p) {if (p) ZFREE(s, p);} + +#endif /* _ZUTIL_H */ |