diff options
Diffstat (limited to 'sys/contrib/opensolaris/uts/common/fs')
115 files changed, 0 insertions, 61758 deletions
diff --git a/sys/contrib/opensolaris/uts/common/fs/gfs.c b/sys/contrib/opensolaris/uts/common/fs/gfs.c deleted file mode 100644 index 738c9d4..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/gfs.c +++ /dev/null @@ -1,884 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* Portions Copyright 2007 Shivakumar GN */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/cmn_err.h> -#include <sys/debug.h> -#include <sys/dirent.h> -#include <sys/kmem.h> -#include <sys/mman.h> -#include <sys/mutex.h> -#include <sys/sysmacros.h> -#include <sys/systm.h> -#include <sys/uio.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/cred.h> -#include <sys/kdb.h> - -#include <sys/gfs.h> - -/* - * Generic pseudo-filesystem routines. - * - * There are significant similarities between the implementation of certain file - * system entry points across different filesystems. While one could attempt to - * "choke up on the bat" and incorporate common functionality into a VOP - * preamble or postamble, such an approach is limited in the benefit it can - * provide. In this file we instead define a toolkit of routines which can be - * called from a filesystem (with in-kernel pseudo-filesystems being the focus - * of the exercise) in a more component-like fashion. - * - * There are three basic classes of routines: - * - * 1) Lowlevel support routines - * - * These routines are designed to play a support role for existing - * pseudo-filesystems (such as procfs). They simplify common tasks, - * without enforcing the filesystem to hand over management to GFS. The - * routines covered are: - * - * gfs_readdir_init() - * gfs_readdir_emit() - * gfs_readdir_emitn() - * gfs_readdir_pred() - * gfs_readdir_fini() - * gfs_lookup_dot() - * - * 2) Complete GFS management - * - * These routines take a more active role in management of the - * pseudo-filesystem. They handle the relationship between vnode private - * data and VFS data, as well as the relationship between vnodes in the - * directory hierarchy. - * - * In order to use these interfaces, the first member of every private - * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control - * to GFS. - * - * gfs_file_create() - * gfs_dir_create() - * gfs_root_create() - * - * gfs_file_inactive() - * gfs_dir_inactive() - * gfs_dir_lookup() - * gfs_dir_readdir() - * - * gfs_vop_inactive() - * gfs_vop_lookup() - * gfs_vop_readdir() - * gfs_vop_map() - * - * 3) Single File pseudo-filesystems - * - * This routine creates a rooted file to be overlayed ontop of another - * file in the physical filespace. - * - * Note that the parent is NULL (actually the vfs), but there is nothing - * technically keeping such a file from utilizing the "Complete GFS - * management" set of routines. - * - * gfs_root_create_file() - */ - -/* - * Low level directory routines - * - * These routines provide some simple abstractions for reading directories. - * They are designed to be used by existing pseudo filesystems (namely procfs) - * that already have a complicated management infrastructure. - */ - -/* - * gfs_readdir_init: initiate a generic readdir - * st - a pointer to an uninitialized gfs_readdir_state_t structure - * name_max - the directory's maximum file name length - * ureclen - the exported file-space record length (1 for non-legacy FSs) - * uiop - the uiop passed to readdir - * parent - the parent directory's inode - * self - this directory's inode - * - * Returns 0 or a non-zero errno. - * - * Typical VOP_READDIR usage of gfs_readdir_*: - * - * if ((error = gfs_readdir_init(...)) != 0) - * return (error); - * eof = 0; - * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { - * if (!consumer_entry_at(voffset)) - * voffset = consumer_next_entry(voffset); - * if (consumer_eof(voffset)) { - * eof = 1 - * break; - * } - * if ((error = gfs_readdir_emit(..., voffset, - * consumer_ino(voffset), consumer_name(voffset))) != 0) - * break; - * } - * return (gfs_readdir_fini(..., error, eofp, eof)); - * - * As you can see, a zero result from gfs_readdir_pred() or - * gfs_readdir_emit() indicates that processing should continue, - * whereas a non-zero result indicates that the loop should terminate. - * Most consumers need do nothing more than let gfs_readdir_fini() - * determine what the cause of failure was and return the appropriate - * value. - */ -int -gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, - uio_t *uiop, ino64_t parent, ino64_t self) -{ - if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || - (uiop->uio_loffset % ureclen) != 0) - return (EINVAL); - - st->grd_ureclen = ureclen; - st->grd_oresid = uiop->uio_resid; - st->grd_namlen = name_max; - st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP); - st->grd_parent = parent; - st->grd_self = self; - - return (0); -} - -/* - * gfs_readdir_emit_int: internal routine to emit directory entry - * - * st - the current readdir state, which must have d_ino and d_name - * set - * uiop - caller-supplied uio pointer - * next - the offset of the next entry - */ -static int -gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, - int *ncookies, u_long **cookies) -{ - int reclen, namlen; - - namlen = strlen(st->grd_dirent->d_name); - reclen = DIRENT64_RECLEN(namlen); - - if (reclen > uiop->uio_resid) { - /* - * Error if no entries were returned yet - */ - if (uiop->uio_resid == st->grd_oresid) - return (EINVAL); - return (-1); - } - - /* XXX: This can change in the future. */ - st->grd_dirent->d_type = DT_DIR; - st->grd_dirent->d_reclen = (ushort_t)reclen; - st->grd_dirent->d_namlen = namlen; - - if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) - return (EFAULT); - - uiop->uio_loffset = next; - if (*cookies != NULL) { - **cookies = next; - (*cookies)++; - (*ncookies)--; - KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies)); - } - - return (0); -} - -/* - * gfs_readdir_emit: emit a directory entry - * voff - the virtual offset (obtained from gfs_readdir_pred) - * ino - the entry's inode - * name - the entry's name - * - * Returns a 0 on success, a non-zero errno on failure, or -1 if the - * readdir loop should terminate. A non-zero result (either errno or - * -1) from this function is typically passed directly to - * gfs_readdir_fini(). - */ -int -gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, - ino64_t ino, const char *name, int *ncookies, u_long **cookies) -{ - offset_t off = (voff + 2) * st->grd_ureclen; - - st->grd_dirent->d_ino = ino; - (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen); - - /* - * Inter-entry offsets are invalid, so we assume a record size of - * grd_ureclen and explicitly set the offset appropriately. - */ - return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies, - cookies)); -} - -/* - * gfs_readdir_pred: readdir loop predicate - * voffp - a pointer in which the next virtual offset should be stored - * - * Returns a 0 on success, a non-zero errno on failure, or -1 if the - * readdir loop should terminate. A non-zero result (either errno or - * -1) from this function is typically passed directly to - * gfs_readdir_fini(). - */ -int -gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp, - int *ncookies, u_long **cookies) -{ - offset_t off, voff; - int error; - -top: - if (uiop->uio_resid <= 0) - return (-1); - - off = uiop->uio_loffset / st->grd_ureclen; - voff = off - 2; - if (off == 0) { - if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, - ".", ncookies, cookies)) == 0) - goto top; - } else if (off == 1) { - if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, - "..", ncookies, cookies)) == 0) - goto top; - } else { - *voffp = voff; - return (0); - } - - return (error); -} - -/* - * gfs_readdir_fini: generic readdir cleanup - * error - if positive, an error to return - * eofp - the eofp passed to readdir - * eof - the eof value - * - * Returns a 0 on success, a non-zero errno on failure. This result - * should be returned from readdir. - */ -int -gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) -{ - kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen)); - if (error > 0) - return (error); - if (eofp) - *eofp = eof; - return (0); -} - -/* - * gfs_lookup_dot - * - * Performs a basic check for "." and ".." directory entries. - */ -int -gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) -{ - if (*nm == '\0' || strcmp(nm, ".") == 0) { - VN_HOLD(dvp); - *vpp = dvp; - return (0); - } else if (strcmp(nm, "..") == 0) { - if (pvp == NULL) { - ASSERT(dvp->v_flag & VROOT); - VN_HOLD(dvp); - *vpp = dvp; - } else { - VN_HOLD(pvp); - *vpp = pvp; - } - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - return (0); - } - - return (-1); -} - -/* - * gfs_file_create(): create a new GFS file - * - * size - size of private data structure (v_data) - * pvp - parent vnode (GFS directory) - * ops - vnode operations vector - * - * In order to use this interface, the parent vnode must have been created by - * gfs_dir_create(), and the private data stored in v_data must have a - * 'gfs_file_t' as its first field. - * - * Given these constraints, this routine will automatically: - * - * - Allocate v_data for the vnode - * - Initialize necessary fields in the vnode - * - Hold the parent - */ -vnode_t * -gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops) -{ - gfs_file_t *fp; - vnode_t *vp; - int error; - - /* - * Allocate vnode and internal data structure - */ - fp = kmem_zalloc(size, KM_SLEEP); - error = getnewvnode("zfs", vfsp, ops, &vp); - ASSERT(error == 0); - vp->v_data = (caddr_t)fp; - - /* - * Set up various pointers - */ - fp->gfs_vnode = vp; - fp->gfs_parent = pvp; - fp->gfs_size = size; - fp->gfs_type = GFS_FILE; - - error = insmntque(vp, vfsp); - KASSERT(error == 0, ("insmntque() failed: error %d", error)); - - /* - * Initialize vnode and hold parent. - */ - if (pvp) - VN_HOLD(pvp); - - return (vp); -} - -/* - * gfs_dir_create: creates a new directory in the parent - * - * size - size of private data structure (v_data) - * pvp - parent vnode (GFS directory) - * ops - vnode operations vector - * entries - NULL-terminated list of static entries (if any) - * maxlen - maximum length of a directory entry - * readdir_cb - readdir callback (see gfs_dir_readdir) - * inode_cb - inode callback (see gfs_dir_readdir) - * lookup_cb - lookup callback (see gfs_dir_lookup) - * - * In order to use this function, the first member of the private vnode - * structure (v_data) must be a gfs_dir_t. For each directory, there are - * static entries, defined when the structure is initialized, and dynamic - * entries, retrieved through callbacks. - * - * If a directory has static entries, then it must supply a inode callback, - * which will compute the inode number based on the parent and the index. - * For a directory with dynamic entries, the caller must supply a readdir - * callback and a lookup callback. If a static lookup fails, we fall back to - * the supplied lookup callback, if any. - * - * This function also performs the same initialization as gfs_file_create(). - */ -vnode_t * -gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops, - gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, - gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) -{ - vnode_t *vp; - gfs_dir_t *dp; - gfs_dirent_t *de; - - vp = gfs_file_create(struct_size, pvp, vfsp, ops); - vp->v_type = VDIR; - - dp = vp->v_data; - dp->gfsd_file.gfs_type = GFS_DIR; - dp->gfsd_maxlen = maxlen; - - if (entries != NULL) { - for (de = entries; de->gfse_name != NULL; de++) - dp->gfsd_nstatic++; - - dp->gfsd_static = kmem_alloc( - dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); - bcopy(entries, dp->gfsd_static, - dp->gfsd_nstatic * sizeof (gfs_dirent_t)); - } - - dp->gfsd_readdir = readdir_cb; - dp->gfsd_lookup = lookup_cb; - dp->gfsd_inode = inode_cb; - - mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); - - return (vp); -} - -/* - * gfs_root_create(): create a root vnode for a GFS filesystem - * - * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The - * only difference is that it takes a vfs_t instead of a vnode_t as its parent. - */ -vnode_t * -gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, - gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, - gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) -{ - vnode_t *vp; - - VFS_HOLD(vfsp); - vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb, - maxlen, readdir_cb, lookup_cb); - /* Manually set the inode */ - ((gfs_file_t *)vp->v_data)->gfs_ino = ino; - vp->v_flag |= VROOT; - - return (vp); -} - -/* - * gfs_file_inactive() - * - * Called from the VOP_INACTIVE() routine. If necessary, this routine will - * remove the given vnode from the parent directory and clean up any references - * in the VFS layer. - * - * If the vnode was not removed (due to a race with vget), then NULL is - * returned. Otherwise, a pointer to the private data is returned. - */ -void * -gfs_file_inactive(vnode_t *vp) -{ - int i; - gfs_dirent_t *ge = NULL; - gfs_file_t *fp = vp->v_data; - gfs_dir_t *dp = NULL; - void *data; - - if (fp->gfs_parent == NULL) - goto found; - - dp = fp->gfs_parent->v_data; - - /* - * First, see if this vnode is cached in the parent. - */ - gfs_dir_lock(dp); - - /* - * Find it in the set of static entries. - */ - for (i = 0; i < dp->gfsd_nstatic; i++) { - ge = &dp->gfsd_static[i]; - - if (ge->gfse_vnode == vp) - goto found; - } - - /* - * If 'ge' is NULL, then it is a dynamic entry. - */ - ge = NULL; - -found: - VI_LOCK(vp); - ASSERT(vp->v_count < 2); - /* - * Really remove this vnode - */ - data = vp->v_data; - if (ge != NULL) { - /* - * If this was a statically cached entry, simply set the - * cached vnode to NULL. - */ - ge->gfse_vnode = NULL; - } - if (vp->v_count == 1) { - vp->v_usecount--; - vdropl(vp); - } else { - VI_UNLOCK(vp); - } - - /* - * Free vnode and release parent - */ - if (fp->gfs_parent) { - gfs_dir_unlock(dp); - VI_LOCK(fp->gfs_parent); - fp->gfs_parent->v_usecount--; - VI_UNLOCK(fp->gfs_parent); - } else { - ASSERT(vp->v_vfsp != NULL); - VFS_RELE(vp->v_vfsp); - } - - return (data); -} - -/* - * gfs_dir_inactive() - * - * Same as above, but for directories. - */ -void * -gfs_dir_inactive(vnode_t *vp) -{ - gfs_dir_t *dp; - - ASSERT(vp->v_type == VDIR); - - if ((dp = gfs_file_inactive(vp)) != NULL) { - mutex_destroy(&dp->gfsd_lock); - if (dp->gfsd_nstatic) - kmem_free(dp->gfsd_static, - dp->gfsd_nstatic * sizeof (gfs_dirent_t)); - } - - return (dp); -} - -/* - * gfs_dir_lookup() - * - * Looks up the given name in the directory and returns the corresponding vnode, - * if found. - * - * First, we search statically defined entries, if any. If a match is found, - * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the - * existing vnode. Otherwise, we call the static entry's callback routine, - * caching the result if necessary. - * - * If no static entry is found, we invoke the lookup callback, if any. The - * arguments to this callback are: - * - * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp); - * - * pvp - parent vnode - * nm - name of entry - * vpp - pointer to resulting vnode - * - * Returns 0 on success, non-zero on error. - */ -int -gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) -{ - int i; - gfs_dirent_t *ge; - vnode_t *vp; - gfs_dir_t *dp = dvp->v_data; - int ret = 0; - - ASSERT(dvp->v_type == VDIR); - - if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) - return (0); - - gfs_dir_lock(dp); - - /* - * Search static entries. - */ - for (i = 0; i < dp->gfsd_nstatic; i++) { - ge = &dp->gfsd_static[i]; - - if (strcmp(ge->gfse_name, nm) == 0) { - if (ge->gfse_vnode) { - ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); - vp = ge->gfse_vnode; - VN_HOLD(vp); - goto out; - } - - /* - * We drop the directory lock, as the constructor will - * need to do KM_SLEEP allocations. If we return from - * the constructor only to find that a parallel - * operation has completed, and GFS_CACHE_VNODE is set - * for this entry, we discard the result in favor of the - * cached vnode. - */ - gfs_dir_unlock(dp); - vp = ge->gfse_ctor(dvp); - gfs_dir_lock(dp); - - ((gfs_file_t *)vp->v_data)->gfs_index = i; - - /* Set the inode according to the callback. */ - ((gfs_file_t *)vp->v_data)->gfs_ino = - dp->gfsd_inode(dvp, i); - - if (ge->gfse_flags & GFS_CACHE_VNODE) { - if (ge->gfse_vnode == NULL) { - ge->gfse_vnode = vp; - } else { - /* - * A parallel constructor beat us to it; - * return existing vnode. We have to be - * careful because we can't release the - * current vnode while holding the - * directory lock; its inactive routine - * will try to lock this directory. - */ - vnode_t *oldvp = vp; - vp = ge->gfse_vnode; - VN_HOLD(vp); - - gfs_dir_unlock(dp); - VN_RELE(oldvp); - gfs_dir_lock(dp); - } - } - - goto out; - } - } - - /* - * See if there is a dynamic constructor. - */ - if (dp->gfsd_lookup) { - ino64_t ino; - gfs_file_t *fp; - - /* - * Once again, drop the directory lock, as the lookup routine - * will need to allocate memory, or otherwise deadlock on this - * directory. - */ - gfs_dir_unlock(dp); - ret = dp->gfsd_lookup(dvp, nm, &vp, &ino); - gfs_dir_lock(dp); - if (ret != 0) - goto out; - - fp = (gfs_file_t *)vp->v_data; - fp->gfs_index = -1; - fp->gfs_ino = ino; - } else { - /* - * No static entry found, and there is no lookup callback, so - * return ENOENT. - */ - ret = ENOENT; - } - -out: - gfs_dir_unlock(dp); - - if (ret == 0) - *vpp = vp; - else - *vpp = NULL; - - return (ret); -} - -/* - * gfs_dir_readdir: does a readdir() on the given directory - * - * dvp - directory vnode - * uiop - uio structure - * eofp - eof pointer - * data - arbitrary data passed to readdir callback - * - * This routine does all the readdir() dirty work. Even so, the caller must - * supply two callbacks in order to get full compatibility. - * - * If the directory contains static entries, an inode callback must be - * specified. This avoids having to create every vnode and call VOP_GETATTR() - * when reading the directory. This function has the following arguments: - * - * ino_t gfs_inode_cb(vnode_t *vp, int index); - * - * vp - vnode for the directory - * index - index in original gfs_dirent_t array - * - * Returns the inode number for the given entry. - * - * For directories with dynamic entries, a readdir callback must be provided. - * This is significantly more complex, thanks to the particulars of - * VOP_READDIR(). - * - * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, - * offset_t *off, offset_t *nextoff, void *data) - * - * vp - directory vnode - * dp - directory entry, sized according to maxlen given to - * gfs_dir_create(). callback must fill in d_name and - * d_ino. - * eofp - callback must set to 1 when EOF has been reached - * off - on entry, the last offset read from the directory. Callback - * must set to the offset of the current entry, typically left - * untouched. - * nextoff - callback must set to offset of next entry. Typically - * (off + 1) - * data - caller-supplied data - * - * Return 0 on success, or error on failure. - */ -int -gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, - u_long **cookies, void *data) -{ - gfs_readdir_state_t gstate; - int error, eof = 0; - ino64_t ino, pino; - offset_t off, next; - gfs_dir_t *dp = dvp->v_data; - - ino = dp->gfsd_file.gfs_ino; - - if (dp->gfsd_file.gfs_parent == NULL) - pino = ino; /* root of filesystem */ - else - pino = ((gfs_file_t *) - (dp->gfsd_file.gfs_parent->v_data))->gfs_ino; - - if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, - pino, ino)) != 0) - return (error); - - while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies, - cookies)) == 0 && !eof) { - - if (off >= 0 && off < dp->gfsd_nstatic) { - ino = dp->gfsd_inode(dvp, off); - - if ((error = gfs_readdir_emit(&gstate, uiop, - off, ino, dp->gfsd_static[off].gfse_name, ncookies, - cookies)) != 0) - break; - - } else if (dp->gfsd_readdir) { - off -= dp->gfsd_nstatic; - - if ((error = dp->gfsd_readdir(dvp, - gstate.grd_dirent, &eof, &off, &next, - data)) != 0 || eof) - break; - - off += dp->gfsd_nstatic + 2; - next += dp->gfsd_nstatic + 2; - - if ((error = gfs_readdir_emit_int(&gstate, uiop, - next, ncookies, cookies)) != 0) - break; - } else { - /* - * Offset is beyond the end of the static entries, and - * we have no dynamic entries. Set EOF. - */ - eof = 1; - } - } - - return (gfs_readdir_fini(&gstate, error, eofp, eof)); -} - -/* - * gfs_vop_readdir: VOP_READDIR() entry point - * - * For use directly in vnode ops table. Given a GFS directory, calls - * gfs_dir_readdir() as necessary. - */ -/* ARGSUSED */ -int -gfs_vop_readdir(ap) - struct vop_readdir_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - int *a_eofflag; - int *ncookies; - u_long **a_cookies; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - uio_t *uiop = ap->a_uio; - int *eofp = ap->a_eofflag; - int ncookies = 0; - u_long *cookies = NULL; - int error; - - if (ap->a_ncookies) { - /* - * Minimum entry size is dirent size and 1 byte for a file name. - */ - ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); - cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK); - *ap->a_cookies = cookies; - *ap->a_ncookies = ncookies; - } - - error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL); - - if (error == 0) { - /* Subtract unused cookies */ - if (ap->a_ncookies) - *ap->a_ncookies -= ncookies; - } else if (ap->a_ncookies) { - free(*ap->a_cookies, M_TEMP); - *ap->a_cookies = NULL; - *ap->a_ncookies = 0; - } - - return (error); -} - -/* - * gfs_vop_inactive: VOP_INACTIVE() entry point - * - * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or - * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. - */ -/* ARGSUSED */ -int -gfs_vop_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - gfs_file_t *fp = vp->v_data; - void *data; - - if (fp->gfs_type == GFS_DIR) - data = gfs_dir_inactive(vp); - else - data = gfs_file_inactive(vp); - - if (data != NULL) - kmem_free(data, fp->gfs_size); - vp->v_data = NULL; - return (0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c deleted file mode 100644 index 420f802..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ /dev/null @@ -1,2859 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * DVA-based Adjustable Replacement Cache - * - * While much of the theory of operation used here is - * based on the self-tuning, low overhead replacement cache - * presented by Megiddo and Modha at FAST 2003, there are some - * significant differences: - * - * 1. The Megiddo and Modha model assumes any page is evictable. - * Pages in its cache cannot be "locked" into memory. This makes - * the eviction algorithm simple: evict the last page in the list. - * This also make the performance characteristics easy to reason - * about. Our cache is not so simple. At any given moment, some - * subset of the blocks in the cache are un-evictable because we - * have handed out a reference to them. Blocks are only evictable - * when there are no external references active. This makes - * eviction far more problematic: we choose to evict the evictable - * blocks that are the "lowest" in the list. - * - * There are times when it is not possible to evict the requested - * space. In these circumstances we are unable to adjust the cache - * size. To prevent the cache growing unbounded at these times we - * implement a "cache throttle" that slowes the flow of new data - * into the cache until we can make space avaiable. - * - * 2. The Megiddo and Modha model assumes a fixed cache size. - * Pages are evicted when the cache is full and there is a cache - * miss. Our model has a variable sized cache. It grows with - * high use, but also tries to react to memory preasure from the - * operating system: decreasing its size when system memory is - * tight. - * - * 3. The Megiddo and Modha model assumes a fixed page size. All - * elements of the cache are therefor exactly the same size. So - * when adjusting the cache size following a cache miss, its simply - * a matter of choosing a single page to evict. In our model, we - * have variable sized cache blocks (rangeing from 512 bytes to - * 128K bytes). We therefor choose a set of blocks to evict to make - * space for a cache miss that approximates as closely as possible - * the space used by the new block. - * - * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" - * by N. Megiddo & D. Modha, FAST 2003 - */ - -/* - * The locking model: - * - * A new reference to a cache buffer can be obtained in two - * ways: 1) via a hash table lookup using the DVA as a key, - * or 2) via one of the ARC lists. The arc_read() inerface - * uses method 1, while the internal arc algorithms for - * adjusting the cache use method 2. We therefor provide two - * types of locks: 1) the hash table lock array, and 2) the - * arc list locks. - * - * Buffers do not have their own mutexs, rather they rely on the - * hash table mutexs for the bulk of their protection (i.e. most - * fields in the arc_buf_hdr_t are protected by these mutexs). - * - * buf_hash_find() returns the appropriate mutex (held) when it - * locates the requested buffer in the hash table. It returns - * NULL for the mutex if the buffer was not in the table. - * - * buf_hash_remove() expects the appropriate hash mutex to be - * already held before it is invoked. - * - * Each arc state also has a mutex which is used to protect the - * buffer list associated with the state. When attempting to - * obtain a hash table lock while holding an arc list lock you - * must use: mutex_tryenter() to avoid deadlock. Also note that - * the active state mutex must be held before the ghost state mutex. - * - * Arc buffers may have an associated eviction callback function. - * This function will be invoked prior to removing the buffer (e.g. - * in arc_do_user_evicts()). Note however that the data associated - * with the buffer may be evicted prior to the callback. The callback - * must be made with *no locks held* (to prevent deadlock). Additionally, - * the users of callbacks must ensure that their private data is - * protected from simultaneous callbacks from arc_buf_evict() - * and arc_do_user_evicts(). - * - * Note that the majority of the performance stats are manipulated - * with atomic operations. - */ - -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/zio_checksum.h> -#include <sys/zfs_context.h> -#include <sys/arc.h> -#include <sys/refcount.h> -#ifdef _KERNEL -#include <sys/dnlc.h> -#endif -#include <sys/callb.h> -#include <sys/kstat.h> -#include <sys/sdt.h> - -static kmutex_t arc_reclaim_thr_lock; -static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ -static uint8_t arc_thread_exit; - -#define ARC_REDUCE_DNLC_PERCENT 3 -uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; - -typedef enum arc_reclaim_strategy { - ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ - ARC_RECLAIM_CONS /* Conservative reclaim strategy */ -} arc_reclaim_strategy_t; - -/* number of seconds before growing cache again */ -static int arc_grow_retry = 60; - -/* - * minimum lifespan of a prefetch block in clock ticks - * (initialized in arc_init()) - */ -static int arc_min_prefetch_lifespan; - -static int arc_dead; - -/* - * These tunables are for performance analysis. - */ -u_long zfs_arc_max; -u_long zfs_arc_min; -TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max); -TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min); -SYSCTL_DECL(_vfs_zfs); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, - "Maximum ARC size"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, - "Minimum ARC size"); - -/* - * Note that buffers can be on one of 5 states: - * ARC_anon - anonymous (discussed below) - * ARC_mru - recently used, currently cached - * ARC_mru_ghost - recentely used, no longer in cache - * ARC_mfu - frequently used, currently cached - * ARC_mfu_ghost - frequently used, no longer in cache - * When there are no active references to the buffer, they - * are linked onto one of the lists in arc. These are the - * only buffers that can be evicted or deleted. - * - * Anonymous buffers are buffers that are not associated with - * a DVA. These are buffers that hold dirty block copies - * before they are written to stable storage. By definition, - * they are "ref'd" and are considered part of arc_mru - * that cannot be freed. Generally, they will aquire a DVA - * as they are written and migrate onto the arc_mru list. - */ - -typedef struct arc_state { - list_t arcs_list; /* linked list of evictable buffer in state */ - uint64_t arcs_lsize; /* total size of buffers in the linked list */ - uint64_t arcs_size; /* total size of all buffers in this state */ - kmutex_t arcs_mtx; -} arc_state_t; - -/* The 5 states: */ -static arc_state_t ARC_anon; -static arc_state_t ARC_mru; -static arc_state_t ARC_mru_ghost; -static arc_state_t ARC_mfu; -static arc_state_t ARC_mfu_ghost; - -typedef struct arc_stats { - kstat_named_t arcstat_hits; - kstat_named_t arcstat_misses; - kstat_named_t arcstat_demand_data_hits; - kstat_named_t arcstat_demand_data_misses; - kstat_named_t arcstat_demand_metadata_hits; - kstat_named_t arcstat_demand_metadata_misses; - kstat_named_t arcstat_prefetch_data_hits; - kstat_named_t arcstat_prefetch_data_misses; - kstat_named_t arcstat_prefetch_metadata_hits; - kstat_named_t arcstat_prefetch_metadata_misses; - kstat_named_t arcstat_mru_hits; - kstat_named_t arcstat_mru_ghost_hits; - kstat_named_t arcstat_mfu_hits; - kstat_named_t arcstat_mfu_ghost_hits; - kstat_named_t arcstat_deleted; - kstat_named_t arcstat_recycle_miss; - kstat_named_t arcstat_mutex_miss; - kstat_named_t arcstat_evict_skip; - kstat_named_t arcstat_hash_elements; - kstat_named_t arcstat_hash_elements_max; - kstat_named_t arcstat_hash_collisions; - kstat_named_t arcstat_hash_chains; - kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; - kstat_named_t arcstat_c; - kstat_named_t arcstat_c_min; - kstat_named_t arcstat_c_max; - kstat_named_t arcstat_size; -} arc_stats_t; - -static arc_stats_t arc_stats = { - { "hits", KSTAT_DATA_UINT64 }, - { "misses", KSTAT_DATA_UINT64 }, - { "demand_data_hits", KSTAT_DATA_UINT64 }, - { "demand_data_misses", KSTAT_DATA_UINT64 }, - { "demand_metadata_hits", KSTAT_DATA_UINT64 }, - { "demand_metadata_misses", KSTAT_DATA_UINT64 }, - { "prefetch_data_hits", KSTAT_DATA_UINT64 }, - { "prefetch_data_misses", KSTAT_DATA_UINT64 }, - { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, - { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, - { "mru_hits", KSTAT_DATA_UINT64 }, - { "mru_ghost_hits", KSTAT_DATA_UINT64 }, - { "mfu_hits", KSTAT_DATA_UINT64 }, - { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, - { "deleted", KSTAT_DATA_UINT64 }, - { "recycle_miss", KSTAT_DATA_UINT64 }, - { "mutex_miss", KSTAT_DATA_UINT64 }, - { "evict_skip", KSTAT_DATA_UINT64 }, - { "hash_elements", KSTAT_DATA_UINT64 }, - { "hash_elements_max", KSTAT_DATA_UINT64 }, - { "hash_collisions", KSTAT_DATA_UINT64 }, - { "hash_chains", KSTAT_DATA_UINT64 }, - { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "p", KSTAT_DATA_UINT64 }, - { "c", KSTAT_DATA_UINT64 }, - { "c_min", KSTAT_DATA_UINT64 }, - { "c_max", KSTAT_DATA_UINT64 }, - { "size", KSTAT_DATA_UINT64 } -}; - -#define ARCSTAT(stat) (arc_stats.stat.value.ui64) - -#define ARCSTAT_INCR(stat, val) \ - atomic_add_64(&arc_stats.stat.value.ui64, (val)); - -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) -#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) - -#define ARCSTAT_MAX(stat, val) { \ - uint64_t m; \ - while ((val) > (m = arc_stats.stat.value.ui64) && \ - (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ - continue; \ -} - -#define ARCSTAT_MAXSTAT(stat) \ - ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) - -/* - * We define a macro to allow ARC hits/misses to be easily broken down by - * two separate conditions, giving a total of four different subtypes for - * each of hits and misses (so eight statistics total). - */ -#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ - if (cond1) { \ - if (cond2) { \ - ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ - } else { \ - ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ - } \ - } else { \ - if (cond2) { \ - ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ - } else { \ - ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ - } \ - } - -kstat_t *arc_ksp; -static arc_state_t *arc_anon; -static arc_state_t *arc_mru; -static arc_state_t *arc_mru_ghost; -static arc_state_t *arc_mfu; -static arc_state_t *arc_mfu_ghost; - -/* - * There are several ARC variables that are critical to export as kstats -- - * but we don't want to have to grovel around in the kstat whenever we wish to - * manipulate them. For these variables, we therefore define them to be in - * terms of the statistic variable. This assures that we are not introducing - * the possibility of inconsistency by having shadow copies of the variables, - * while still allowing the code to be readable. - */ -#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ -#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ -#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ -#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ - -static int arc_no_grow; /* Don't try to grow cache size */ -static uint64_t arc_tempreserve; - -typedef struct arc_callback arc_callback_t; - -struct arc_callback { - void *acb_private; - arc_done_func_t *acb_done; - arc_byteswap_func_t *acb_byteswap; - arc_buf_t *acb_buf; - zio_t *acb_zio_dummy; - arc_callback_t *acb_next; -}; - -typedef struct arc_write_callback arc_write_callback_t; - -struct arc_write_callback { - void *awcb_private; - arc_done_func_t *awcb_ready; - arc_done_func_t *awcb_done; - arc_buf_t *awcb_buf; -}; - -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - - kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; - - arc_buf_hdr_t *b_hash_next; - arc_buf_t *b_buf; - uint32_t b_flags; - uint32_t b_datacnt; - - arc_callback_t *b_acb; - kcondvar_t b_cv; - - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - spa_t *b_spa; - - /* protected by arc state mutex */ - arc_state_t *b_state; - list_node_t b_arc_node; - - /* updated atomically */ - clock_t b_arc_access; - - /* self protecting */ - refcount_t b_refcnt; -}; - -static arc_buf_t *arc_eviction_list; -static kmutex_t arc_eviction_mtx; -static arc_buf_hdr_t arc_eviction_hdr; -static void arc_get_data_buf(arc_buf_t *buf); -static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); - -#define GHOST_STATE(state) \ - ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) - -/* - * Private ARC flags. These flags are private ARC only flags that will show up - * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can - * be passed in as arc_flags in things like arc_read. However, these flags - * should never be passed and should only be set by ARC code. When adding new - * public flags, make sure not to smash the private ones. - */ - -#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ -#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ -#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ -#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ -#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ -#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ - -#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) -#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) -#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) - -/* - * Hash table routines - */ - -#define HT_LOCK_PAD 128 - -struct ht_lock { - kmutex_t ht_lock; -#ifdef _KERNEL - unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; -#endif -}; - -#define BUF_LOCKS 256 -typedef struct buf_hash_table { - uint64_t ht_mask; - arc_buf_hdr_t **ht_table; - struct ht_lock ht_locks[BUF_LOCKS]; -} buf_hash_table_t; - -static buf_hash_table_t buf_hash_table; - -#define BUF_HASH_INDEX(spa, dva, birth) \ - (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) -#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) -#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) -#define HDR_LOCK(buf) \ - (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) - -uint64_t zfs_crc64_table[256]; - -static uint64_t -buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) -{ - uintptr_t spav = (uintptr_t)spa; - uint8_t *vdva = (uint8_t *)dva; - uint64_t crc = -1ULL; - int i; - - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - - for (i = 0; i < sizeof (dva_t); i++) - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; - - crc ^= (spav>>8) ^ birth; - - return (crc); -} - -#define BUF_EMPTY(buf) \ - ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_birth == 0) - -#define BUF_EQUAL(spa, dva, birth, buf) \ - ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((buf)->b_birth == birth) && ((buf)->b_spa == spa) - -static arc_buf_hdr_t * -buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) -{ - uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); - kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *buf; - - mutex_enter(hash_lock); - for (buf = buf_hash_table.ht_table[idx]; buf != NULL; - buf = buf->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, buf)) { - *lockp = hash_lock; - return (buf); - } - } - mutex_exit(hash_lock); - *lockp = NULL; - return (NULL); -} - -/* - * Insert an entry into the hash table. If there is already an element - * equal to elem in the hash table, then the already existing element - * will be returned and the new element will not be inserted. - * Otherwise returns NULL. - */ -static arc_buf_hdr_t * -buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) -{ - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); - kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *fbuf; - uint32_t i; - - ASSERT(!HDR_IN_HASH_TABLE(buf)); - *lockp = hash_lock; - mutex_enter(hash_lock); - for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; - fbuf = fbuf->b_hash_next, i++) { - if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) - return (fbuf); - } - - buf->b_hash_next = buf_hash_table.ht_table[idx]; - buf_hash_table.ht_table[idx] = buf; - buf->b_flags |= ARC_IN_HASH_TABLE; - - /* collect some hash table performance data */ - if (i > 0) { - ARCSTAT_BUMP(arcstat_hash_collisions); - if (i == 1) - ARCSTAT_BUMP(arcstat_hash_chains); - - ARCSTAT_MAX(arcstat_hash_chain_max, i); - } - - ARCSTAT_BUMP(arcstat_hash_elements); - ARCSTAT_MAXSTAT(arcstat_hash_elements); - - return (NULL); -} - -static void -buf_hash_remove(arc_buf_hdr_t *buf) -{ - arc_buf_hdr_t *fbuf, **bufp; - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); - - ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); - ASSERT(HDR_IN_HASH_TABLE(buf)); - - bufp = &buf_hash_table.ht_table[idx]; - while ((fbuf = *bufp) != buf) { - ASSERT(fbuf != NULL); - bufp = &fbuf->b_hash_next; - } - *bufp = buf->b_hash_next; - buf->b_hash_next = NULL; - buf->b_flags &= ~ARC_IN_HASH_TABLE; - - /* collect some hash table performance data */ - ARCSTAT_BUMPDOWN(arcstat_hash_elements); - - if (buf_hash_table.ht_table[idx] && - buf_hash_table.ht_table[idx]->b_hash_next == NULL) - ARCSTAT_BUMPDOWN(arcstat_hash_chains); -} - -/* - * Global data structures and functions for the buf kmem cache. - */ -static kmem_cache_t *hdr_cache; -static kmem_cache_t *buf_cache; - -static void -buf_fini(void) -{ - int i; - - kmem_free(buf_hash_table.ht_table, - (buf_hash_table.ht_mask + 1) * sizeof (void *)); - for (i = 0; i < BUF_LOCKS; i++) - mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_cache); - kmem_cache_destroy(buf_cache); -} - -/* - * Constructor callback - called when the cache is empty - * and a new buf is requested. - */ -/* ARGSUSED */ -static int -hdr_cons(void *vbuf, void *unused, int kmflag) -{ - arc_buf_hdr_t *buf = vbuf; - - bzero(buf, sizeof (arc_buf_hdr_t)); - refcount_create(&buf->b_refcnt); - cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); - return (0); -} - -/* - * Destructor callback - called when a cached buf is - * no longer required. - */ -/* ARGSUSED */ -static void -hdr_dest(void *vbuf, void *unused) -{ - arc_buf_hdr_t *buf = vbuf; - - refcount_destroy(&buf->b_refcnt); - cv_destroy(&buf->b_cv); -} - -/* - * Reclaim callback -- invoked when memory is low. - */ -/* ARGSUSED */ -static void -hdr_recl(void *unused) -{ - dprintf("hdr_recl called\n"); - /* - * umem calls the reclaim func when we destroy the buf cache, - * which is after we do arc_fini(). - */ - if (!arc_dead) - cv_signal(&arc_reclaim_thr_cv); -} - -static void -buf_init(void) -{ - uint64_t *ct; - uint64_t hsize = 1ULL << 12; - int i, j; - - /* - * The hash table is big enough to fill all of physical memory - * with an average 64K block size. The table will take up - * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). - */ - while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) - hsize <<= 1; -retry: - buf_hash_table.ht_mask = hsize - 1; - buf_hash_table.ht_table = - kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); - if (buf_hash_table.ht_table == NULL) { - ASSERT(hsize > (1ULL << 8)); - hsize >>= 1; - goto retry; - } - - hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); - buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - - for (i = 0; i < 256; i++) - for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) - *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); - - for (i = 0; i < BUF_LOCKS; i++) { - mutex_init(&buf_hash_table.ht_locks[i].ht_lock, - NULL, MUTEX_DEFAULT, NULL); - } -} - -#define ARC_MINTIME (hz>>4) /* 62 ms */ - -static void -arc_cksum_verify(arc_buf_t *buf) -{ - zio_cksum_t zc; - - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_IO_ERROR)) { - mutex_exit(&buf->b_hdr->b_freeze_lock); - return; - } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); - if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) - panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_freeze_lock); -} - -static void -arc_cksum_compute(arc_buf_t *buf) -{ - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_freeze_lock); - return; - } - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, - buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_freeze_lock); -} - -void -arc_buf_thaw(arc_buf_t *buf) -{ - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - if (buf->b_hdr->b_state != arc_anon) - panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) - panic("modifying buffer while i/o in progress!"); - arc_cksum_verify(buf); - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - buf->b_hdr->b_freeze_cksum = NULL; - } - mutex_exit(&buf->b_hdr->b_freeze_lock); -} - -void -arc_buf_freeze(arc_buf_t *buf) -{ - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_state == arc_anon); - arc_cksum_compute(buf); -} - -static void -add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) -{ - ASSERT(MUTEX_HELD(hash_lock)); - - if ((refcount_add(&ab->b_refcnt, tag) == 1) && - (ab->b_state != arc_anon)) { - uint64_t delta = ab->b_size * ab->b_datacnt; - - ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); - mutex_enter(&ab->b_state->arcs_mtx); - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&ab->b_state->arcs_list, ab); - if (GHOST_STATE(ab->b_state)) { - ASSERT3U(ab->b_datacnt, ==, 0); - ASSERT3P(ab->b_buf, ==, NULL); - delta = ab->b_size; - } - ASSERT(delta > 0); - ASSERT3U(ab->b_state->arcs_lsize, >=, delta); - atomic_add_64(&ab->b_state->arcs_lsize, -delta); - mutex_exit(&ab->b_state->arcs_mtx); - /* remove the prefetch flag is we get a reference */ - if (ab->b_flags & ARC_PREFETCH) - ab->b_flags &= ~ARC_PREFETCH; - } -} - -static int -remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) -{ - int cnt; - arc_state_t *state = ab->b_state; - - ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); - ASSERT(!GHOST_STATE(state)); - - if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && - (state != arc_anon)) { - ASSERT(!MUTEX_HELD(&state->arcs_mtx)); - mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list, ab); - ASSERT(ab->b_datacnt > 0); - atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); - ASSERT3U(state->arcs_size, >=, state->arcs_lsize); - mutex_exit(&state->arcs_mtx); - } - return (cnt); -} - -/* - * Move the supplied buffer to the indicated state. The mutex - * for the buffer must be held by the caller. - */ -static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) -{ - arc_state_t *old_state = ab->b_state; - int64_t refcnt = refcount_count(&ab->b_refcnt); - uint64_t from_delta, to_delta; - - ASSERT(MUTEX_HELD(hash_lock)); - ASSERT(new_state != old_state); - ASSERT(refcnt == 0 || ab->b_datacnt > 0); - ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); - - from_delta = to_delta = ab->b_datacnt * ab->b_size; - - /* - * If this buffer is evictable, transfer it from the - * old state list to the new state list. - */ - if (refcnt == 0) { - if (old_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); - - if (use_mutex) - mutex_enter(&old_state->arcs_mtx); - - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list, ab); - - /* - * If prefetching out of the ghost cache, - * we will have a non-null datacnt. - */ - if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { - /* ghost elements have a ghost size */ - ASSERT(ab->b_buf == NULL); - from_delta = ab->b_size; - } - ASSERT3U(old_state->arcs_lsize, >=, from_delta); - atomic_add_64(&old_state->arcs_lsize, -from_delta); - - if (use_mutex) - mutex_exit(&old_state->arcs_mtx); - } - if (new_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); - - if (use_mutex) - mutex_enter(&new_state->arcs_mtx); - - list_insert_head(&new_state->arcs_list, ab); - - /* ghost elements have a ghost size */ - if (GHOST_STATE(new_state)) { - ASSERT(ab->b_datacnt == 0); - ASSERT(ab->b_buf == NULL); - to_delta = ab->b_size; - } - atomic_add_64(&new_state->arcs_lsize, to_delta); - ASSERT3U(new_state->arcs_size + to_delta, >=, - new_state->arcs_lsize); - - if (use_mutex) - mutex_exit(&new_state->arcs_mtx); - } - } - - ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon && old_state != arc_anon) { - buf_hash_remove(ab); - } - - /* adjust state sizes */ - if (to_delta) - atomic_add_64(&new_state->arcs_size, to_delta); - if (from_delta) { - ASSERT3U(old_state->arcs_size, >=, from_delta); - atomic_add_64(&old_state->arcs_size, -from_delta); - } - ab->b_state = new_state; -} - -arc_buf_t * -arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) -{ - arc_buf_hdr_t *hdr; - arc_buf_t *buf; - - ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); - ASSERT(BUF_EMPTY(hdr)); - hdr->b_size = size; - hdr->b_type = type; - hdr->b_spa = spa; - hdr->b_state = arc_anon; - hdr->b_arc_access = 0; - mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_buf = buf; - arc_get_data_buf(buf); - hdr->b_datacnt = 1; - hdr->b_flags = 0; - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - (void) refcount_add(&hdr->b_refcnt, tag); - - return (buf); -} - -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) -{ - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = hdr->b_size; - - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = hdr->b_buf; - hdr->b_buf = buf; - arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); - hdr->b_datacnt += 1; - return (buf); -} - -void -arc_buf_add_ref(arc_buf_t *buf, void* tag) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - - /* - * Check to see if this buffer is currently being evicted via - * arc_do_user_evicts(). - */ - mutex_enter(&arc_eviction_mtx); - hdr = buf->b_hdr; - if (hdr == NULL) { - mutex_exit(&arc_eviction_mtx); - return; - } - hash_lock = HDR_LOCK(hdr); - mutex_exit(&arc_eviction_mtx); - - mutex_enter(hash_lock); - if (buf->b_data == NULL) { - /* - * This buffer is evicted. - */ - mutex_exit(hash_lock); - return; - } - - ASSERT(buf->b_hdr == hdr); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); - add_reference(hdr, hash_lock, tag); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, - data, metadata, hits); -} - -static void -arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) -{ - arc_buf_t **bufp; - - /* free up data associated with the buf */ - if (buf->b_data) { - arc_state_t *state = buf->b_hdr->b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; - - arc_cksum_verify(buf); - if (!recycle) { - if (type == ARC_BUFC_METADATA) { - zio_buf_free(buf->b_data, size); - } else { - ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(buf->b_data, size); - } - atomic_add_64(&arc_size, -size); - } - if (list_link_active(&buf->b_hdr->b_arc_node)) { - ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); - ASSERT(state != arc_anon); - ASSERT3U(state->arcs_lsize, >=, size); - atomic_add_64(&state->arcs_lsize, -size); - } - ASSERT3U(state->arcs_size, >=, size); - atomic_add_64(&state->arcs_size, -size); - buf->b_data = NULL; - ASSERT(buf->b_hdr->b_datacnt > 0); - buf->b_hdr->b_datacnt -= 1; - } - - /* only remove the buf if requested */ - if (!all) - return; - - /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) - continue; - *bufp = buf->b_next; - - ASSERT(buf->b_efunc == NULL); - - /* clean up the buf */ - buf->b_hdr = NULL; - kmem_cache_free(buf_cache, buf); -} - -static void -arc_hdr_destroy(arc_buf_hdr_t *hdr) -{ - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - ASSERT3P(hdr->b_state, ==, arc_anon); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - - if (!BUF_EMPTY(hdr)) { - ASSERT(!HDR_IN_HASH_TABLE(hdr)); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; - } - while (hdr->b_buf) { - arc_buf_t *buf = hdr->b_buf; - - if (buf->b_efunc) { - mutex_enter(&arc_eviction_mtx); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_buf, FALSE, FALSE); - hdr->b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&arc_eviction_mtx); - } else { - arc_buf_destroy(hdr->b_buf, FALSE, TRUE); - } - } - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; - } - mutex_destroy(&hdr->b_freeze_lock); - - ASSERT(!list_link_active(&hdr->b_arc_node)); - ASSERT3P(hdr->b_hash_next, ==, NULL); - ASSERT3P(hdr->b_acb, ==, NULL); - kmem_cache_free(hdr_cache, hdr); -} - -void -arc_buf_free(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_state != arc_anon; - - ASSERT(buf->b_efunc == NULL); - ASSERT(buf->b_data != NULL); - - if (hashed) { - kmutex_t *hash_lock = HDR_LOCK(hdr); - - mutex_enter(hash_lock); - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) - arc_buf_destroy(buf, FALSE, TRUE); - else - hdr->b_flags |= ARC_BUF_AVAILABLE; - mutex_exit(hash_lock); - } else if (HDR_IO_IN_PROGRESS(hdr)) { - int destroy_hdr; - /* - * We are in the middle of an async write. Don't destroy - * this buffer unless the write completes before we finish - * decrementing the reference count. - */ - mutex_enter(&arc_eviction_mtx); - (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_eviction_mtx); - if (destroy_hdr) - arc_hdr_destroy(hdr); - } else { - if (remove_reference(hdr, NULL, tag) > 0) { - ASSERT(HDR_IO_ERROR(hdr)); - arc_buf_destroy(buf, FALSE, TRUE); - } else { - arc_hdr_destroy(hdr); - } - } -} - -int -arc_buf_remove_ref(arc_buf_t *buf, void* tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); - int no_callback = (buf->b_efunc == NULL); - - if (hdr->b_state == arc_anon) { - arc_buf_free(buf, tag); - return (no_callback); - } - - mutex_enter(hash_lock); - ASSERT(hdr->b_state != arc_anon); - ASSERT(buf->b_data != NULL); - - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { - if (no_callback) - arc_buf_destroy(buf, FALSE, TRUE); - } else if (no_callback) { - ASSERT(hdr->b_buf == buf && buf->b_next == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; - } - ASSERT(no_callback || hdr->b_datacnt > 1 || - refcount_is_zero(&hdr->b_refcnt)); - mutex_exit(hash_lock); - return (no_callback); -} - -int -arc_buf_size(arc_buf_t *buf) -{ - return (buf->b_hdr->b_size); -} - -/* - * Evict buffers from list until we've removed the specified number of - * bytes. Move the removed buffers to the appropriate evict state. - * If the recycle flag is set, then attempt to "recycle" a buffer: - * - look for a buffer to evict that is `bytes' long. - * - return the data block from this buffer rather than freeing it. - * This flag is used by callers that are trying to make space for a - * new buffer in a full arc cache. - */ -static void * -arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, - arc_buf_contents_t type) -{ - arc_state_t *evicted_state; - uint64_t bytes_evicted = 0, skipped = 0, missed = 0; - arc_buf_hdr_t *ab, *ab_prev = NULL; - kmutex_t *hash_lock; - boolean_t have_lock; - void *stolen = NULL; - - ASSERT(state == arc_mru || state == arc_mfu); - - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - - mutex_enter(&state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); - - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); - /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(ab) || - (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && - LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) { - skipped++; - continue; - } - /* "lookahead" for better eviction candidate */ - if (recycle && ab->b_size != bytes && - ab_prev && ab_prev->b_size == bytes) - continue; - hash_lock = HDR_LOCK(ab); - have_lock = MUTEX_HELD(hash_lock); - if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); - ASSERT(ab->b_datacnt > 0); - while (ab->b_buf) { - arc_buf_t *buf = ab->b_buf; - if (buf->b_data) { - bytes_evicted += ab->b_size; - if (recycle && ab->b_type == type && - ab->b_size == bytes) { - stolen = buf->b_data; - recycle = FALSE; - } - } - if (buf->b_efunc) { - mutex_enter(&arc_eviction_mtx); - arc_buf_destroy(buf, - buf->b_data == stolen, FALSE); - ab->b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&arc_eviction_mtx); - } else { - arc_buf_destroy(buf, - buf->b_data == stolen, TRUE); - } - } - ASSERT(ab->b_datacnt == 0); - arc_change_state(evicted_state, ab, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags = ARC_IN_HASH_TABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); - if (!have_lock) - mutex_exit(hash_lock); - if (bytes >= 0 && bytes_evicted >= bytes) - break; - } else { - missed += 1; - } - } - - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&state->arcs_mtx); - - if (bytes_evicted < bytes) - dprintf("only evicted %lld bytes from %x", - (longlong_t)bytes_evicted, state); - - if (skipped) - ARCSTAT_INCR(arcstat_evict_skip, skipped); - - if (missed) - ARCSTAT_INCR(arcstat_mutex_miss, missed); - - return (stolen); -} - -/* - * Remove buffers from list until we've removed the specified number of - * bytes. Destroy the buffers that are removed. - */ -static void -arc_evict_ghost(arc_state_t *state, int64_t bytes) -{ - arc_buf_hdr_t *ab, *ab_prev; - kmutex_t *hash_lock; - uint64_t bytes_deleted = 0; - uint64_t bufs_skipped = 0; - - ASSERT(GHOST_STATE(state)); -top: - mutex_enter(&state->arcs_mtx); - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); - hash_lock = HDR_LOCK(ab); - if (mutex_tryenter(hash_lock)) { - ASSERT(!HDR_IO_IN_PROGRESS(ab)); - ASSERT(ab->b_buf == NULL); - arc_change_state(arc_anon, ab, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_deleted); - bytes_deleted += ab->b_size; - arc_hdr_destroy(ab); - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); - if (bytes >= 0 && bytes_deleted >= bytes) - break; - } else { - if (bytes < 0) { - mutex_exit(&state->arcs_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto top; - } - bufs_skipped += 1; - } - } - mutex_exit(&state->arcs_mtx); - - if (bufs_skipped) { - ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); - ASSERT(bytes >= 0); - } - - if (bytes_deleted < bytes) - dprintf("only deleted %lld bytes from %p", - (longlong_t)bytes_deleted, state); -} - -static void -arc_adjust(void) -{ - int64_t top_sz, mru_over, arc_over, todelete; - - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; - - if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); - (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; - } - - mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; - - if (mru_over > 0) { - if (arc_mru_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); - arc_evict_ghost(arc_mru_ghost, todelete); - } - } - - if ((arc_over = arc_size - arc_c) > 0) { - int64_t tbl_over; - - if (arc_mfu->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); - (void) arc_evict(arc_mfu, toevict, FALSE, - ARC_BUFC_UNDEF); - } - - tbl_over = arc_size + arc_mru_ghost->arcs_lsize + - arc_mfu_ghost->arcs_lsize - arc_c*2; - - if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); - arc_evict_ghost(arc_mfu_ghost, todelete); - } - } -} - -static void -arc_do_user_evicts(void) -{ - mutex_enter(&arc_eviction_mtx); - while (arc_eviction_list != NULL) { - arc_buf_t *buf = arc_eviction_list; - arc_eviction_list = buf->b_next; - buf->b_hdr = NULL; - mutex_exit(&arc_eviction_mtx); - - if (buf->b_efunc != NULL) - VERIFY(buf->b_efunc(buf) == 0); - - buf->b_efunc = NULL; - buf->b_private = NULL; - kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_eviction_mtx); - } - mutex_exit(&arc_eviction_mtx); -} - -/* - * Flush all *evictable* data from the cache. - * NOTE: this will not touch "active" (i.e. referenced) data. - */ -void -arc_flush(void) -{ - while (list_head(&arc_mru->arcs_list)) - (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); - while (list_head(&arc_mfu->arcs_list)) - (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); - - arc_evict_ghost(arc_mru_ghost, -1); - arc_evict_ghost(arc_mfu_ghost, -1); - - mutex_enter(&arc_reclaim_thr_lock); - arc_do_user_evicts(); - mutex_exit(&arc_reclaim_thr_lock); - ASSERT(arc_eviction_list == NULL); -} - -int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ - -void -arc_shrink(void) -{ - if (arc_c > arc_c_min) { - uint64_t to_free; - -#ifdef _KERNEL - to_free = arc_c >> arc_shrink_shift; -#else - to_free = arc_c >> arc_shrink_shift; -#endif - if (arc_c > arc_c_min + to_free) - atomic_add_64(&arc_c, -to_free); - else - arc_c = arc_c_min; - - atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); - if (arc_c > arc_size) - arc_c = MAX(arc_size, arc_c_min); - if (arc_p > arc_c) - arc_p = (arc_c >> 1); - ASSERT(arc_c >= arc_c_min); - ASSERT((int64_t)arc_p >= 0); - } - - if (arc_size > arc_c) - arc_adjust(); -} - -static int zfs_needfree = 0; - -static int -arc_reclaim_needed(void) -{ -#if 0 - uint64_t extra; -#endif - -#ifdef _KERNEL - - if (zfs_needfree) - return (1); - -#if 0 - /* - * check to make sure that swapfs has enough space so that anon - * reservations can still succeeed. anon_resvmem() checks that the - * availrmem is greater than swapfs_minfree, and the number of reserved - * swap pages. We also add a bit of extra here just to prevent - * circumstances from getting really dire. - */ - if (availrmem < swapfs_minfree + swapfs_reserve + extra) - return (1); - - /* - * If zio data pages are being allocated out of a separate heap segment, - * then check that the size of available vmem for this area remains - * above 1/4th free. This needs to be done when the size of the - * non-default segment is smaller than physical memory, so we could - * conceivably run out of VA in that segment before running out of - * physical memory. - */ - if (zio_arena != NULL) { - size_t arc_ziosize = - btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); - - if ((physmem > arc_ziosize) && - (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) - return (1); - } - -#if defined(__i386) - /* - * If we're on an i386 platform, it's possible that we'll exhaust the - * kernel heap space before we ever run out of available physical - * memory. Most checks of the size of the heap_area compare against - * tune.t_minarmem, which is the minimum available real memory that we - * can have in the system. However, this is generally fixed at 25 pages - * which is so low that it's useless. In this comparison, we seek to - * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the caclulation, if less than 1/4th is - * free) - */ - if (btop(vmem_size(heap_arena, VMEM_FREE)) < - (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) - return (1); -#endif -#else - if (kmem_used() > (kmem_size() * 3) / 4) - return (1); -#endif - -#else - if (spa_get_random(100) == 0) - return (1); -#endif - return (0); -} - -static void -arc_kmem_reap_now(arc_reclaim_strategy_t strat) -{ -#ifdef ZIO_USE_UMA - size_t i; - kmem_cache_t *prev_cache = NULL; - kmem_cache_t *prev_data_cache = NULL; - extern kmem_cache_t *zio_buf_cache[]; - extern kmem_cache_t *zio_data_buf_cache[]; -#endif - -#ifdef _KERNEL - /* - * First purge some DNLC entries, in case the DNLC is using - * up too much memory. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - -#if defined(__i386) - /* - * Reclaim unused memory from all kmem caches. - */ - kmem_reap(); -#endif -#endif - - /* - * An agressive reclamation will shrink the cache size as well as - * reap free buffers from the arc kmem caches. - */ - if (strat == ARC_RECLAIM_AGGR) - arc_shrink(); - -#ifdef ZIO_USE_UMA - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { - if (zio_buf_cache[i] != prev_cache) { - prev_cache = zio_buf_cache[i]; - kmem_cache_reap_now(zio_buf_cache[i]); - } - if (zio_data_buf_cache[i] != prev_data_cache) { - prev_data_cache = zio_data_buf_cache[i]; - kmem_cache_reap_now(zio_data_buf_cache[i]); - } - } -#endif - kmem_cache_reap_now(buf_cache); - kmem_cache_reap_now(hdr_cache); -} - -static void -arc_reclaim_thread(void *dummy __unused) -{ - clock_t growtime = 0; - arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); - - mutex_enter(&arc_reclaim_thr_lock); - while (arc_thread_exit == 0) { - if (arc_reclaim_needed()) { - - if (arc_no_grow) { - if (last_reclaim == ARC_RECLAIM_CONS) { - last_reclaim = ARC_RECLAIM_AGGR; - } else { - last_reclaim = ARC_RECLAIM_CONS; - } - } else { - arc_no_grow = TRUE; - last_reclaim = ARC_RECLAIM_AGGR; - membar_producer(); - } - - /* reset the growth delay for every reclaim */ - growtime = LBOLT + (arc_grow_retry * hz); - ASSERT(growtime > 0); - - if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) { - /* - * If zfs_needfree is TRUE our vm_lowmem hook - * was called and in that case we must free some - * memory, so switch to aggressive mode. - */ - arc_no_grow = TRUE; - last_reclaim = ARC_RECLAIM_AGGR; - } - arc_kmem_reap_now(last_reclaim); - } else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) { - arc_no_grow = FALSE; - } - - if (zfs_needfree || - (2 * arc_c < arc_size + - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) - arc_adjust(); - - if (arc_eviction_list != NULL) - arc_do_user_evicts(); - - if (arc_reclaim_needed()) { - zfs_needfree = 0; -#ifdef _KERNEL - wakeup(&zfs_needfree); -#endif - } - - /* block until needed, or one second, whichever is shorter */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_reclaim_thr_cv, - &arc_reclaim_thr_lock, hz); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); - } - - arc_thread_exit = 0; - cv_broadcast(&arc_reclaim_thr_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ - thread_exit(); -} - -/* - * Adapt arc info given the number of bytes we are trying to add and - * the state that we are comming from. This function is only called - * when we are adding new content to the cache. - */ -static void -arc_adapt(int bytes, arc_state_t *state) -{ - int mult; - - ASSERT(bytes > 0); - /* - * Adapt the target size of the MRU list: - * - if we just hit in the MRU ghost list, then increase - * the target size of the MRU list. - * - if we just hit in the MFU ghost list, then increase - * the target size of the MFU list by decreasing the - * target size of the MRU list. - */ - if (state == arc_mru_ghost) { - mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? - 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); - - arc_p = MIN(arc_c, arc_p + bytes * mult); - } else if (state == arc_mfu_ghost) { - mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? - 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); - - arc_p = MAX(0, (int64_t)arc_p - bytes * mult); - } - ASSERT((int64_t)arc_p >= 0); - - if (arc_reclaim_needed()) { - cv_signal(&arc_reclaim_thr_cv); - return; - } - - if (arc_no_grow) - return; - - if (arc_c >= arc_c_max) - return; - - /* - * If we're within (2 * maxblocksize) bytes of the target - * cache size, increment the target cache size - */ - if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { - atomic_add_64(&arc_c, (int64_t)bytes); - if (arc_c > arc_c_max) - arc_c = arc_c_max; - else if (state == arc_anon) - atomic_add_64(&arc_p, (int64_t)bytes); - if (arc_p > arc_c) - arc_p = arc_c; - } - ASSERT((int64_t)arc_p >= 0); -} - -/* - * Check if the cache has reached its limits and eviction is required - * prior to insert. - */ -static int -arc_evict_needed() -{ - if (arc_reclaim_needed()) - return (1); - - return (arc_size > arc_c); -} - -/* - * The buffer, supplied as the first argument, needs a data block. - * So, if we are at cache max, determine which cache should be victimized. - * We have the following cases: - * - * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> - * In this situation if we're out of space, but the resident size of the MFU is - * under the limit, victimize the MFU cache to satisfy this insertion request. - * - * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> - * Here, we've used up all of the available space for the MRU, so we need to - * evict from our own cache instead. Evict from the set of resident MRU - * entries. - * - * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> - * c minus p represents the MFU space in the cache, since p is the size of the - * cache that is dedicated to the MRU. In this situation there's still space on - * the MFU side, so the MRU side needs to be victimized. - * - * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> - * MFU's resident set is consuming more space than it has been allotted. In - * this situation, we must victimize our own cache, the MFU, for this insertion. - */ -static void -arc_get_data_buf(arc_buf_t *buf) -{ - arc_state_t *state = buf->b_hdr->b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; - - arc_adapt(size, state); - - /* - * We have not yet reached cache maximum size, - * just allocate a new buffer. - */ - if (!arc_evict_needed()) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - } - atomic_add_64(&arc_size, size); - goto out; - } - - /* - * If we are prefetching from the mfu ghost list, this buffer - * will end up on the mru list; so steal space from there. - */ - if (state == arc_mfu_ghost) - state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; - else if (state == arc_mru_ghost) - state = arc_mru; - - if (state == arc_mru || state == arc_anon) { - uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_p > mru_used) ? arc_mfu : arc_mru; - } else { - /* MFU cases */ - uint64_t mfu_space = arc_c - arc_p; - state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; - } - if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - } - atomic_add_64(&arc_size, size); - ARCSTAT_BUMP(arcstat_recycle_miss); - } - ASSERT(buf->b_data != NULL); -out: - /* - * Update the state size. Note that ghost states have a - * "ghost size" and so don't need to be updated. - */ - if (!GHOST_STATE(buf->b_hdr->b_state)) { - arc_buf_hdr_t *hdr = buf->b_hdr; - - atomic_add_64(&hdr->b_state->arcs_size, size); - if (list_link_active(&hdr->b_arc_node)) { - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize, size); - } - /* - * If we are growing the cache, and we are adding anonymous - * data, and we have outgrown arc_p, update arc_p - */ - if (arc_size < arc_c && hdr->b_state == arc_anon && - arc_anon->arcs_size + arc_mru->arcs_size > arc_p) - arc_p = MIN(arc_c, arc_p + size); - } -} - -/* - * This routine is called whenever a buffer is accessed. - * NOTE: the hash lock is dropped in this function. - */ -static void -arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) -{ - ASSERT(MUTEX_HELD(hash_lock)); - - if (buf->b_state == arc_anon) { - /* - * This buffer is not in the cache, and does not - * appear in our "ghost" list. Add the new buffer - * to the MRU state. - */ - - ASSERT(buf->b_arc_access == 0); - buf->b_arc_access = LBOLT; - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); - arc_change_state(arc_mru, buf, hash_lock); - - } else if (buf->b_state == arc_mru) { - /* - * If this buffer is here because of a prefetch, then either: - * - clear the flag if this is a "referencing" read - * (any subsequent access will bump this into the MFU state). - * or - * - move the buffer to the head of the list if this is - * another prefetch (to make it less likely to be evicted). - */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - if (refcount_count(&buf->b_refcnt) == 0) { - ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mru->arcs_mtx); - list_remove(&arc_mru->arcs_list, buf); - list_insert_head(&arc_mru->arcs_list, buf); - mutex_exit(&arc_mru->arcs_mtx); - } else { - buf->b_flags &= ~ARC_PREFETCH; - ARCSTAT_BUMP(arcstat_mru_hits); - } - buf->b_arc_access = LBOLT; - return; - } - - /* - * This buffer has been "accessed" only once so far, - * but it is still in the cache. Move it to the MFU - * state. - */ - if (LBOLT > buf->b_arc_access + ARC_MINTIME) { - /* - * More than 125ms have passed since we - * instantiated this buffer. Move it to the - * most frequently used state. - */ - buf->b_arc_access = LBOLT; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); - } - ARCSTAT_BUMP(arcstat_mru_hits); - } else if (buf->b_state == arc_mru_ghost) { - arc_state_t *new_state; - /* - * This buffer has been "accessed" recently, but - * was evicted from the cache. Move it to the - * MFU state. - */ - - if (buf->b_flags & ARC_PREFETCH) { - new_state = arc_mru; - if (refcount_count(&buf->b_refcnt) > 0) - buf->b_flags &= ~ARC_PREFETCH; - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); - } else { - new_state = arc_mfu; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - } - - buf->b_arc_access = LBOLT; - arc_change_state(new_state, buf, hash_lock); - - ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (buf->b_state == arc_mfu) { - /* - * This buffer has been accessed more than once and is - * still in the cache. Keep it in the MFU state. - * - * NOTE: an add_reference() that occurred when we did - * the arc_read() will have kicked this off the list. - * If it was a prefetch, we will explicitly move it to - * the head of the list now. - */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - ASSERT(refcount_count(&buf->b_refcnt) == 0); - ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mfu->arcs_mtx); - list_remove(&arc_mfu->arcs_list, buf); - list_insert_head(&arc_mfu->arcs_list, buf); - mutex_exit(&arc_mfu->arcs_mtx); - } - ARCSTAT_BUMP(arcstat_mfu_hits); - buf->b_arc_access = LBOLT; - } else if (buf->b_state == arc_mfu_ghost) { - arc_state_t *new_state = arc_mfu; - /* - * This buffer has been accessed more than once but has - * been evicted from the cache. Move it back to the - * MFU state. - */ - - if (buf->b_flags & ARC_PREFETCH) { - /* - * This is a prefetch access... - * move this block back to the MRU state. - */ - ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); - new_state = arc_mru; - } - - buf->b_arc_access = LBOLT; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(new_state, buf, hash_lock); - - ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else { - ASSERT(!"invalid arc state"); - } -} - -/* a generic arc_done_func_t which you can use */ -/* ARGSUSED */ -void -arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) -{ - bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg) == 1); -} - -/* a generic arc_done_func_t which you can use */ -void -arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) -{ - arc_buf_t **bufp = arg; - if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg) == 1); - *bufp = NULL; - } else { - *bufp = buf; - } -} - -static void -arc_read_done(zio_t *zio) -{ - arc_buf_hdr_t *hdr, *found; - arc_buf_t *buf; - arc_buf_t *abuf; /* buffer we're assigning to callback */ - kmutex_t *hash_lock; - arc_callback_t *callback_list, *acb; - int freeable = FALSE; - - buf = zio->io_private; - hdr = buf->b_hdr; - - /* - * The hdr was inserted into hash-table and removed from lists - * prior to starting I/O. We should find this header, since - * it's in the hash table, and it should be legit since it's - * not possible to evict it during the I/O. The only possible - * reason for it not to be found is if we were freed during the - * read. - */ - found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, - &hash_lock); - - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || - (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); - - /* byteswap if necessary */ - callback_list = hdr->b_acb; - ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) - callback_list->acb_byteswap(buf->b_data, hdr->b_size); - - arc_cksum_compute(buf); - - /* create copies of the data buffer for the callers */ - abuf = buf; - for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done) { - if (abuf == NULL) - abuf = arc_buf_clone(buf); - acb->acb_buf = abuf; - abuf = NULL; - } - } - hdr->b_acb = NULL; - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) - hdr->b_flags |= ARC_BUF_AVAILABLE; - - ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); - - if (zio->io_error != 0) { - hdr->b_flags |= ARC_IO_ERROR; - if (hdr->b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); - if (HDR_IN_HASH_TABLE(hdr)) - buf_hash_remove(hdr); - freeable = refcount_is_zero(&hdr->b_refcnt); - /* convert checksum errors into IO errors */ - if (zio->io_error == ECKSUM) - zio->io_error = EIO; - } - - /* - * Broadcast before we drop the hash_lock to avoid the possibility - * that the hdr (and hence the cv) might be freed before we get to - * the cv_broadcast(). - */ - cv_broadcast(&hdr->b_cv); - - if (hash_lock) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - if (zio->io_error == 0 && hdr->b_state == arc_anon) - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - } else { - /* - * This block was freed while we waited for the read to - * complete. It has been removed from the hash table and - * moved to the anonymous state (so that it won't show up - * in the cache). - */ - ASSERT3P(hdr->b_state, ==, arc_anon); - freeable = refcount_is_zero(&hdr->b_refcnt); - } - - /* execute each callback and free its structure */ - while ((acb = callback_list) != NULL) { - if (acb->acb_done) - acb->acb_done(zio, acb->acb_buf, acb->acb_private); - - if (acb->acb_zio_dummy != NULL) { - acb->acb_zio_dummy->io_error = zio->io_error; - zio_nowait(acb->acb_zio_dummy); - } - - callback_list = acb->acb_next; - kmem_free(acb, sizeof (arc_callback_t)); - } - - if (freeable) - arc_hdr_destroy(hdr); -} - -/* - * "Read" the block block at the specified DVA (in bp) via the - * cache. If the block is found in the cache, invoke the provided - * callback immediately and return. Note that the `zio' parameter - * in the callback will be NULL in this case, since no IO was - * required. If the block is not in the cache pass the read request - * on to the spa with a substitute callback function, so that the - * requested block will be added to the cache. - * - * If a read request arrives for a block that has a read in-progress, - * either wait for the in-progress read to complete (and return the - * results); or, if this is a read with a "done" func, add a record - * to the read to invoke the "done" func when the read completes, - * and return; or just return. - * - * arc_read_done() will invoke all the requested "done" functions - * for readers of this block. - */ -int -arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, - arc_done_func_t *done, void *private, int priority, int flags, - uint32_t *arc_flags, zbookmark_t *zb) -{ - arc_buf_hdr_t *hdr; - arc_buf_t *buf; - kmutex_t *hash_lock; - zio_t *rzio; - -top: - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); - if (hdr && hdr->b_datacnt > 0) { - - *arc_flags |= ARC_CACHED; - - if (HDR_IO_IN_PROGRESS(hdr)) { - - if (*arc_flags & ARC_WAIT) { - cv_wait(&hdr->b_cv, hash_lock); - mutex_exit(hash_lock); - goto top; - } - ASSERT(*arc_flags & ARC_NOWAIT); - - if (done) { - arc_callback_t *acb = NULL; - - acb = kmem_zalloc(sizeof (arc_callback_t), - KM_SLEEP); - acb->acb_done = done; - acb->acb_private = private; - acb->acb_byteswap = swap; - if (pio != NULL) - acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, flags); - - ASSERT(acb->acb_done != NULL); - acb->acb_next = hdr->b_acb; - hdr->b_acb = acb; - add_reference(hdr, hash_lock, private); - mutex_exit(hash_lock); - return (0); - } - mutex_exit(hash_lock); - return (0); - } - - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); - - if (done) { - add_reference(hdr, hash_lock, private); - /* - * If this block is already in use, create a new - * copy of the data so that we will be guaranteed - * that arc_release() will always succeed. - */ - buf = hdr->b_buf; - ASSERT(buf); - ASSERT(buf->b_data); - if (HDR_BUF_AVAILABLE(hdr)) { - ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; - } else { - buf = arc_buf_clone(buf); - } - } else if (*arc_flags & ARC_PREFETCH && - refcount_count(&hdr->b_refcnt) == 0) { - hdr->b_flags |= ARC_PREFETCH; - } - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, - data, metadata, hits); - - if (done) - done(NULL, buf, private); - } else { - uint64_t size = BP_GET_LSIZE(bp); - arc_callback_t *acb; - - if (hdr == NULL) { - /* this block is not in the cache */ - arc_buf_hdr_t *exists; - arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - buf = arc_buf_alloc(spa, size, private, type); - hdr = buf->b_hdr; - hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = bp->blk_birth; - hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; - exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { - /* somebody beat us to the hash insert */ - mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; - (void) arc_buf_remove_ref(buf, private); - goto top; /* restart the IO request */ - } - /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) { - (void) remove_reference(hdr, hash_lock, - private); - hdr->b_flags |= ARC_PREFETCH; - } - if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_INDIRECT; - } else { - /* this block is in the ghost cache */ - ASSERT(GHOST_STATE(hdr->b_state)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); - ASSERT(hdr->b_buf == NULL); - - /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) - hdr->b_flags |= ARC_PREFETCH; - else - add_reference(hdr, hash_lock, private); - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_buf = buf; - arc_get_data_buf(buf); - ASSERT(hdr->b_datacnt == 0); - hdr->b_datacnt = 1; - - } - - acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); - acb->acb_done = done; - acb->acb_private = private; - acb->acb_byteswap = swap; - - ASSERT(hdr->b_acb == NULL); - hdr->b_acb = acb; - hdr->b_flags |= ARC_IO_IN_PROGRESS; - - /* - * If the buffer has been evicted, migrate it to a present state - * before issuing the I/O. Once we drop the hash-table lock, - * the header will be marked as I/O in progress and have an - * attached buffer. At this point, anybody who finds this - * buffer ought to notice that it's legit but has a pending I/O. - */ - - if (GHOST_STATE(hdr->b_state)) - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - - ASSERT3U(hdr->b_size, ==, size); - DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, - zbookmark_t *, zb); - ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, - data, metadata, misses); - - rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, flags, zb); - - if (*arc_flags & ARC_WAIT) - return (zio_wait(rzio)); - - ASSERT(*arc_flags & ARC_NOWAIT); - zio_nowait(rzio); - } - return (0); -} - -/* - * arc_read() variant to support pool traversal. If the block is already - * in the ARC, make a copy of it; otherwise, the caller will do the I/O. - * The idea is that we don't want pool traversal filling up memory, but - * if the ARC already has the data anyway, we shouldn't pay for the I/O. - */ -int -arc_tryread(spa_t *spa, blkptr_t *bp, void *data) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_mtx; - int rc = 0; - - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); - - if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { - arc_buf_t *buf = hdr->b_buf; - - ASSERT(buf); - while (buf->b_data == NULL) { - buf = buf->b_next; - ASSERT(buf); - } - bcopy(buf->b_data, data, hdr->b_size); - } else { - rc = ENOENT; - } - - if (hash_mtx) - mutex_exit(hash_mtx); - - return (rc); -} - -void -arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) -{ - ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); - buf->b_efunc = func; - buf->b_private = private; -} - -/* - * This is used by the DMU to let the ARC know that a buffer is - * being evicted, so the ARC should clean up. If this arc buf - * is not yet in the evicted state, it will be put there. - */ -int -arc_buf_evict(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - arc_buf_t **bufp; - - mutex_enter(&arc_eviction_mtx); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(). - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&arc_eviction_mtx); - return (0); - } - hash_lock = HDR_LOCK(hdr); - mutex_exit(&arc_eviction_mtx); - - mutex_enter(hash_lock); - - if (buf->b_data == NULL) { - /* - * We are on the eviction list. - */ - mutex_exit(hash_lock); - mutex_enter(&arc_eviction_mtx); - if (buf->b_hdr == NULL) { - /* - * We are already in arc_do_user_evicts(). - */ - mutex_exit(&arc_eviction_mtx); - return (0); - } else { - arc_buf_t copy = *buf; /* structure assignment */ - /* - * Process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&arc_eviction_mtx); - VERIFY(copy.b_efunc(©) == 0); - return (1); - } - } - - ASSERT(buf->b_hdr == hdr); - ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); - - /* - * Pull this buffer off of the hdr - */ - bufp = &hdr->b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = buf->b_next; - - ASSERT(buf->b_data != NULL); - arc_buf_destroy(buf, FALSE, FALSE); - - if (hdr->b_datacnt == 0) { - arc_state_t *old_state = hdr->b_state; - arc_state_t *evicted_state; - - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - - evicted_state = - (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - - mutex_enter(&old_state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); - - arc_change_state(evicted_state, hdr, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags = ARC_IN_HASH_TABLE; - - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&old_state->arcs_mtx); - } - mutex_exit(hash_lock); - - VERIFY(buf->b_efunc(buf) == 0); - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_hdr = NULL; - kmem_cache_free(buf_cache, buf); - return (1); -} - -/* - * Release this buffer from the cache. This must be done - * after a read and prior to modifying the buffer contents. - * If the buffer has more than one reference, we must make - * make a new hdr for the buffer. - */ -void -arc_release(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); - - /* this buffer is not on any list */ - ASSERT(refcount_count(&hdr->b_refcnt) > 0); - - if (hdr->b_state == arc_anon) { - /* this buffer is already released */ - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); - ASSERT(BUF_EMPTY(hdr)); - ASSERT(buf->b_efunc == NULL); - arc_buf_thaw(buf); - return; - } - - mutex_enter(hash_lock); - - /* - * Do we have more than one buf? - */ - if (hdr->b_buf != buf || buf->b_next != NULL) { - arc_buf_hdr_t *nhdr; - arc_buf_t **bufp; - uint64_t blksz = hdr->b_size; - spa_t *spa = hdr->b_spa; - arc_buf_contents_t type = hdr->b_type; - - ASSERT(hdr->b_datacnt > 1); - /* - * Pull the data off of this buf and attach it to - * a new anonymous buf. - */ - (void) remove_reference(hdr, hash_lock, tag); - bufp = &hdr->b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = (*bufp)->b_next; - buf->b_next = NULL; - - ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); - if (refcount_is_zero(&hdr->b_refcnt)) { - ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); - } - hdr->b_datacnt -= 1; - arc_cksum_verify(buf); - - mutex_exit(hash_lock); - - nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); - nhdr->b_size = blksz; - nhdr->b_spa = spa; - nhdr->b_type = type; - nhdr->b_buf = buf; - nhdr->b_state = arc_anon; - nhdr->b_arc_access = 0; - nhdr->b_flags = 0; - nhdr->b_datacnt = 1; - nhdr->b_freeze_cksum = NULL; - mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - (void) refcount_add(&nhdr->b_refcnt, tag); - buf->b_hdr = nhdr; - atomic_add_64(&arc_anon->arcs_size, blksz); - - hdr = nhdr; - } else { - ASSERT(refcount_count(&hdr->b_refcnt) == 1); - ASSERT(!list_link_active(&hdr->b_arc_node)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - arc_change_state(arc_anon, hdr, hash_lock); - hdr->b_arc_access = 0; - mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; - arc_buf_thaw(buf); - } - buf->b_efunc = NULL; - buf->b_private = NULL; -} - -int -arc_released(arc_buf_t *buf) -{ - return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); -} - -int -arc_has_callback(arc_buf_t *buf) -{ - return (buf->b_efunc != NULL); -} - -#ifdef ZFS_DEBUG -int -arc_referenced(arc_buf_t *buf) -{ - return (refcount_count(&buf->b_hdr->b_refcnt)); -} -#endif - -static void -arc_write_ready(zio_t *zio) -{ - arc_write_callback_t *callback = zio->io_private; - arc_buf_t *buf = callback->awcb_buf; - - if (callback->awcb_ready) { - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); - callback->awcb_ready(zio, buf, callback->awcb_private); - } - arc_cksum_compute(buf); -} - -static void -arc_write_done(zio_t *zio) -{ - arc_write_callback_t *callback = zio->io_private; - arc_buf_t *buf = callback->awcb_buf; - arc_buf_hdr_t *hdr = buf->b_hdr; - - hdr->b_acb = NULL; - - /* this buffer is on no lists and is not in the hash table */ - ASSERT3P(hdr->b_state, ==, arc_anon); - - hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = zio->io_bp->blk_birth; - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; - /* - * If the block to be written was all-zero, we may have - * compressed it away. In this case no write was performed - * so there will be no dva/birth-date/checksum. The buffer - * must therefor remain anonymous (and uncached). - */ - if (!BUF_EMPTY(hdr)) { - arc_buf_hdr_t *exists; - kmutex_t *hash_lock; - - arc_cksum_verify(buf); - - exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { - /* - * This can only happen if we overwrite for - * sync-to-convergence, because we remove - * buffers from the hash table when we arc_free(). - */ - ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), - BP_IDENTITY(zio->io_bp))); - ASSERT3U(zio->io_bp_orig.blk_birth, ==, - zio->io_bp->blk_birth); - - ASSERT(refcount_is_zero(&exists->b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(exists); - exists = buf_hash_insert(hdr, &hash_lock); - ASSERT3P(exists, ==, NULL); - } - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - } else if (callback->awcb_done == NULL) { - int destroy_hdr; - /* - * This is an anonymous buffer with no user callback, - * destroy it if there are no active references. - */ - mutex_enter(&arc_eviction_mtx); - destroy_hdr = refcount_is_zero(&hdr->b_refcnt); - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - mutex_exit(&arc_eviction_mtx); - if (destroy_hdr) - arc_hdr_destroy(hdr); - } else { - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - } - - if (callback->awcb_done) { - ASSERT(!refcount_is_zero(&hdr->b_refcnt)); - callback->awcb_done(zio, buf, callback->awcb_private); - } - - kmem_free(callback, sizeof (arc_write_callback_t)); -} - -zio_t * -arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, - uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - arc_write_callback_t *callback; - zio_t *zio; - - /* this is a private buffer - no locking required */ - ASSERT3P(hdr->b_state, ==, arc_anon); - ASSERT(BUF_EMPTY(hdr)); - ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); - ASSERT(hdr->b_acb == 0); - callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); - callback->awcb_ready = ready; - callback->awcb_done = done; - callback->awcb_private = private; - callback->awcb_buf = buf; - hdr->b_flags |= ARC_IO_IN_PROGRESS; - zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, - buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, - priority, flags, zb); - - return (zio); -} - -int -arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, uint32_t arc_flags) -{ - arc_buf_hdr_t *ab; - kmutex_t *hash_lock; - zio_t *zio; - - /* - * If this buffer is in the cache, release it, so it - * can be re-used. - */ - ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); - if (ab != NULL) { - /* - * The checksum of blocks to free is not always - * preserved (eg. on the deadlist). However, if it is - * nonzero, it should match what we have in the cache. - */ - ASSERT(bp->blk_cksum.zc_word[0] == 0 || - ab->b_cksum0 == bp->blk_cksum.zc_word[0]); - if (ab->b_state != arc_anon) - arc_change_state(arc_anon, ab, hash_lock); - if (HDR_IO_IN_PROGRESS(ab)) { - /* - * This should only happen when we prefetch. - */ - ASSERT(ab->b_flags & ARC_PREFETCH); - ASSERT3U(ab->b_datacnt, ==, 1); - ab->b_flags |= ARC_FREED_IN_READ; - if (HDR_IN_HASH_TABLE(ab)) - buf_hash_remove(ab); - ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; - ab->b_buf->b_efunc = NULL; - ab->b_buf->b_private = NULL; - mutex_exit(hash_lock); - } else if (refcount_is_zero(&ab->b_refcnt)) { - mutex_exit(hash_lock); - arc_hdr_destroy(ab); - ARCSTAT_BUMP(arcstat_deleted); - } else { - /* - * We still have an active reference on this - * buffer. This can happen, e.g., from - * dbuf_unoverride(). - */ - ASSERT(!HDR_IN_HASH_TABLE(ab)); - ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; - ab->b_buf->b_efunc = NULL; - ab->b_buf->b_private = NULL; - mutex_exit(hash_lock); - } - } - - zio = zio_free(pio, spa, txg, bp, done, private); - - if (arc_flags & ARC_WAIT) - return (zio_wait(zio)); - - ASSERT(arc_flags & ARC_NOWAIT); - zio_nowait(zio); - - return (0); -} - -void -arc_tempreserve_clear(uint64_t tempreserve) -{ - atomic_add_64(&arc_tempreserve, -tempreserve); - ASSERT((int64_t)arc_tempreserve >= 0); -} - -int -arc_tempreserve_space(uint64_t tempreserve) -{ -#ifdef ZFS_DEBUG - /* - * Once in a while, fail for no reason. Everything should cope. - */ - if (spa_get_random(10000) == 0) { - dprintf("forcing random failure\n"); - return (ERESTART); - } -#endif - if (tempreserve > arc_c/4 && !arc_no_grow) - arc_c = MIN(arc_c_max, tempreserve * 4); - if (tempreserve > arc_c) - return (ENOMEM); - - /* - * Throttle writes when the amount of dirty data in the cache - * gets too large. We try to keep the cache less than half full - * of dirty blocks so that our sync times don't grow too large. - * Note: if two requests come in concurrently, we might let them - * both succeed, when one of them should fail. Not a huge deal. - * - * XXX The limit should be adjusted dynamically to keep the time - * to sync a dataset fixed (around 1-5 seconds?). - */ - - if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && - arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { - dprintf("failing, arc_tempreserve=%lluK anon=%lluK " - "tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, arc_anon->arcs_lsize>>10, - tempreserve>>10, arc_c>>10); - return (ERESTART); - } - atomic_add_64(&arc_tempreserve, tempreserve); - return (0); -} - -static kmutex_t arc_lowmem_lock; -#ifdef _KERNEL -static eventhandler_tag arc_event_lowmem = NULL; - -static void -arc_lowmem(void *arg __unused, int howto __unused) -{ - - /* Serialize access via arc_lowmem_lock. */ - mutex_enter(&arc_lowmem_lock); - zfs_needfree = 1; - cv_signal(&arc_reclaim_thr_cv); - while (zfs_needfree) - tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5); - mutex_exit(&arc_lowmem_lock); -} -#endif - -void -arc_init(void) -{ - mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); - - /* Convert seconds to clock ticks */ - arc_min_prefetch_lifespan = 1 * hz; - - /* Start out with 1/8 of all memory */ - arc_c = kmem_size() / 8; -#if 0 -#ifdef _KERNEL - /* - * On architectures where the physical memory can be larger - * than the addressable space (intel in 32-bit mode), we may - * need to limit the cache to 1/8 of VM size. - */ - arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); -#endif -#endif - /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ - arc_c_min = MAX(arc_c / 4, 64<<18); - /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ - if (arc_c * 8 >= 1<<30) - arc_c_max = (arc_c * 8) - (1<<30); - else - arc_c_max = arc_c_min; - arc_c_max = MAX(arc_c * 5, arc_c_max); -#ifdef _KERNEL - /* - * Allow the tunables to override our calculations if they are - * reasonable (ie. over 16MB) - */ - if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size()) - arc_c_max = zfs_arc_max; - if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max) - arc_c_min = zfs_arc_min; -#endif - arc_c = arc_c_max; - arc_p = (arc_c >> 1); - - /* if kmem_flags are set, lets try to use less memory */ - if (kmem_debugging()) - arc_c = arc_c / 2; - if (arc_c < arc_c_min) - arc_c = arc_c_min; - - zfs_arc_min = arc_c_min; - zfs_arc_max = arc_c_max; - - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_size = 0; - - mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - - buf_init(); - - arc_thread_exit = 0; - arc_eviction_list = NULL; - mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); - bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); - - arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, - sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - - if (arc_ksp != NULL) { - arc_ksp->ks_data = &arc_stats; - kstat_install(arc_ksp); - } - - (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); - -#ifdef _KERNEL - arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, - EVENTHANDLER_PRI_FIRST); -#endif - - arc_dead = FALSE; - -#ifdef _KERNEL - /* Warn about ZFS memory and address space requirements. */ - if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { - printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " - "expect unstable behavior.\n"); - } - if (kmem_size() < 512 * (1 << 20)) { - printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " - "expect unstable behavior.\n"); - printf(" Consider tuning vm.kmem_size and " - "vm.kmem_size_max\n"); - printf(" in /boot/loader.conf.\n"); - } -#endif -} - -void -arc_fini(void) -{ - mutex_enter(&arc_reclaim_thr_lock); - arc_thread_exit = 1; - cv_signal(&arc_reclaim_thr_cv); - while (arc_thread_exit != 0) - cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); - mutex_exit(&arc_reclaim_thr_lock); - - arc_flush(); - - arc_dead = TRUE; - - if (arc_ksp != NULL) { - kstat_delete(arc_ksp); - arc_ksp = NULL; - } - - mutex_destroy(&arc_eviction_mtx); - mutex_destroy(&arc_reclaim_thr_lock); - cv_destroy(&arc_reclaim_thr_cv); - - list_destroy(&arc_mru->arcs_list); - list_destroy(&arc_mru_ghost->arcs_list); - list_destroy(&arc_mfu->arcs_list); - list_destroy(&arc_mfu_ghost->arcs_list); - - mutex_destroy(&arc_anon->arcs_mtx); - mutex_destroy(&arc_mru->arcs_mtx); - mutex_destroy(&arc_mru_ghost->arcs_mtx); - mutex_destroy(&arc_mfu->arcs_mtx); - mutex_destroy(&arc_mfu_ghost->arcs_mtx); - - buf_fini(); - - mutex_destroy(&arc_lowmem_lock); -#ifdef _KERNEL - if (arc_event_lowmem != NULL) - EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); -#endif -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c deleted file mode 100644 index 4442b1f..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c +++ /dev/null @@ -1,312 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/bplist.h> -#include <sys/zfs_context.h> - -static int -bplist_hold(bplist_t *bpl) -{ - ASSERT(MUTEX_HELD(&bpl->bpl_lock)); - if (bpl->bpl_dbuf == NULL) { - int err = dmu_bonus_hold(bpl->bpl_mos, - bpl->bpl_object, bpl, &bpl->bpl_dbuf); - if (err) - return (err); - bpl->bpl_phys = bpl->bpl_dbuf->db_data; - } - return (0); -} - -uint64_t -bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx) -{ - int size; - - size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ? - BPLIST_SIZE_V0 : sizeof (bplist_phys_t); - - return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize, - DMU_OT_BPLIST_HDR, size, tx)); -} - -void -bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx) -{ - VERIFY(dmu_object_free(mos, object, tx) == 0); -} - -int -bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object) -{ - dmu_object_info_t doi; - int err; - - err = dmu_object_info(mos, object, &doi); - if (err) - return (err); - - mutex_enter(&bpl->bpl_lock); - - ASSERT(bpl->bpl_dbuf == NULL); - ASSERT(bpl->bpl_phys == NULL); - ASSERT(bpl->bpl_cached_dbuf == NULL); - ASSERT(bpl->bpl_queue == NULL); - ASSERT(object != 0); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR); - - bpl->bpl_mos = mos; - bpl->bpl_object = object; - bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1); - bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT; - bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t)); - - mutex_exit(&bpl->bpl_lock); - return (0); -} - -void -bplist_close(bplist_t *bpl) -{ - mutex_enter(&bpl->bpl_lock); - - ASSERT(bpl->bpl_queue == NULL); - - if (bpl->bpl_cached_dbuf) { - dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); - bpl->bpl_cached_dbuf = NULL; - } - if (bpl->bpl_dbuf) { - dmu_buf_rele(bpl->bpl_dbuf, bpl); - bpl->bpl_dbuf = NULL; - bpl->bpl_phys = NULL; - } - - mutex_exit(&bpl->bpl_lock); -} - -boolean_t -bplist_empty(bplist_t *bpl) -{ - boolean_t rv; - - if (bpl->bpl_object == 0) - return (B_TRUE); - - mutex_enter(&bpl->bpl_lock); - VERIFY(0 == bplist_hold(bpl)); /* XXX */ - rv = (bpl->bpl_phys->bpl_entries == 0); - mutex_exit(&bpl->bpl_lock); - - return (rv); -} - -static int -bplist_cache(bplist_t *bpl, uint64_t blkid) -{ - int err = 0; - - if (bpl->bpl_cached_dbuf == NULL || - bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) { - if (bpl->bpl_cached_dbuf != NULL) - dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); - err = dmu_buf_hold(bpl->bpl_mos, - bpl->bpl_object, blkid << bpl->bpl_blockshift, - bpl, &bpl->bpl_cached_dbuf); - ASSERT(err || bpl->bpl_cached_dbuf->db_size == - 1ULL << bpl->bpl_blockshift); - } - return (err); -} - -int -bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) -{ - uint64_t blk, off; - blkptr_t *bparray; - int err; - - mutex_enter(&bpl->bpl_lock); - - err = bplist_hold(bpl); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - if (*itorp >= bpl->bpl_phys->bpl_entries) { - mutex_exit(&bpl->bpl_lock); - return (ENOENT); - } - - blk = *itorp >> bpl->bpl_bpshift; - off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); - - err = bplist_cache(bpl, blk); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - bparray = bpl->bpl_cached_dbuf->db_data; - *bp = bparray[off]; - (*itorp)++; - mutex_exit(&bpl->bpl_lock); - return (0); -} - -int -bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx) -{ - uint64_t blk, off; - blkptr_t *bparray; - int err; - - ASSERT(!BP_IS_HOLE(bp)); - mutex_enter(&bpl->bpl_lock); - err = bplist_hold(bpl); - if (err) - return (err); - - blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; - off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); - - err = bplist_cache(bpl, blk); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx); - bparray = bpl->bpl_cached_dbuf->db_data; - bparray[off] = *bp; - - /* We never need the fill count. */ - bparray[off].blk_fill = 0; - - /* The bplist will compress better if we can leave off the checksum */ - bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); - - dmu_buf_will_dirty(bpl->bpl_dbuf, tx); - bpl->bpl_phys->bpl_entries++; - bpl->bpl_phys->bpl_bytes += - bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp); - if (bpl->bpl_havecomp) { - bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp); - bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp); - } - mutex_exit(&bpl->bpl_lock); - - return (0); -} - -/* - * Deferred entry; will be written later by bplist_sync(). - */ -void -bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp) -{ - bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP); - - ASSERT(!BP_IS_HOLE(bp)); - mutex_enter(&bpl->bpl_lock); - bpq->bpq_blk = *bp; - bpq->bpq_next = bpl->bpl_queue; - bpl->bpl_queue = bpq; - mutex_exit(&bpl->bpl_lock); -} - -void -bplist_sync(bplist_t *bpl, dmu_tx_t *tx) -{ - bplist_q_t *bpq; - - mutex_enter(&bpl->bpl_lock); - while ((bpq = bpl->bpl_queue) != NULL) { - bpl->bpl_queue = bpq->bpq_next; - mutex_exit(&bpl->bpl_lock); - VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx)); - kmem_free(bpq, sizeof (*bpq)); - mutex_enter(&bpl->bpl_lock); - } - mutex_exit(&bpl->bpl_lock); -} - -void -bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) -{ - mutex_enter(&bpl->bpl_lock); - ASSERT3P(bpl->bpl_queue, ==, NULL); - VERIFY(0 == bplist_hold(bpl)); - dmu_buf_will_dirty(bpl->bpl_dbuf, tx); - VERIFY(0 == dmu_free_range(bpl->bpl_mos, - bpl->bpl_object, 0, -1ULL, tx)); - bpl->bpl_phys->bpl_entries = 0; - bpl->bpl_phys->bpl_bytes = 0; - if (bpl->bpl_havecomp) { - bpl->bpl_phys->bpl_comp = 0; - bpl->bpl_phys->bpl_uncomp = 0; - } - mutex_exit(&bpl->bpl_lock); -} - -int -bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - uint64_t itor = 0, comp = 0, uncomp = 0; - int err; - blkptr_t bp; - - mutex_enter(&bpl->bpl_lock); - - err = bplist_hold(bpl); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - *usedp = bpl->bpl_phys->bpl_bytes; - if (bpl->bpl_havecomp) { - *compp = bpl->bpl_phys->bpl_comp; - *uncompp = bpl->bpl_phys->bpl_uncomp; - } - mutex_exit(&bpl->bpl_lock); - - if (!bpl->bpl_havecomp) { - while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { - comp += BP_GET_PSIZE(&bp); - uncomp += BP_GET_UCSIZE(&bp); - } - if (err == ENOENT) - err = 0; - *compp = comp; - *uncompp = uncomp; - } - - return (err); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c deleted file mode 100644 index 94c6308..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ /dev/null @@ -1,2247 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/dmu.h> -#include <sys/dmu_impl.h> -#include <sys/dbuf.h> -#include <sys/dmu_objset.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dmu_tx.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/dmu_zfetch.h> - -static void dbuf_destroy(dmu_buf_impl_t *db); -static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, - int compress, dmu_tx_t *tx); -static arc_done_func_t dbuf_write_ready; -static arc_done_func_t dbuf_write_done; - -int zfs_mdcomp_disable = 0; -SYSCTL_DECL(_vfs_zfs); -TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); -SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, - &zfs_mdcomp_disable, 0, "Disable metadata compression"); - -/* - * Global data structures and functions for the dbuf cache. - */ -static kmem_cache_t *dbuf_cache; - -/* ARGSUSED */ -static int -dbuf_cons(void *vdb, void *unused, int kmflag) -{ - dmu_buf_impl_t *db = vdb; - bzero(db, sizeof (dmu_buf_impl_t)); - - mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); - cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); - refcount_create(&db->db_holds); - return (0); -} - -/* ARGSUSED */ -static void -dbuf_dest(void *vdb, void *unused) -{ - dmu_buf_impl_t *db = vdb; - mutex_destroy(&db->db_mtx); - cv_destroy(&db->db_changed); - refcount_destroy(&db->db_holds); -} - -/* - * dbuf hash table routines - */ -static dbuf_hash_table_t dbuf_hash_table; - -static uint64_t dbuf_hash_count; - -static uint64_t -dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) -{ - uintptr_t osv = (uintptr_t)os; - uint64_t crc = -1ULL; - - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; - - crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); - - return (crc); -} - -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); - -#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ - ((dbuf)->db.db_object == (obj) && \ - (dbuf)->db_objset == (os) && \ - (dbuf)->db_level == (level) && \ - (dbuf)->db_blkid == (blkid)) - -dmu_buf_impl_t * -dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - objset_impl_t *os = dn->dn_objset; - uint64_t obj = dn->dn_object; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); - uint64_t idx = hv & h->hash_table_mask; - dmu_buf_impl_t *db; - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { - if (DBUF_EQUAL(db, os, obj, level, blkid)) { - mutex_enter(&db->db_mtx); - if (db->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (db); - } - mutex_exit(&db->db_mtx); - } - } - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (NULL); -} - -/* - * Insert an entry into the hash table. If there is already an element - * equal to elem in the hash table, then the already existing element - * will be returned and the new element will not be inserted. - * Otherwise returns NULL. - */ -static dmu_buf_impl_t * -dbuf_hash_insert(dmu_buf_impl_t *db) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - objset_impl_t *os = db->db_objset; - uint64_t obj = db->db.db_object; - int level = db->db_level; - uint64_t blkid = db->db_blkid; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); - uint64_t idx = hv & h->hash_table_mask; - dmu_buf_impl_t *dbf; - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { - if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { - mutex_enter(&dbf->db_mtx); - if (dbf->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (dbf); - } - mutex_exit(&dbf->db_mtx); - } - } - - mutex_enter(&db->db_mtx); - db->db_hash_next = h->hash_table[idx]; - h->hash_table[idx] = db; - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_add_64(&dbuf_hash_count, 1); - - return (NULL); -} - -/* - * Remove an entry from the hash table. This operation will - * fail if there are any existing holds on the db. - */ -static void -dbuf_hash_remove(dmu_buf_impl_t *db) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, - db->db_level, db->db_blkid); - uint64_t idx = hv & h->hash_table_mask; - dmu_buf_impl_t *dbf, **dbp; - - /* - * We musn't hold db_mtx to maintin lock ordering: - * DBUF_HASH_MUTEX > db_mtx. - */ - ASSERT(refcount_is_zero(&db->db_holds)); - ASSERT(db->db_state == DB_EVICTING); - ASSERT(!MUTEX_HELD(&db->db_mtx)); - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - dbp = &h->hash_table[idx]; - while ((dbf = *dbp) != db) { - dbp = &dbf->db_hash_next; - ASSERT(dbf != NULL); - } - *dbp = db->db_hash_next; - db->db_hash_next = NULL; - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_add_64(&dbuf_hash_count, -1); -} - -static arc_evict_func_t dbuf_do_evict; - -static void -dbuf_evict_user(dmu_buf_impl_t *db) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (db->db_level != 0 || db->db_evict_func == NULL) - return; - - if (db->db_user_data_ptr_ptr) - *db->db_user_data_ptr_ptr = db->db.db_data; - db->db_evict_func(&db->db, db->db_user_ptr); - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; -} - -void -dbuf_evict(dmu_buf_impl_t *db) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL); - ASSERT(db->db_data_pending == NULL); - - dbuf_clear(db); - dbuf_destroy(db); -} - -void -dbuf_init(void) -{ - uint64_t hsize = 1ULL << 16; - dbuf_hash_table_t *h = &dbuf_hash_table; - int i; - - /* - * The hash table is big enough to fill all of physical memory - * with an average 4K block size. The table will take up - * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). - */ - while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) - hsize <<= 1; - -retry: - h->hash_table_mask = hsize - 1; - h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); - if (h->hash_table == NULL) { - /* XXX - we should really return an error instead of assert */ - ASSERT(hsize > (1ULL << 10)); - hsize >>= 1; - goto retry; - } - - dbuf_cache = kmem_cache_create("dmu_buf_impl_t", - sizeof (dmu_buf_impl_t), - 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); -} - -void -dbuf_fini(void) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - int i; - - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_destroy(&h->hash_mutexes[i]); - kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); - kmem_cache_destroy(dbuf_cache); -} - -/* - * Other stuff. - */ - -#ifdef ZFS_DEBUG -static void -dbuf_verify(dmu_buf_impl_t *db) -{ - dnode_t *dn = db->db_dnode; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) - return; - - ASSERT(db->db_objset != NULL); - if (dn == NULL) { - ASSERT(db->db_parent == NULL); - ASSERT(db->db_blkptr == NULL); - } else { - ASSERT3U(db->db.db_object, ==, dn->dn_object); - ASSERT3P(db->db_objset, ==, dn->dn_objset); - ASSERT3U(db->db_level, <, dn->dn_nlevels); - ASSERT(db->db_blkid == DB_BONUS_BLKID || - list_head(&dn->dn_dbufs)); - } - if (db->db_blkid == DB_BONUS_BLKID) { - ASSERT(dn != NULL); - ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); - ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); - } else { - ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); - } - - if (db->db_level == 0) { - /* we can be momentarily larger in dnode_set_blksz() */ - if (db->db_blkid != DB_BONUS_BLKID && dn) { - ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); - } - if (db->db.db_object == DMU_META_DNODE_OBJECT) { - dbuf_dirty_record_t *dr = db->db_data_pending; - /* - * it should only be modified in syncing - * context, so make sure we only have - * one copy of the data. - */ - ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); - } - } - - /* verify db->db_blkptr */ - if (db->db_blkptr) { - if (db->db_parent == dn->dn_dbuf) { - /* db is pointed to by the dnode */ - /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) - ASSERT(db->db_parent == NULL); - else - ASSERT(db->db_parent != NULL); - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); - } else { - /* db is pointed to by an indirect block */ - int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; - ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); - ASSERT3U(db->db_parent->db.db_object, ==, - db->db.db_object); - /* - * dnode_grow_indblksz() can make this fail if we don't - * have the struct_rwlock. XXX indblksz no longer - * grows. safe to do this now? - */ - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { - ASSERT3P(db->db_blkptr, ==, - ((blkptr_t *)db->db_parent->db.db_data + - db->db_blkid % epb)); - } - } - } - if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && - db->db.db_data && db->db_blkid != DB_BONUS_BLKID && - db->db_state != DB_FILL && !dn->dn_free_txg) { - /* - * If the blkptr isn't set but they have nonzero data, - * it had better be dirty, otherwise we'll lose that - * data when we evict this buffer. - */ - if (db->db_dirtycnt == 0) { - uint64_t *buf = db->db.db_data; - int i; - - for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); - } - } - } -} -#endif - -static void -dbuf_update_data(dmu_buf_impl_t *db) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level == 0 && db->db_user_data_ptr_ptr) { - ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_user_data_ptr_ptr = db->db.db_data; - } -} - -static void -dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); - db->db_buf = buf; - if (buf != NULL) { - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); - dbuf_update_data(db); - } else { - dbuf_evict_user(db); - db->db.db_data = NULL; - db->db_state = DB_UNCACHED; - } -} - -uint64_t -dbuf_whichblock(dnode_t *dn, uint64_t offset) -{ - if (dn->dn_datablkshift) { - return (offset >> dn->dn_datablkshift); - } else { - ASSERT3U(offset, <, dn->dn_datablksz); - return (0); - } -} - -static void -dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - - mutex_enter(&db->db_mtx); - ASSERT3U(db->db_state, ==, DB_READ); - /* - * All reads are synchronous, so we must have a hold on the dbuf - */ - ASSERT(refcount_count(&db->db_holds) > 0); - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - if (db->db_level == 0 && db->db_freed_in_flight) { - /* we were freed in flight; disregard any error */ - arc_release(buf, db); - bzero(buf->b_data, db->db.db_size); - arc_buf_freeze(buf); - db->db_freed_in_flight = FALSE; - dbuf_set_data(db, buf); - db->db_state = DB_CACHED; - } else if (zio == NULL || zio->io_error == 0) { - dbuf_set_data(db, buf); - db->db_state = DB_CACHED; - } else { - ASSERT(db->db_blkid != DB_BONUS_BLKID); - ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - db->db_state = DB_UNCACHED; - } - cv_broadcast(&db->db_changed); - mutex_exit(&db->db_mtx); - dbuf_rele(db, NULL); -} - -static void -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) -{ - blkptr_t *bp; - zbookmark_t zb; - uint32_t aflags = ARC_NOWAIT; - - ASSERT(!refcount_is_zero(&db->db_holds)); - /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_state == DB_UNCACHED); - ASSERT(db->db_buf == NULL); - - if (db->db_blkid == DB_BONUS_BLKID) { - ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); - db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); - if (db->db.db_size < DN_MAX_BONUSLEN) - bzero(db->db.db_data, DN_MAX_BONUSLEN); - bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, - db->db.db_size); - dbuf_update_data(db); - db->db_state = DB_CACHED; - mutex_exit(&db->db_mtx); - return; - } - - if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) - bp = NULL; - else - bp = db->db_blkptr; - - if (bp == NULL) - dprintf_dbuf(db, "blkptr: %s\n", "NULL"); - else - dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); - - if (bp == NULL || BP_IS_HOLE(bp)) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - ASSERT(bp == NULL || BP_IS_HOLE(bp)); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db, type)); - bzero(db->db.db_data, db->db.db_size); - db->db_state = DB_CACHED; - *flags |= DB_RF_CACHED; - mutex_exit(&db->db_mtx); - return; - } - - db->db_state = DB_READ; - mutex_exit(&db->db_mtx); - - zb.zb_objset = db->db_objset->os_dsl_dataset ? - db->db_objset->os_dsl_dataset->ds_object : 0; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; - - dbuf_add_ref(db, NULL); - /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ - ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES); - (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, - db->db_level > 0 ? byteswap_uint64_array : - dmu_ot[db->db_dnode->dn_type].ot_byteswap, - dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, - (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, - &aflags, &zb); - if (aflags & ARC_CACHED) - *flags |= DB_RF_CACHED; -} - -int -dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) -{ - int err = 0; - int havepzio = (zio != NULL); - int prefetch; - - /* - * We don't have to hold the mutex to check db_state because it - * can't be freed while we have a hold on the buffer. - */ - ASSERT(!refcount_is_zero(&db->db_holds)); - - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); - - prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; - - mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { - mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, - db->db.db_size, TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); - } else if (db->db_state == DB_UNCACHED) { - if (zio == NULL) { - zio = zio_root(db->db_dnode->dn_objset->os_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - } - dbuf_read_impl(db, zio, &flags); - - /* dbuf_read_impl has dropped db_mtx for us */ - - if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, - db->db.db_size, flags & DB_RF_CACHED); - - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); - - if (!havepzio) - err = zio_wait(zio); - } else { - mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, - db->db.db_size, TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); - - mutex_enter(&db->db_mtx); - if ((flags & DB_RF_NEVERWAIT) == 0) { - while (db->db_state == DB_READ || - db->db_state == DB_FILL) { - ASSERT(db->db_state == DB_READ || - (flags & DB_RF_HAVESTRUCT) == 0); - cv_wait(&db->db_changed, &db->db_mtx); - } - if (db->db_state == DB_UNCACHED) - err = EIO; - } - mutex_exit(&db->db_mtx); - } - - ASSERT(err || havepzio || db->db_state == DB_CACHED); - return (err); -} - -static void -dbuf_noread(dmu_buf_impl_t *db) -{ - ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DB_BONUS_BLKID); - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_UNCACHED) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db, type)); - db->db_state = DB_FILL; - } else { - ASSERT3U(db->db_state, ==, DB_CACHED); - } - mutex_exit(&db->db_mtx); -} - -/* - * This is our just-in-time copy function. It makes a copy of - * buffers, that have been modified in a previous transaction - * group, before we modify them in the current active group. - * - * This function is used in two places: when we are dirtying a - * buffer for the first time in a txg, and when we are freeing - * a range in a dnode that includes this buffer. - * - * Note that when we are called from dbuf_free_range() we do - * not put a hold on the buffer, we just traverse the active - * dbuf list for the dnode. - */ -static void -dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) -{ - dbuf_dirty_record_t *dr = db->db_last_dirty; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db.db_data != NULL); - ASSERT(db->db_level == 0); - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); - - if (dr == NULL || - (dr->dt.dl.dr_data != - ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) - return; - - /* - * If the last dirty record for this dbuf has not yet synced - * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, - * or (if there a no active holders) - * just null out the current db_data pointer. - */ - ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DB_BONUS_BLKID) { - /* Note that the data bufs here are zio_bufs */ - dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); - bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); - } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - int size = db->db.db_size; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dr->dt.dl.dr_data = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, size, db, type); - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); - } else { - dbuf_set_data(db, NULL); - } -} - -void -dbuf_unoverride(dbuf_dirty_record_t *dr) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - uint64_t txg = dr->dr_txg; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); - ASSERT(db->db_level == 0); - - if (db->db_blkid == DB_BONUS_BLKID || - dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) - return; - - /* free this block */ - if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { - /* XXX can get silent EIO here */ - (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, - txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); - } - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - /* - * Release the already-written buffer, so we leave it in - * a consistent dirty state. Note that all callers are - * modifying the buffer, so they will immediately do - * another (redundant) arc_release(). Therefore, leave - * the buf thawed to save the effort of freezing & - * immediately re-thawing it. - */ - arc_release(dr->dt.dl.dr_data, db); -} - -void -dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db, *db_next; - uint64_t txg = tx->tx_txg; - - dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); - mutex_enter(&dn->dn_dbufs_mtx); - for (db = list_head(&dn->dn_dbufs); db; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); - ASSERT(db->db_blkid != DB_BONUS_BLKID); - if (db->db_level != 0) - continue; - dprintf_dbuf(db, "found buf %s\n", ""); - if (db->db_blkid < blkid || - db->db_blkid >= blkid+nblks) - continue; - - /* found a level 0 buffer in the range */ - if (dbuf_undirty(db, tx)) - continue; - - mutex_enter(&db->db_mtx); - if (db->db_state == DB_UNCACHED || - db->db_state == DB_EVICTING) { - ASSERT(db->db.db_data == NULL); - mutex_exit(&db->db_mtx); - continue; - } - if (db->db_state == DB_READ || db->db_state == DB_FILL) { - /* will be handled in dbuf_read_done or dbuf_rele */ - db->db_freed_in_flight = TRUE; - mutex_exit(&db->db_mtx); - continue; - } - if (refcount_count(&db->db_holds) == 0) { - ASSERT(db->db_buf); - dbuf_clear(db); - continue; - } - /* The dbuf is referenced */ - - if (db->db_last_dirty != NULL) { - dbuf_dirty_record_t *dr = db->db_last_dirty; - - if (dr->dr_txg == txg) { - /* - * This buffer is "in-use", re-adjust the file - * size to reflect that this buffer may - * contain new data when we sync. - */ - if (db->db_blkid > dn->dn_maxblkid) - dn->dn_maxblkid = db->db_blkid; - dbuf_unoverride(dr); - } else { - /* - * This dbuf is not dirty in the open context. - * Either uncache it (if its not referenced in - * the open context) or reset its contents to - * empty. - */ - dbuf_fix_old_data(db, txg); - } - } - /* clear the contents if its cached */ - if (db->db_state == DB_CACHED) { - ASSERT(db->db.db_data != NULL); - arc_release(db->db_buf, db); - bzero(db->db.db_data, db->db.db_size); - arc_buf_freeze(db->db_buf); - } - - mutex_exit(&db->db_mtx); - } - mutex_exit(&dn->dn_dbufs_mtx); -} - -static int -dbuf_new_block(dmu_buf_impl_t *db) -{ - dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; - uint64_t birth_txg = 0; - - /* Don't count meta-objects */ - if (ds == NULL) - return (FALSE); - - /* - * We don't need any locking to protect db_blkptr: - * If it's syncing, then db_last_dirty will be set - * so we'll ignore db_blkptr. - */ - ASSERT(MUTEX_HELD(&db->db_mtx)); - /* If we have been dirtied since the last snapshot, its not new */ - if (db->db_last_dirty) - birth_txg = db->db_last_dirty->dr_txg; - else if (db->db_blkptr) - birth_txg = db->db_blkptr->blk_birth; - - if (birth_txg) - return (!dsl_dataset_block_freeable(ds, birth_txg)); - else - return (TRUE); -} - -void -dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) -{ - arc_buf_t *buf, *obuf; - int osize = db->db.db_size; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - ASSERT(db->db_blkid != DB_BONUS_BLKID); - - /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); - - /* - * This call to dbuf_will_dirty() with the dn_struct_rwlock held - * is OK, because there can be no other references to the db - * when we are changing its size, so no concurrent DB_FILL can - * be happening. - */ - /* - * XXX we should be doing a dbuf_read, checking the return - * value and returning that up to our callers - */ - dbuf_will_dirty(db, tx); - - /* create the data buffer for the new block */ - buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); - - /* copy old block data to the new block */ - obuf = db->db_buf; - bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); - /* zero the remainder */ - if (size > osize) - bzero((uint8_t *)buf->b_data + osize, size - osize); - - mutex_enter(&db->db_mtx); - dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db) == 1); - db->db.db_size = size; - - if (db->db_level == 0) { - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - db->db_last_dirty->dt.dl.dr_data = buf; - } - mutex_exit(&db->db_mtx); - - dnode_willuse_space(db->db_dnode, size-osize, tx); -} - -dbuf_dirty_record_t * -dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; - dbuf_dirty_record_t **drp, *dr; - int drop_struct_lock = FALSE; - int txgoff = tx->tx_txg & TXG_MASK; - - ASSERT(tx->tx_txg != 0); - ASSERT(!refcount_is_zero(&db->db_holds)); - DMU_TX_DIRTY_BUF(tx, db); - - /* - * Shouldn't dirty a regular buffer in syncing context. Private - * objects may be dirtied in syncing context, but only if they - * were already pre-dirtied in open context. - * XXX We may want to prohibit dirtying in syncing context even - * if they did pre-dirty. - */ - ASSERT(!dmu_tx_is_syncing(tx) || - BP_IS_HOLE(dn->dn_objset->os_rootbp) || - dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_objset->os_dsl_dataset == NULL || - dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); - - /* - * We make this assert for private objects as well, but after we - * check if we're already dirty. They are allowed to re-dirty - * in syncing context. - */ - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - - mutex_enter(&db->db_mtx); - /* - * XXX make this true for indirects too? The problem is that - * transactions created with dmu_tx_create_assigned() from - * syncing context don't bother holding ahead. - */ - ASSERT(db->db_level != 0 || - db->db_state == DB_CACHED || db->db_state == DB_FILL); - - mutex_enter(&dn->dn_mtx); - /* - * Don't set dirtyctx to SYNC if we're just modifying this as we - * initialize the objset. - */ - if (dn->dn_dirtyctx == DN_UNDIRTIED && - !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - dn->dn_dirtyctx = - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); - ASSERT(dn->dn_dirtyctx_firstset == NULL); - dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); - } - mutex_exit(&dn->dn_mtx); - - /* - * If this buffer is already dirty, we're done. - */ - drp = &db->db_last_dirty; - ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || - db->db.db_object == DMU_META_DNODE_OBJECT); - while (*drp && (*drp)->dr_txg > tx->tx_txg) - drp = &(*drp)->dr_next; - if (*drp && (*drp)->dr_txg == tx->tx_txg) { - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { - /* - * If this buffer has already been written out, - * we now need to reset its state. - */ - dbuf_unoverride(*drp); - if (db->db.db_object != DMU_META_DNODE_OBJECT) - arc_buf_thaw(db->db_buf); - } - mutex_exit(&db->db_mtx); - return (*drp); - } - - /* - * Only valid if not already dirty. - */ - ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - - ASSERT3U(dn->dn_nlevels, >, db->db_level); - ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || - dn->dn_phys->dn_nlevels > db->db_level || - dn->dn_next_nlevels[txgoff] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); - - /* - * We should only be dirtying in syncing context if it's the - * mos, a spa os, or we're initializing the os. However, we are - * allowed to dirty in syncing context provided we already - * dirtied it in open context. Hence we must make this - * assertion only if we're not already dirty. - */ - ASSERT(!dmu_tx_is_syncing(tx) || - os->os_dsl_dataset == NULL || - !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || - !BP_IS_HOLE(os->os_rootbp)); - ASSERT(db->db.db_size != 0); - - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - - /* - * If this buffer is dirty in an old transaction group we need - * to make a copy of it so that the changes we make in this - * transaction group won't leak out when we sync the older txg. - */ - dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); - if (db->db_level == 0) { - void *data_old = db->db_buf; - - if (db->db_blkid == DB_BONUS_BLKID) { - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db.db_data; - } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { - /* - * Release the data buffer from the cache so that we - * can modify it without impacting possible other users - * of this cached data block. Note that indirect - * blocks and private objects are not released until the - * syncing state (since they are only modified then). - */ - arc_release(db->db_buf, db); - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db_buf; - } - ASSERT(data_old != NULL); - dr->dt.dl.dr_data = data_old; - } else { - mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&dr->dt.di.dr_children, - sizeof (dbuf_dirty_record_t), - offsetof(dbuf_dirty_record_t, dr_dirty_node)); - } - dr->dr_dbuf = db; - dr->dr_txg = tx->tx_txg; - dr->dr_next = *drp; - *drp = dr; - - /* - * We could have been freed_in_flight between the dbuf_noread - * and dbuf_dirty. We win, as though the dbuf_noread() had - * happened after the free. - */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); - mutex_exit(&dn->dn_mtx); - db->db_freed_in_flight = FALSE; - } - - if (db->db_blkid != DB_BONUS_BLKID) { - /* - * Update the accounting. - */ - if (!dbuf_new_block(db) && db->db_blkptr) { - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - dnode_willuse_space(dn, - -bp_get_dasize(os->os_spa, db->db_blkptr), tx); - } - dnode_willuse_space(dn, db->db.db_size, tx); - } - - /* - * This buffer is now part of this txg - */ - dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); - db->db_dirtycnt += 1; - ASSERT3U(db->db_dirtycnt, <=, 3); - - mutex_exit(&db->db_mtx); - - if (db->db_blkid == DB_BONUS_BLKID) { - mutex_enter(&dn->dn_mtx); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&dn->dn_dirty_records[txgoff], dr); - mutex_exit(&dn->dn_mtx); - dnode_setdirty(dn, tx); - return (dr); - } - - if (db->db_level == 0) { - dnode_new_blkid(dn, db->db_blkid, tx); - ASSERT(dn->dn_maxblkid >= db->db_blkid); - } - - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - - if (db->db_level+1 < dn->dn_nlevels) { - dmu_buf_impl_t *parent = db->db_parent; - dbuf_dirty_record_t *di; - int parent_held = FALSE; - - if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - parent = dbuf_hold_level(dn, db->db_level+1, - db->db_blkid >> epbs, FTAG); - parent_held = TRUE; - } - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); - ASSERT3U(db->db_level+1, ==, parent->db_level); - di = dbuf_dirty(parent, tx); - if (parent_held) - dbuf_rele(parent, FTAG); - - mutex_enter(&db->db_mtx); - /* possible race with dbuf_undirty() */ - if (db->db_last_dirty == dr || - dn->dn_object == DMU_META_DNODE_OBJECT) { - mutex_enter(&di->dt.di.dr_mtx); - ASSERT3U(di->dr_txg, ==, tx->tx_txg); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&di->dt.di.dr_children, dr); - mutex_exit(&di->dt.di.dr_mtx); - dr->dr_parent = di; - } - mutex_exit(&db->db_mtx); - } else { - ASSERT(db->db_level+1 == dn->dn_nlevels); - ASSERT(db->db_blkid < dn->dn_nblkptr); - ASSERT(db->db_parent == NULL || - db->db_parent == db->db_dnode->dn_dbuf); - mutex_enter(&dn->dn_mtx); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&dn->dn_dirty_records[txgoff], dr); - mutex_exit(&dn->dn_mtx); - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); - } - - dnode_setdirty(dn, tx); - return (dr); -} - -static int -dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - dnode_t *dn = db->db_dnode; - uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr; - - ASSERT(txg != 0); - ASSERT(db->db_blkid != DB_BONUS_BLKID); - - mutex_enter(&db->db_mtx); - - /* - * If this buffer is not dirty, we're done. - */ - for (dr = db->db_last_dirty; dr; dr = dr->dr_next) - if (dr->dr_txg <= txg) - break; - if (dr == NULL || dr->dr_txg < txg) { - mutex_exit(&db->db_mtx); - return (0); - } - ASSERT(dr->dr_txg == txg); - - /* - * If this buffer is currently held, we cannot undirty - * it, since one of the current holders may be in the - * middle of an update. Note that users of dbuf_undirty() - * should not place a hold on the dbuf before the call. - */ - if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - mutex_exit(&db->db_mtx); - /* Make sure we don't toss this buffer at sync phase */ - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); - mutex_exit(&dn->dn_mtx); - return (0); - } - - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - - ASSERT(db->db.db_size != 0); - - /* XXX would be nice to fix up dn_towrite_space[] */ - - db->db_last_dirty = dr->dr_next; - - if (dr->dr_parent) { - mutex_enter(&dr->dr_parent->dt.di.dr_mtx); - list_remove(&dr->dr_parent->dt.di.dr_children, dr); - mutex_exit(&dr->dr_parent->dt.di.dr_mtx); - } else if (db->db_level+1 == dn->dn_nlevels) { - ASSERT3P(db->db_parent, ==, dn->dn_dbuf); - mutex_enter(&dn->dn_mtx); - list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); - mutex_exit(&dn->dn_mtx); - } - - if (db->db_level == 0) { - dbuf_unoverride(dr); - - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); - } else { - ASSERT(db->db_buf != NULL); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - list_destroy(&dr->dt.di.dr_children); - mutex_destroy(&dr->dt.di.dr_mtx); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - - if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - arc_buf_t *buf = db->db_buf; - - ASSERT(arc_released(buf)); - dbuf_set_data(db, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - dbuf_evict(db); - return (1); - } - - mutex_exit(&db->db_mtx); - return (0); -} - -#pragma weak dmu_buf_will_dirty = dbuf_will_dirty -void -dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - int rf = DB_RF_MUST_SUCCEED; - - ASSERT(tx->tx_txg != 0); - ASSERT(!refcount_is_zero(&db->db_holds)); - - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; - (void) dbuf_read(db, NULL, rf); - (void) dbuf_dirty(db, tx); -} - -void -dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - ASSERT(db->db_blkid != DB_BONUS_BLKID); - ASSERT(tx->tx_txg != 0); - ASSERT(db->db_level == 0); - ASSERT(!refcount_is_zero(&db->db_holds)); - - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || - dmu_tx_private_ok(tx)); - - dbuf_noread(db); - (void) dbuf_dirty(db, tx); -} - -#pragma weak dmu_buf_fill_done = dbuf_fill_done -/* ARGSUSED */ -void -dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - mutex_enter(&db->db_mtx); - DBUF_VERIFY(db); - - if (db->db_state == DB_FILL) { - if (db->db_level == 0 && db->db_freed_in_flight) { - ASSERT(db->db_blkid != DB_BONUS_BLKID); - /* we were freed while filling */ - /* XXX dbuf_undirty? */ - bzero(db->db.db_data, db->db.db_size); - db->db_freed_in_flight = FALSE; - } - db->db_state = DB_CACHED; - cv_broadcast(&db->db_changed); - } - mutex_exit(&db->db_mtx); -} - -/* - * "Clear" the contents of this dbuf. This will mark the dbuf - * EVICTING and clear *most* of its references. Unfortunetely, - * when we are not holding the dn_dbufs_mtx, we can't clear the - * entry in the dn_dbufs list. We have to wait until dbuf_destroy() - * in this case. For callers from the DMU we will usually see: - * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() - * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() - * Sometimes, though, we will get a mix of these two: - * DMU: dbuf_clear()->arc_buf_evict() - * ARC: dbuf_do_evict()->dbuf_destroy() - */ -void -dbuf_clear(dmu_buf_impl_t *db) -{ - dnode_t *dn = db->db_dnode; - dmu_buf_impl_t *parent = db->db_parent; - dmu_buf_impl_t *dndb = dn->dn_dbuf; - int dbuf_gone = FALSE; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(refcount_is_zero(&db->db_holds)); - - dbuf_evict_user(db); - - if (db->db_state == DB_CACHED) { - ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DB_BONUS_BLKID) - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - db->db.db_data = NULL; - db->db_state = DB_UNCACHED; - } - - ASSERT3U(db->db_state, ==, DB_UNCACHED); - ASSERT(db->db_data_pending == NULL); - - db->db_state = DB_EVICTING; - db->db_blkptr = NULL; - - if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { - list_remove(&dn->dn_dbufs, db); - dnode_rele(dn, db); - } - - if (db->db_buf) - dbuf_gone = arc_buf_evict(db->db_buf); - - if (!dbuf_gone) - mutex_exit(&db->db_mtx); - - /* - * If this dbuf is referened from an indirect dbuf, - * decrement the ref count on the indirect dbuf. - */ - if (parent && parent != dndb) - dbuf_rele(parent, db); -} - -static int -dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, - dmu_buf_impl_t **parentp, blkptr_t **bpp) -{ - int nlevels, epbs; - - *parentp = NULL; - *bpp = NULL; - - ASSERT(blkid != DB_BONUS_BLKID); - - if (dn->dn_phys->dn_nlevels == 0) - nlevels = 1; - else - nlevels = dn->dn_phys->dn_nlevels; - - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - ASSERT3U(level * epbs, <, 64); - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - if (level >= nlevels || - (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { - /* the buffer has no parent yet */ - return (ENOENT); - } else if (level < nlevels-1) { - /* this block is referenced from an indirect block */ - int err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, NULL, parentp); - if (err) - return (err); - err = dbuf_read(*parentp, NULL, - (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); - if (err) { - dbuf_rele(*parentp, NULL); - *parentp = NULL; - return (err); - } - *bpp = ((blkptr_t *)(*parentp)->db.db_data) + - (blkid & ((1ULL << epbs) - 1)); - return (0); - } else { - /* the block is referenced from the dnode */ - ASSERT3U(level, ==, nlevels-1); - ASSERT(dn->dn_phys->dn_nblkptr == 0 || - blkid < dn->dn_phys->dn_nblkptr); - if (dn->dn_dbuf) { - dbuf_add_ref(dn->dn_dbuf, NULL); - *parentp = dn->dn_dbuf; - } - *bpp = &dn->dn_phys->dn_blkptr[blkid]; - return (0); - } -} - -static dmu_buf_impl_t * -dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, - dmu_buf_impl_t *parent, blkptr_t *blkptr) -{ - objset_impl_t *os = dn->dn_objset; - dmu_buf_impl_t *db, *odb; - - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT(dn->dn_type != DMU_OT_NONE); - - db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); - - db->db_objset = os; - db->db.db_object = dn->dn_object; - db->db_level = level; - db->db_blkid = blkid; - db->db_last_dirty = NULL; - db->db_dirtycnt = 0; - db->db_dnode = dn; - db->db_parent = parent; - db->db_blkptr = blkptr; - - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; - db->db_immediate_evict = 0; - db->db_freed_in_flight = 0; - - if (blkid == DB_BONUS_BLKID) { - ASSERT3P(parent, ==, dn->dn_dbuf); - db->db.db_size = dn->dn_bonuslen; - db->db.db_offset = DB_BONUS_BLKID; - db->db_state = DB_UNCACHED; - /* the bonus dbuf is not placed in the hash table */ - return (db); - } else { - int blocksize = - db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; - db->db.db_size = blocksize; - db->db.db_offset = db->db_blkid * blocksize; - } - - /* - * Hold the dn_dbufs_mtx while we get the new dbuf - * in the hash table *and* added to the dbufs list. - * This prevents a possible deadlock with someone - * trying to look up this dbuf before its added to the - * dn_dbufs list. - */ - mutex_enter(&dn->dn_dbufs_mtx); - db->db_state = DB_EVICTING; - if ((odb = dbuf_hash_insert(db)) != NULL) { - /* someone else inserted it first */ - kmem_cache_free(dbuf_cache, db); - mutex_exit(&dn->dn_dbufs_mtx); - return (odb); - } - list_insert_head(&dn->dn_dbufs, db); - db->db_state = DB_UNCACHED; - mutex_exit(&dn->dn_dbufs_mtx); - - if (parent && parent != dn->dn_dbuf) - dbuf_add_ref(parent, db); - - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - refcount_count(&dn->dn_holds) > 0); - (void) refcount_add(&dn->dn_holds, db); - - dprintf_dbuf(db, "db=%p\n", db); - - return (db); -} - -static int -dbuf_do_evict(void *private) -{ - arc_buf_t *buf = private; - dmu_buf_impl_t *db = buf->b_private; - - if (!MUTEX_HELD(&db->db_mtx)) - mutex_enter(&db->db_mtx); - - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_state != DB_EVICTING) { - ASSERT(db->db_state == DB_CACHED); - DBUF_VERIFY(db); - db->db_buf = NULL; - dbuf_evict(db); - } else { - mutex_exit(&db->db_mtx); - dbuf_destroy(db); - } - return (0); -} - -static void -dbuf_destroy(dmu_buf_impl_t *db) -{ - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_blkid != DB_BONUS_BLKID) { - dnode_t *dn = db->db_dnode; - - /* - * If this dbuf is still on the dn_dbufs list, - * remove it from that list. - */ - if (list_link_active(&db->db_link)) { - mutex_enter(&dn->dn_dbufs_mtx); - list_remove(&dn->dn_dbufs, db); - mutex_exit(&dn->dn_dbufs_mtx); - - dnode_rele(dn, db); - } - dbuf_hash_remove(db); - } - db->db_parent = NULL; - db->db_dnode = NULL; - db->db_buf = NULL; - - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); - - kmem_cache_free(dbuf_cache, db); -} - -void -dbuf_prefetch(dnode_t *dn, uint64_t blkid) -{ - dmu_buf_impl_t *db = NULL; - blkptr_t *bp = NULL; - - ASSERT(blkid != DB_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - - if (dnode_block_freed(dn, blkid)) - return; - - /* dbuf_find() returns with db_mtx held */ - if (db = dbuf_find(dn, 0, blkid)) { - if (refcount_count(&db->db_holds) > 0) { - /* - * This dbuf is active. We assume that it is - * already CACHED, or else about to be either - * read or filled. - */ - mutex_exit(&db->db_mtx); - return; - } - mutex_exit(&db->db_mtx); - db = NULL; - } - - if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { - if (bp && !BP_IS_HOLE(bp)) { - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; - zbookmark_t zb; - zb.zb_objset = dn->dn_objset->os_dsl_dataset ? - dn->dn_objset->os_dsl_dataset->ds_object : 0; - zb.zb_object = dn->dn_object; - zb.zb_level = 0; - zb.zb_blkid = blkid; - - (void) arc_read(NULL, dn->dn_objset->os_spa, bp, - dmu_ot[dn->dn_type].ot_byteswap, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &zb); - } - if (db) - dbuf_rele(db, NULL); - } -} - -/* - * Returns with db_holds incremented, and db_mtx not held. - * Note: dn_struct_rwlock must be held. - */ -int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - void *tag, dmu_buf_impl_t **dbp) -{ - dmu_buf_impl_t *db, *parent = NULL; - - ASSERT(blkid != DB_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT3U(dn->dn_nlevels, >, level); - - *dbp = NULL; -top: - /* dbuf_find() returns with db_mtx held */ - db = dbuf_find(dn, level, blkid); - - if (db == NULL) { - blkptr_t *bp = NULL; - int err; - - ASSERT3P(parent, ==, NULL); - err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); - if (fail_sparse) { - if (err == 0 && bp && BP_IS_HOLE(bp)) - err = ENOENT; - if (err) { - if (parent) - dbuf_rele(parent, NULL); - return (err); - } - } - if (err && err != ENOENT) - return (err); - db = dbuf_create(dn, level, blkid, parent, bp); - } - - if (db->db_buf && refcount_is_zero(&db->db_holds)) { - arc_buf_add_ref(db->db_buf, db); - if (db->db_buf->b_data == NULL) { - dbuf_clear(db); - if (parent) { - dbuf_rele(parent, NULL); - parent = NULL; - } - goto top; - } - ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); - } - - ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); - - /* - * If this buffer is currently syncing out, and we are are - * still referencing it from db_data, we need to make a copy - * of it in case we decide we want to dirty it again in this txg. - */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && - dn->dn_object != DMU_META_DNODE_OBJECT && - db->db_state == DB_CACHED && db->db_data_pending) { - dbuf_dirty_record_t *dr = db->db_data_pending; - - if (dr->dt.dl.dr_data == db->db_buf) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - dbuf_set_data(db, - arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db, type)); - bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, - db->db.db_size); - } - } - - (void) refcount_add(&db->db_holds, tag); - dbuf_update_data(db); - DBUF_VERIFY(db); - mutex_exit(&db->db_mtx); - - /* NOTE: we can't rele the parent until after we drop the db_mtx */ - if (parent) - dbuf_rele(parent, NULL); - - ASSERT3P(db->db_dnode, ==, dn); - ASSERT3U(db->db_blkid, ==, blkid); - ASSERT3U(db->db_level, ==, level); - *dbp = db; - - return (0); -} - -dmu_buf_impl_t * -dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) -{ - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); - return (err ? NULL : db); -} - -dmu_buf_impl_t * -dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) -{ - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); - return (err ? NULL : db); -} - -dmu_buf_impl_t * -dbuf_create_bonus(dnode_t *dn) -{ - dmu_buf_impl_t *db = dn->dn_bonus; - - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - ASSERT(dn->dn_bonus == NULL); - db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); - return (db); -} - -#pragma weak dmu_buf_add_ref = dbuf_add_ref -void -dbuf_add_ref(dmu_buf_impl_t *db, void *tag) -{ - int64_t holds = refcount_add(&db->db_holds, tag); - ASSERT(holds > 1); -} - -#pragma weak dmu_buf_rele = dbuf_rele -void -dbuf_rele(dmu_buf_impl_t *db, void *tag) -{ - int64_t holds; - - mutex_enter(&db->db_mtx); - DBUF_VERIFY(db); - - holds = refcount_remove(&db->db_holds, tag); - ASSERT(holds >= 0); - - /* - * We can't freeze indirects if there is a possibility that they - * may be modified in the current syncing context. - */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) - arc_buf_freeze(db->db_buf); - - if (holds == db->db_dirtycnt && - db->db_level == 0 && db->db_immediate_evict) - dbuf_evict_user(db); - - if (holds == 0) { - if (db->db_blkid == DB_BONUS_BLKID) { - mutex_exit(&db->db_mtx); - dnode_rele(db->db_dnode, db); - } else if (db->db_buf == NULL) { - /* - * This is a special case: we never associated this - * dbuf with any data allocated from the ARC. - */ - ASSERT3U(db->db_state, ==, DB_UNCACHED); - dbuf_evict(db); - } else if (arc_released(db->db_buf)) { - arc_buf_t *buf = db->db_buf; - /* - * This dbuf has anonymous data associated with it. - */ - dbuf_set_data(db, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - dbuf_evict(db); - } else { - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - mutex_exit(&db->db_mtx); - } - } else { - mutex_exit(&db->db_mtx); - } -} - -#pragma weak dmu_buf_refcount = dbuf_refcount -uint64_t -dbuf_refcount(dmu_buf_impl_t *db) -{ - return (refcount_count(&db->db_holds)); -} - -void * -dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) -{ - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); -} - -void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_immediate_evict = TRUE; - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); -} - -void * -dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, - void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_level == 0); - - ASSERT((user_ptr == NULL) == (evict_func == NULL)); - - mutex_enter(&db->db_mtx); - - if (db->db_user_ptr == old_user_ptr) { - db->db_user_ptr = user_ptr; - db->db_user_data_ptr_ptr = user_data_ptr_ptr; - db->db_evict_func = evict_func; - - dbuf_update_data(db); - } else { - old_user_ptr = db->db_user_ptr; - } - - mutex_exit(&db->db_mtx); - return (old_user_ptr); -} - -void * -dmu_buf_get_user(dmu_buf_t *db_fake) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(!refcount_is_zero(&db->db_holds)); - - return (db->db_user_ptr); -} - -static void -dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) -{ - /* ASSERT(dmu_tx_is_syncing(tx) */ - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (db->db_blkptr != NULL) - return; - - if (db->db_level == dn->dn_phys->dn_nlevels-1) { - /* - * This buffer was allocated at a time when there was - * no available blkptrs from the dnode, or it was - * inappropriate to hook it in (i.e., nlevels mis-match). - */ - ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); - ASSERT(db->db_parent == NULL); - db->db_parent = dn->dn_dbuf; - db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; - DBUF_VERIFY(db); - } else { - dmu_buf_impl_t *parent = db->db_parent; - int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - - ASSERT(dn->dn_phys->dn_nlevels > 1); - if (parent == NULL) { - mutex_exit(&db->db_mtx); - rw_enter(&dn->dn_struct_rwlock, RW_READER); - (void) dbuf_hold_impl(dn, db->db_level+1, - db->db_blkid >> epbs, FALSE, db, &parent); - rw_exit(&dn->dn_struct_rwlock); - mutex_enter(&db->db_mtx); - db->db_parent = parent; - } - db->db_blkptr = (blkptr_t *)parent->db.db_data + - (db->db_blkid & ((1ULL << epbs) - 1)); - DBUF_VERIFY(db); - } -} - -static void -dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - zio_t *zio; - - ASSERT(dmu_tx_is_syncing(tx)); - - dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); - - mutex_enter(&db->db_mtx); - - ASSERT(db->db_level > 0); - DBUF_VERIFY(db); - - if (db->db_buf == NULL) { - mutex_exit(&db->db_mtx); - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - mutex_enter(&db->db_mtx); - } - ASSERT3U(db->db_state, ==, DB_CACHED); - ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); - ASSERT(db->db_buf != NULL); - - dbuf_check_blkptr(dn, db); - - db->db_data_pending = dr; - - arc_release(db->db_buf, db); - mutex_exit(&db->db_mtx); - - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4, - zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx); - - zio = dr->dr_zio; - mutex_enter(&dr->dt.di.dr_mtx); - dbuf_sync_list(&dr->dt.di.dr_children, tx); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - mutex_exit(&dr->dt.di.dr_mtx); - zio_nowait(zio); -} - -static void -dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) -{ - arc_buf_t **datap = &dr->dt.dl.dr_data; - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; - uint64_t txg = tx->tx_txg; - int checksum, compress; - int blksz; - - ASSERT(dmu_tx_is_syncing(tx)); - - dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); - - mutex_enter(&db->db_mtx); - /* - * To be synced, we must be dirtied. But we - * might have been freed after the dirty. - */ - if (db->db_state == DB_UNCACHED) { - /* This buffer has been freed since it was dirtied */ - ASSERT(db->db.db_data == NULL); - } else if (db->db_state == DB_FILL) { - /* This buffer was freed and is now being re-filled */ - ASSERT(db->db.db_data != dr->dt.dl.dr_data); - } else { - ASSERT3U(db->db_state, ==, DB_CACHED); - } - DBUF_VERIFY(db); - - /* - * If this is a bonus buffer, simply copy the bonus data into the - * dnode. It will be written out when the dnode is synced (and it - * will be synced, since it must have been dirty for dbuf_sync to - * be called). - */ - if (db->db_blkid == DB_BONUS_BLKID) { - dbuf_dirty_record_t **drp; - /* - * Use dn_phys->dn_bonuslen since db.db_size is the length - * of the bonus buffer in the open transaction rather than - * the syncing transaction. - */ - ASSERT(*datap != NULL); - ASSERT3U(db->db_level, ==, 0); - ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); - bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); - if (*datap != db->db.db_data) - zio_buf_free(*datap, DN_MAX_BONUSLEN); - db->db_data_pending = NULL; - drp = &db->db_last_dirty; - while (*drp != dr) - drp = &(*drp)->dr_next; - ASSERT((*drp)->dr_next == NULL); - *drp = NULL; - if (dr->dr_dbuf->db_level != 0) { - list_destroy(&dr->dt.di.dr_children); - mutex_destroy(&dr->dt.di.dr_mtx); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - mutex_exit(&db->db_mtx); - dbuf_rele(db, (void *)(uintptr_t)txg); - return; - } - - /* - * If this buffer is in the middle of an immdiate write, - * wait for the synchronous IO to complete. - */ - while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); - cv_wait(&db->db_changed, &db->db_mtx); - ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); - } - - dbuf_check_blkptr(dn, db); - - /* - * If this dbuf has already been written out via an immediate write, - * just complete the write by copying over the new block pointer and - * updating the accounting via the write-completion functions. - */ - if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - zio_t zio_fake; - - zio_fake.io_private = &db; - zio_fake.io_error = 0; - zio_fake.io_bp = db->db_blkptr; - zio_fake.io_bp_orig = *db->db_blkptr; - zio_fake.io_txg = txg; - - *db->db_blkptr = dr->dt.dl.dr_overridden_by; - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - db->db_data_pending = dr; - dr->dr_zio = &zio_fake; - mutex_exit(&db->db_mtx); - - if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) - dsl_dataset_block_kill(os->os_dsl_dataset, - &zio_fake.io_bp_orig, dn->dn_zio, tx); - - dbuf_write_ready(&zio_fake, db->db_buf, db); - dbuf_write_done(&zio_fake, db->db_buf, db); - - return; - } - - blksz = arc_buf_size(*datap); - - if (dn->dn_object != DMU_META_DNODE_OBJECT) { - /* - * If this buffer is currently "in use" (i.e., there are - * active holds and db_data still references it), then make - * a copy before we start the write so that any modifications - * from the open txg will not leak into this write. - * - * NOTE: this copy does not need to be made for objects only - * modified in the syncing context (e.g. DNONE_DNODE blocks). - */ - if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); - } - } else { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - arc_release(db->db_buf, db); - } - - ASSERT(*datap != NULL); - db->db_data_pending = dr; - - mutex_exit(&db->db_mtx); - - /* - * Allow dnode settings to override objset settings, - * except for metadata checksums. - */ - if (dmu_ot[dn->dn_type].ot_metadata) { - checksum = os->os_md_checksum; - compress = zio_compress_select(dn->dn_compress, - os->os_md_compress); - } else { - checksum = zio_checksum_select(dn->dn_checksum, - os->os_checksum); - compress = zio_compress_select(dn->dn_compress, - os->os_compress); - } - - dbuf_write(dr, *datap, checksum, compress, tx); - - ASSERT(!list_link_active(&dr->dr_dirty_node)); - if (dn->dn_object == DMU_META_DNODE_OBJECT) - list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); - else - zio_nowait(dr->dr_zio); -} - -void -dbuf_sync_list(list_t *list, dmu_tx_t *tx) -{ - dbuf_dirty_record_t *dr; - - while (dr = list_head(list)) { - if (dr->dr_zio != NULL) { - /* - * If we find an already initialized zio then we - * are processing the meta-dnode, and we have finished. - * The dbufs for all dnodes are put back on the list - * during processing, so that we can zio_wait() - * these IOs after initiating all child IOs. - */ - ASSERT3U(dr->dr_dbuf->db.db_object, ==, - DMU_META_DNODE_OBJECT); - break; - } - list_remove(list, dr); - if (dr->dr_dbuf->db_level > 0) - dbuf_sync_indirect(dr, tx); - else - dbuf_sync_leaf(dr, tx); - } -} - -static void -dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, - int compress, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; - dmu_buf_impl_t *parent = db->db_parent; - uint64_t txg = tx->tx_txg; - zbookmark_t zb; - zio_t *zio; - int zio_flags; - - if (parent != dn->dn_dbuf) { - ASSERT(parent && parent->db_data_pending); - ASSERT(db->db_level == parent->db_level-1); - ASSERT(arc_released(parent->db_buf)); - zio = parent->db_data_pending->dr_zio; - } else { - ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); - zio = dn->dn_zio; - } - - ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(db->db_blkptr->blk_birth, <=, txg); - ASSERT(zio); - - zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; - - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; - if (BP_IS_OLDER(db->db_blkptr, txg)) - dsl_dataset_block_kill( - os->os_dsl_dataset, db->db_blkptr, zio, tx); - - dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress, - dmu_get_replication_level(os, &zb, dn->dn_type), txg, - db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); -} - -/* ARGSUSED */ -static void -dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; - blkptr_t *bp_orig = &zio->io_bp_orig; - uint64_t fill = 0; - int old_size, new_size, i; - - dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); - - old_size = bp_get_dasize(os->os_spa, bp_orig); - new_size = bp_get_dasize(os->os_spa, zio->io_bp); - - dnode_diduse_space(dn, new_size-old_size); - - if (BP_IS_HOLE(zio->io_bp)) { - dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; - - if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); - ASSERT3U(db->db_blkptr->blk_fill, ==, 0); - return; - } - - mutex_enter(&db->db_mtx); - - if (db->db_level == 0) { - mutex_enter(&dn->dn_mtx); - if (db->db_blkid > dn->dn_phys->dn_maxblkid) - dn->dn_phys->dn_maxblkid = db->db_blkid; - mutex_exit(&dn->dn_mtx); - - if (dn->dn_type == DMU_OT_DNODE) { - dnode_phys_t *dnp = db->db.db_data; - for (i = db->db.db_size >> DNODE_SHIFT; i > 0; - i--, dnp++) { - if (dnp->dn_type != DMU_OT_NONE) - fill++; - } - } else { - fill = 1; - } - } else { - blkptr_t *bp = db->db.db_data; - ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); - for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { - if (BP_IS_HOLE(bp)) - continue; - ASSERT3U(BP_GET_LSIZE(bp), ==, - db->db_level == 1 ? dn->dn_datablksz : - (1<<dn->dn_phys->dn_indblkshift)); - fill += bp->blk_fill; - } - } - - db->db_blkptr->blk_fill = fill; - BP_SET_TYPE(db->db_blkptr, dn->dn_type); - BP_SET_LEVEL(db->db_blkptr, db->db_level); - - mutex_exit(&db->db_mtx); - - /* We must do this after we've set the bp's type and level */ - if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) { - dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; - - if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); - dsl_dataset_block_born(ds, zio->io_bp, tx); - } -} - -/* ARGSUSED */ -static void -dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - uint64_t txg = zio->io_txg; - dbuf_dirty_record_t **drp, *dr; - - ASSERT3U(zio->io_error, ==, 0); - - mutex_enter(&db->db_mtx); - - drp = &db->db_last_dirty; - while (*drp != db->db_data_pending) - drp = &(*drp)->dr_next; - ASSERT(!list_link_active(&(*drp)->dr_dirty_node)); - ASSERT((*drp)->dr_txg == txg); - ASSERT((*drp)->dr_next == NULL); - dr = *drp; - *drp = NULL; - - if (db->db_level == 0) { - ASSERT(db->db_blkid != DB_BONUS_BLKID); - ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - - if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); - else if (!BP_IS_HOLE(db->db_blkptr)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); - else - ASSERT(arc_released(db->db_buf)); - } else { - dnode_t *dn = db->db_dnode; - - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); - if (!BP_IS_HOLE(db->db_blkptr)) { - int epbs = - dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, - db->db.db_size); - ASSERT3U(dn->dn_phys->dn_maxblkid - >> (db->db_level * epbs), >=, db->db_blkid); - arc_set_callback(db->db_buf, dbuf_do_evict, db); - } - list_destroy(&dr->dt.di.dr_children); - mutex_destroy(&dr->dt.di.dr_mtx); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - - cv_broadcast(&db->db_changed); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - db->db_data_pending = NULL; - mutex_exit(&db->db_mtx); - - dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", ""); - - dbuf_rele(db, (void *)(uintptr_t)txg); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c deleted file mode 100644 index d3be6b4..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ /dev/null @@ -1,1029 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/dmu_impl.h> -#include <sys/dmu_tx.h> -#include <sys/dbuf.h> -#include <sys/dnode.h> -#include <sys/zfs_context.h> -#include <sys/dmu_objset.h> -#include <sys/dmu_traverse.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_pool.h> -#include <sys/dsl_synctask.h> -#include <sys/dsl_prop.h> -#include <sys/dmu_zfetch.h> -#include <sys/zfs_ioctl.h> -#include <sys/zap.h> -#include <sys/zio_checksum.h> - -const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { byteswap_uint8_array, TRUE, "unallocated" }, - { zap_byteswap, TRUE, "object directory" }, - { byteswap_uint64_array, TRUE, "object array" }, - { byteswap_uint8_array, TRUE, "packed nvlist" }, - { byteswap_uint64_array, TRUE, "packed nvlist size" }, - { byteswap_uint64_array, TRUE, "bplist" }, - { byteswap_uint64_array, TRUE, "bplist header" }, - { byteswap_uint64_array, TRUE, "SPA space map header" }, - { byteswap_uint64_array, TRUE, "SPA space map" }, - { byteswap_uint64_array, TRUE, "ZIL intent log" }, - { dnode_buf_byteswap, TRUE, "DMU dnode" }, - { dmu_objset_byteswap, TRUE, "DMU objset" }, - { byteswap_uint64_array, TRUE, "DSL directory" }, - { zap_byteswap, TRUE, "DSL directory child map"}, - { zap_byteswap, TRUE, "DSL dataset snap map" }, - { zap_byteswap, TRUE, "DSL props" }, - { byteswap_uint64_array, TRUE, "DSL dataset" }, - { zfs_znode_byteswap, TRUE, "ZFS znode" }, - { zfs_acl_byteswap, TRUE, "ZFS ACL" }, - { byteswap_uint8_array, FALSE, "ZFS plain file" }, - { zap_byteswap, TRUE, "ZFS directory" }, - { zap_byteswap, TRUE, "ZFS master node" }, - { zap_byteswap, TRUE, "ZFS delete queue" }, - { byteswap_uint8_array, FALSE, "zvol object" }, - { zap_byteswap, TRUE, "zvol prop" }, - { byteswap_uint8_array, FALSE, "other uint8[]" }, - { byteswap_uint64_array, FALSE, "other uint64[]" }, - { zap_byteswap, TRUE, "other ZAP" }, - { zap_byteswap, TRUE, "persistent error log" }, - { byteswap_uint8_array, TRUE, "SPA history" }, - { byteswap_uint64_array, TRUE, "SPA history offsets" }, - { zap_byteswap, TRUE, "Pool properties" }, -}; - -int -dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **dbp) -{ - dnode_t *dn; - uint64_t blkid; - dmu_buf_impl_t *db; - int err; - - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); - blkid = dbuf_whichblock(dn, offset); - rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold(dn, blkid, tag); - rw_exit(&dn->dn_struct_rwlock); - if (db == NULL) { - err = EIO; - } else { - err = dbuf_read(db, NULL, DB_RF_CANFAIL); - if (err) { - dbuf_rele(db, tag); - db = NULL; - } - } - - dnode_rele(dn, FTAG); - *dbp = &db->db; - return (err); -} - -int -dmu_bonus_max(void) -{ - return (DN_MAX_BONUSLEN); -} - -/* - * returns ENOENT, EIO, or 0. - */ -int -dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) -{ - dnode_t *dn; - int err, count; - dmu_buf_impl_t *db; - - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_bonus == NULL) { - rw_exit(&dn->dn_struct_rwlock); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus == NULL) - dn->dn_bonus = dbuf_create_bonus(dn); - } - db = dn->dn_bonus; - rw_exit(&dn->dn_struct_rwlock); - mutex_enter(&db->db_mtx); - count = refcount_add(&db->db_holds, tag); - mutex_exit(&db->db_mtx); - if (count == 1) - dnode_add_ref(dn, db); - dnode_rele(dn, FTAG); - - VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); - - *dbp = &db->db; - return (0); -} - -/* - * Note: longer-term, we should modify all of the dmu_buf_*() interfaces - * to take a held dnode rather than <os, object> -- the lookup is wasteful, - * and can induce severe lock contention when writing to several files - * whose dnodes are in the same block. - */ -static int -dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) -{ - dmu_buf_t **dbp; - uint64_t blkid, nblks, i; - uint32_t flags; - int err; - zio_t *zio; - - ASSERT(length <= DMU_MAX_ACCESS); - - flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; - if (length > zfetch_array_rd_sz) - flags |= DB_RF_NOPREFETCH; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - - P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; - } else { - if (offset + length > dn->dn_datablksz) { - zfs_panic_recover("zfs: accessing past end of object " - "%llx/%llx (size=%u access=%llu+%llu)", - (longlong_t)dn->dn_objset-> - os_dsl_dataset->ds_object, - (longlong_t)dn->dn_object, dn->dn_datablksz, - (longlong_t)offset, (longlong_t)length); - return (EIO); - } - nblks = 1; - } - dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); - blkid = dbuf_whichblock(dn, offset); - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); - if (db == NULL) { - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array(dbp, nblks, tag); - zio_nowait(zio); - return (EIO); - } - /* initiate async i/o */ - if (read) { - rw_exit(&dn->dn_struct_rwlock); - (void) dbuf_read(db, zio, flags); - rw_enter(&dn->dn_struct_rwlock, RW_READER); - } - dbp[i] = &db->db; - } - rw_exit(&dn->dn_struct_rwlock); - - /* wait for async i/o */ - err = zio_wait(zio); - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - - /* wait for other io to complete */ - if (read) { - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || - db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_UNCACHED) - err = EIO; - mutex_exit(&db->db_mtx); - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - } - } - - *numbufsp = nblks; - *dbpp = dbp; - return (0); -} - -static int -dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) -{ - dnode_t *dn; - int err; - - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); - - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp); - - dnode_rele(dn, FTAG); - - return (err); -} - -int -dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) -{ - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; - int err; - - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp); - - return (err); -} - -void -dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) -{ - int i; - dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; - - if (numbufs == 0) - return; - - for (i = 0; i < numbufs; i++) { - if (dbp[i]) - dbuf_rele(dbp[i], tag); - } - - kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); -} - -void -dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) -{ - dnode_t *dn; - uint64_t blkid; - int nblks, i, err; - - if (zfs_prefetch_disable) - return; - - if (len == 0) { /* they're interested in the bonus buffer */ - dn = os->os->os_meta_dnode; - - if (object == 0 || object >= DN_MAX_OBJECT) - return; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, blkid); - rw_exit(&dn->dn_struct_rwlock); - return; - } - - /* - * XXX - Note, if the dnode for the requested object is not - * already cached, we will do a *synchronous* read in the - * dnode_hold() call. The same is true for any indirects. - */ - err = dnode_hold(os->os, object, FTAG, &dn); - if (err != 0) - return; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - - P2ALIGN(offset, 1<<blkshift)) >> blkshift; - } else { - nblks = (offset < dn->dn_datablksz); - } - - if (nblks != 0) { - blkid = dbuf_whichblock(dn, offset); - for (i = 0; i < nblks; i++) - dbuf_prefetch(dn, blkid+i); - } - - rw_exit(&dn->dn_struct_rwlock); - - dnode_rele(dn, FTAG); -} - -int -dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, dmu_tx_t *tx) -{ - dnode_t *dn; - int err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); - ASSERT(offset < UINT64_MAX); - ASSERT(size == -1ULL || size <= UINT64_MAX - offset); - dnode_free_range(dn, offset, size, tx); - dnode_rele(dn, FTAG); - return (0); -} - -int -dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf) -{ - dnode_t *dn; - dmu_buf_t **dbp; - int numbufs, i, err; - - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); - - /* - * Deal with odd block sizes, where there can't be data past the first - * block. If we ever do the tail block optimization, we will need to - * handle that here as well. - */ - if (dn->dn_datablkshift == 0) { - int newsz = offset > dn->dn_datablksz ? 0 : - MIN(size, dn->dn_datablksz - offset); - bzero((char *)buf + newsz, size - newsz); - size = newsz; - } - - while (size > 0) { - uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); - int err; - - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - bcopy((char *)db->db_data + bufoff, buf, tocpy); - - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - } - dnode_rele(dn, FTAG); - return (0); -} - -void -dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs, i; - - if (size == 0) - return; - - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - - bcopy(buf, (char *)db->db_data + bufoff, tocpy); - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); -} - -#ifdef _KERNEL -int -dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) -{ - dmu_buf_t **dbp; - int numbufs, i, err; - - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, - &numbufs, &dbp); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); - if (err) - break; - - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - - return (err); -} - -int -dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, - dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs, i; - int err = 0; - - if (size == 0) - return (0); - - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, - FALSE, FTAG, &numbufs, &dbp); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - - /* - * XXX uiomove could block forever (eg. nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that uiomove won't - * block. - */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - if (err) - break; - - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - return (err); -} - -#ifndef __FreeBSD__ -int -dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - page_t *pp, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs, i; - int err; - - if (size == 0) - return (0); - - err = dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - int tocpy, copied, thiscpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - caddr_t va; - - ASSERT(size > 0); - ASSERT3U(db->db_size, >=, PAGESIZE); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - - for (copied = 0; copied < tocpy; copied += PAGESIZE) { - ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); - thiscpy = MIN(PAGESIZE, tocpy - copied); - va = ppmapin(pp, PROT_READ, (caddr_t)-1); - bcopy(va, (char *)db->db_data + bufoff, thiscpy); - ppmapout(va); - pp = pp->p_next; - bufoff += PAGESIZE; - } - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - if (err) - break; - - offset += tocpy; - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - return (err); -} -#endif /* !__FreeBSD__ */ -#endif /* _KERNEL */ - -typedef struct { - dbuf_dirty_record_t *dr; - dmu_sync_cb_t *done; - void *arg; -} dmu_sync_arg_t; - -/* ARGSUSED */ -static void -dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) -{ - dmu_sync_arg_t *in = varg; - dbuf_dirty_record_t *dr = in->dr; - dmu_buf_impl_t *db = dr->dr_dbuf; - dmu_sync_cb_t *done = in->done; - - if (!BP_IS_HOLE(zio->io_bp)) { - zio->io_bp->blk_fill = 1; - BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); - BP_SET_LEVEL(zio->io_bp, 0); - } - - mutex_enter(&db->db_mtx); - ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); - dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ - dr->dt.dl.dr_override_state = DR_OVERRIDDEN; - cv_broadcast(&db->db_changed); - mutex_exit(&db->db_mtx); - - if (done) - done(&(db->db), in->arg); - - kmem_free(in, sizeof (dmu_sync_arg_t)); -} - -/* - * Intent log support: sync the block associated with db to disk. - * N.B. and XXX: the caller is responsible for making sure that the - * data isn't changing while dmu_sync() is writing it. - * - * Return values: - * - * EEXIST: this txg has already been synced, so there's nothing to to. - * The caller should not log the write. - * - * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. - * The caller should not log the write. - * - * EALREADY: this block is already in the process of being synced. - * The caller should track its progress (somehow). - * - * EINPROGRESS: the IO has been initiated. - * The caller should log this blkptr in the callback. - * - * 0: completed. Sets *bp to the blkptr just written. - * The caller should log this blkptr immediately. - */ -int -dmu_sync(zio_t *pio, dmu_buf_t *db_fake, - blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - objset_impl_t *os = db->db_objset; - dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; - tx_state_t *tx = &dp->dp_tx; - dbuf_dirty_record_t *dr; - dmu_sync_arg_t *in; - zbookmark_t zb; - zio_t *zio; - int zio_flags; - int err; - - ASSERT(BP_IS_HOLE(bp)); - ASSERT(txg != 0); - - - dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", - txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); - - /* - * XXX - would be nice if we could do this without suspending... - */ - txg_suspend(dp); - - /* - * If this txg already synced, there's nothing to do. - */ - if (txg <= tx->tx_synced_txg) { - txg_resume(dp); - /* - * If we're running ziltest, we need the blkptr regardless. - */ - if (txg > spa_freeze_txg(dp->dp_spa)) { - /* if db_blkptr == NULL, this was an empty write */ - if (db->db_blkptr) - *bp = *db->db_blkptr; /* structure assignment */ - return (0); - } - return (EEXIST); - } - - mutex_enter(&db->db_mtx); - - if (txg == tx->tx_syncing_txg) { - while (db->db_data_pending) { - /* - * IO is in-progress. Wait for it to finish. - * XXX - would be nice to be able to somehow "attach" - * this zio to the parent zio passed in. - */ - cv_wait(&db->db_changed, &db->db_mtx); - if (!db->db_data_pending && - db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { - /* - * IO was compressed away - */ - *bp = *db->db_blkptr; /* structure assignment */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); - } - ASSERT(db->db_data_pending || - (db->db_blkptr && db->db_blkptr->blk_birth == txg)); - } - - if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { - /* - * IO is already completed. - */ - *bp = *db->db_blkptr; /* structure assignment */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); - } - } - - dr = db->db_last_dirty; - while (dr && dr->dr_txg > txg) - dr = dr->dr_next; - if (dr == NULL || dr->dr_txg < txg) { - /* - * This dbuf isn't dirty, must have been free_range'd. - * There's no need to log writes to freed blocks, so we're done. - */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (ENOENT); - } - - ASSERT(dr->dr_txg == txg); - if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { - /* - * We have already issued a sync write for this buffer. - */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (EALREADY); - } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - /* - * This buffer has already been synced. It could not - * have been dirtied since, or we would have cleared the state. - */ - *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); - } - - dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; - in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); - in->dr = dr; - in->done = done; - in->arg = arg; - mutex_exit(&db->db_mtx); - txg_resume(dp); - - zb.zb_objset = os->os_dsl_dataset->ds_object; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; - zio = arc_write(pio, os->os_spa, - zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), - zio_compress_select(db->db_dnode->dn_compress, os->os_compress), - dmu_get_replication_level(os, &zb, db->db_dnode->dn_type), - txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in, - ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb); - - if (pio) { - zio_nowait(zio); - err = EINPROGRESS; - } else { - err = zio_wait(zio); - ASSERT(err == 0); - } - return (err); -} - -int -dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, - dmu_tx_t *tx) -{ - dnode_t *dn; - int err; - - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); - err = dnode_set_blksz(dn, size, ibs, tx); - dnode_rele(dn, FTAG); - return (err); -} - -void -dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, - dmu_tx_t *tx) -{ - dnode_t *dn; - - /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os->os, object, FTAG, &dn); - ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); - dn->dn_checksum = checksum; - dnode_setdirty(dn, tx); - dnode_rele(dn, FTAG); -} - -void -dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, - dmu_tx_t *tx) -{ - dnode_t *dn; - - /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os->os, object, FTAG, &dn); - ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); - dn->dn_compress = compress; - dnode_setdirty(dn, tx); - dnode_rele(dn, FTAG); -} - -int -dmu_get_replication_level(objset_impl_t *os, - zbookmark_t *zb, dmu_object_type_t ot) -{ - int ncopies = os->os_copies; - - /* If it's the mos, it should have max copies set. */ - ASSERT(zb->zb_objset != 0 || - ncopies == spa_max_replication(os->os_spa)); - - if (dmu_ot[ot].ot_metadata || zb->zb_level != 0) - ncopies++; - return (MIN(ncopies, spa_max_replication(os->os_spa))); -} - -int -dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) -{ - dnode_t *dn; - int i, err; - - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); - /* - * Sync any current changes before - * we go trundling through the block pointers. - */ - for (i = 0; i < TXG_SIZE; i++) { - if (list_link_active(&dn->dn_dirty_link[i])) - break; - } - if (i != TXG_SIZE) { - dnode_rele(dn, FTAG); - txg_wait_synced(dmu_objset_pool(os), 0); - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); - } - - err = dnode_next_offset(dn, hole, off, 1, 1, 0); - dnode_rele(dn, FTAG); - - return (err); -} - -void -dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) -{ - rw_enter(&dn->dn_struct_rwlock, RW_READER); - mutex_enter(&dn->dn_mtx); - - doi->doi_data_block_size = dn->dn_datablksz; - doi->doi_metadata_block_size = dn->dn_indblkshift ? - 1ULL << dn->dn_indblkshift : 0; - doi->doi_indirection = dn->dn_nlevels; - doi->doi_checksum = dn->dn_checksum; - doi->doi_compress = dn->dn_compress; - doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + - SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; - doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; - doi->doi_type = dn->dn_type; - doi->doi_bonus_size = dn->dn_bonuslen; - doi->doi_bonus_type = dn->dn_bonustype; - - mutex_exit(&dn->dn_mtx); - rw_exit(&dn->dn_struct_rwlock); -} - -/* - * Get information on a DMU object. - * If doi is NULL, just indicates whether the object exists. - */ -int -dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) -{ - dnode_t *dn; - int err = dnode_hold(os->os, object, FTAG, &dn); - - if (err) - return (err); - - if (doi != NULL) - dmu_object_info_from_dnode(dn, doi); - - dnode_rele(dn, FTAG); - return (0); -} - -/* - * As above, but faster; can be used when you have a held dbuf in hand. - */ -void -dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) -{ - dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); -} - -/* - * Faster still when you only care about the size. - * This is specifically optimized for zfs_getattr(). - */ -void -dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) -{ - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; - - *blksize = dn->dn_datablksz; - /* add 1 for dnode space */ - *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> - SPA_MINBLOCKSHIFT) + 1; -} - -void -byteswap_uint64_array(void *vbuf, size_t size) -{ - uint64_t *buf = vbuf; - size_t count = size >> 3; - int i; - - ASSERT((size & 7) == 0); - - for (i = 0; i < count; i++) - buf[i] = BSWAP_64(buf[i]); -} - -void -byteswap_uint32_array(void *vbuf, size_t size) -{ - uint32_t *buf = vbuf; - size_t count = size >> 2; - int i; - - ASSERT((size & 3) == 0); - - for (i = 0; i < count; i++) - buf[i] = BSWAP_32(buf[i]); -} - -void -byteswap_uint16_array(void *vbuf, size_t size) -{ - uint16_t *buf = vbuf; - size_t count = size >> 1; - int i; - - ASSERT((size & 1) == 0); - - for (i = 0; i < count; i++) - buf[i] = BSWAP_16(buf[i]); -} - -/* ARGSUSED */ -void -byteswap_uint8_array(void *vbuf, size_t size) -{ -} - -void -dmu_init(void) -{ - dbuf_init(); - dnode_init(); - arc_init(); -} - -void -dmu_fini(void) -{ - arc_fini(); - dnode_fini(); - dbuf_fini(); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c deleted file mode 100644 index 93168cc..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/dmu_objset.h> -#include <sys/dmu_tx.h> -#include <sys/dnode.h> - -uint64_t -dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - objset_impl_t *osi = os->os; - uint64_t object; - uint64_t L2_dnode_count = DNODES_PER_BLOCK << - (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); - dnode_t *dn = NULL; - int restarted = B_FALSE; - - mutex_enter(&osi->os_obj_lock); - for (;;) { - object = osi->os_obj_next; - /* - * Each time we polish off an L2 bp worth of dnodes - * (2^13 objects), move to another L2 bp that's still - * reasonably sparse (at most 1/4 full). Look from the - * beginning once, but after that keep looking from here. - * If we can't find one, just keep going from here. - */ - if (P2PHASE(object, L2_dnode_count) == 0) { - uint64_t offset = restarted ? object << DNODE_SHIFT : 0; - int error = dnode_next_offset(osi->os_meta_dnode, - B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); - restarted = B_TRUE; - if (error == 0) - object = offset >> DNODE_SHIFT; - } - osi->os_obj_next = ++object; - - /* - * XXX We should check for an i/o error here and return - * up to our caller. Actually we should pre-read it in - * dmu_tx_assign(), but there is currently no mechanism - * to do so. - */ - (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, - FTAG, &dn); - if (dn) - break; - - if (dmu_object_next(os, &object, B_TRUE, 0) == 0) - osi->os_obj_next = object - 1; - } - - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); - dnode_rele(dn, FTAG); - - mutex_exit(&osi->os_obj_lock); - - dmu_tx_add_new_object(tx, os, object); - return (object); -} - -int -dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - dnode_t *dn; - int err; - - if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) - return (EBADF); - - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn); - if (err) - return (err); - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); - dnode_rele(dn, FTAG); - - dmu_tx_add_new_object(tx, os, object); - return (0); -} - -int -dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - dnode_t *dn; - int err; - - if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) - return (EBADF); - - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, - FTAG, &dn); - if (err) - return (err); - dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); - dnode_rele(dn, FTAG); - - return (0); -} - -int -dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) -{ - dnode_t *dn; - int err; - - ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, - FTAG, &dn); - if (err) - return (err); - - ASSERT(dn->dn_type != DMU_OT_NONE); - dnode_free(dn, tx); - dnode_rele(dn, FTAG); - - return (0); -} - -int -dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) -{ - uint64_t offset = (*objectp + 1) << DNODE_SHIFT; - int error; - - error = dnode_next_offset(os->os->os_meta_dnode, - hole, &offset, 0, DNODES_PER_BLOCK, txg); - - *objectp = offset >> DNODE_SHIFT; - - return (error); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c deleted file mode 100644 index 378fe8c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ /dev/null @@ -1,1037 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/dmu_objset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_pool.h> -#include <sys/dsl_synctask.h> -#include <sys/dnode.h> -#include <sys/dbuf.h> -#include <sys/zvol.h> -#include <sys/dmu_tx.h> -#include <sys/zio_checksum.h> -#include <sys/zap.h> -#include <sys/zil.h> -#include <sys/dmu_impl.h> - - -spa_t * -dmu_objset_spa(objset_t *os) -{ - return (os->os->os_spa); -} - -zilog_t * -dmu_objset_zil(objset_t *os) -{ - return (os->os->os_zil); -} - -dsl_pool_t * -dmu_objset_pool(objset_t *os) -{ - dsl_dataset_t *ds; - - if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir) - return (ds->ds_dir->dd_pool); - else - return (spa_get_dsl(os->os->os_spa)); -} - -dsl_dataset_t * -dmu_objset_ds(objset_t *os) -{ - return (os->os->os_dsl_dataset); -} - -dmu_objset_type_t -dmu_objset_type(objset_t *os) -{ - return (os->os->os_phys->os_type); -} - -void -dmu_objset_name(objset_t *os, char *buf) -{ - dsl_dataset_name(os->os->os_dsl_dataset, buf); -} - -uint64_t -dmu_objset_id(objset_t *os) -{ - dsl_dataset_t *ds = os->os->os_dsl_dataset; - - return (ds ? ds->ds_object : 0); -} - -static void -checksum_changed_cb(void *arg, uint64_t newval) -{ - objset_impl_t *osi = arg; - - /* - * Inheritance should have been done by now. - */ - ASSERT(newval != ZIO_CHECKSUM_INHERIT); - - osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); -} - -static void -compression_changed_cb(void *arg, uint64_t newval) -{ - objset_impl_t *osi = arg; - - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval != ZIO_COMPRESS_INHERIT); - - osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); -} - -static void -copies_changed_cb(void *arg, uint64_t newval) -{ - objset_impl_t *osi = arg; - - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval > 0); - ASSERT(newval <= spa_max_replication(osi->os_spa)); - - osi->os_copies = newval; -} - -void -dmu_objset_byteswap(void *buf, size_t size) -{ - objset_phys_t *osp = buf; - - ASSERT(size == sizeof (objset_phys_t)); - dnode_byteswap(&osp->os_meta_dnode); - byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); - osp->os_type = BSWAP_64(osp->os_type); -} - -int -dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - objset_impl_t **osip) -{ - objset_impl_t *winner, *osi; - int i, err, checksum; - - osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP); - osi->os.os = osi; - osi->os_dsl_dataset = ds; - osi->os_spa = spa; - osi->os_rootbp = bp; - if (!BP_IS_HOLE(osi->os_rootbp)) { - uint32_t aflags = ARC_WAIT; - zbookmark_t zb; - zb.zb_objset = ds ? ds->ds_object : 0; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = 0; - - dprintf_bp(osi->os_rootbp, "reading %s", ""); - err = arc_read(NULL, spa, osi->os_rootbp, - dmu_ot[DMU_OT_OBJSET].ot_byteswap, - arc_getbuf_func, &osi->os_phys_buf, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); - if (err) { - kmem_free(osi, sizeof (objset_impl_t)); - return (err); - } - osi->os_phys = osi->os_phys_buf->b_data; - arc_release(osi->os_phys_buf, &osi->os_phys_buf); - } else { - osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), - &osi->os_phys_buf, ARC_BUFC_METADATA); - osi->os_phys = osi->os_phys_buf->b_data; - bzero(osi->os_phys, sizeof (objset_phys_t)); - } - - /* - * Note: the changed_cb will be called once before the register - * func returns, thus changing the checksum/compression from the - * default (fletcher2/off). Snapshots don't need to know, and - * registering would complicate clone promotion. - */ - if (ds && ds->ds_phys->ds_num_children == 0) { - err = dsl_prop_register(ds, "checksum", - checksum_changed_cb, osi); - if (err == 0) - err = dsl_prop_register(ds, "compression", - compression_changed_cb, osi); - if (err == 0) - err = dsl_prop_register(ds, "copies", - copies_changed_cb, osi); - if (err) { - VERIFY(arc_buf_remove_ref(osi->os_phys_buf, - &osi->os_phys_buf) == 1); - kmem_free(osi, sizeof (objset_impl_t)); - return (err); - } - } else if (ds == NULL) { - /* It's the meta-objset. */ - osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; - osi->os_compress = ZIO_COMPRESS_LZJB; - osi->os_copies = spa_max_replication(spa); - } - - osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header); - - /* - * Metadata always gets compressed and checksummed. - * If the data checksum is multi-bit correctable, and it's not - * a ZBT-style checksum, then it's suitable for metadata as well. - * Otherwise, the metadata checksum defaults to fletcher4. - */ - checksum = osi->os_checksum; - - if (zio_checksum_table[checksum].ci_correctable && - !zio_checksum_table[checksum].ci_zbt) - osi->os_md_checksum = checksum; - else - osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4; - osi->os_md_compress = ZIO_COMPRESS_LZJB; - - for (i = 0; i < TXG_SIZE; i++) { - list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), - offsetof(dnode_t, dn_dirty_link[i])); - list_create(&osi->os_free_dnodes[i], sizeof (dnode_t), - offsetof(dnode_t, dn_dirty_link[i])); - } - list_create(&osi->os_dnodes, sizeof (dnode_t), - offsetof(dnode_t, dn_link)); - list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_link)); - - mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); - - osi->os_meta_dnode = dnode_special_open(osi, - &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); - - if (ds != NULL) { - winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict); - if (winner) { - dmu_objset_evict(ds, osi); - osi = winner; - } - } - - *osip = osi; - return (0); -} - -/* called from zpl */ -int -dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp) -{ - dsl_dataset_t *ds; - int err; - objset_t *os; - objset_impl_t *osi; - - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - err = dsl_dataset_open(name, mode, os, &ds); - if (err) { - kmem_free(os, sizeof (objset_t)); - return (err); - } - - osi = dsl_dataset_get_user_ptr(ds); - if (osi == NULL) { - err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, &ds->ds_phys->ds_bp, &osi); - if (err) { - dsl_dataset_close(ds, mode, os); - kmem_free(os, sizeof (objset_t)); - return (err); - } - } - - os->os = osi; - os->os_mode = mode; - - if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) { - dmu_objset_close(os); - return (EINVAL); - } - *osp = os; - return (0); -} - -void -dmu_objset_close(objset_t *os) -{ - dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os); - kmem_free(os, sizeof (objset_t)); -} - -int -dmu_objset_evict_dbufs(objset_t *os, int try) -{ - objset_impl_t *osi = os->os; - dnode_t *dn; - - mutex_enter(&osi->os_lock); - - /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&osi->os_dnodes, osi->os_meta_dnode); - list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode); - - /* - * Find the first dnode with holds. We have to do this dance - * because dnode_add_ref() only works if you already have a - * hold. If there are no holds then it has no dbufs so OK to - * skip. - */ - for (dn = list_head(&osi->os_dnodes); - dn && refcount_is_zero(&dn->dn_holds); - dn = list_next(&osi->os_dnodes, dn)) - continue; - if (dn) - dnode_add_ref(dn, FTAG); - - while (dn) { - dnode_t *next_dn = dn; - - do { - next_dn = list_next(&osi->os_dnodes, next_dn); - } while (next_dn && refcount_is_zero(&next_dn->dn_holds)); - if (next_dn) - dnode_add_ref(next_dn, FTAG); - - mutex_exit(&osi->os_lock); - if (dnode_evict_dbufs(dn, try)) { - dnode_rele(dn, FTAG); - if (next_dn) - dnode_rele(next_dn, FTAG); - return (1); - } - dnode_rele(dn, FTAG); - mutex_enter(&osi->os_lock); - dn = next_dn; - } - mutex_exit(&osi->os_lock); - return (0); -} - -void -dmu_objset_evict(dsl_dataset_t *ds, void *arg) -{ - objset_impl_t *osi = arg; - objset_t os; - int i; - - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL); - ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); - } - - if (ds && ds->ds_phys->ds_num_children == 0) { - VERIFY(0 == dsl_prop_unregister(ds, "checksum", - checksum_changed_cb, osi)); - VERIFY(0 == dsl_prop_unregister(ds, "compression", - compression_changed_cb, osi)); - VERIFY(0 == dsl_prop_unregister(ds, "copies", - copies_changed_cb, osi)); - } - - /* - * We should need only a single pass over the dnode list, since - * nothing can be added to the list at this point. - */ - os.os = osi; - (void) dmu_objset_evict_dbufs(&os, 0); - - ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL); - - dnode_special_close(osi->os_meta_dnode); - zil_free(osi->os_zil); - - VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); - mutex_destroy(&osi->os_lock); - mutex_destroy(&osi->os_obj_lock); - kmem_free(osi, sizeof (objset_impl_t)); -} - -/* called from dsl for meta-objset */ -objset_impl_t * -dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - dmu_objset_type_t type, dmu_tx_t *tx) -{ - objset_impl_t *osi; - dnode_t *mdn; - - ASSERT(dmu_tx_is_syncing(tx)); - VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi)); - mdn = osi->os_meta_dnode; - - dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, - DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); - - /* - * We don't want to have to increase the meta-dnode's nlevels - * later, because then we could do it in quescing context while - * we are also accessing it in open context. - * - * This precaution is not necessary for the MOS (ds == NULL), - * because the MOS is only updated in syncing context. - * This is most fortunate: the MOS is the only objset that - * needs to be synced multiple times as spa_sync() iterates - * to convergence, so minimizing its dn_nlevels matters. - */ - if (ds != NULL) { - int levels = 1; - - /* - * Determine the number of levels necessary for the meta-dnode - * to contain DN_MAX_OBJECT dnodes. - */ - while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + - (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < - DN_MAX_OBJECT * sizeof (dnode_phys_t)) - levels++; - - mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = - mdn->dn_nlevels = levels; - } - - ASSERT(type != DMU_OST_NONE); - ASSERT(type != DMU_OST_ANY); - ASSERT(type < DMU_OST_NUMTYPES); - osi->os_phys->os_type = type; - - dsl_dataset_dirty(ds, tx); - - return (osi); -} - -struct oscarg { - void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx); - void *userarg; - dsl_dataset_t *clone_parent; - const char *lastname; - dmu_objset_type_t type; -}; - -/* ARGSUSED */ -static int -dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct oscarg *oa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - int err; - uint64_t ddobj; - - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - oa->lastname, sizeof (uint64_t), 1, &ddobj); - if (err != ENOENT) - return (err ? err : EEXIST); - - if (oa->clone_parent != NULL) { - /* - * You can't clone across pools. - */ - if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool) - return (EXDEV); - - /* - * You can only clone snapshots, not the head datasets. - */ - if (oa->clone_parent->ds_phys->ds_num_children == 0) - return (EINVAL); - } - return (0); -} - -static void -dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct oscarg *oa = arg2; - dsl_dataset_t *ds; - blkptr_t *bp; - uint64_t dsobj; - - ASSERT(dmu_tx_is_syncing(tx)); - - dsobj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_parent, tx); - - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, - DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds)); - bp = dsl_dataset_get_blkptr(ds); - if (BP_IS_HOLE(bp)) { - objset_impl_t *osi; - - /* This is an empty dmu_objset; not a clone. */ - osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, bp, oa->type, tx); - - if (oa->userfunc) - oa->userfunc(&osi->os, oa->userarg, tx); - } - dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG); -} - -int -dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, - void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg) -{ - dsl_dir_t *pdd; - const char *tail; - int err = 0; - struct oscarg oa = { 0 }; - - ASSERT(strchr(name, '@') == NULL); - err = dsl_dir_open(name, FTAG, &pdd, &tail); - if (err) - return (err); - if (tail == NULL) { - dsl_dir_close(pdd, FTAG); - return (EEXIST); - } - - dprintf("name=%s\n", name); - - oa.userfunc = func; - oa.userarg = arg; - oa.lastname = tail; - oa.type = type; - if (clone_parent != NULL) { - /* - * You can't clone to a different type. - */ - if (clone_parent->os->os_phys->os_type != type) { - dsl_dir_close(pdd, FTAG); - return (EINVAL); - } - oa.clone_parent = clone_parent->os->os_dsl_dataset; - } - err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, - dmu_objset_create_sync, pdd, &oa, 5); - dsl_dir_close(pdd, FTAG); - return (err); -} - -int -dmu_objset_destroy(const char *name) -{ - objset_t *os; - int error; - - /* - * If it looks like we'll be able to destroy it, and there's - * an unplayed replay log sitting around, destroy the log. - * It would be nicer to do this in dsl_dataset_destroy_sync(), - * but the replay log objset is modified in open context. - */ - error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os); - if (error == 0) { - zil_destroy(dmu_objset_zil(os), B_FALSE); - dmu_objset_close(os); - } - - return (dsl_dataset_destroy(name)); -} - -int -dmu_objset_rollback(const char *name) -{ - int err; - objset_t *os; - - err = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os); - if (err == 0) { - err = zil_suspend(dmu_objset_zil(os)); - if (err == 0) - zil_resume(dmu_objset_zil(os)); - if (err == 0) { - /* XXX uncache everything? */ - err = dsl_dataset_rollback(os->os->os_dsl_dataset); - } - dmu_objset_close(os); - } - return (err); -} - -struct snaparg { - dsl_sync_task_group_t *dstg; - char *snapname; - char failed[MAXPATHLEN]; -}; - -static int -dmu_objset_snapshot_one(char *name, void *arg) -{ - struct snaparg *sn = arg; - objset_t *os; - dmu_objset_stats_t stat; - int err; - - (void) strcpy(sn->failed, name); - - err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os); - if (err != 0) - return (err); - - /* - * If the objset is in an inconsistent state, return busy. - */ - dmu_objset_fast_stat(os, &stat); - if (stat.dds_inconsistent) { - dmu_objset_close(os); - return (EBUSY); - } - - /* - * NB: we need to wait for all in-flight changes to get to disk, - * so that we snapshot those changes. zil_suspend does this as - * a side effect. - */ - err = zil_suspend(dmu_objset_zil(os)); - if (err == 0) { - dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, - dsl_dataset_snapshot_sync, os, sn->snapname, 3); - } else { - dmu_objset_close(os); - } - - return (err); -} - -int -dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) -{ - dsl_sync_task_t *dst; - struct snaparg sn = { 0 }; - char *cp; - spa_t *spa; - int err; - - (void) strcpy(sn.failed, fsname); - - cp = strchr(fsname, '/'); - if (cp) { - *cp = '\0'; - err = spa_open(fsname, &spa, FTAG); - *cp = '/'; - } else { - err = spa_open(fsname, &spa, FTAG); - } - if (err) - return (err); - - sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - sn.snapname = snapname; - - if (recursive) { - err = dmu_objset_find(fsname, - dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); - } else { - err = dmu_objset_snapshot_one(fsname, &sn); - } - - if (err) - goto out; - - err = dsl_sync_task_group_wait(sn.dstg); - - for (dst = list_head(&sn.dstg->dstg_tasks); dst; - dst = list_next(&sn.dstg->dstg_tasks, dst)) { - objset_t *os = dst->dst_arg1; - if (dst->dst_err) - dmu_objset_name(os, sn.failed); - zil_resume(dmu_objset_zil(os)); - dmu_objset_close(os); - } -out: - if (err) - (void) strcpy(fsname, sn.failed); - dsl_sync_task_group_destroy(sn.dstg); - spa_close(spa, FTAG); - return (err); -} - -static void -dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) -{ - dnode_t *dn; - - while (dn = list_head(list)) { - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); - ASSERT(dn->dn_dbuf->db_data_pending); - /* - * Initialize dn_zio outside dnode_sync() - * to accomodate meta-dnode - */ - dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; - ASSERT(dn->dn_zio); - - ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); - list_remove(list, dn); - dnode_sync(dn, tx); - } -} - -/* ARGSUSED */ -static void -ready(zio_t *zio, arc_buf_t *abuf, void *arg) -{ - objset_impl_t *os = arg; - blkptr_t *bp = os->os_rootbp; - dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; - int i; - - /* - * Update rootbp fill count. - */ - bp->blk_fill = 1; /* count the meta-dnode */ - for (i = 0; i < dnp->dn_nblkptr; i++) - bp->blk_fill += dnp->dn_blkptr[i].blk_fill; -} - -/* ARGSUSED */ -static void -killer(zio_t *zio, arc_buf_t *abuf, void *arg) -{ - objset_impl_t *os = arg; - - ASSERT3U(zio->io_error, ==, 0); - - BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET); - BP_SET_LEVEL(zio->io_bp, 0); - - if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), - BP_IDENTITY(&zio->io_bp_orig))) { - if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) - dsl_dataset_block_kill(os->os_dsl_dataset, - &zio->io_bp_orig, NULL, os->os_synctx); - dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp, - os->os_synctx); - } - arc_release(os->os_phys_buf, &os->os_phys_buf); -} - -/* called from dsl */ -void -dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) -{ - int txgoff; - zbookmark_t zb; - zio_t *zio; - list_t *list; - dbuf_dirty_record_t *dr; - int zio_flags; - - dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); - - ASSERT(dmu_tx_is_syncing(tx)); - /* XXX the write_done callback should really give us the tx... */ - os->os_synctx = tx; - - if (os->os_dsl_dataset == NULL) { - /* - * This is the MOS. If we have upgraded, - * spa_max_replication() could change, so reset - * os_copies here. - */ - os->os_copies = spa_max_replication(os->os_spa); - } - - /* - * Create the root block IO - */ - zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = 0; - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; - if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) - dsl_dataset_block_kill(os->os_dsl_dataset, - os->os_rootbp, pio, tx); - zio = arc_write(pio, os->os_spa, os->os_md_checksum, - os->os_md_compress, - dmu_get_replication_level(os, &zb, DMU_OT_OBJSET), - tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os, - ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); - - /* - * Sync meta-dnode - the parent IO for the sync is the root block - */ - os->os_meta_dnode->dn_zio = zio; - dnode_sync(os->os_meta_dnode, tx); - - txgoff = tx->tx_txg & TXG_MASK; - - dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx); - dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx); - - list = &os->os_meta_dnode->dn_dirty_records[txgoff]; - while (dr = list_head(list)) { - ASSERT(dr->dr_dbuf->db_level == 0); - list_remove(list, dr); - if (dr->dr_zio) - zio_nowait(dr->dr_zio); - } - /* - * Free intent log blocks up to this tx. - */ - zil_sync(os->os_zil, tx); - zio_nowait(zio); -} - -void -dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp) -{ - dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp, - usedobjsp, availobjsp); -} - -uint64_t -dmu_objset_fsid_guid(objset_t *os) -{ - return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset)); -} - -void -dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) -{ - stat->dds_type = os->os->os_phys->os_type; - if (os->os->os_dsl_dataset) - dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat); -} - -void -dmu_objset_stats(objset_t *os, nvlist_t *nv) -{ - ASSERT(os->os->os_dsl_dataset || - os->os->os_phys->os_type == DMU_OST_META); - - if (os->os->os_dsl_dataset != NULL) - dsl_dataset_stats(os->os->os_dsl_dataset, nv); - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, - os->os->os_phys->os_type); -} - -int -dmu_objset_is_snapshot(objset_t *os) -{ - if (os->os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset)); - else - return (B_FALSE); -} - -int -dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - uint64_t *idp, uint64_t *offp) -{ - dsl_dataset_t *ds = os->os->os_dsl_dataset; - zap_cursor_t cursor; - zap_attribute_t attr; - - if (ds->ds_phys->ds_snapnames_zapobj == 0) - return (ENOENT); - - zap_cursor_init_serialized(&cursor, - ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, *offp); - - if (zap_cursor_retrieve(&cursor, &attr) != 0) { - zap_cursor_fini(&cursor); - return (ENOENT); - } - - if (strlen(attr.za_name) + 1 > namelen) { - zap_cursor_fini(&cursor); - return (ENAMETOOLONG); - } - - (void) strcpy(name, attr.za_name); - if (idp) - *idp = attr.za_first_integer; - zap_cursor_advance(&cursor); - *offp = zap_cursor_serialize(&cursor); - zap_cursor_fini(&cursor); - - return (0); -} - -int -dmu_dir_list_next(objset_t *os, int namelen, char *name, - uint64_t *idp, uint64_t *offp) -{ - dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir; - zap_cursor_t cursor; - zap_attribute_t attr; - - /* there is no next dir on a snapshot! */ - if (os->os->os_dsl_dataset->ds_object != - dd->dd_phys->dd_head_dataset_obj) - return (ENOENT); - - zap_cursor_init_serialized(&cursor, - dd->dd_pool->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj, *offp); - - if (zap_cursor_retrieve(&cursor, &attr) != 0) { - zap_cursor_fini(&cursor); - return (ENOENT); - } - - if (strlen(attr.za_name) + 1 > namelen) { - zap_cursor_fini(&cursor); - return (ENAMETOOLONG); - } - - (void) strcpy(name, attr.za_name); - if (idp) - *idp = attr.za_first_integer; - zap_cursor_advance(&cursor); - *offp = zap_cursor_serialize(&cursor); - zap_cursor_fini(&cursor); - - return (0); -} - -/* - * Find all objsets under name, and for each, call 'func(child_name, arg)'. - */ -int -dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) -{ - dsl_dir_t *dd; - objset_t *os; - uint64_t snapobj; - zap_cursor_t zc; - zap_attribute_t *attr; - char *child; - int do_self, err; - - err = dsl_dir_open(name, FTAG, &dd, NULL); - if (err) - return (err); - - /* NB: the $MOS dir doesn't have a head dataset */ - do_self = (dd->dd_phys->dd_head_dataset_obj != 0); - attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - - /* - * Iterate over all children. - */ - if (flags & DS_FIND_CHILDREN) { - for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj); - zap_cursor_retrieve(&zc, attr) == 0; - (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); - - /* - * No separating '/' because parent's name ends in /. - */ - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - /* XXX could probably just use name here */ - dsl_dir_name(dd, child); - (void) strcat(child, "/"); - (void) strcat(child, attr->za_name); - err = dmu_objset_find(child, func, arg, flags); - kmem_free(child, MAXPATHLEN); - if (err) - break; - } - zap_cursor_fini(&zc); - - if (err) { - dsl_dir_close(dd, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); - return (err); - } - } - - /* - * Iterate over all snapshots. - */ - if ((flags & DS_FIND_SNAPSHOTS) && - dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) { - - snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj; - dmu_objset_close(os); - - for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj); - zap_cursor_retrieve(&zc, attr) == 0; - (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); - - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - /* XXX could probably just use name here */ - dsl_dir_name(dd, child); - (void) strcat(child, "@"); - (void) strcat(child, attr->za_name); - err = func(child, arg); - kmem_free(child, MAXPATHLEN); - if (err) - break; - } - zap_cursor_fini(&zc); - } - - dsl_dir_close(dd, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); - - if (err) - return (err); - - /* - * Apply to self if appropriate. - */ - if (do_self) - err = func(name, arg); - return (err); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c deleted file mode 100644 index 3e55dc3..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ /dev/null @@ -1,1009 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/dmu_impl.h> -#include <sys/dmu_tx.h> -#include <sys/dbuf.h> -#include <sys/dnode.h> -#include <sys/zfs_context.h> -#include <sys/dmu_objset.h> -#include <sys/dmu_traverse.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_pool.h> -#include <sys/dsl_synctask.h> -#include <sys/zfs_ioctl.h> -#include <sys/zap.h> -#include <sys/zio_checksum.h> - -struct backuparg { - dmu_replay_record_t *drr; - kthread_t *td; - struct file *fp; - objset_t *os; - zio_cksum_t zc; - int err; -}; - -static int -dump_bytes(struct backuparg *ba, void *buf, int len) -{ - struct uio auio; - struct iovec aiov; - - ASSERT3U(len % 8, ==, 0); - - fletcher_4_incremental_native(buf, len, &ba->zc); - - aiov.iov_base = buf; - aiov.iov_len = len; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_resid = len; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_WRITE; - auio.uio_offset = (off_t)-1; - auio.uio_td = ba->td; -#ifdef _KERNEL - if (ba->fp->f_type == DTYPE_VNODE) - bwillwrite(); - ba->err = fo_write(ba->fp, &auio, ba->td->td_ucred, 0, ba->td); -#else - fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); - ba->err = EOPNOTSUPP; -#endif - - return (ba->err); -} - -static int -dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, - uint64_t length) -{ - /* write a FREE record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_FREE; - ba->drr->drr_u.drr_free.drr_object = object; - ba->drr->drr_u.drr_free.drr_offset = offset; - ba->drr->drr_u.drr_free.drr_length = length; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); - return (0); -} - -static int -dump_data(struct backuparg *ba, dmu_object_type_t type, - uint64_t object, uint64_t offset, int blksz, void *data) -{ - /* write a DATA record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_WRITE; - ba->drr->drr_u.drr_write.drr_object = object; - ba->drr->drr_u.drr_write.drr_type = type; - ba->drr->drr_u.drr_write.drr_offset = offset; - ba->drr->drr_u.drr_write.drr_length = blksz; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); - if (dump_bytes(ba, data, blksz)) - return (EINTR); - return (0); -} - -static int -dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) -{ - /* write a FREEOBJECTS record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_FREEOBJECTS; - ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; - ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); - return (0); -} - -static int -dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) -{ - if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) - return (dump_freeobjects(ba, object, 1)); - - /* write an OBJECT record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_OBJECT; - ba->drr->drr_u.drr_object.drr_object = object; - ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; - ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; - ba->drr->drr_u.drr_object.drr_blksz = - dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; - ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; - ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); - - if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) - return (EINTR); - - /* free anything past the end of the file */ - if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) - return (EINTR); - if (ba->err) - return (EINTR); - return (0); -} - -#define BP_SPAN(dnp, level) \ - (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) - -static int -backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) -{ - struct backuparg *ba = arg; - uint64_t object = bc->bc_bookmark.zb_object; - int level = bc->bc_bookmark.zb_level; - uint64_t blkid = bc->bc_bookmark.zb_blkid; - blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; - dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; - void *data = bc->bc_data; - int err = 0; - - if (SIGPENDING(curthread)) - return (EINTR); - - ASSERT(data || bp == NULL); - - if (bp == NULL && object == 0) { - uint64_t span = BP_SPAN(bc->bc_dnode, level); - uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); - } else if (bp == NULL) { - uint64_t span = BP_SPAN(bc->bc_dnode, level); - err = dump_free(ba, object, blkid * span, span); - } else if (data && level == 0 && type == DMU_OT_DNODE) { - dnode_phys_t *blk = data; - int i; - int blksz = BP_GET_LSIZE(bp); - - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = - (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; - err = dump_dnode(ba, dnobj, blk+i); - if (err) - break; - } - } else if (level == 0 && - type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { - int blksz = BP_GET_LSIZE(bp); - if (data == NULL) { - uint32_t aflags = ARC_WAIT; - arc_buf_t *abuf; - zbookmark_t zb; - - zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; - zb.zb_object = object; - zb.zb_level = level; - zb.zb_blkid = blkid; - (void) arc_read(NULL, spa, bp, - dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, - &aflags, &zb); - - if (abuf) { - err = dump_data(ba, type, object, blkid * blksz, - blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); - } - } else { - err = dump_data(ba, type, object, blkid * blksz, - blksz, data); - } - } - - ASSERT(err == 0 || err == EINTR); - return (err); -} - -int -dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) -{ - dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; - dmu_replay_record_t *drr; - struct backuparg ba; - int err; - - /* tosnap must be a snapshot */ - if (ds->ds_phys->ds_next_snap_obj == 0) - return (EINVAL); - - /* fromsnap must be an earlier snapshot from the same fs as tosnap */ - if (fromds && (ds->ds_dir != fromds->ds_dir || - fromds->ds_phys->ds_creation_txg >= - ds->ds_phys->ds_creation_txg)) - return (EXDEV); - - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; - drr->drr_u.drr_begin.drr_creation_time = - ds->ds_phys->ds_creation_time; - drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; - drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; - if (fromds) - drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; - dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - - ba.drr = drr; - ba.td = curthread; - ba.fp = fp; - ba.os = tosnap; - ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); - - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (ba.err); - } - - err = traverse_dsl_dataset(ds, - fromds ? fromds->ds_phys->ds_creation_txg : 0, - ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, - backup_cb, &ba); - - if (err) { - if (err == EINTR && ba.err) - err = ba.err; - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (err); - } - - bzero(drr, sizeof (dmu_replay_record_t)); - drr->drr_type = DRR_END; - drr->drr_u.drr_end.drr_checksum = ba.zc; - - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (ba.err); - } - - kmem_free(drr, sizeof (dmu_replay_record_t)); - - return (0); -} - -struct restorearg { - int err; - int byteswap; - kthread_t *td; - struct file *fp; - char *buf; - uint64_t voff; - int buflen; /* number of valid bytes in buf */ - int bufoff; /* next offset to read */ - int bufsize; /* amount of memory allocated for buf */ - zio_cksum_t zc; -}; - -/* ARGSUSED */ -static int -replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct drr_begin *drrb = arg2; - const char *snapname; - int err; - uint64_t val; - - /* must already be a snapshot of this fs */ - if (ds->ds_phys->ds_prev_snap_obj == 0) - return (ENODEV); - - /* most recent snapshot must match fromguid */ - if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) - return (ENODEV); - /* must not have any changes since most recent snapshot */ - if (ds->ds_phys->ds_bp.blk_birth > - ds->ds_prev->ds_phys->ds_creation_txg) - return (ETXTBSY); - - /* new snapshot name must not exist */ - snapname = strrchr(drrb->drr_toname, '@'); - if (snapname == NULL) - return (EEXIST); - - snapname++; - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - - return (0); -} - -/* ARGSUSED */ -static void -replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; -} - -/* ARGSUSED */ -static int -replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct drr_begin *drrb = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - char *cp; - uint64_t val; - int err; - - cp = strchr(drrb->drr_toname, '@'); - *cp = '\0'; - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - strrchr(drrb->drr_toname, '/') + 1, - sizeof (uint64_t), 1, &val); - *cp = '@'; - - if (err != ENOENT) - return (err ? err : EEXIST); - - return (0); -} - -static void -replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct drr_begin *drrb = arg2; - char *cp; - dsl_dataset_t *ds; - uint64_t dsobj; - - cp = strchr(drrb->drr_toname, '@'); - *cp = '\0'; - dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1, - NULL, tx); - *cp = '@'; - - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, - DS_MODE_EXCLUSIVE, FTAG, &ds)); - - (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx); - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); -} - -static int -replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - struct drr_begin *drrb = arg2; - char *snapname; - - /* XXX verify that drr_toname is in dd */ - - snapname = strchr(drrb->drr_toname, '@'); - if (snapname == NULL) - return (EINVAL); - snapname++; - - return (dsl_dataset_snapshot_check(os, snapname, tx)); -} - -static void -replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - struct drr_begin *drrb = arg2; - char *snapname; - dsl_dataset_t *ds, *hds; - - snapname = strchr(drrb->drr_toname, '@') + 1; - - dsl_dataset_snapshot_sync(os, snapname, tx); - - /* set snapshot's creation time and guid */ - hds = os->os->os_dsl_dataset; - VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool, - hds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - FTAG, &ds)); - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_creation_time = drrb->drr_creation_time; - ds->ds_phys->ds_guid = drrb->drr_toguid; - ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; - - dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); - - dmu_buf_will_dirty(hds->ds_dbuf, tx); - hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -} - -static int -restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid) -{ - struct uio auio; - struct iovec aiov; - int error; - - aiov.iov_base = buf; - aiov.iov_len = len; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_resid = len; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_offset = off; - auio.uio_td = ra->td; -#ifdef _KERNEL - error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); -#else - fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); - error = EOPNOTSUPP; -#endif - *resid = auio.uio_resid; - return (error); -} - -static void * -restore_read(struct restorearg *ra, int len) -{ - void *rv; - - /* some things will require 8-byte alignment, so everything must */ - ASSERT3U(len % 8, ==, 0); - - while (ra->buflen - ra->bufoff < len) { - int resid; - int leftover = ra->buflen - ra->bufoff; - - (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); - - ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover, - ra->bufsize - leftover, ra->voff, &resid); - - ra->voff += ra->bufsize - leftover - resid; - ra->buflen = ra->bufsize - resid; - ra->bufoff = 0; - if (resid == ra->bufsize - leftover) - ra->err = EINVAL; - if (ra->err) - return (NULL); - /* Could compute checksum here? */ - } - - ASSERT3U(ra->bufoff % 8, ==, 0); - ASSERT3U(ra->buflen - ra->bufoff, >=, len); - rv = ra->buf + ra->bufoff; - ra->bufoff += len; - if (ra->byteswap) - fletcher_4_incremental_byteswap(rv, len, &ra->zc); - else - fletcher_4_incremental_native(rv, len, &ra->zc); - return (rv); -} - -static void -backup_byteswap(dmu_replay_record_t *drr) -{ -#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) -#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) - drr->drr_type = BSWAP_32(drr->drr_type); - switch (drr->drr_type) { - case DRR_BEGIN: - DO64(drr_begin.drr_magic); - DO64(drr_begin.drr_version); - DO64(drr_begin.drr_creation_time); - DO32(drr_begin.drr_type); - DO64(drr_begin.drr_toguid); - DO64(drr_begin.drr_fromguid); - break; - case DRR_OBJECT: - DO64(drr_object.drr_object); - /* DO64(drr_object.drr_allocation_txg); */ - DO32(drr_object.drr_type); - DO32(drr_object.drr_bonustype); - DO32(drr_object.drr_blksz); - DO32(drr_object.drr_bonuslen); - break; - case DRR_FREEOBJECTS: - DO64(drr_freeobjects.drr_firstobj); - DO64(drr_freeobjects.drr_numobjs); - break; - case DRR_WRITE: - DO64(drr_write.drr_object); - DO32(drr_write.drr_type); - DO64(drr_write.drr_offset); - DO64(drr_write.drr_length); - break; - case DRR_FREE: - DO64(drr_free.drr_object); - DO64(drr_free.drr_offset); - DO64(drr_free.drr_length); - break; - case DRR_END: - DO64(drr_end.drr_checksum.zc_word[0]); - DO64(drr_end.drr_checksum.zc_word[1]); - DO64(drr_end.drr_checksum.zc_word[2]); - DO64(drr_end.drr_checksum.zc_word[3]); - break; - } -#undef DO64 -#undef DO32 -} - -static int -restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) -{ - int err; - dmu_tx_t *tx; - - err = dmu_object_info(os, drro->drr_object, NULL); - - if (err != 0 && err != ENOENT) - return (EINVAL); - - if (drro->drr_type == DMU_OT_NONE || - drro->drr_type >= DMU_OT_NUMTYPES || - drro->drr_bonustype >= DMU_OT_NUMTYPES || - drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || - drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || - P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || - drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > SPA_MAXBLOCKSIZE || - drro->drr_bonuslen > DN_MAX_BONUSLEN) { - return (EINVAL); - } - - tx = dmu_tx_create(os); - - if (err == ENOENT) { - /* currently free, want to be allocated */ - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_object_claim(os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); - } else { - /* currently allocated, want to be allocated */ - dmu_tx_hold_bonus(tx, drro->drr_object); - /* - * We may change blocksize, so need to - * hold_write - */ - dmu_tx_hold_write(tx, drro->drr_object, 0, 1); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - - err = dmu_object_reclaim(os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); - } - if (err) { - dmu_tx_commit(tx); - return (EINVAL); - } - - dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); - dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); - - if (drro->drr_bonuslen) { - dmu_buf_t *db; - void *data; - VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - - ASSERT3U(db->db_size, ==, drro->drr_bonuslen); - data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); - if (data == NULL) { - dmu_tx_commit(tx); - return (ra->err); - } - bcopy(data, db->db_data, db->db_size); - if (ra->byteswap) { - dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, - drro->drr_bonuslen); - } - dmu_buf_rele(db, FTAG); - } - dmu_tx_commit(tx); - return (0); -} - -/* ARGSUSED */ -static int -restore_freeobjects(struct restorearg *ra, objset_t *os, - struct drr_freeobjects *drrfo) -{ - uint64_t obj; - - if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) - return (EINVAL); - - for (obj = drrfo->drr_firstobj; - obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(os, &obj, FALSE, 0)) { - dmu_tx_t *tx; - int err; - - if (dmu_object_info(os, obj, NULL) != 0) - continue; - - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, obj); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_object_free(os, obj, tx); - dmu_tx_commit(tx); - if (err && err != ENOENT) - return (EINVAL); - } - return (0); -} - -static int -restore_write(struct restorearg *ra, objset_t *os, - struct drr_write *drrw) -{ - dmu_tx_t *tx; - void *data; - int err; - - if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || - drrw->drr_type >= DMU_OT_NUMTYPES) - return (EINVAL); - - data = restore_read(ra, drrw->drr_length); - if (data == NULL) - return (ra->err); - - if (dmu_object_info(os, drrw->drr_object, NULL) != 0) - return (EINVAL); - - tx = dmu_tx_create(os); - - dmu_tx_hold_write(tx, drrw->drr_object, - drrw->drr_offset, drrw->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - if (ra->byteswap) - dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); - dmu_write(os, drrw->drr_object, - drrw->drr_offset, drrw->drr_length, data, tx); - dmu_tx_commit(tx); - return (0); -} - -/* ARGSUSED */ -static int -restore_free(struct restorearg *ra, objset_t *os, - struct drr_free *drrf) -{ - dmu_tx_t *tx; - int err; - - if (drrf->drr_length != -1ULL && - drrf->drr_offset + drrf->drr_length < drrf->drr_offset) - return (EINVAL); - - if (dmu_object_info(os, drrf->drr_object, NULL) != 0) - return (EINVAL); - - tx = dmu_tx_create(os); - - dmu_tx_hold_free(tx, drrf->drr_object, - drrf->drr_offset, drrf->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_free_range(os, drrf->drr_object, - drrf->drr_offset, drrf->drr_length, tx); - dmu_tx_commit(tx); - return (err); -} - -int -dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, - boolean_t force, struct file *fp, uint64_t voffset) -{ - kthread_t *td = curthread; - struct restorearg ra; - dmu_replay_record_t *drr; - char *cp; - objset_t *os = NULL; - zio_cksum_t pzc; - - bzero(&ra, sizeof (ra)); - ra.td = td; - ra.fp = fp; - ra.voff = voffset; - ra.bufsize = 1<<20; - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); - - if (drrb->drr_magic == DMU_BACKUP_MAGIC) { - ra.byteswap = FALSE; - } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { - ra.byteswap = TRUE; - } else { - ra.err = EINVAL; - goto out; - } - - /* - * NB: this assumes that struct drr_begin will be the largest in - * dmu_replay_record_t's drr_u, and thus we don't need to pad it - * with zeros to make it the same length as we wrote out. - */ - ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN; - ((dmu_replay_record_t *)ra.buf)->drr_pad = 0; - ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb; - if (ra.byteswap) { - fletcher_4_incremental_byteswap(ra.buf, - sizeof (dmu_replay_record_t), &ra.zc); - } else { - fletcher_4_incremental_native(ra.buf, - sizeof (dmu_replay_record_t), &ra.zc); - } - (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */ - - if (ra.byteswap) { - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_version = BSWAP_64(drrb->drr_version); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); - } - - ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - - if (drrb->drr_version != DMU_BACKUP_VERSION || - drrb->drr_type >= DMU_OST_NUMTYPES || - strchr(drrb->drr_toname, '@') == NULL) { - ra.err = EINVAL; - goto out; - } - - /* - * Process the begin in syncing context. - */ - if (drrb->drr_fromguid) { - /* incremental backup */ - dsl_dataset_t *ds = NULL; - - cp = strchr(tosnap, '@'); - *cp = '\0'; - ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds); - *cp = '@'; - if (ra.err) - goto out; - - /* - * Only do the rollback if the most recent snapshot - * matches the incremental source - */ - if (force) { - if (ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_guid != - drrb->drr_fromguid) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - kmem_free(ra.buf, ra.bufsize); - return (ENODEV); - } - (void) dsl_dataset_rollback(ds); - } - ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool, - replay_incremental_check, replay_incremental_sync, - ds, drrb, 1); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - } else { - /* full backup */ - dsl_dir_t *dd = NULL; - const char *tail; - - /* can't restore full backup into topmost fs, for now */ - if (strrchr(drrb->drr_toname, '/') == NULL) { - ra.err = EINVAL; - goto out; - } - - cp = strchr(tosnap, '@'); - *cp = '\0'; - ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail); - *cp = '@'; - if (ra.err) - goto out; - if (tail == NULL) { - ra.err = EEXIST; - goto out; - } - - ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check, - replay_full_sync, dd, drrb, 5); - dsl_dir_close(dd, FTAG); - } - if (ra.err) - goto out; - - /* - * Open the objset we are modifying. - */ - - cp = strchr(tosnap, '@'); - *cp = '\0'; - ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, - DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); - *cp = '@'; - ASSERT3U(ra.err, ==, 0); - - /* - * Read records and process them. - */ - pzc = ra.zc; - while (ra.err == 0 && - NULL != (drr = restore_read(&ra, sizeof (*drr)))) { - if (SIGPENDING(td)) { - ra.err = EINTR; - goto out; - } - - if (ra.byteswap) - backup_byteswap(drr); - - switch (drr->drr_type) { - case DRR_OBJECT: - { - /* - * We need to make a copy of the record header, - * because restore_{object,write} may need to - * restore_read(), which will invalidate drr. - */ - struct drr_object drro = drr->drr_u.drr_object; - ra.err = restore_object(&ra, os, &drro); - break; - } - case DRR_FREEOBJECTS: - { - struct drr_freeobjects drrfo = - drr->drr_u.drr_freeobjects; - ra.err = restore_freeobjects(&ra, os, &drrfo); - break; - } - case DRR_WRITE: - { - struct drr_write drrw = drr->drr_u.drr_write; - ra.err = restore_write(&ra, os, &drrw); - break; - } - case DRR_FREE: - { - struct drr_free drrf = drr->drr_u.drr_free; - ra.err = restore_free(&ra, os, &drrf); - break; - } - case DRR_END: - { - struct drr_end drre = drr->drr_u.drr_end; - /* - * We compare against the *previous* checksum - * value, because the stored checksum is of - * everything before the DRR_END record. - */ - if (drre.drr_checksum.zc_word[0] != 0 && - !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) { - ra.err = ECKSUM; - goto out; - } - - ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> - ds_dir->dd_pool, replay_end_check, replay_end_sync, - os, drrb, 3); - goto out; - } - default: - ra.err = EINVAL; - goto out; - } - pzc = ra.zc; - } - -out: - if (os) - dmu_objset_close(os); - - /* - * Make sure we don't rollback/destroy unless we actually - * processed the begin properly. 'os' will only be set if this - * is the case. - */ - if (ra.err && os && tosnap && strchr(tosnap, '@')) { - /* - * rollback or destroy what we created, so we don't - * leave it in the restoring state. - */ - dsl_dataset_t *ds; - int err; - - cp = strchr(tosnap, '@'); - *cp = '\0'; - err = dsl_dataset_open(tosnap, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, - FTAG, &ds); - if (err == 0) { - txg_wait_synced(ds->ds_dir->dd_pool, 0); - if (drrb->drr_fromguid) { - /* incremental: rollback to most recent snap */ - (void) dsl_dataset_rollback(ds); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - } else { - /* full: destroy whole fs */ - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - (void) dsl_dataset_destroy(tosnap); - } - } - *cp = '@'; - } - - kmem_free(ra.buf, ra.bufsize); - if (sizep) - *sizep = ra.voff; - return (ra.err); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c deleted file mode 100644 index 3d2bc3e..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ /dev/null @@ -1,888 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/dmu_objset.h> -#include <sys/dmu_traverse.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_pool.h> -#include <sys/dnode.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/dmu_impl.h> - -#define BP_SPAN_SHIFT(level, width) ((level) * (width)) - -#define BP_EQUAL(b1, b2) \ - (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ - (b1)->blk_birth == (b2)->blk_birth) - -/* - * Compare two bookmarks. - * - * For ADVANCE_PRE, the visitation order is: - * - * objset 0, 1, 2, ..., ZB_MAXOBJSET. - * object 0, 1, 2, ..., ZB_MAXOBJECT. - * blkoff 0, 1, 2, ... - * level ZB_MAXLEVEL, ..., 2, 1, 0. - * - * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid - * ordering vector is: - * - * < objset, object, blkoff, -level > - * - * For ADVANCE_POST, the starting offsets aren't sequential but ending - * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. - * The visitation order is: - * - * objset 1, 2, ..., ZB_MAXOBJSET, 0. - * object 1, 2, ..., ZB_MAXOBJECT, 0. - * blkoff 1, 2, ... - * level 0, 1, 2, ..., ZB_MAXLEVEL. - * - * and thus a valid ordering vector is: - * - * < objset - 1, object - 1, blkoff, level > - * - * Both orderings can be expressed as: - * - * < objset + bias, object + bias, blkoff, level ^ bias > - * - * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) - * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). - * - * Special case: an objset's osphys is represented as level -1 of object 0. - * It is always either the very first or very last block we visit in an objset. - * Therefore, if either bookmark's level is -1, level alone determines order. - */ -static int -compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, - int advance) -{ - int bias = (advance & ADVANCE_PRE) ? 0 : -1; - uint64_t sblkoff, eblkoff; - int slevel, elevel, wshift; - - if (szb->zb_objset + bias < ezb->zb_objset + bias) - return (-1); - - if (szb->zb_objset + bias > ezb->zb_objset + bias) - return (1); - - slevel = szb->zb_level; - elevel = ezb->zb_level; - - if ((slevel | elevel) < 0) - return ((slevel ^ bias) - (elevel ^ bias)); - - if (szb->zb_object + bias < ezb->zb_object + bias) - return (-1); - - if (szb->zb_object + bias > ezb->zb_object + bias) - return (1); - - if (dnp == NULL) - return (0); - - wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; - - sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); - eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); - - if (sblkoff < eblkoff) - return (-1); - - if (sblkoff > eblkoff) - return (1); - - return ((elevel ^ bias) - (slevel ^ bias)); -} - -#define SET_BOOKMARK(zb, objset, object, level, blkid) \ -{ \ - (zb)->zb_objset = objset; \ - (zb)->zb_object = object; \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - -#define SET_BOOKMARK_LB(zb, level, blkid) \ -{ \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - -static int -advance_objset(zseg_t *zseg, uint64_t objset, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - - if (advance & ADVANCE_PRE) { - if (objset >= ZB_MAXOBJSET) - return (ERANGE); - SET_BOOKMARK(zb, objset, 0, -1, 0); - } else { - if (objset >= ZB_MAXOBJSET) - objset = 0; - SET_BOOKMARK(zb, objset, 1, 0, 0); - } - - if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -advance_object(zseg_t *zseg, uint64_t object, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - - if (advance & ADVANCE_PRE) { - if (object >= ZB_MAXOBJECT) { - SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); - } else { - SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); - } - } else { - if (zb->zb_object == 0) { - SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); - } else { - if (object >= ZB_MAXOBJECT) - object = 0; - SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); - } - } - - if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -advance_from_osphys(zseg_t *zseg, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - - ASSERT(zb->zb_object == 0); - ASSERT(zb->zb_level == -1); - ASSERT(zb->zb_blkid == 0); - - if (advance & ADVANCE_PRE) { - SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); - } else { - if (zb->zb_objset == 0) - return (ERANGE); - SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); - } - - if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; - int maxlevel = dnp->dn_nlevels - 1; - int level = zb->zb_level; - uint64_t blkid = zb->zb_blkid; - - if (advance & ADVANCE_PRE) { - if (level > 0 && rc == 0) { - level--; - blkid <<= wshift; - } else { - blkid++; - - if ((blkid << BP_SPAN_SHIFT(level, wshift)) > - dnp->dn_maxblkid) - return (ERANGE); - - while (level < maxlevel) { - if (P2PHASE(blkid, 1ULL << wshift)) - break; - blkid >>= wshift; - level++; - } - } - } else { - if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { - blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); - level = 0; - } else { - blkid >>= wshift; - level++; - } - - while ((blkid << BP_SPAN_SHIFT(level, wshift)) > - dnp->dn_maxblkid) { - if (level == maxlevel) - return (ERANGE); - blkid >>= wshift; - level++; - } - } - SET_BOOKMARK_LB(zb, level, blkid); - - if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) -{ - /* - * Before we issue the callback, prune against maxtxg. - * - * We prune against mintxg before we get here because it's a big win. - * If a given block was born in txg 37, then we know that the entire - * subtree below that block must have been born in txg 37 or earlier. - * We can therefore lop off huge branches of the tree as we go. - * - * There's no corresponding optimization for maxtxg because knowing - * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's - * children. In fact, the copy-on-write design of ZFS ensures that - * top-level blocks will pretty much always be new. - * - * Therefore, in the name of simplicity we don't prune against - * maxtxg until the last possible moment -- that being right now. - */ - if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) - return (0); - - /* - * Debugging: verify that the order we visit things agrees with the - * order defined by compare_bookmark(). We don't check this for - * log blocks because there's no defined ordering for them; they're - * always visited (or not) as part of visiting the objset_phys_t. - */ - if (bc->bc_errno == 0 && bc != &th->th_zil_cache) { - zbookmark_t *zb = &bc->bc_bookmark; - zbookmark_t *szb = &zseg->seg_start; - zbookmark_t *ezb = &zseg->seg_end; - zbookmark_t *lzb = &th->th_lastcb; - dnode_phys_t *dnp = bc->bc_dnode; - - ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); - ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); - ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || - lzb->zb_level == ZB_NO_LEVEL); - *lzb = *zb; - } - - th->th_callbacks++; - return (th->th_func(bc, th->th_spa, th->th_arg)); -} - -static int -traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, - dnode_phys_t *dnp) -{ - zbookmark_t *zb = &bc->bc_bookmark; - int error; - - th->th_hits++; - - bc->bc_dnode = dnp; - bc->bc_errno = 0; - - if (BP_EQUAL(&bc->bc_blkptr, bp)) - return (0); - - bc->bc_blkptr = *bp; - - if (bc->bc_data == NULL) - return (0); - - if (BP_IS_HOLE(bp)) { - ASSERT(th->th_advance & ADVANCE_HOLES); - return (0); - } - - if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { - error = EIO; - } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { - error = 0; - th->th_arc_hits++; - } else { - error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, - BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, - th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb)); - - if (BP_SHOULD_BYTESWAP(bp) && error == 0) - (zb->zb_level > 0 ? byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, - BP_GET_LSIZE(bp)); - th->th_reads++; - } - - if (error) { - bc->bc_errno = error; - error = traverse_callback(th, NULL, bc); - ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); - bc->bc_blkptr.blk_birth = -1ULL; - } - - dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", - bc - &th->th_cache[0][0], error, - zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); - - return (error); -} - -static int -find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) -{ - zbookmark_t *zb = &zseg->seg_start; - traverse_blk_cache_t *bc; - blkptr_t *bp = dnp->dn_blkptr; - int i, first, level; - int nbp = dnp->dn_nblkptr; - int minlevel = zb->zb_level; - int maxlevel = dnp->dn_nlevels - 1; - int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; - int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); - uint64_t blkid = zb->zb_blkid >> bp_shift; - int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; - int rc; - - if (minlevel > maxlevel || blkid >= nbp) - return (ERANGE); - - for (level = maxlevel; level >= minlevel; level--) { - first = P2PHASE(blkid, 1ULL << wshift); - - for (i = first; i < nbp; i++) - if (bp[i].blk_birth > zseg->seg_mintxg || - BP_IS_HOLE(&bp[i]) && do_holes) - break; - - if (i != first) { - i--; - SET_BOOKMARK_LB(zb, level, blkid + (i - first)); - return (ENOTBLK); - } - - bc = &th->th_cache[depth][level]; - - SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, - level, blkid); - - if (rc = traverse_read(th, bc, bp + i, dnp)) { - if (rc != EAGAIN) { - SET_BOOKMARK_LB(zb, level, blkid); - } - return (rc); - } - - if (BP_IS_HOLE(&bp[i])) { - SET_BOOKMARK_LB(zb, level, blkid); - th->th_lastcb.zb_level = ZB_NO_LEVEL; - return (0); - } - - nbp = 1 << wshift; - bp = bc->bc_data; - bp_shift -= wshift; - blkid = zb->zb_blkid >> bp_shift; - } - - return (0); -} - -static int -get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, - uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) -{ - zseg_t zseg; - zbookmark_t *zb = &zseg.seg_start; - uint64_t object = *objectp; - int i, rc; - - SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); - SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); - - zseg.seg_mintxg = txg; - zseg.seg_maxtxg = -1ULL; - - for (;;) { - rc = find_block(th, &zseg, mdn, depth); - - if (rc == EAGAIN || rc == EINTR || rc == ERANGE) - break; - - if (rc == 0 && zb->zb_level == 0) { - dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; - for (i = 0; i < DNODES_PER_BLOCK; i++) { - object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; - if (object >= *objectp && - dnp[i].dn_type != DMU_OT_NONE && - (type == -1 || dnp[i].dn_type == type)) { - *objectp = object; - *dnpp = &dnp[i]; - return (0); - } - } - } - - rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); - - if (rc == ERANGE) - break; - } - - if (rc == ERANGE) - *objectp = ZB_MAXOBJECT; - - return (rc); -} - -/* ARGSUSED */ -static void -traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - traverse_handle_t *th = arg; - traverse_blk_cache_t *bc = &th->th_zil_cache; - zbookmark_t *zb = &bc->bc_bookmark; - zseg_t *zseg = list_head(&th->th_seglist); - - if (bp->blk_birth <= zseg->seg_mintxg) - return; - - if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { - zb->zb_object = 0; - zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; - bc->bc_blkptr = *bp; - (void) traverse_callback(th, zseg, bc); - } -} - -/* ARGSUSED */ -static void -traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) -{ - traverse_handle_t *th = arg; - traverse_blk_cache_t *bc = &th->th_zil_cache; - zbookmark_t *zb = &bc->bc_bookmark; - zseg_t *zseg = list_head(&th->th_seglist); - - if (lrc->lrc_txtype == TX_WRITE) { - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - - if (bp->blk_birth <= zseg->seg_mintxg) - return; - - if (claim_txg != 0 && bp->blk_birth >= claim_txg) { - zb->zb_object = lr->lr_foid; - zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); - bc->bc_blkptr = *bp; - (void) traverse_callback(th, zseg, bc); - } - } -} - -static void -traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc) -{ - spa_t *spa = th->th_spa; - dsl_pool_t *dp = spa_get_dsl(spa); - objset_phys_t *osphys = bc->bc_data; - zil_header_t *zh = &osphys->os_zil_header; - uint64_t claim_txg = zh->zh_claim_txg; - zilog_t *zilog; - - ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]); - ASSERT(bc->bc_bookmark.zb_level == -1); - - /* - * We only want to visit blocks that have been claimed but not yet - * replayed (or, in read-only mode, blocks that *would* be claimed). - */ - if (claim_txg == 0 && (spa_mode & FWRITE)) - return; - - th->th_zil_cache.bc_bookmark = bc->bc_bookmark; - - zilog = zil_alloc(dp->dp_meta_objset, zh); - - (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, - claim_txg); - - zil_free(zilog); -} - -static int -traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) -{ - zbookmark_t *zb = &zseg->seg_start; - traverse_blk_cache_t *bc; - dnode_phys_t *dn, *dn_tmp; - int worklimit = 100; - int rc; - - dprintf("<%llu, %llu, %d, %llx>\n", - zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); - - bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; - dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; - - SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); - - rc = traverse_read(th, bc, mosbp, dn); - - if (rc) /* If we get ERESTART, we've got nowhere left to go */ - return (rc == ERESTART ? EINTR : rc); - - ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); - - if (zb->zb_objset != 0) { - uint64_t objset = zb->zb_objset; - dsl_dataset_phys_t *dsp; - - rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, - DMU_OT_DSL_DATASET, ZB_MOS_CACHE); - - if (objset != zb->zb_objset) - rc = advance_objset(zseg, objset, th->th_advance); - - if (rc != 0) - return (rc); - - dsp = DN_BONUS(dn_tmp); - - bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; - dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; - - SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); - - /* - * If we're traversing an open snapshot, we know that it - * can't be deleted (because it's open) and it can't change - * (because it's a snapshot). Therefore, once we've gotten - * from the uberblock down to the snapshot's objset_phys_t, - * we no longer need to synchronize with spa_sync(); we're - * traversing a completely static block tree from here on. - */ - if (th->th_advance & ADVANCE_NOLOCK) { - ASSERT(th->th_locked); - rw_exit(spa_traverse_rwlock(th->th_spa)); - th->th_locked = 0; - } - - rc = traverse_read(th, bc, &dsp->ds_bp, dn); - - if (rc != 0) { - if (rc == ERESTART) - rc = advance_objset(zseg, zb->zb_objset + 1, - th->th_advance); - return (rc); - } - - if (th->th_advance & ADVANCE_PRUNE) - zseg->seg_mintxg = - MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); - } - - if (zb->zb_level == -1) { - ASSERT(zb->zb_object == 0); - ASSERT(zb->zb_blkid == 0); - ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET); - - if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { - rc = traverse_callback(th, zseg, bc); - if (rc) { - ASSERT(rc == EINTR); - return (rc); - } - if ((th->th_advance & ADVANCE_ZIL) && - zb->zb_objset != 0) - traverse_zil(th, bc); - } - - return (advance_from_osphys(zseg, th->th_advance)); - } - - if (zb->zb_object != 0) { - uint64_t object = zb->zb_object; - - rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, - zseg->seg_mintxg, -1, ZB_MDN_CACHE); - - if (object != zb->zb_object) - rc = advance_object(zseg, object, th->th_advance); - - if (rc != 0) - return (rc); - - dn = dn_tmp; - } - - if (zb->zb_level == ZB_MAXLEVEL) - zb->zb_level = dn->dn_nlevels - 1; - - for (;;) { - rc = find_block(th, zseg, dn, ZB_DN_CACHE); - - if (rc == EAGAIN || rc == EINTR || rc == ERANGE) - break; - - if (rc == 0) { - bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; - ASSERT(bc->bc_dnode == dn); - ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); - rc = traverse_callback(th, zseg, bc); - if (rc) { - ASSERT(rc == EINTR); - return (rc); - } - if (BP_IS_HOLE(&bc->bc_blkptr)) { - ASSERT(th->th_advance & ADVANCE_HOLES); - rc = ENOTBLK; - } - } - - rc = advance_block(zseg, dn, rc, th->th_advance); - - if (rc == ERANGE) - break; - - /* - * Give spa_sync() a chance to run. - */ - if (th->th_locked && spa_traverse_wanted(th->th_spa)) { - th->th_syncs++; - return (EAGAIN); - } - - if (--worklimit == 0) - return (EAGAIN); - } - - if (rc == ERANGE) - rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); - - return (rc); -} - -/* - * It is the caller's responsibility to ensure that the dsl_dataset_t - * doesn't go away during traversal. - */ -int -traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, - blkptr_cb_t func, void *arg) -{ - spa_t *spa = ds->ds_dir->dd_pool->dp_spa; - traverse_handle_t *th; - int err; - - th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); - - traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); - - while ((err = traverse_more(th)) == EAGAIN) - continue; - - traverse_fini(th); - return (err); -} - -int -traverse_more(traverse_handle_t *th) -{ - zseg_t *zseg = list_head(&th->th_seglist); - uint64_t save_txg; /* XXX won't be necessary with real itinerary */ - krwlock_t *rw = spa_traverse_rwlock(th->th_spa); - blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); - int rc; - - if (zseg == NULL) - return (0); - - th->th_restarts++; - - save_txg = zseg->seg_mintxg; - - rw_enter(rw, RW_READER); - th->th_locked = 1; - - rc = traverse_segment(th, zseg, mosbp); - ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); - - if (th->th_locked) - rw_exit(rw); - th->th_locked = 0; - - zseg->seg_mintxg = save_txg; - - if (rc == ERANGE) { - list_remove(&th->th_seglist, zseg); - kmem_free(zseg, sizeof (*zseg)); - return (EAGAIN); - } - - return (rc); -} - -/* - * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves - * are not included. The blocks covered by this segment will all have - * mintxg < birth < maxtxg. - */ -static void -traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, - uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, - uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) -{ - zseg_t *zseg; - - zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); - - zseg->seg_mintxg = mintxg; - zseg->seg_maxtxg = maxtxg; - - zseg->seg_start.zb_objset = sobjset; - zseg->seg_start.zb_object = sobject; - zseg->seg_start.zb_level = slevel; - zseg->seg_start.zb_blkid = sblkid; - - zseg->seg_end.zb_objset = eobjset; - zseg->seg_end.zb_object = eobject; - zseg->seg_end.zb_level = elevel; - zseg->seg_end.zb_blkid = eblkid; - - list_insert_tail(&th->th_seglist, zseg); -} - -void -traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, - uint64_t objset, uint64_t object) -{ - if (th->th_advance & ADVANCE_PRE) - traverse_add_segment(th, mintxg, maxtxg, - objset, object, ZB_MAXLEVEL, 0, - objset, object, 0, ZB_MAXBLKID); - else - traverse_add_segment(th, mintxg, maxtxg, - objset, object, 0, 0, - objset, object, 0, ZB_MAXBLKID); -} - -void -traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, - uint64_t objset) -{ - if (th->th_advance & ADVANCE_PRE) - traverse_add_segment(th, mintxg, maxtxg, - objset, 0, -1, 0, - objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); - else - traverse_add_segment(th, mintxg, maxtxg, - objset, 1, 0, 0, - objset, 0, -1, 0); -} - -void -traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) -{ - if (th->th_advance & ADVANCE_PRE) - traverse_add_segment(th, mintxg, maxtxg, - 0, 0, -1, 0, - ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); - else - traverse_add_segment(th, mintxg, maxtxg, - 1, 1, 0, 0, - 0, 0, -1, 0); -} - -traverse_handle_t * -traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, - int zio_flags) -{ - traverse_handle_t *th; - int d, l; - - th = kmem_zalloc(sizeof (*th), KM_SLEEP); - - th->th_spa = spa; - th->th_func = func; - th->th_arg = arg; - th->th_advance = advance; - th->th_lastcb.zb_level = ZB_NO_LEVEL; - th->th_noread.zb_level = ZB_NO_LEVEL; - th->th_zio_flags = zio_flags; - - list_create(&th->th_seglist, sizeof (zseg_t), - offsetof(zseg_t, seg_node)); - - for (d = 0; d < ZB_DEPTH; d++) { - for (l = 0; l < ZB_MAXLEVEL; l++) { - if ((advance & ADVANCE_DATA) || - l != 0 || d != ZB_DN_CACHE) - th->th_cache[d][l].bc_data = - zio_buf_alloc(SPA_MAXBLOCKSIZE); - } - } - - return (th); -} - -void -traverse_fini(traverse_handle_t *th) -{ - int d, l; - zseg_t *zseg; - - for (d = 0; d < ZB_DEPTH; d++) - for (l = 0; l < ZB_MAXLEVEL; l++) - if (th->th_cache[d][l].bc_data != NULL) - zio_buf_free(th->th_cache[d][l].bc_data, - SPA_MAXBLOCKSIZE); - - while ((zseg = list_head(&th->th_seglist)) != NULL) { - list_remove(&th->th_seglist, zseg); - kmem_free(zseg, sizeof (*zseg)); - } - - list_destroy(&th->th_seglist); - - dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", - th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, - th->th_syncs, th->th_restarts); - - kmem_free(th, sizeof (*th)); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c deleted file mode 100644 index 13fd8d4..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ /dev/null @@ -1,992 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/dmu_impl.h> -#include <sys/dbuf.h> -#include <sys/dmu_tx.h> -#include <sys/dmu_objset.h> -#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ -#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ -#include <sys/dsl_pool.h> -#include <sys/zap_impl.h> /* for fzap_default_block_shift */ -#include <sys/spa.h> -#include <sys/zfs_context.h> - -typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, - uint64_t arg1, uint64_t arg2); - - -dmu_tx_t * -dmu_tx_create_dd(dsl_dir_t *dd) -{ - dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); - tx->tx_dir = dd; - if (dd) - tx->tx_pool = dd->dd_pool; - list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), - offsetof(dmu_tx_hold_t, txh_node)); -#ifdef ZFS_DEBUG - refcount_create(&tx->tx_space_written); - refcount_create(&tx->tx_space_freed); -#endif - return (tx); -} - -dmu_tx_t * -dmu_tx_create(objset_t *os) -{ - dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); - tx->tx_objset = os; - tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); - return (tx); -} - -dmu_tx_t * -dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) -{ - dmu_tx_t *tx = dmu_tx_create_dd(NULL); - - ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); - tx->tx_pool = dp; - tx->tx_txg = txg; - tx->tx_anyobj = TRUE; - - return (tx); -} - -int -dmu_tx_is_syncing(dmu_tx_t *tx) -{ - return (tx->tx_anyobj); -} - -int -dmu_tx_private_ok(dmu_tx_t *tx) -{ - return (tx->tx_anyobj); -} - -static dmu_tx_hold_t * -dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, - enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) -{ - dmu_tx_hold_t *txh; - dnode_t *dn = NULL; - int err; - - if (object != DMU_NEW_OBJECT) { - err = dnode_hold(os->os, object, tx, &dn); - if (err) { - tx->tx_err = err; - return (NULL); - } - - if (err == 0 && tx->tx_txg != 0) { - mutex_enter(&dn->dn_mtx); - /* - * dn->dn_assigned_txg == tx->tx_txg doesn't pose a - * problem, but there's no way for it to happen (for - * now, at least). - */ - ASSERT(dn->dn_assigned_txg == 0); - dn->dn_assigned_txg = tx->tx_txg; - (void) refcount_add(&dn->dn_tx_holds, tx); - mutex_exit(&dn->dn_mtx); - } - } - - txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); - txh->txh_tx = tx; - txh->txh_dnode = dn; -#ifdef ZFS_DEBUG - txh->txh_type = type; - txh->txh_arg1 = arg1; - txh->txh_arg2 = arg2; -#endif - list_insert_tail(&tx->tx_holds, txh); - - return (txh); -} - -void -dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) -{ - /* - * If we're syncing, they can manipulate any object anyhow, and - * the hold on the dnode_t can cause problems. - */ - if (!dmu_tx_is_syncing(tx)) { - (void) dmu_tx_hold_object_impl(tx, os, - object, THT_NEWOBJECT, 0, 0); - } -} - -static int -dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) -{ - int err; - dmu_buf_impl_t *db; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold_level(dn, level, blkid, FTAG); - rw_exit(&dn->dn_struct_rwlock); - if (db == NULL) - return (EIO); - err = dbuf_read(db, zio, DB_RF_CANFAIL); - dbuf_rele(db, FTAG); - return (err); -} - -/* ARGSUSED */ -static void -dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) -{ - dnode_t *dn = txh->txh_dnode; - uint64_t start, end, i; - int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; - int err = 0; - - if (len == 0) - return; - - min_bs = SPA_MINBLOCKSHIFT; - max_bs = SPA_MAXBLOCKSHIFT; - min_ibs = DN_MIN_INDBLKSHIFT; - max_ibs = DN_MAX_INDBLKSHIFT; - - - /* - * For i/o error checking, read the first and last level-0 - * blocks (if they are not aligned), and all the level-1 blocks. - */ - - if (dn) { - if (dn->dn_maxblkid == 0) { - err = dmu_tx_check_ioerr(NULL, dn, 0, 0); - if (err) - goto out; - } else { - zio_t *zio = zio_root(dn->dn_objset->os_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - - /* first level-0 block */ - start = off >> dn->dn_datablkshift; - if (P2PHASE(off, dn->dn_datablksz) || - len < dn->dn_datablksz) { - err = dmu_tx_check_ioerr(zio, dn, 0, start); - if (err) - goto out; - } - - /* last level-0 block */ - end = (off+len-1) >> dn->dn_datablkshift; - if (end != start && - P2PHASE(off+len, dn->dn_datablksz)) { - err = dmu_tx_check_ioerr(zio, dn, 0, end); - if (err) - goto out; - } - - /* level-1 blocks */ - if (dn->dn_nlevels > 1) { - start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (i = start+1; i < end; i++) { - err = dmu_tx_check_ioerr(zio, dn, 1, i); - if (err) - goto out; - } - } - - err = zio_wait(zio); - if (err) - goto out; - } - } - - /* - * If there's more than one block, the blocksize can't change, - * so we can make a more precise estimate. Alternatively, - * if the dnode's ibs is larger than max_ibs, always use that. - * This ensures that if we reduce DN_MAX_INDBLKSHIFT, - * the code will still work correctly on existing pools. - */ - if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { - min_ibs = max_ibs = dn->dn_indblkshift; - if (dn->dn_datablkshift != 0) - min_bs = max_bs = dn->dn_datablkshift; - } - - /* - * 'end' is the last thing we will access, not one past. - * This way we won't overflow when accessing the last byte. - */ - start = P2ALIGN(off, 1ULL << max_bs); - end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; - txh->txh_space_towrite += end - start + 1; - - start >>= min_bs; - end >>= min_bs; - - epbs = min_ibs - SPA_BLKPTRSHIFT; - - /* - * The object contains at most 2^(64 - min_bs) blocks, - * and each indirect level maps 2^epbs. - */ - for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { - start >>= epbs; - end >>= epbs; - /* - * If we increase the number of levels of indirection, - * we'll need new blkid=0 indirect blocks. If start == 0, - * we're already accounting for that blocks; and if end == 0, - * we can't increase the number of levels beyond that. - */ - if (start != 0 && end != 0) - txh->txh_space_towrite += 1ULL << max_ibs; - txh->txh_space_towrite += (end - start + 1) << max_ibs; - } - - ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); - -out: - if (err) - txh->txh_tx->tx_err = err; -} - -static void -dmu_tx_count_dnode(dmu_tx_hold_t *txh) -{ - dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; - uint64_t space = mdn->dn_datablksz + - ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); - - if (dn && dn->dn_dbuf->db_blkptr && - dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_dbuf->db_blkptr->blk_birth)) { - txh->txh_space_tooverwrite += space; - } else { - txh->txh_space_towrite += space; - } -} - -void -dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) -{ - dmu_tx_hold_t *txh; - - ASSERT(tx->tx_txg == 0); - ASSERT(len < DMU_MAX_ACCESS); - ASSERT(len == 0 || UINT64_MAX - off >= len - 1); - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_WRITE, off, len); - if (txh == NULL) - return; - - dmu_tx_count_write(txh, off, len); - dmu_tx_count_dnode(txh); -} - -static void -dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) -{ - uint64_t blkid, nblks; - uint64_t space = 0; - dnode_t *dn = txh->txh_dnode; - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - spa_t *spa = txh->txh_tx->tx_pool->dp_spa; - int dirty; - - /* - * We don't need to use any locking to check for dirtyness - * because it's OK if we get stale data -- the dnode may become - * dirty immediately after our check anyway. This is just a - * means to avoid the expensive count when we aren't sure we - * need it. We need to be able to deal with a dirty dnode. - */ - dirty = list_link_active(&dn->dn_dirty_link[0]) | - list_link_active(&dn->dn_dirty_link[1]) | - list_link_active(&dn->dn_dirty_link[2]) | - list_link_active(&dn->dn_dirty_link[3]); - if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0) - return; - - /* - * the struct_rwlock protects us against dn_phys->dn_nlevels - * changing, in case (against all odds) we manage to dirty & - * sync out the changes after we check for being dirty. - * also, dbuf_hold_impl() wants us to have the struct_rwlock. - * - * It's fine to use dn_datablkshift rather than the dn_phys - * equivalent because if it is changing, maxblkid==0 and we will - * bail. - */ - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_phys->dn_maxblkid == 0) { - if (off == 0 && len >= dn->dn_datablksz) { - blkid = 0; - nblks = 1; - } else { - rw_exit(&dn->dn_struct_rwlock); - return; - } - } else { - blkid = off >> dn->dn_datablkshift; - nblks = (off + len) >> dn->dn_datablkshift; - - if (blkid >= dn->dn_phys->dn_maxblkid) { - rw_exit(&dn->dn_struct_rwlock); - return; - } - if (blkid + nblks > dn->dn_phys->dn_maxblkid) - nblks = dn->dn_phys->dn_maxblkid - blkid; - - /* don't bother after 128,000 blocks */ - nblks = MIN(nblks, 128*1024); - } - - if (dn->dn_phys->dn_nlevels == 1) { - int i; - for (i = 0; i < nblks; i++) { - blkptr_t *bp = dn->dn_phys->dn_blkptr; - ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); - bp += blkid + i; - if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { - dprintf_bp(bp, "can free old%s", ""); - space += bp_get_dasize(spa, bp); - } - } - nblks = 0; - } - - while (nblks) { - dmu_buf_impl_t *dbuf; - int err, epbs, blkoff, tochk; - - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - blkoff = P2PHASE(blkid, 1<<epbs); - tochk = MIN((1<<epbs) - blkoff, nblks); - - err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf); - if (err == 0) { - int i; - blkptr_t *bp; - - err = dbuf_read(dbuf, NULL, - DB_RF_HAVESTRUCT | DB_RF_CANFAIL); - if (err != 0) { - txh->txh_tx->tx_err = err; - dbuf_rele(dbuf, FTAG); - break; - } - - bp = dbuf->db.db_data; - bp += blkoff; - - for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, - bp[i].blk_birth)) { - dprintf_bp(&bp[i], - "can free old%s", ""); - space += bp_get_dasize(spa, &bp[i]); - } - } - dbuf_rele(dbuf, FTAG); - } - if (err && err != ENOENT) { - txh->txh_tx->tx_err = err; - break; - } - - blkid += tochk; - nblks -= tochk; - } - rw_exit(&dn->dn_struct_rwlock); - - txh->txh_space_tofree += space; -} - -void -dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) -{ - dmu_tx_hold_t *txh; - dnode_t *dn; - uint64_t start, end, i; - int err, shift; - zio_t *zio; - - ASSERT(tx->tx_txg == 0); - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_FREE, off, len); - if (txh == NULL) - return; - dn = txh->txh_dnode; - - /* first block */ - if (off != 0) - dmu_tx_count_write(txh, off, 1); - /* last block */ - if (len != DMU_OBJECT_END) - dmu_tx_count_write(txh, off+len, 1); - - if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) - return; - if (len == DMU_OBJECT_END) - len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; - - /* - * For i/o error checking, read the first and last level-0 - * blocks, and all the level-1 blocks. The above count_write's - * will take care of the level-0 blocks. - */ - if (dn->dn_nlevels > 1) { - shift = dn->dn_datablkshift + dn->dn_indblkshift - - SPA_BLKPTRSHIFT; - start = off >> shift; - end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; - - zio = zio_root(tx->tx_pool->dp_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - for (i = start; i <= end; i++) { - uint64_t ibyte = i << shift; - err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0); - i = ibyte >> shift; - if (err == ESRCH) - break; - if (err) { - tx->tx_err = err; - return; - } - - err = dmu_tx_check_ioerr(zio, dn, 1, i); - if (err) { - tx->tx_err = err; - return; - } - } - err = zio_wait(zio); - if (err) { - tx->tx_err = err; - return; - } - } - - dmu_tx_count_dnode(txh); - dmu_tx_count_free(txh, off, len); -} - -void -dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) -{ - dmu_tx_hold_t *txh; - dnode_t *dn; - uint64_t nblocks; - int epbs, err; - - ASSERT(tx->tx_txg == 0); - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_ZAP, add, (uintptr_t)name); - if (txh == NULL) - return; - dn = txh->txh_dnode; - - dmu_tx_count_dnode(txh); - - if (dn == NULL) { - /* - * We will be able to fit a new object's entries into one leaf - * block. So there will be at most 2 blocks total, - * including the header block. - */ - dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); - return; - } - - ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); - - if (dn->dn_maxblkid == 0 && !add) { - /* - * If there is only one block (i.e. this is a micro-zap) - * and we are not adding anything, the accounting is simple. - */ - err = dmu_tx_check_ioerr(NULL, dn, 0, 0); - if (err) { - tx->tx_err = err; - return; - } - - /* - * Use max block size here, since we don't know how much - * the size will change between now and the dbuf dirty call. - */ - if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_phys->dn_blkptr[0].blk_birth)) - txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; - else - txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - return; - } - - if (dn->dn_maxblkid > 0 && name) { - /* - * access the name in this fat-zap so that we'll check - * for i/o errors to the leaf blocks, etc. - */ - err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, - 8, 0, NULL); - if (err == EIO) { - tx->tx_err = err; - return; - } - } - - /* - * 3 blocks overwritten: target leaf, ptrtbl block, header block - * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks - */ - dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, - (3 + add ? 3 : 0) << dn->dn_datablkshift); - - /* - * If the modified blocks are scattered to the four winds, - * we'll have to modify an indirect twig for each. - */ - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) - txh->txh_space_towrite += 3 << dn->dn_indblkshift; -} - -void -dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) -{ - dmu_tx_hold_t *txh; - - ASSERT(tx->tx_txg == 0); - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_BONUS, 0, 0); - if (txh) - dmu_tx_count_dnode(txh); -} - -void -dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) -{ - dmu_tx_hold_t *txh; - ASSERT(tx->tx_txg == 0); - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - DMU_NEW_OBJECT, THT_SPACE, space, 0); - - txh->txh_space_towrite += space; -} - -int -dmu_tx_holds(dmu_tx_t *tx, uint64_t object) -{ - dmu_tx_hold_t *txh; - int holds = 0; - - /* - * By asserting that the tx is assigned, we're counting the - * number of dn_tx_holds, which is the same as the number of - * dn_holds. Otherwise, we'd be counting dn_holds, but - * dn_tx_holds could be 0. - */ - ASSERT(tx->tx_txg != 0); - - /* if (tx->tx_anyobj == TRUE) */ - /* return (0); */ - - for (txh = list_head(&tx->tx_holds); txh; - txh = list_next(&tx->tx_holds, txh)) { - if (txh->txh_dnode && txh->txh_dnode->dn_object == object) - holds++; - } - - return (holds); -} - -#ifdef ZFS_DEBUG -void -dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) -{ - dmu_tx_hold_t *txh; - int match_object = FALSE, match_offset = FALSE; - dnode_t *dn = db->db_dnode; - - ASSERT(tx->tx_txg != 0); - ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); - ASSERT3U(dn->dn_object, ==, db->db.db_object); - - if (tx->tx_anyobj) - return; - - /* XXX No checking on the meta dnode for now */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) - return; - - for (txh = list_head(&tx->tx_holds); txh; - txh = list_next(&tx->tx_holds, txh)) { - ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); - if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) - match_object = TRUE; - if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { - int datablkshift = dn->dn_datablkshift ? - dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - int shift = datablkshift + epbs * db->db_level; - uint64_t beginblk = shift >= 64 ? 0 : - (txh->txh_arg1 >> shift); - uint64_t endblk = shift >= 64 ? 0 : - ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); - uint64_t blkid = db->db_blkid; - - /* XXX txh_arg2 better not be zero... */ - - dprintf("found txh type %x beginblk=%llx endblk=%llx\n", - txh->txh_type, beginblk, endblk); - - switch (txh->txh_type) { - case THT_WRITE: - if (blkid >= beginblk && blkid <= endblk) - match_offset = TRUE; - /* - * We will let this hold work for the bonus - * buffer so that we don't need to hold it - * when creating a new object. - */ - if (blkid == DB_BONUS_BLKID) - match_offset = TRUE; - /* - * They might have to increase nlevels, - * thus dirtying the new TLIBs. Or the - * might have to change the block size, - * thus dirying the new lvl=0 blk=0. - */ - if (blkid == 0) - match_offset = TRUE; - break; - case THT_FREE: - if (blkid == beginblk && - (txh->txh_arg1 != 0 || - dn->dn_maxblkid == 0)) - match_offset = TRUE; - if (blkid == endblk && - txh->txh_arg2 != DMU_OBJECT_END) - match_offset = TRUE; - break; - case THT_BONUS: - if (blkid == DB_BONUS_BLKID) - match_offset = TRUE; - break; - case THT_ZAP: - match_offset = TRUE; - break; - case THT_NEWOBJECT: - match_object = TRUE; - break; - default: - ASSERT(!"bad txh_type"); - } - } - if (match_object && match_offset) - return; - } - panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", - (u_longlong_t)db->db.db_object, db->db_level, - (u_longlong_t)db->db_blkid); -} -#endif - -static int -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) -{ - dmu_tx_hold_t *txh; - uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite; - - ASSERT3U(tx->tx_txg, ==, 0); - if (tx->tx_err) - return (tx->tx_err); - - tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); - tx->tx_needassign_txh = NULL; - - /* - * NB: No error returns are allowed after txg_hold_open, but - * before processing the dnode holds, due to the - * dmu_tx_unassign() logic. - */ - - towrite = tofree = tooverwrite = 0; - for (txh = list_head(&tx->tx_holds); txh; - txh = list_next(&tx->tx_holds, txh)) { - dnode_t *dn = txh->txh_dnode; - if (dn != NULL) { - mutex_enter(&dn->dn_mtx); - if (dn->dn_assigned_txg == tx->tx_txg - 1) { - mutex_exit(&dn->dn_mtx); - tx->tx_needassign_txh = txh; - return (ERESTART); - } - if (dn->dn_assigned_txg == 0) - dn->dn_assigned_txg = tx->tx_txg; - ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); - (void) refcount_add(&dn->dn_tx_holds, tx); - mutex_exit(&dn->dn_mtx); - } - towrite += txh->txh_space_towrite; - tofree += txh->txh_space_tofree; - tooverwrite += txh->txh_space_tooverwrite; - } - - /* - * NB: This check must be after we've held the dnodes, so that - * the dmu_tx_unassign() logic will work properly - */ - if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) - return (ERESTART); - - /* - * If a snapshot has been taken since we made our estimates, - * assume that we won't be able to free or overwrite anything. - */ - if (tx->tx_objset && - dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > - tx->tx_lastsnap_txg) { - towrite += tooverwrite; - tooverwrite = tofree = 0; - } - - /* - * Convert logical size to worst-case allocated size. - */ - fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; - lsize = towrite + tooverwrite; - asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); - -#ifdef ZFS_DEBUG - tx->tx_space_towrite = asize; - tx->tx_space_tofree = tofree; - tx->tx_space_tooverwrite = tooverwrite; -#endif - - if (tx->tx_dir && asize != 0) { - int err = dsl_dir_tempreserve_space(tx->tx_dir, - lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx); - if (err) - return (err); - } - - return (0); -} - -static void -dmu_tx_unassign(dmu_tx_t *tx) -{ - dmu_tx_hold_t *txh; - - if (tx->tx_txg == 0) - return; - - txg_rele_to_quiesce(&tx->tx_txgh); - - for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; - txh = list_next(&tx->tx_holds, txh)) { - dnode_t *dn = txh->txh_dnode; - - if (dn == NULL) - continue; - mutex_enter(&dn->dn_mtx); - ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); - - if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { - dn->dn_assigned_txg = 0; - cv_broadcast(&dn->dn_notxholds); - } - mutex_exit(&dn->dn_mtx); - } - - txg_rele_to_sync(&tx->tx_txgh); - - tx->tx_lasttried_txg = tx->tx_txg; - tx->tx_txg = 0; -} - -/* - * Assign tx to a transaction group. txg_how can be one of: - * - * (1) TXG_WAIT. If the current open txg is full, waits until there's - * a new one. This should be used when you're not holding locks. - * If will only fail if we're truly out of space (or over quota). - * - * (2) TXG_NOWAIT. If we can't assign into the current open txg without - * blocking, returns immediately with ERESTART. This should be used - * whenever you're holding locks. On an ERESTART error, the caller - * should drop locks, do a dmu_tx_wait(tx), and try again. - * - * (3) A specific txg. Use this if you need to ensure that multiple - * transactions all sync in the same txg. Like TXG_NOWAIT, it - * returns ERESTART if it can't assign you into the requested txg. - */ -int -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) -{ - int err; - - ASSERT(tx->tx_txg == 0); - ASSERT(txg_how != 0); - ASSERT(!dsl_pool_sync_context(tx->tx_pool)); - - while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { - dmu_tx_unassign(tx); - - if (err != ERESTART || txg_how != TXG_WAIT) - return (err); - - dmu_tx_wait(tx); - } - - txg_rele_to_quiesce(&tx->tx_txgh); - - return (0); -} - -void -dmu_tx_wait(dmu_tx_t *tx) -{ - ASSERT(tx->tx_txg == 0); - ASSERT(tx->tx_lasttried_txg != 0); - - if (tx->tx_needassign_txh) { - dnode_t *dn = tx->tx_needassign_txh->txh_dnode; - - mutex_enter(&dn->dn_mtx); - while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) - cv_wait(&dn->dn_notxholds, &dn->dn_mtx); - mutex_exit(&dn->dn_mtx); - tx->tx_needassign_txh = NULL; - } else { - txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); - } -} - -void -dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) -{ -#ifdef ZFS_DEBUG - if (tx->tx_dir == NULL || delta == 0) - return; - - if (delta > 0) { - ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, - tx->tx_space_towrite); - (void) refcount_add_many(&tx->tx_space_written, delta, NULL); - } else { - (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); - } -#endif -} - -void -dmu_tx_commit(dmu_tx_t *tx) -{ - dmu_tx_hold_t *txh; - - ASSERT(tx->tx_txg != 0); - - while (txh = list_head(&tx->tx_holds)) { - dnode_t *dn = txh->txh_dnode; - - list_remove(&tx->tx_holds, txh); - kmem_free(txh, sizeof (dmu_tx_hold_t)); - if (dn == NULL) - continue; - mutex_enter(&dn->dn_mtx); - ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); - - if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { - dn->dn_assigned_txg = 0; - cv_broadcast(&dn->dn_notxholds); - } - mutex_exit(&dn->dn_mtx); - dnode_rele(dn, tx); - } - - if (tx->tx_tempreserve_cookie) - dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); - - if (tx->tx_anyobj == FALSE) - txg_rele_to_sync(&tx->tx_txgh); -#ifdef ZFS_DEBUG - dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", - tx->tx_space_towrite, refcount_count(&tx->tx_space_written), - tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); - refcount_destroy_many(&tx->tx_space_written, - refcount_count(&tx->tx_space_written)); - refcount_destroy_many(&tx->tx_space_freed, - refcount_count(&tx->tx_space_freed)); -#endif - kmem_free(tx, sizeof (dmu_tx_t)); -} - -void -dmu_tx_abort(dmu_tx_t *tx) -{ - dmu_tx_hold_t *txh; - - ASSERT(tx->tx_txg == 0); - - while (txh = list_head(&tx->tx_holds)) { - dnode_t *dn = txh->txh_dnode; - - list_remove(&tx->tx_holds, txh); - kmem_free(txh, sizeof (dmu_tx_hold_t)); - if (dn != NULL) - dnode_rele(dn, tx); - } -#ifdef ZFS_DEBUG - refcount_destroy_many(&tx->tx_space_written, - refcount_count(&tx->tx_space_written)); - refcount_destroy_many(&tx->tx_space_freed, - refcount_count(&tx->tx_space_freed)); -#endif - kmem_free(tx, sizeof (dmu_tx_t)); -} - -uint64_t -dmu_tx_get_txg(dmu_tx_t *tx) -{ - ASSERT(tx->tx_txg != 0); - return (tx->tx_txg); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c deleted file mode 100644 index 78d625c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ /dev/null @@ -1,655 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/dnode.h> -#include <sys/dmu_objset.h> -#include <sys/dmu_zfetch.h> -#include <sys/dmu.h> -#include <sys/dbuf.h> - -/* - * I'm against tune-ables, but these should probably exist as tweakable globals - * until we can get this working the way we want it to. - */ - -int zfs_prefetch_disable = 0; -SYSCTL_DECL(_vfs_zfs); -TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable); -SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN, - &zfs_prefetch_disable, 0, "Disable prefetch"); - -/* max # of streams per zfetch */ -uint32_t zfetch_max_streams = 8; -/* min time before stream reclaim */ -uint32_t zfetch_min_sec_reap = 2; -/* max number of blocks to fetch at a time */ -uint32_t zfetch_block_cap = 256; -/* number of bytes in a array_read at which we stop prefetching (1Mb) */ -uint64_t zfetch_array_rd_sz = 1024 * 1024; - -/* forward decls for static routines */ -static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); -static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); -static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); -static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); -static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); -static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); -static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); -static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); -static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); - -/* - * Given a zfetch structure and a zstream structure, determine whether the - * blocks to be read are part of a co-linear pair of existing prefetch - * streams. If a set is found, coalesce the streams, removing one, and - * configure the prefetch so it looks for a strided access pattern. - * - * In other words: if we find two sequential access streams that are - * the same length and distance N appart, and this read is N from the - * last stream, then we are probably in a strided access pattern. So - * combine the two sequential streams into a single strided stream. - * - * If no co-linear streams are found, return NULL. - */ -static int -dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) -{ - zstream_t *z_walk; - zstream_t *z_comp; - - if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) - return (0); - - if (zh == NULL) { - rw_exit(&zf->zf_rwlock); - return (0); - } - - for (z_walk = list_head(&zf->zf_stream); z_walk; - z_walk = list_next(&zf->zf_stream, z_walk)) { - for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; - z_comp = list_next(&zf->zf_stream, z_comp)) { - int64_t diff; - - if (z_walk->zst_len != z_walk->zst_stride || - z_comp->zst_len != z_comp->zst_stride) { - continue; - } - - diff = z_comp->zst_offset - z_walk->zst_offset; - if (z_comp->zst_offset + diff == zh->zst_offset) { - z_walk->zst_offset = zh->zst_offset; - z_walk->zst_direction = diff < 0 ? -1 : 1; - z_walk->zst_stride = - diff * z_walk->zst_direction; - z_walk->zst_ph_offset = - zh->zst_offset + z_walk->zst_stride; - dmu_zfetch_stream_remove(zf, z_comp); - mutex_destroy(&z_comp->zst_lock); - kmem_free(z_comp, sizeof (zstream_t)); - - dmu_zfetch_dofetch(zf, z_walk); - - rw_exit(&zf->zf_rwlock); - return (1); - } - - diff = z_walk->zst_offset - z_comp->zst_offset; - if (z_walk->zst_offset + diff == zh->zst_offset) { - z_walk->zst_offset = zh->zst_offset; - z_walk->zst_direction = diff < 0 ? -1 : 1; - z_walk->zst_stride = - diff * z_walk->zst_direction; - z_walk->zst_ph_offset = - zh->zst_offset + z_walk->zst_stride; - dmu_zfetch_stream_remove(zf, z_comp); - mutex_destroy(&z_comp->zst_lock); - kmem_free(z_comp, sizeof (zstream_t)); - - dmu_zfetch_dofetch(zf, z_walk); - - rw_exit(&zf->zf_rwlock); - return (1); - } - } - } - - rw_exit(&zf->zf_rwlock); - return (0); -} - -/* - * Given a zstream_t, determine the bounds of the prefetch. Then call the - * routine that actually prefetches the individual blocks. - */ -static void -dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) -{ - uint64_t prefetch_tail; - uint64_t prefetch_limit; - uint64_t prefetch_ofst; - uint64_t prefetch_len; - uint64_t blocks_fetched; - - zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); - zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); - - prefetch_tail = MAX((int64_t)zs->zst_ph_offset, - (int64_t)(zs->zst_offset + zs->zst_stride)); - /* - * XXX: use a faster division method? - */ - prefetch_limit = zs->zst_offset + zs->zst_len + - (zs->zst_cap * zs->zst_stride) / zs->zst_len; - - while (prefetch_tail < prefetch_limit) { - prefetch_ofst = zs->zst_offset + zs->zst_direction * - (prefetch_tail - zs->zst_offset); - - prefetch_len = zs->zst_len; - - /* - * Don't prefetch beyond the end of the file, if working - * backwards. - */ - if ((zs->zst_direction == ZFETCH_BACKWARD) && - (prefetch_ofst > prefetch_tail)) { - prefetch_len += prefetch_ofst; - prefetch_ofst = 0; - } - - /* don't prefetch more than we're supposed to */ - if (prefetch_len > zs->zst_len) - break; - - blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, - prefetch_ofst, zs->zst_len); - - prefetch_tail += zs->zst_stride; - /* stop if we've run out of stuff to prefetch */ - if (blocks_fetched < zs->zst_len) - break; - } - zs->zst_ph_offset = prefetch_tail; - zs->zst_last = lbolt; -} - -/* - * This takes a pointer to a zfetch structure and a dnode. It performs the - * necessary setup for the zfetch structure, grokking data from the - * associated dnode. - */ -void -dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) -{ - if (zf == NULL) { - return; - } - - zf->zf_dnode = dno; - zf->zf_stream_cnt = 0; - zf->zf_alloc_fail = 0; - - list_create(&zf->zf_stream, sizeof (zstream_t), - offsetof(zstream_t, zst_node)); - - rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); -} - -/* - * This function computes the actual size, in blocks, that can be prefetched, - * and fetches it. - */ -static uint64_t -dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) -{ - uint64_t fetchsz; - uint64_t i; - - fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); - - for (i = 0; i < fetchsz; i++) { - dbuf_prefetch(dn, blkid + i); - } - - return (fetchsz); -} - -/* - * this function returns the number of blocks that would be prefetched, based - * upon the supplied dnode, blockid, and nblks. This is used so that we can - * update streams in place, and then prefetch with their old value after the - * fact. This way, we can delay the prefetch, but subsequent accesses to the - * stream won't result in the same data being prefetched multiple times. - */ -static uint64_t -dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) -{ - uint64_t fetchsz; - - if (blkid > dn->dn_maxblkid) { - return (0); - } - - /* compute fetch size */ - if (blkid + nblks + 1 > dn->dn_maxblkid) { - fetchsz = (dn->dn_maxblkid - blkid) + 1; - ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); - } else { - fetchsz = nblks; - } - - - return (fetchsz); -} - -/* - * given a zfetch and a zsearch structure, see if there is an associated zstream - * for this block read. If so, it starts a prefetch for the stream it - * located and returns true, otherwise it returns false - */ -static int -dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) -{ - zstream_t *zs; - int64_t diff; - int reset = !prefetched; - int rc = 0; - - if (zh == NULL) - return (0); - - /* - * XXX: This locking strategy is a bit coarse; however, it's impact has - * yet to be tested. If this turns out to be an issue, it can be - * modified in a number of different ways. - */ - - rw_enter(&zf->zf_rwlock, RW_READER); -top: - - for (zs = list_head(&zf->zf_stream); zs; - zs = list_next(&zf->zf_stream, zs)) { - - /* - * XXX - should this be an assert? - */ - if (zs->zst_len == 0) { - /* bogus stream */ - continue; - } - - /* - * We hit this case when we are in a strided prefetch stream: - * we will read "len" blocks before "striding". - */ - if (zh->zst_offset >= zs->zst_offset && - zh->zst_offset < zs->zst_offset + zs->zst_len) { - /* already fetched */ - rc = 1; - goto out; - } - - /* - * This is the forward sequential read case: we increment - * len by one each time we hit here, so we will enter this - * case on every read. - */ - if (zh->zst_offset == zs->zst_offset + zs->zst_len) { - - reset = !prefetched && zs->zst_len > 1; - - mutex_enter(&zs->zst_lock); - - if (zh->zst_offset != zs->zst_offset + zs->zst_len) { - mutex_exit(&zs->zst_lock); - goto top; - } - zs->zst_len += zh->zst_len; - diff = zs->zst_len - zfetch_block_cap; - if (diff > 0) { - zs->zst_offset += diff; - zs->zst_len = zs->zst_len > diff ? - zs->zst_len - diff : 0; - } - zs->zst_direction = ZFETCH_FORWARD; - - break; - - /* - * Same as above, but reading backwards through the file. - */ - } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { - /* backwards sequential access */ - - reset = !prefetched && zs->zst_len > 1; - - mutex_enter(&zs->zst_lock); - - if (zh->zst_offset != zs->zst_offset - zh->zst_len) { - mutex_exit(&zs->zst_lock); - goto top; - } - - zs->zst_offset = zs->zst_offset > zh->zst_len ? - zs->zst_offset - zh->zst_len : 0; - zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? - zs->zst_ph_offset - zh->zst_len : 0; - zs->zst_len += zh->zst_len; - - diff = zs->zst_len - zfetch_block_cap; - if (diff > 0) { - zs->zst_ph_offset = zs->zst_ph_offset > diff ? - zs->zst_ph_offset - diff : 0; - zs->zst_len = zs->zst_len > diff ? - zs->zst_len - diff : zs->zst_len; - } - zs->zst_direction = ZFETCH_BACKWARD; - - break; - - } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < - zs->zst_len) && (zs->zst_len != zs->zst_stride)) { - /* strided forward access */ - - mutex_enter(&zs->zst_lock); - - if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= - zs->zst_len) || (zs->zst_len == zs->zst_stride)) { - mutex_exit(&zs->zst_lock); - goto top; - } - - zs->zst_offset += zs->zst_stride; - zs->zst_direction = ZFETCH_FORWARD; - - break; - - } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < - zs->zst_len) && (zs->zst_len != zs->zst_stride)) { - /* strided reverse access */ - - mutex_enter(&zs->zst_lock); - - if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= - zs->zst_len) || (zs->zst_len == zs->zst_stride)) { - mutex_exit(&zs->zst_lock); - goto top; - } - - zs->zst_offset = zs->zst_offset > zs->zst_stride ? - zs->zst_offset - zs->zst_stride : 0; - zs->zst_ph_offset = (zs->zst_ph_offset > - (2 * zs->zst_stride)) ? - (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; - zs->zst_direction = ZFETCH_BACKWARD; - - break; - } - } - - if (zs) { - if (reset) { - zstream_t *remove = zs; - - rc = 0; - mutex_exit(&zs->zst_lock); - rw_exit(&zf->zf_rwlock); - rw_enter(&zf->zf_rwlock, RW_WRITER); - /* - * Relocate the stream, in case someone removes - * it while we were acquiring the WRITER lock. - */ - for (zs = list_head(&zf->zf_stream); zs; - zs = list_next(&zf->zf_stream, zs)) { - if (zs == remove) { - dmu_zfetch_stream_remove(zf, zs); - mutex_destroy(&zs->zst_lock); - kmem_free(zs, sizeof (zstream_t)); - break; - } - } - } else { - rc = 1; - dmu_zfetch_dofetch(zf, zs); - mutex_exit(&zs->zst_lock); - } - } -out: - rw_exit(&zf->zf_rwlock); - return (rc); -} - -/* - * Clean-up state associated with a zfetch structure. This frees allocated - * structure members, empties the zf_stream tree, and generally makes things - * nice. This doesn't free the zfetch_t itself, that's left to the caller. - */ -void -dmu_zfetch_rele(zfetch_t *zf) -{ - zstream_t *zs; - zstream_t *zs_next; - - ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); - - for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { - zs_next = list_next(&zf->zf_stream, zs); - - list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zst_lock); - kmem_free(zs, sizeof (zstream_t)); - } - list_destroy(&zf->zf_stream); - rw_destroy(&zf->zf_rwlock); - - zf->zf_dnode = NULL; -} - -/* - * Given a zfetch and zstream structure, insert the zstream structure into the - * AVL tree contained within the zfetch structure. Peform the appropriate - * book-keeping. It is possible that another thread has inserted a stream which - * matches one that we are about to insert, so we must be sure to check for this - * case. If one is found, return failure, and let the caller cleanup the - * duplicates. - */ -static int -dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) -{ - zstream_t *zs_walk; - zstream_t *zs_next; - - ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); - - for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { - zs_next = list_next(&zf->zf_stream, zs_walk); - - if (dmu_zfetch_streams_equal(zs_walk, zs)) { - return (0); - } - } - - list_insert_head(&zf->zf_stream, zs); - zf->zf_stream_cnt++; - - return (1); -} - - -/* - * Walk the list of zstreams in the given zfetch, find an old one (by time), and - * reclaim it for use by the caller. - */ -static zstream_t * -dmu_zfetch_stream_reclaim(zfetch_t *zf) -{ - zstream_t *zs; - - if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) - return (0); - - for (zs = list_head(&zf->zf_stream); zs; - zs = list_next(&zf->zf_stream, zs)) { - - if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap) - break; - } - - if (zs) { - dmu_zfetch_stream_remove(zf, zs); - mutex_destroy(&zs->zst_lock); - bzero(zs, sizeof (zstream_t)); - } else { - zf->zf_alloc_fail++; - } - rw_exit(&zf->zf_rwlock); - - return (zs); -} - -/* - * Given a zfetch and zstream structure, remove the zstream structure from its - * container in the zfetch structure. Perform the appropriate book-keeping. - */ -static void -dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) -{ - ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); - - list_remove(&zf->zf_stream, zs); - zf->zf_stream_cnt--; -} - -static int -dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) -{ - if (zs1->zst_offset != zs2->zst_offset) - return (0); - - if (zs1->zst_len != zs2->zst_len) - return (0); - - if (zs1->zst_stride != zs2->zst_stride) - return (0); - - if (zs1->zst_ph_offset != zs2->zst_ph_offset) - return (0); - - if (zs1->zst_cap != zs2->zst_cap) - return (0); - - if (zs1->zst_direction != zs2->zst_direction) - return (0); - - return (1); -} - -/* - * This is the prefetch entry point. It calls all of the other dmu_zfetch - * routines to create, delete, find, or operate upon prefetch streams. - */ -void -dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) -{ - zstream_t zst; - zstream_t *newstream; - int fetched; - int inserted; - unsigned int blkshft; - uint64_t blksz; - - if (zfs_prefetch_disable) - return; - - /* files that aren't ln2 blocksz are only one block -- nothing to do */ - if (!zf->zf_dnode->dn_datablkshift) - return; - - /* convert offset and size, into blockid and nblocks */ - blkshft = zf->zf_dnode->dn_datablkshift; - blksz = (1 << blkshft); - - bzero(&zst, sizeof (zstream_t)); - zst.zst_offset = offset >> blkshft; - zst.zst_len = (P2ROUNDUP(offset + size, blksz) - - P2ALIGN(offset, blksz)) >> blkshft; - - fetched = dmu_zfetch_find(zf, &zst, prefetched); - if (!fetched) { - fetched = dmu_zfetch_colinear(zf, &zst); - } - - if (!fetched) { - newstream = dmu_zfetch_stream_reclaim(zf); - - /* - * we still couldn't find a stream, drop the lock, and allocate - * one if possible. Otherwise, give up and go home. - */ - if (newstream == NULL) { - uint64_t maxblocks; - uint32_t max_streams; - uint32_t cur_streams; - - cur_streams = zf->zf_stream_cnt; - maxblocks = zf->zf_dnode->dn_maxblkid; - - max_streams = MIN(zfetch_max_streams, - (maxblocks / zfetch_block_cap)); - if (max_streams == 0) { - max_streams++; - } - - if (cur_streams >= max_streams) { - return; - } - - newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); - } - - newstream->zst_offset = zst.zst_offset; - newstream->zst_len = zst.zst_len; - newstream->zst_stride = zst.zst_len; - newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; - newstream->zst_cap = zst.zst_len; - newstream->zst_direction = ZFETCH_FORWARD; - newstream->zst_last = lbolt; - - mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); - - rw_enter(&zf->zf_rwlock, RW_WRITER); - inserted = dmu_zfetch_stream_insert(zf, newstream); - rw_exit(&zf->zf_rwlock); - - if (!inserted) { - mutex_destroy(&newstream->zst_lock); - kmem_free(newstream, sizeof (zstream_t)); - } - } -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c deleted file mode 100644 index ca50285..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ /dev/null @@ -1,1369 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/dbuf.h> -#include <sys/dnode.h> -#include <sys/dmu.h> -#include <sys/dmu_impl.h> -#include <sys/dmu_tx.h> -#include <sys/dmu_objset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_dataset.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/dmu_zfetch.h> - -static int free_range_compar(const void *node1, const void *node2); - -static kmem_cache_t *dnode_cache; - -static dnode_phys_t dnode_phys_zero; - -int zfs_default_bs = SPA_MINBLOCKSHIFT; -int zfs_default_ibs = DN_MAX_INDBLKSHIFT; - -/* ARGSUSED */ -static int -dnode_cons(void *arg, void *unused, int kmflag) -{ - int i; - dnode_t *dn = arg; - bzero(dn, sizeof (dnode_t)); - - cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); - rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); - mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); - refcount_create(&dn->dn_holds); - refcount_create(&dn->dn_tx_holds); - - for (i = 0; i < TXG_SIZE; i++) { - avl_create(&dn->dn_ranges[i], free_range_compar, - sizeof (free_range_t), - offsetof(struct free_range, fr_node)); - list_create(&dn->dn_dirty_records[i], - sizeof (dbuf_dirty_record_t), - offsetof(dbuf_dirty_record_t, dr_dirty_node)); - } - - list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_link)); - - return (0); -} - -/* ARGSUSED */ -static void -dnode_dest(void *arg, void *unused) -{ - int i; - dnode_t *dn = arg; - - cv_destroy(&dn->dn_notxholds); - rw_destroy(&dn->dn_struct_rwlock); - mutex_destroy(&dn->dn_mtx); - mutex_destroy(&dn->dn_dbufs_mtx); - refcount_destroy(&dn->dn_holds); - refcount_destroy(&dn->dn_tx_holds); - - for (i = 0; i < TXG_SIZE; i++) { - avl_destroy(&dn->dn_ranges[i]); - list_destroy(&dn->dn_dirty_records[i]); - } - - list_destroy(&dn->dn_dbufs); -} - -void -dnode_init(void) -{ - dnode_cache = kmem_cache_create("dnode_t", - sizeof (dnode_t), - 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); -} - -void -dnode_fini(void) -{ - kmem_cache_destroy(dnode_cache); -} - - -#ifdef ZFS_DEBUG -void -dnode_verify(dnode_t *dn) -{ - int drop_struct_lock = FALSE; - - ASSERT(dn->dn_phys); - ASSERT(dn->dn_objset); - - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); - - if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) - return; - - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { - int i; - ASSERT3U(dn->dn_indblkshift, >=, 0); - ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); - if (dn->dn_datablkshift) { - ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); - ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); - ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); - } - ASSERT3U(dn->dn_nlevels, <=, 30); - ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); - ASSERT3U(dn->dn_nblkptr, >=, 1); - ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); - ASSERT3U(dn->dn_datablksz, ==, - dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); - ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); - ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + - dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); - for (i = 0; i < TXG_SIZE; i++) { - ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); - } - } - if (dn->dn_phys->dn_type != DMU_OT_NONE) - ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL); - if (dn->dn_dbuf != NULL) { - ASSERT3P(dn->dn_phys, ==, - (dnode_phys_t *)dn->dn_dbuf->db.db_data + - (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); - } - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); -} -#endif - -void -dnode_byteswap(dnode_phys_t *dnp) -{ - uint64_t *buf64 = (void*)&dnp->dn_blkptr; - int i; - - if (dnp->dn_type == DMU_OT_NONE) { - bzero(dnp, sizeof (dnode_phys_t)); - return; - } - - dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); - dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); - dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); - dnp->dn_used = BSWAP_64(dnp->dn_used); - - /* - * dn_nblkptr is only one byte, so it's OK to read it in either - * byte order. We can't read dn_bouslen. - */ - ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); - ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); - for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) - buf64[i] = BSWAP_64(buf64[i]); - - /* - * OK to check dn_bonuslen for zero, because it won't matter if - * we have the wrong byte order. This is necessary because the - * dnode dnode is smaller than a regular dnode. - */ - if (dnp->dn_bonuslen != 0) { - /* - * Note that the bonus length calculated here may be - * longer than the actual bonus buffer. This is because - * we always put the bonus buffer after the last block - * pointer (instead of packing it against the end of the - * dnode buffer). - */ - int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); - size_t len = DN_MAX_BONUSLEN - off; - ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); - dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); - } -} - -void -dnode_buf_byteswap(void *vbuf, size_t size) -{ - dnode_phys_t *buf = vbuf; - int i; - - ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT)); - ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); - - size >>= DNODE_SHIFT; - for (i = 0; i < size; i++) { - dnode_byteswap(buf); - buf++; - } -} - -static int -free_range_compar(const void *node1, const void *node2) -{ - const free_range_t *rp1 = node1; - const free_range_t *rp2 = node2; - - if (rp1->fr_blkid < rp2->fr_blkid) - return (-1); - else if (rp1->fr_blkid > rp2->fr_blkid) - return (1); - else return (0); -} - -static void -dnode_setdblksz(dnode_t *dn, int size) -{ - ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0); - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(size, >=, SPA_MINBLOCKSIZE); - ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, - 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); - dn->dn_datablksz = size; - dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; - dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0; -} - -static dnode_t * -dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, - uint64_t object) -{ - dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); - - dn->dn_objset = os; - dn->dn_object = object; - dn->dn_dbuf = db; - dn->dn_phys = dnp; - - if (dnp->dn_datablkszsec) - dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); - dn->dn_indblkshift = dnp->dn_indblkshift; - dn->dn_nlevels = dnp->dn_nlevels; - dn->dn_type = dnp->dn_type; - dn->dn_nblkptr = dnp->dn_nblkptr; - dn->dn_checksum = dnp->dn_checksum; - dn->dn_compress = dnp->dn_compress; - dn->dn_bonustype = dnp->dn_bonustype; - dn->dn_bonuslen = dnp->dn_bonuslen; - dn->dn_maxblkid = dnp->dn_maxblkid; - - dmu_zfetch_init(&dn->dn_zfetch, dn); - - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); - mutex_enter(&os->os_lock); - list_insert_head(&os->os_dnodes, dn); - mutex_exit(&os->os_lock); - - return (dn); -} - -static void -dnode_destroy(dnode_t *dn) -{ - objset_impl_t *os = dn->dn_objset; - -#ifdef ZFS_DEBUG - int i; - - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - ASSERT(NULL == list_head(&dn->dn_dirty_records[i])); - ASSERT(0 == avl_numnodes(&dn->dn_ranges[i])); - } - ASSERT(NULL == list_head(&dn->dn_dbufs)); -#endif - - mutex_enter(&os->os_lock); - list_remove(&os->os_dnodes, dn); - mutex_exit(&os->os_lock); - - if (dn->dn_dirtyctx_firstset) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } - dmu_zfetch_rele(&dn->dn_zfetch); - if (dn->dn_bonus) { - mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); - dn->dn_bonus = NULL; - } - kmem_cache_free(dnode_cache, dn); -} - -void -dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - int i; - - if (blocksize == 0) - blocksize = 1 << zfs_default_bs; - else if (blocksize > SPA_MAXBLOCKSIZE) - blocksize = SPA_MAXBLOCKSIZE; - else - blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); - - if (ibs == 0) - ibs = zfs_default_ibs; - - ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); - - dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, - dn->dn_object, tx->tx_txg, blocksize, ibs); - - ASSERT(dn->dn_type == DMU_OT_NONE); - ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); - ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); - ASSERT(ot != DMU_OT_NONE); - ASSERT3U(ot, <, DMU_OT_NUMTYPES); - ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || - (bonustype != DMU_OT_NONE && bonuslen != 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); - ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); - ASSERT(dn->dn_type == DMU_OT_NONE); - ASSERT3U(dn->dn_maxblkid, ==, 0); - ASSERT3U(dn->dn_allocated_txg, ==, 0); - ASSERT3U(dn->dn_assigned_txg, ==, 0); - ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); - - for (i = 0; i < TXG_SIZE; i++) { - ASSERT3U(dn->dn_next_nlevels[i], ==, 0); - ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); - ASSERT3U(dn->dn_next_blksz[i], ==, 0); - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); - ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0); - } - - dn->dn_type = ot; - dnode_setdblksz(dn, blocksize); - dn->dn_indblkshift = ibs; - dn->dn_nlevels = 1; - dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); - dn->dn_bonustype = bonustype; - dn->dn_bonuslen = bonuslen; - dn->dn_checksum = ZIO_CHECKSUM_INHERIT; - dn->dn_compress = ZIO_COMPRESS_INHERIT; - dn->dn_dirtyctx = 0; - - dn->dn_free_txg = 0; - if (dn->dn_dirtyctx_firstset) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } - - dn->dn_allocated_txg = tx->tx_txg; - - dnode_setdirty(dn, tx); - dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; - dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; -} - -void -dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - int i; - dmu_buf_impl_t *db = NULL; - - ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); - ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0); - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - ASSERT(tx->tx_txg != 0); - ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || - (bonustype != DMU_OT_NONE && bonuslen != 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); - ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); - - for (i = 0; i < TXG_SIZE; i++) - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - - /* clean up any unreferenced dbufs */ - (void) dnode_evict_dbufs(dn, 0); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); - - /* - * XXX I should really have a generation number to tell if we - * need to do this... - */ - if (blocksize != dn->dn_datablksz || - dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) { - /* free all old data */ - dnode_free_range(dn, 0, -1ULL, tx); - } - - /* change blocksize */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (blocksize != dn->dn_datablksz && - (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || - list_head(&dn->dn_dbufs) != NULL)) { - db = dbuf_hold(dn, 0, FTAG); - dbuf_new_size(db, blocksize, tx); - } - dnode_setdblksz(dn, blocksize); - dnode_setdirty(dn, tx); - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; - rw_exit(&dn->dn_struct_rwlock); - if (db) { - dbuf_rele(db, FTAG); - db = NULL; - } - - /* change type */ - dn->dn_type = ot; - - if (dn->dn_bonuslen != bonuslen) { - /* change bonus size */ - if (bonuslen == 0) - bonuslen = 1; /* XXX */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus == NULL) - dn->dn_bonus = dbuf_create_bonus(dn); - db = dn->dn_bonus; - rw_exit(&dn->dn_struct_rwlock); - if (refcount_add(&db->db_holds, FTAG) == 1) - dnode_add_ref(dn, db); - VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); - mutex_enter(&db->db_mtx); - ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); - ASSERT(db->db.db_data != NULL); - db->db.db_size = bonuslen; - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - } - - /* change bonus size and type */ - mutex_enter(&dn->dn_mtx); - dn->dn_bonustype = bonustype; - dn->dn_bonuslen = bonuslen; - dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); - dn->dn_checksum = ZIO_CHECKSUM_INHERIT; - dn->dn_compress = ZIO_COMPRESS_INHERIT; - ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - - /* - * NB: we have to do the dbuf_rele after we've changed the - * dn_bonuslen, for the sake of dbuf_verify(). - */ - if (db) - dbuf_rele(db, FTAG); - - dn->dn_allocated_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); -} - -void -dnode_special_close(dnode_t *dn) -{ - /* - * Wait for final references to the dnode to clear. This can - * only happen if the arc is asyncronously evicting state that - * has a hold on this dnode while we are trying to evict this - * dnode. - */ - while (refcount_count(&dn->dn_holds) > 0) - delay(1); - dnode_destroy(dn); -} - -dnode_t * -dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object) -{ - dnode_t *dn = dnode_create(os, dnp, NULL, object); - DNODE_VERIFY(dn); - return (dn); -} - -static void -dnode_buf_pageout(dmu_buf_t *db, void *arg) -{ - dnode_t **children_dnodes = arg; - int i; - int epb = db->db_size >> DNODE_SHIFT; - - for (i = 0; i < epb; i++) { - dnode_t *dn = children_dnodes[i]; - int n; - - if (dn == NULL) - continue; -#ifdef ZFS_DEBUG - /* - * If there are holds on this dnode, then there should - * be holds on the dnode's containing dbuf as well; thus - * it wouldn't be eligable for eviction and this function - * would not have been called. - */ - ASSERT(refcount_is_zero(&dn->dn_holds)); - ASSERT(list_head(&dn->dn_dbufs) == NULL); - ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - - for (n = 0; n < TXG_SIZE; n++) - ASSERT(!list_link_active(&dn->dn_dirty_link[n])); -#endif - children_dnodes[i] = NULL; - dnode_destroy(dn); - } - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); -} - -/* - * errors: - * EINVAL - invalid object number. - * EIO - i/o error. - * succeeds even for free dnodes. - */ -int -dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, - void *tag, dnode_t **dnp) -{ - int epb, idx, err; - int drop_struct_lock = FALSE; - int type; - uint64_t blk; - dnode_t *mdn, *dn; - dmu_buf_impl_t *db; - dnode_t **children_dnodes; - - if (object == 0 || object >= DN_MAX_OBJECT) - return (EINVAL); - - mdn = os->os_meta_dnode; - - DNODE_VERIFY(mdn); - - if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { - rw_enter(&mdn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - - blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); - - db = dbuf_hold(mdn, blk, FTAG); - if (drop_struct_lock) - rw_exit(&mdn->dn_struct_rwlock); - if (db == NULL) - return (EIO); - err = dbuf_read(db, NULL, DB_RF_CANFAIL); - if (err) { - dbuf_rele(db, FTAG); - return (err); - } - - ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); - epb = db->db.db_size >> DNODE_SHIFT; - - idx = object & (epb-1); - - children_dnodes = dmu_buf_get_user(&db->db); - if (children_dnodes == NULL) { - dnode_t **winner; - children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *), - KM_SLEEP); - if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, - dnode_buf_pageout)) { - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); - children_dnodes = winner; - } - } - - if ((dn = children_dnodes[idx]) == NULL) { - dnode_t *winner; - dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx, - db, object); - winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); - if (winner != NULL) { - dnode_destroy(dn); - dn = winner; - } - } - - mutex_enter(&dn->dn_mtx); - type = dn->dn_type; - if (dn->dn_free_txg || - ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || - ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) { - mutex_exit(&dn->dn_mtx); - dbuf_rele(db, FTAG); - return (type == DMU_OT_NONE ? ENOENT : EEXIST); - } - mutex_exit(&dn->dn_mtx); - - if (refcount_add(&dn->dn_holds, tag) == 1) - dbuf_add_ref(db, dn); - - DNODE_VERIFY(dn); - ASSERT3P(dn->dn_dbuf, ==, db); - ASSERT3U(dn->dn_object, ==, object); - dbuf_rele(db, FTAG); - - *dnp = dn; - return (0); -} - -/* - * Return held dnode if the object is allocated, NULL if not. - */ -int -dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp) -{ - return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); -} - -void -dnode_add_ref(dnode_t *dn, void *tag) -{ - ASSERT(refcount_count(&dn->dn_holds) > 0); - (void) refcount_add(&dn->dn_holds, tag); -} - -void -dnode_rele(dnode_t *dn, void *tag) -{ - uint64_t refs; - - refs = refcount_remove(&dn->dn_holds, tag); - /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ - if (refs == 0 && dn->dn_dbuf) - dbuf_rele(dn->dn_dbuf, dn); -} - -void -dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) -{ - objset_impl_t *os = dn->dn_objset; - uint64_t txg = tx->tx_txg; - - if (dn->dn_object == DMU_META_DNODE_OBJECT) - return; - - DNODE_VERIFY(dn); - -#ifdef ZFS_DEBUG - mutex_enter(&dn->dn_mtx); - ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); - /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */ - mutex_exit(&dn->dn_mtx); -#endif - - mutex_enter(&os->os_lock); - - /* - * If we are already marked dirty, we're done. - */ - if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { - mutex_exit(&os->os_lock); - return; - } - - ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); - ASSERT(dn->dn_datablksz != 0); - ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); - - dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", - dn->dn_object, txg); - - if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) { - list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn); - } else { - list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn); - } - - mutex_exit(&os->os_lock); - - /* - * The dnode maintains a hold on its containing dbuf as - * long as there are holds on it. Each instantiated child - * dbuf maintaines a hold on the dnode. When the last child - * drops its hold, the dnode will drop its hold on the - * containing dbuf. We add a "dirty hold" here so that the - * dnode will hang around after we finish processing its - * children. - */ - dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg); - - (void) dbuf_dirty(dn->dn_dbuf, tx); - - dsl_dataset_dirty(os->os_dsl_dataset, tx); -} - -void -dnode_free(dnode_t *dn, dmu_tx_t *tx) -{ - int txgoff = tx->tx_txg & TXG_MASK; - - dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg); - - /* we should be the only holder... hopefully */ - /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ - - mutex_enter(&dn->dn_mtx); - if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { - mutex_exit(&dn->dn_mtx); - return; - } - dn->dn_free_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); - - /* - * If the dnode is already dirty, it needs to be moved from - * the dirty list to the free list. - */ - mutex_enter(&dn->dn_objset->os_lock); - if (list_link_active(&dn->dn_dirty_link[txgoff])) { - list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn); - list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn); - mutex_exit(&dn->dn_objset->os_lock); - } else { - mutex_exit(&dn->dn_objset->os_lock); - dnode_setdirty(dn, tx); - } -} - -/* - * Try to change the block size for the indicated dnode. This can only - * succeed if there are no blocks allocated or dirty beyond first block - */ -int -dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db, *db_next; - int have_db0 = FALSE; - - if (size == 0) - size = SPA_MINBLOCKSIZE; - if (size > SPA_MAXBLOCKSIZE) - size = SPA_MAXBLOCKSIZE; - else - size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); - - if (ibs == dn->dn_indblkshift) - ibs = 0; - - if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) - return (0); - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - - /* Check for any allocated blocks beyond the first */ - if (dn->dn_phys->dn_maxblkid != 0) - goto fail; - - mutex_enter(&dn->dn_dbufs_mtx); - for (db = list_head(&dn->dn_dbufs); db; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); - - if (db->db_blkid == 0) { - have_db0 = TRUE; - } else if (db->db_blkid != DB_BONUS_BLKID) { - mutex_exit(&dn->dn_dbufs_mtx); - goto fail; - } - } - mutex_exit(&dn->dn_dbufs_mtx); - - if (ibs && dn->dn_nlevels != 1) - goto fail; - - db = NULL; - if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) { - /* obtain the old block */ - db = dbuf_hold(dn, 0, FTAG); - dbuf_new_size(db, size, tx); - } - - dnode_setdblksz(dn, size); - dnode_setdirty(dn, tx); - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; - if (ibs) { - dn->dn_indblkshift = ibs; - dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; - } - - if (db) - dbuf_rele(db, FTAG); - - rw_exit(&dn->dn_struct_rwlock); - return (0); - -fail: - rw_exit(&dn->dn_struct_rwlock); - return (ENOTSUP); -} - -void -dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) -{ - uint64_t txgoff = tx->tx_txg & TXG_MASK; - int drop_struct_lock = FALSE; - int epbs, new_nlevels; - uint64_t sz; - - ASSERT(blkid != DB_BONUS_BLKID); - - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - drop_struct_lock = TRUE; - } - - if (blkid <= dn->dn_maxblkid) - goto out; - - dn->dn_maxblkid = blkid; - - /* - * Compute the number of levels necessary to support the new maxblkid. - */ - new_nlevels = 1; - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (sz = dn->dn_nblkptr; - sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) - new_nlevels++; - - if (new_nlevels > dn->dn_nlevels) { - int old_nlevels = dn->dn_nlevels; - dmu_buf_impl_t *db; - list_t *list; - dbuf_dirty_record_t *new, *dr, *dr_next; - - dn->dn_nlevels = new_nlevels; - - ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); - dn->dn_next_nlevels[txgoff] = new_nlevels; - - /* dirty the left indirects */ - db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); - new = dbuf_dirty(db, tx); - dbuf_rele(db, FTAG); - - /* transfer the dirty records to the new indirect */ - mutex_enter(&dn->dn_mtx); - mutex_enter(&new->dt.di.dr_mtx); - list = &dn->dn_dirty_records[txgoff]; - for (dr = list_head(list); dr; dr = dr_next) { - dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); - if (dr->dr_dbuf->db_level != new_nlevels-1 && - dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) { - ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); - list_remove(&dn->dn_dirty_records[txgoff], dr); - list_insert_tail(&new->dt.di.dr_children, dr); - dr->dr_parent = new; - } - } - mutex_exit(&new->dt.di.dr_mtx); - mutex_exit(&dn->dn_mtx); - } - -out: - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); -} - -void -dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) -{ - avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; - avl_index_t where; - free_range_t *rp; - free_range_t rp_tofind; - uint64_t endblk = blkid + nblks; - - ASSERT(MUTEX_HELD(&dn->dn_mtx)); - ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */ - - dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", - blkid, nblks, tx->tx_txg); - rp_tofind.fr_blkid = blkid; - rp = avl_find(tree, &rp_tofind, &where); - if (rp == NULL) - rp = avl_nearest(tree, where, AVL_BEFORE); - if (rp == NULL) - rp = avl_nearest(tree, where, AVL_AFTER); - - while (rp && (rp->fr_blkid <= blkid + nblks)) { - uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks; - free_range_t *nrp = AVL_NEXT(tree, rp); - - if (blkid <= rp->fr_blkid && endblk >= fr_endblk) { - /* clear this entire range */ - avl_remove(tree, rp); - kmem_free(rp, sizeof (free_range_t)); - } else if (blkid <= rp->fr_blkid && - endblk > rp->fr_blkid && endblk < fr_endblk) { - /* clear the beginning of this range */ - rp->fr_blkid = endblk; - rp->fr_nblks = fr_endblk - endblk; - } else if (blkid > rp->fr_blkid && blkid < fr_endblk && - endblk >= fr_endblk) { - /* clear the end of this range */ - rp->fr_nblks = blkid - rp->fr_blkid; - } else if (blkid > rp->fr_blkid && endblk < fr_endblk) { - /* clear a chunk out of this range */ - free_range_t *new_rp = - kmem_alloc(sizeof (free_range_t), KM_SLEEP); - - new_rp->fr_blkid = endblk; - new_rp->fr_nblks = fr_endblk - endblk; - avl_insert_here(tree, new_rp, rp, AVL_AFTER); - rp->fr_nblks = blkid - rp->fr_blkid; - } - /* there may be no overlap */ - rp = nrp; - } -} - -void -dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db; - uint64_t blkoff, blkid, nblks; - int blksz, head; - int trunc = FALSE; - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - blksz = dn->dn_datablksz; - - /* If the range is past the end of the file, this is a no-op */ - if (off >= blksz * (dn->dn_maxblkid+1)) - goto out; - if (len == -1ULL) { - len = UINT64_MAX - off; - trunc = TRUE; - } - - /* - * First, block align the region to free: - */ - if (ISP2(blksz)) { - head = P2NPHASE(off, blksz); - blkoff = P2PHASE(off, blksz); - } else { - ASSERT(dn->dn_maxblkid == 0); - if (off == 0 && len >= blksz) { - /* Freeing the whole block; don't do any head. */ - head = 0; - } else { - /* Freeing part of the block. */ - head = blksz - off; - ASSERT3U(head, >, 0); - } - blkoff = off; - } - /* zero out any partial block data at the start of the range */ - if (head) { - ASSERT3U(blkoff + head, ==, blksz); - if (len < head) - head = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, - FTAG, &db) == 0) { - caddr_t data; - - /* don't dirty if it isn't on disk and isn't dirty */ - if (db->db_last_dirty || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - data = db->db.db_data; - bzero(data + blkoff, head); - } - dbuf_rele(db, FTAG); - } - off += head; - len -= head; - } - - /* If the range was less than one block, we're done */ - if (len == 0 || off >= blksz * (dn->dn_maxblkid+1)) - goto out; - - if (!ISP2(blksz)) { - /* - * They are freeing the whole block of a - * non-power-of-two blocksize file. Skip all the messy - * math. - */ - ASSERT3U(off, ==, 0); - ASSERT3U(len, >=, blksz); - blkid = 0; - nblks = 1; - } else { - int tail; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - int blkshift = dn->dn_datablkshift; - - /* If the remaining range is past end of file, we're done */ - if (off > dn->dn_maxblkid << blkshift) - goto out; - - if (off + len == UINT64_MAX) - tail = 0; - else - tail = P2PHASE(len, blksz); - - ASSERT3U(P2PHASE(off, blksz), ==, 0); - /* zero out any partial block data at the end of the range */ - if (tail) { - if (len < tail) - tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { - /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || - (db->db_blkptr && - !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); - rw_enter(&dn->dn_struct_rwlock, - RW_WRITER); - bzero(db->db.db_data, tail); - } - dbuf_rele(db, FTAG); - } - len -= tail; - } - /* If the range did not include a full block, we are done */ - if (len == 0) - goto out; - - /* dirty the left indirects */ - if (dn->dn_nlevels > 1 && off != 0) { - db = dbuf_hold_level(dn, 1, - (off - head) >> (blkshift + epbs), FTAG); - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } - - /* dirty the right indirects */ - if (dn->dn_nlevels > 1 && !trunc) { - db = dbuf_hold_level(dn, 1, - (off + len + tail - 1) >> (blkshift + epbs), FTAG); - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } - - /* - * Finally, add this range to the dnode range list, we - * will finish up this free operation in the syncing phase. - */ - ASSERT(IS_P2ALIGNED(off, 1<<blkshift)); - ASSERT(off + len == UINT64_MAX || - IS_P2ALIGNED(len, 1<<blkshift)); - blkid = off >> blkshift; - nblks = len >> blkshift; - - if (trunc) - dn->dn_maxblkid = (blkid ? blkid - 1 : 0); - } - - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, blkid, nblks, tx); - { - free_range_t *rp, *found; - avl_index_t where; - avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; - - /* Add new range to dn_ranges */ - rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP); - rp->fr_blkid = blkid; - rp->fr_nblks = nblks; - found = avl_find(tree, rp, &where); - ASSERT(found == NULL); - avl_insert(tree, rp, where); - dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", - blkid, nblks, tx->tx_txg); - } - mutex_exit(&dn->dn_mtx); - - dbuf_free_range(dn, blkid, nblks, tx); - dnode_setdirty(dn, tx); -out: - rw_exit(&dn->dn_struct_rwlock); -} - -/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ -uint64_t -dnode_block_freed(dnode_t *dn, uint64_t blkid) -{ - free_range_t range_tofind; - void *dp = spa_get_dsl(dn->dn_objset->os_spa); - int i; - - if (blkid == DB_BONUS_BLKID) - return (FALSE); - - /* - * If we're in the process of opening the pool, dp will not be - * set yet, but there shouldn't be anything dirty. - */ - if (dp == NULL) - return (FALSE); - - if (dn->dn_free_txg) - return (TRUE); - - /* - * If dn_datablkshift is not set, then there's only a single - * block, in which case there will never be a free range so it - * won't matter. - */ - range_tofind.fr_blkid = blkid; - mutex_enter(&dn->dn_mtx); - for (i = 0; i < TXG_SIZE; i++) { - free_range_t *range_found; - avl_index_t idx; - - range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx); - if (range_found) { - ASSERT(range_found->fr_nblks > 0); - break; - } - range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE); - if (range_found && - range_found->fr_blkid + range_found->fr_nblks > blkid) - break; - } - mutex_exit(&dn->dn_mtx); - return (i < TXG_SIZE); -} - -/* call from syncing context when we actually write/free space for this dnode */ -void -dnode_diduse_space(dnode_t *dn, int64_t delta) -{ - uint64_t space; - dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", - dn, dn->dn_phys, - (u_longlong_t)dn->dn_phys->dn_used, - (longlong_t)delta); - - mutex_enter(&dn->dn_mtx); - space = DN_USED_BYTES(dn->dn_phys); - if (delta > 0) { - ASSERT3U(space + delta, >=, space); /* no overflow */ - } else { - ASSERT3U(space, >=, -delta); /* no underflow */ - } - space += delta; - if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) { - ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); - ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0); - dn->dn_phys->dn_used = space >> DEV_BSHIFT; - } else { - dn->dn_phys->dn_used = space; - dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; - } - mutex_exit(&dn->dn_mtx); -} - -/* - * Call when we think we're going to write/free space in open context. - * Be conservative (ie. OK to write less than this or free more than - * this, but don't write more or free less). - */ -void -dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) -{ - objset_impl_t *os = dn->dn_objset; - dsl_dataset_t *ds = os->os_dsl_dataset; - - if (space > 0) - space = spa_get_asize(os->os_spa, space); - - if (ds) - dsl_dir_willuse_space(ds->ds_dir, space, tx); - - dmu_tx_willuse_space(tx, space); -} - -static int -dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, - int lvl, uint64_t blkfill, uint64_t txg) -{ - dmu_buf_impl_t *db = NULL; - void *data = NULL; - uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - uint64_t epb = 1ULL << epbs; - uint64_t minfill, maxfill; - int i, error, span; - - dprintf("probing object %llu offset %llx level %d of %u\n", - dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); - - if (lvl == dn->dn_phys->dn_nlevels) { - error = 0; - epb = dn->dn_phys->dn_nblkptr; - data = dn->dn_phys->dn_blkptr; - } else { - uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); - error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); - if (error) { - if (error == ENOENT) - return (hole ? 0 : ESRCH); - return (error); - } - error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); - if (error) { - dbuf_rele(db, FTAG); - return (error); - } - data = db->db.db_data; - } - - if (db && txg && - (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) { - error = ESRCH; - } else if (lvl == 0) { - dnode_phys_t *dnp = data; - span = DNODE_SHIFT; - ASSERT(dn->dn_type == DMU_OT_DNODE); - - for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) { - boolean_t newcontents = B_TRUE; - if (txg) { - int j; - newcontents = B_FALSE; - for (j = 0; j < dnp[i].dn_nblkptr; j++) { - if (dnp[i].dn_blkptr[j].blk_birth > txg) - newcontents = B_TRUE; - } - } - if (!dnp[i].dn_type == hole && newcontents) - break; - *offset += 1ULL << span; - } - if (i == blkfill) - error = ESRCH; - } else { - blkptr_t *bp = data; - span = (lvl - 1) * epbs + dn->dn_datablkshift; - minfill = 0; - maxfill = blkfill << ((lvl - 1) * epbs); - - if (hole) - maxfill--; - else - minfill++; - - for (i = (*offset >> span) & ((1ULL << epbs) - 1); - i < epb; i++) { - if (bp[i].blk_fill >= minfill && - bp[i].blk_fill <= maxfill && - bp[i].blk_birth > txg) - break; - *offset += 1ULL << span; - } - if (i >= epb) - error = ESRCH; - } - - if (db) - dbuf_rele(db, FTAG); - - return (error); -} - -/* - * Find the next hole, data, or sparse region at or after *offset. - * The value 'blkfill' tells us how many items we expect to find - * in an L0 data block; this value is 1 for normal objects, - * DNODES_PER_BLOCK for the meta dnode, and some fraction of - * DNODES_PER_BLOCK when searching for sparse regions thereof. - * - * Examples: - * - * dnode_next_offset(dn, hole, offset, 1, 1, 0); - * Finds the next hole/data in a file. - * Used in dmu_offset_next(). - * - * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg); - * Finds the next free/allocated dnode an objset's meta-dnode. - * Only finds objects that have new contents since txg (ie. - * bonus buffer changes and content removal are ignored). - * Used in dmu_object_next(). - * - * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0); - * Finds the next L2 meta-dnode bp that's at most 1/4 full. - * Used in dmu_object_alloc(). - */ -int -dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset, - int minlvl, uint64_t blkfill, uint64_t txg) -{ - int lvl, maxlvl; - int error = 0; - uint64_t initial_offset = *offset; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - - if (dn->dn_phys->dn_nlevels == 0) { - rw_exit(&dn->dn_struct_rwlock); - return (ESRCH); - } - - if (dn->dn_datablkshift == 0) { - if (*offset < dn->dn_datablksz) { - if (hole) - *offset = dn->dn_datablksz; - } else { - error = ESRCH; - } - rw_exit(&dn->dn_struct_rwlock); - return (error); - } - - maxlvl = dn->dn_phys->dn_nlevels; - - for (lvl = minlvl; lvl <= maxlvl; lvl++) { - error = dnode_next_offset_level(dn, - hole, offset, lvl, blkfill, txg); - if (error != ESRCH) - break; - } - - while (--lvl >= minlvl && error == 0) { - error = dnode_next_offset_level(dn, - hole, offset, lvl, blkfill, txg); - } - - rw_exit(&dn->dn_struct_rwlock); - - if (error == 0 && initial_offset > *offset) - error = ESRCH; - - return (error); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c deleted file mode 100644 index 9e8c7ad..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ /dev/null @@ -1,623 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/dbuf.h> -#include <sys/dnode.h> -#include <sys/dmu.h> -#include <sys/dmu_tx.h> -#include <sys/dmu_objset.h> -#include <sys/dsl_dataset.h> -#include <sys/spa.h> - -static void -dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db; - int txgoff = tx->tx_txg & TXG_MASK; - int nblkptr = dn->dn_phys->dn_nblkptr; - int old_toplvl = dn->dn_phys->dn_nlevels - 1; - int new_level = dn->dn_next_nlevels[txgoff]; - int i; - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - - /* this dnode can't be paged out because it's dirty */ - ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); - - db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); - ASSERT(db != NULL); - - dn->dn_phys->dn_nlevels = new_level; - dprintf("os=%p obj=%llu, increase to %d\n", - dn->dn_objset, dn->dn_object, - dn->dn_phys->dn_nlevels); - - /* check for existing blkptrs in the dnode */ - for (i = 0; i < nblkptr; i++) - if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) - break; - if (i != nblkptr) { - /* transfer dnode's block pointers to new indirect block */ - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); - ASSERT(db->db.db_data); - ASSERT(arc_released(db->db_buf)); - ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); - bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, - sizeof (blkptr_t) * nblkptr); - arc_buf_freeze(db->db_buf); - } - - /* set dbuf's parent pointers to new indirect buf */ - for (i = 0; i < nblkptr; i++) { - dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i); - - if (child == NULL) - continue; - ASSERT3P(child->db_dnode, ==, dn); - if (child->db_parent && child->db_parent != dn->dn_dbuf) { - ASSERT(child->db_parent->db_level == db->db_level); - ASSERT(child->db_blkptr != - &dn->dn_phys->dn_blkptr[child->db_blkid]); - mutex_exit(&child->db_mtx); - continue; - } - ASSERT(child->db_parent == NULL || - child->db_parent == dn->dn_dbuf); - - child->db_parent = db; - dbuf_add_ref(db, child); - if (db->db.db_data) - child->db_blkptr = (blkptr_t *)db->db.db_data + i; - else - child->db_blkptr = NULL; - dprintf_dbuf_bp(child, child->db_blkptr, - "changed db_blkptr to new indirect %s", ""); - - mutex_exit(&child->db_mtx); - } - - bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); - - dbuf_rele(db, FTAG); - - rw_exit(&dn->dn_struct_rwlock); -} - -static void -free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) -{ - objset_impl_t *os = dn->dn_objset; - uint64_t bytesfreed = 0; - int i; - - dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num); - - for (i = 0; i < num; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - - bytesfreed += bp_get_dasize(os->os_spa, bp); - ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); - dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx); - bzero(bp, sizeof (blkptr_t)); - } - dnode_diduse_space(dn, -bytesfreed); -} - -#ifdef ZFS_DEBUG -static void -free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) -{ - int off, num; - int i, err, epbs; - uint64_t txg = tx->tx_txg; - - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - off = start - (db->db_blkid * 1<<epbs); - num = end - start + 1; - - ASSERT3U(off, >=, 0); - ASSERT3U(num, >=, 0); - ASSERT3U(db->db_level, >, 0); - ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift); - ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); - ASSERT(db->db_blkptr != NULL); - - for (i = off; i < off+num; i++) { - uint64_t *buf; - dmu_buf_impl_t *child; - dbuf_dirty_record_t *dr; - int j; - - ASSERT(db->db_level == 1); - - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(db->db_dnode, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); - rw_exit(&db->db_dnode->dn_struct_rwlock); - if (err == ENOENT) - continue; - ASSERT(err == 0); - ASSERT(child->db_level == 0); - dr = child->db_last_dirty; - while (dr && dr->dr_txg > txg) - dr = dr->dr_next; - ASSERT(dr == NULL || dr->dr_txg == txg); - - /* data_old better be zeroed */ - if (dr) { - buf = dr->dt.dl.dr_data->b_data; - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - child, i, off, num); - } - } - } - - /* - * db_data better be zeroed unless it's dirty in a - * future txg. - */ - mutex_enter(&child->db_mtx); - buf = child->db.db_data; - if (buf != NULL && child->db_state != DB_FILL && - child->db_last_dirty == NULL) { - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - child, i, off, num); - } - } - } - mutex_exit(&child->db_mtx); - - dbuf_rele(child, FTAG); - } -} -#endif - -static int -free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, - dmu_tx_t *tx) -{ - dnode_t *dn = db->db_dnode; - blkptr_t *bp; - dmu_buf_impl_t *subdb; - uint64_t start, end, dbstart, dbend, i; - int epbs, shift, err; - int all = TRUE; - - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - arc_release(db->db_buf, db); - bp = (blkptr_t *)db->db.db_data; - - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - shift = (db->db_level - 1) * epbs; - dbstart = db->db_blkid << epbs; - start = blkid >> shift; - if (dbstart < start) { - bp += start - dbstart; - all = FALSE; - } else { - start = dbstart; - } - dbend = ((db->db_blkid + 1) << epbs) - 1; - end = (blkid + nblks - 1) >> shift; - if (dbend <= end) - end = dbend; - else if (all) - all = trunc; - ASSERT3U(start, <=, end); - - if (db->db_level == 1) { - FREE_VERIFY(db, start, end, tx); - free_blocks(dn, bp, end-start+1, tx); - arc_buf_freeze(db->db_buf); - ASSERT(all || db->db_last_dirty); - return (all); - } - - for (i = start; i <= end; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); - ASSERT3U(err, ==, 0); - rw_exit(&dn->dn_struct_rwlock); - - if (free_children(subdb, blkid, nblks, trunc, tx)) { - ASSERT3P(subdb->db_blkptr, ==, bp); - free_blocks(dn, bp, 1, tx); - } else { - all = FALSE; - } - dbuf_rele(subdb, FTAG); - } - arc_buf_freeze(db->db_buf); -#ifdef ZFS_DEBUG - bp -= (end-start)+1; - for (i = start; i <= end; i++, bp++) { - if (i == start && blkid != 0) - continue; - else if (i == end && !trunc) - continue; - ASSERT3U(bp->blk_birth, ==, 0); - } -#endif - ASSERT(all || db->db_last_dirty); - return (all); -} - -/* - * free_range: Traverse the indicated range of the provided file - * and "free" all the blocks contained there. - */ -static void -dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) -{ - blkptr_t *bp = dn->dn_phys->dn_blkptr; - dmu_buf_impl_t *db; - int trunc, start, end, shift, i, err; - int dnlevel = dn->dn_phys->dn_nlevels; - - if (blkid > dn->dn_phys->dn_maxblkid) - return; - - ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); - trunc = blkid + nblks > dn->dn_phys->dn_maxblkid; - if (trunc) - nblks = dn->dn_phys->dn_maxblkid - blkid + 1; - - /* There are no indirect blocks in the object */ - if (dnlevel == 1) { - if (blkid >= dn->dn_phys->dn_nblkptr) { - /* this range was never made persistent */ - return; - } - ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); - free_blocks(dn, bp + blkid, nblks, tx); - if (trunc) { - uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * - (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); - dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); - ASSERT(off < dn->dn_phys->dn_maxblkid || - dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, FALSE, &off, - 1, 1, 0) != 0); - } - return; - } - - shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); - start = blkid >> shift; - ASSERT(start < dn->dn_phys->dn_nblkptr); - end = (blkid + nblks - 1) >> shift; - bp += start; - for (i = start; i <= end; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); - ASSERT3U(err, ==, 0); - rw_exit(&dn->dn_struct_rwlock); - - if (free_children(db, blkid, nblks, trunc, tx)) { - ASSERT3P(db->db_blkptr, ==, bp); - free_blocks(dn, bp, 1, tx); - } - dbuf_rele(db, FTAG); - } - if (trunc) { - uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * - (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); - dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); - ASSERT(off < dn->dn_phys->dn_maxblkid || - dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0); - } -} - -/* - * Try to kick all the dnodes dbufs out of the cache... - */ -int -dnode_evict_dbufs(dnode_t *dn, int try) -{ - int progress; - int pass = 0; - - do { - dmu_buf_impl_t *db, marker; - int evicting = FALSE; - - progress = FALSE; - mutex_enter(&dn->dn_dbufs_mtx); - list_insert_tail(&dn->dn_dbufs, &marker); - db = list_head(&dn->dn_dbufs); - for (; db != ▮ db = list_head(&dn->dn_dbufs)) { - list_remove(&dn->dn_dbufs, db); - list_insert_tail(&dn->dn_dbufs, db); - - mutex_enter(&db->db_mtx); - if (db->db_state == DB_EVICTING) { - progress = TRUE; - evicting = TRUE; - mutex_exit(&db->db_mtx); - } else if (refcount_is_zero(&db->db_holds)) { - progress = TRUE; - ASSERT(!arc_released(db->db_buf)); - dbuf_clear(db); /* exits db_mtx for us */ - } else { - mutex_exit(&db->db_mtx); - } - - } - list_remove(&dn->dn_dbufs, &marker); - /* - * NB: we need to drop dn_dbufs_mtx between passes so - * that any DB_EVICTING dbufs can make progress. - * Ideally, we would have some cv we could wait on, but - * since we don't, just wait a bit to give the other - * thread a chance to run. - */ - mutex_exit(&dn->dn_dbufs_mtx); - if (evicting) - delay(1); - pass++; - ASSERT(pass < 100); /* sanity check */ - } while (progress); - - /* - * This function works fine even if it can't evict everything. - * If were only asked to try to evict everything then - * return an error if we can't. Otherwise panic as the caller - * expects total eviction. - */ - if (list_head(&dn->dn_dbufs) != NULL) { - if (try) { - return (1); - } else { - panic("dangling dbufs (dn=%p, dbuf=%p)\n", - dn, list_head(&dn->dn_dbufs)); - } - } - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { - mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); - dn->dn_bonus = NULL; - } - rw_exit(&dn->dn_struct_rwlock); - return (0); -} - -static void -dnode_undirty_dbufs(list_t *list) -{ - dbuf_dirty_record_t *dr; - - while (dr = list_head(list)) { - dmu_buf_impl_t *db = dr->dr_dbuf; - uint64_t txg = dr->dr_txg; - - mutex_enter(&db->db_mtx); - /* XXX - use dbuf_undirty()? */ - list_remove(list, dr); - ASSERT(db->db_last_dirty == dr); - db->db_last_dirty = NULL; - db->db_dirtycnt -= 1; - if (db->db_level == 0) { - ASSERT(db->db_blkid == DB_BONUS_BLKID || - dr->dt.dl.dr_data == db->db_buf); - dbuf_unoverride(dr); - mutex_exit(&db->db_mtx); - } else { - mutex_exit(&db->db_mtx); - dnode_undirty_dbufs(&dr->dt.di.dr_children); - list_destroy(&dr->dt.di.dr_children); - mutex_destroy(&dr->dt.di.dr_mtx); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - dbuf_rele(db, (void *)(uintptr_t)txg); - } -} - -static void -dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) -{ - int txgoff = tx->tx_txg & TXG_MASK; - - ASSERT(dmu_tx_is_syncing(tx)); - - dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); - (void) dnode_evict_dbufs(dn, 0); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); - - /* - * XXX - It would be nice to assert this, but we may still - * have residual holds from async evictions from the arc... - * - * zfs_obj_to_path() also depends on this being - * commented out. - * - * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); - */ - - /* Undirty next bits */ - dn->dn_next_nlevels[txgoff] = 0; - dn->dn_next_indblkshift[txgoff] = 0; - dn->dn_next_blksz[txgoff] = 0; - - /* free up all the blocks in the file. */ - dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx); - ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); - - /* ASSERT(blkptrs are zero); */ - ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); - ASSERT(dn->dn_type != DMU_OT_NONE); - - ASSERT(dn->dn_free_txg > 0); - if (dn->dn_allocated_txg != dn->dn_free_txg) - dbuf_will_dirty(dn->dn_dbuf, tx); - bzero(dn->dn_phys, sizeof (dnode_phys_t)); - - mutex_enter(&dn->dn_mtx); - dn->dn_type = DMU_OT_NONE; - dn->dn_maxblkid = 0; - dn->dn_allocated_txg = 0; - mutex_exit(&dn->dn_mtx); - - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); - - dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); - /* - * Now that we've released our hold, the dnode may - * be evicted, so we musn't access it. - */ -} - -/* - * Write out the dnode's dirty buffers. - * - * NOTE: The dnode is kept in memory by being dirty. Once the - * dirty bit is cleared, it may be evicted. Beware of this! - */ -void -dnode_sync(dnode_t *dn, dmu_tx_t *tx) -{ - free_range_t *rp; - dnode_phys_t *dnp = dn->dn_phys; - int txgoff = tx->tx_txg & TXG_MASK; - list_t *list = &dn->dn_dirty_records[txgoff]; - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); - DNODE_VERIFY(dn); - - ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); - - mutex_enter(&dn->dn_mtx); - if (dn->dn_allocated_txg == tx->tx_txg) { - /* The dnode is newly allocated or reallocated */ - if (dnp->dn_type == DMU_OT_NONE) { - /* this is a first alloc, not a realloc */ - /* XXX shouldn't the phys already be zeroed? */ - bzero(dnp, DNODE_CORE_SIZE); - dnp->dn_nlevels = 1; - } - - if (dn->dn_nblkptr > dnp->dn_nblkptr) { - /* zero the new blkptrs we are gaining */ - bzero(dnp->dn_blkptr + dnp->dn_nblkptr, - sizeof (blkptr_t) * - (dn->dn_nblkptr - dnp->dn_nblkptr)); - } - dnp->dn_type = dn->dn_type; - dnp->dn_bonustype = dn->dn_bonustype; - dnp->dn_bonuslen = dn->dn_bonuslen; - dnp->dn_nblkptr = dn->dn_nblkptr; - } - - ASSERT(dnp->dn_nlevels > 1 || - BP_IS_HOLE(&dnp->dn_blkptr[0]) || - BP_GET_LSIZE(&dnp->dn_blkptr[0]) == - dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); - - if (dn->dn_next_blksz[txgoff]) { - ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], - SPA_MINBLOCKSIZE) == 0); - ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || - list_head(list) != NULL || - dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == - dnp->dn_datablkszsec); - dnp->dn_datablkszsec = - dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; - dn->dn_next_blksz[txgoff] = 0; - } - - if (dn->dn_next_indblkshift[txgoff]) { - ASSERT(dnp->dn_nlevels == 1); - dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; - dn->dn_next_indblkshift[txgoff] = 0; - } - - /* - * Just take the live (open-context) values for checksum and compress. - * Strictly speaking it's a future leak, but nothing bad happens if we - * start using the new checksum or compress algorithm a little early. - */ - dnp->dn_checksum = dn->dn_checksum; - dnp->dn_compress = dn->dn_compress; - - mutex_exit(&dn->dn_mtx); - - /* process all the "freed" ranges in the file */ - if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) { - for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL; - rp = AVL_PREV(&dn->dn_ranges[txgoff], rp)) - dnode_sync_free_range(dn, - rp->fr_blkid, rp->fr_nblks, tx); - } - mutex_enter(&dn->dn_mtx); - for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) { - free_range_t *last = rp; - rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp); - avl_remove(&dn->dn_ranges[txgoff], last); - kmem_free(last, sizeof (free_range_t)); - } - mutex_exit(&dn->dn_mtx); - - if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { - dnode_sync_free(dn, tx); - return; - } - - if (dn->dn_next_nlevels[txgoff]) { - dnode_increase_indirection(dn, tx); - dn->dn_next_nlevels[txgoff] = 0; - } - - dbuf_sync_list(list, tx); - - if (dn->dn_object != DMU_META_DNODE_OBJECT) { - ASSERT3P(list_head(list), ==, NULL); - dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); - } - - /* - * Although we have dropped our reference to the dnode, it - * can't be evicted until its written, and we haven't yet - * initiated the IO for the dnode's dbuf. - */ -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c deleted file mode 100644 index 7d4689f..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ /dev/null @@ -1,2035 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu_objset.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_synctask.h> -#include <sys/dmu_traverse.h> -#include <sys/dmu_tx.h> -#include <sys/arc.h> -#include <sys/zio.h> -#include <sys/zap.h> -#include <sys/unique.h> -#include <sys/zfs_context.h> -#include <sys/zfs_ioctl.h> - -static dsl_checkfunc_t dsl_dataset_destroy_begin_check; -static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; -static dsl_checkfunc_t dsl_dataset_rollback_check; -static dsl_syncfunc_t dsl_dataset_rollback_sync; -static dsl_checkfunc_t dsl_dataset_destroy_check; -static dsl_syncfunc_t dsl_dataset_destroy_sync; - -#define DS_REF_MAX (1ULL << 62) - -#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE - -/* - * We use weighted reference counts to express the various forms of exclusion - * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open - * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE. - * This makes the exclusion logic simple: the total refcnt for all opens cannot - * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their - * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume - * just over half of the refcnt space, so there can't be more than one, but it - * can peacefully coexist with any number of STANDARD opens. - */ -static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = { - 0, /* DS_MODE_NONE - invalid */ - 1, /* DS_MODE_STANDARD - unlimited number */ - (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */ - DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */ -}; - - -void -dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) -{ - int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); - - dprintf_bp(bp, "born, ds=%p\n", ds); - - ASSERT(dmu_tx_is_syncing(tx)); - /* It could have been compressed away to nothing */ - if (BP_IS_HOLE(bp)) - return; - ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); - ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); - if (ds == NULL) { - /* - * Account for the meta-objset space in its placeholder - * dsl_dir. - */ - ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, - used, compressed, uncompressed, tx); - dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); - return; - } - dmu_buf_will_dirty(ds->ds_dbuf, tx); - mutex_enter(&ds->ds_lock); - ds->ds_phys->ds_used_bytes += used; - ds->ds_phys->ds_compressed_bytes += compressed; - ds->ds_phys->ds_uncompressed_bytes += uncompressed; - ds->ds_phys->ds_unique_bytes += used; - mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, - used, compressed, uncompressed, tx); -} - -void -dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, - dmu_tx_t *tx) -{ - int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); - - ASSERT(dmu_tx_is_syncing(tx)); - /* No block pointer => nothing to free */ - if (BP_IS_HOLE(bp)) - return; - - ASSERT(used > 0); - if (ds == NULL) { - int err; - /* - * Account for the meta-objset space in its placeholder - * dataset. - */ - err = arc_free(pio, tx->tx_pool->dp_spa, - tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); - ASSERT(err == 0); - - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, - -used, -compressed, -uncompressed, tx); - dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); - return; - } - ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - - if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { - int err; - - dprintf_bp(bp, "freeing: %s", ""); - err = arc_free(pio, tx->tx_pool->dp_spa, - tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); - ASSERT(err == 0); - - mutex_enter(&ds->ds_lock); - /* XXX unique_bytes is not accurate for head datasets */ - /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */ - ds->ds_phys->ds_unique_bytes -= used; - mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, - -used, -compressed, -uncompressed, tx); - } else { - dprintf_bp(bp, "putting on dead list: %s", ""); - VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); - /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ - if (ds->ds_phys->ds_prev_snap_obj != 0) { - ASSERT3U(ds->ds_prev->ds_object, ==, - ds->ds_phys->ds_prev_snap_obj); - ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); - if (ds->ds_prev->ds_phys->ds_next_snap_obj == - ds->ds_object && bp->blk_birth > - ds->ds_prev->ds_phys->ds_prev_snap_txg) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - mutex_enter(&ds->ds_prev->ds_lock); - ds->ds_prev->ds_phys->ds_unique_bytes += - used; - mutex_exit(&ds->ds_prev->ds_lock); - } - } - } - mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); - ds->ds_phys->ds_used_bytes -= used; - ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); - ds->ds_phys->ds_compressed_bytes -= compressed; - ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); - ds->ds_phys->ds_uncompressed_bytes -= uncompressed; - mutex_exit(&ds->ds_lock); -} - -uint64_t -dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) -{ - uint64_t trysnap = 0; - - if (ds == NULL) - return (0); - /* - * The snapshot creation could fail, but that would cause an - * incorrect FALSE return, which would only result in an - * overestimation of the amount of space that an operation would - * consume, which is OK. - * - * There's also a small window where we could miss a pending - * snapshot, because we could set the sync task in the quiescing - * phase. So this should only be used as a guess. - */ - if (ds->ds_trysnap_txg > - spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) - trysnap = ds->ds_trysnap_txg; - return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); -} - -int -dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) -{ - return (blk_birth > dsl_dataset_prev_snap_txg(ds)); -} - -/* ARGSUSED */ -static void -dsl_dataset_evict(dmu_buf_t *db, void *dsv) -{ - dsl_dataset_t *ds = dsv; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - /* open_refcount == DS_REF_MAX when deleting */ - ASSERT(ds->ds_open_refcount == 0 || - ds->ds_open_refcount == DS_REF_MAX); - - dprintf_ds(ds, "evicting %s\n", ""); - - unique_remove(ds->ds_phys->ds_fsid_guid); - - if (ds->ds_user_ptr != NULL) - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - - if (ds->ds_prev) { - dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); - ds->ds_prev = NULL; - } - - bplist_close(&ds->ds_deadlist); - dsl_dir_close(ds->ds_dir, ds); - - if (list_link_active(&ds->ds_synced_link)) - list_remove(&dp->dp_synced_objsets, ds); - - mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_deadlist.bpl_lock); - - kmem_free(ds, sizeof (dsl_dataset_t)); -} - -static int -dsl_dataset_get_snapname(dsl_dataset_t *ds) -{ - dsl_dataset_phys_t *headphys; - int err; - dmu_buf_t *headdbuf; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - - if (ds->ds_snapname[0]) - return (0); - if (ds->ds_phys->ds_next_snap_obj == 0) - return (0); - - err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, - FTAG, &headdbuf); - if (err) - return (err); - headphys = headdbuf->db_data; - err = zap_value_search(dp->dp_meta_objset, - headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname); - dmu_buf_rele(headdbuf, FTAG); - return (err); -} - -int -dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, - int mode, void *tag, dsl_dataset_t **dsp) -{ - uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; - objset_t *mos = dp->dp_meta_objset; - dmu_buf_t *dbuf; - dsl_dataset_t *ds; - int err; - - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); - - err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); - if (err) - return (err); - ds = dmu_buf_get_user(dbuf); - if (ds == NULL) { - dsl_dataset_t *winner; - - ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); - ds->ds_dbuf = dbuf; - ds->ds_object = dsobj; - ds->ds_phys = dbuf->db_data; - - mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, - NULL); - - err = bplist_open(&ds->ds_deadlist, - mos, ds->ds_phys->ds_deadlist_obj); - if (err == 0) { - err = dsl_dir_open_obj(dp, - ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); - } - if (err) { - /* - * we don't really need to close the blist if we - * just opened it. - */ - mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_deadlist.bpl_lock); - kmem_free(ds, sizeof (dsl_dataset_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } - - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) { - ds->ds_snapname[0] = '\0'; - if (ds->ds_phys->ds_prev_snap_obj) { - err = dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, ds, &ds->ds_prev); - } - } else { - if (snapname) { -#ifdef ZFS_DEBUG - dsl_dataset_phys_t *headphys; - dmu_buf_t *headdbuf; - err = dmu_bonus_hold(mos, - ds->ds_dir->dd_phys->dd_head_dataset_obj, - FTAG, &headdbuf); - if (err == 0) { - headphys = headdbuf->db_data; - uint64_t foundobj; - err = zap_lookup(dp->dp_meta_objset, - headphys->ds_snapnames_zapobj, - snapname, sizeof (foundobj), 1, - &foundobj); - ASSERT3U(foundobj, ==, dsobj); - dmu_buf_rele(headdbuf, FTAG); - } -#endif - (void) strcat(ds->ds_snapname, snapname); - } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { - err = dsl_dataset_get_snapname(ds); - } - } - - if (err == 0) { - winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, - dsl_dataset_evict); - } - if (err || winner) { - bplist_close(&ds->ds_deadlist); - if (ds->ds_prev) { - dsl_dataset_close(ds->ds_prev, - DS_MODE_NONE, ds); - } - dsl_dir_close(ds->ds_dir, ds); - mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_deadlist.bpl_lock); - kmem_free(ds, sizeof (dsl_dataset_t)); - if (err) { - dmu_buf_rele(dbuf, tag); - return (err); - } - ds = winner; - } else { - uint64_t new = - unique_insert(ds->ds_phys->ds_fsid_guid); - if (new != ds->ds_phys->ds_fsid_guid) { - /* XXX it won't necessarily be synced... */ - ds->ds_phys->ds_fsid_guid = new; - } - } - } - ASSERT3P(ds->ds_dbuf, ==, dbuf); - ASSERT3P(ds->ds_phys, ==, dbuf->db_data); - - mutex_enter(&ds->ds_lock); - if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY && - (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) && - !DS_MODE_IS_INCONSISTENT(mode)) || - (ds->ds_open_refcount + weight > DS_REF_MAX)) { - mutex_exit(&ds->ds_lock); - dsl_dataset_close(ds, DS_MODE_NONE, tag); - return (EBUSY); - } - ds->ds_open_refcount += weight; - mutex_exit(&ds->ds_lock); - - *dsp = ds; - return (0); -} - -int -dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, - void *tag, dsl_dataset_t **dsp) -{ - dsl_dir_t *dd; - dsl_pool_t *dp; - const char *tail; - uint64_t obj; - dsl_dataset_t *ds = NULL; - int err = 0; - - err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail); - if (err) - return (err); - - dp = dd->dd_pool; - obj = dd->dd_phys->dd_head_dataset_obj; - rw_enter(&dp->dp_config_rwlock, RW_READER); - if (obj == 0) { - /* A dataset with no associated objset */ - err = ENOENT; - goto out; - } - - if (tail != NULL) { - objset_t *mos = dp->dp_meta_objset; - - err = dsl_dataset_open_obj(dp, obj, NULL, - DS_MODE_NONE, tag, &ds); - if (err) - goto out; - obj = ds->ds_phys->ds_snapnames_zapobj; - dsl_dataset_close(ds, DS_MODE_NONE, tag); - ds = NULL; - - if (tail[0] != '@') { - err = ENOENT; - goto out; - } - tail++; - - /* Look for a snapshot */ - if (!DS_MODE_IS_READONLY(mode)) { - err = EROFS; - goto out; - } - dprintf("looking for snapshot '%s'\n", tail); - err = zap_lookup(mos, obj, tail, 8, 1, &obj); - if (err) - goto out; - } - err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds); - -out: - rw_exit(&dp->dp_config_rwlock); - dsl_dir_close(dd, FTAG); - - ASSERT3U((err == 0), ==, (ds != NULL)); - /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */ - - *dsp = ds; - return (err); -} - -int -dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp) -{ - return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp)); -} - -void -dsl_dataset_name(dsl_dataset_t *ds, char *name) -{ - if (ds == NULL) { - (void) strcpy(name, "mos"); - } else { - dsl_dir_name(ds->ds_dir, name); - VERIFY(0 == dsl_dataset_get_snapname(ds)); - if (ds->ds_snapname[0]) { - (void) strcat(name, "@"); - if (!MUTEX_HELD(&ds->ds_lock)) { - /* - * We use a "recursive" mutex so that we - * can call dprintf_ds() with ds_lock held. - */ - mutex_enter(&ds->ds_lock); - (void) strcat(name, ds->ds_snapname); - mutex_exit(&ds->ds_lock); - } else { - (void) strcat(name, ds->ds_snapname); - } - } - } -} - -static int -dsl_dataset_namelen(dsl_dataset_t *ds) -{ - int result; - - if (ds == NULL) { - result = 3; /* "mos" */ - } else { - result = dsl_dir_namelen(ds->ds_dir); - VERIFY(0 == dsl_dataset_get_snapname(ds)); - if (ds->ds_snapname[0]) { - ++result; /* adding one for the @-sign */ - if (!MUTEX_HELD(&ds->ds_lock)) { - /* see dsl_datset_name */ - mutex_enter(&ds->ds_lock); - result += strlen(ds->ds_snapname); - mutex_exit(&ds->ds_lock); - } else { - result += strlen(ds->ds_snapname); - } - } - } - - return (result); -} - -void -dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag) -{ - uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; - mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_open_refcount, >=, weight); - ds->ds_open_refcount -= weight; - dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n", - mode, ds->ds_open_refcount); - mutex_exit(&ds->ds_lock); - - dmu_buf_rele(ds->ds_dbuf, tag); -} - -void -dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx) -{ - objset_t *mos = dp->dp_meta_objset; - dmu_buf_t *dbuf; - dsl_dataset_phys_t *dsphys; - dsl_dataset_t *ds; - uint64_t dsobj; - dsl_dir_t *dd; - - dsl_dir_create_root(mos, ddobjp, tx); - VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd)); - - dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; - dsphys->ds_dir_obj = dd->dd_object; - dsphys->ds_fsid_guid = unique_create(); - unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ - (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, - sizeof (dsphys->ds_guid)); - dsphys->ds_snapnames_zapobj = - zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); - dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg; - dsphys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - dmu_buf_rele(dbuf, FTAG); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_head_dataset_obj = dsobj; - dsl_dir_close(dd, FTAG); - - VERIFY(0 == - dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds)); - (void) dmu_objset_create_impl(dp->dp_spa, ds, - &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); -} - -uint64_t -dsl_dataset_create_sync(dsl_dir_t *pdd, - const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx) -{ - dsl_pool_t *dp = pdd->dd_pool; - dmu_buf_t *dbuf; - dsl_dataset_phys_t *dsphys; - uint64_t dsobj, ddobj; - objset_t *mos = dp->dp_meta_objset; - dsl_dir_t *dd; - - ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp); - ASSERT(clone_parent == NULL || - clone_parent->ds_phys->ds_num_children > 0); - ASSERT(lastname[0] != '@'); - ASSERT(dmu_tx_is_syncing(tx)); - - ddobj = dsl_dir_create_sync(pdd, lastname, tx); - VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); - - dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; - dsphys->ds_dir_obj = dd->dd_object; - dsphys->ds_fsid_guid = unique_create(); - unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ - (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, - sizeof (dsphys->ds_guid)); - dsphys->ds_snapnames_zapobj = - zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); - dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg; - dsphys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - if (clone_parent) { - dsphys->ds_prev_snap_obj = clone_parent->ds_object; - dsphys->ds_prev_snap_txg = - clone_parent->ds_phys->ds_creation_txg; - dsphys->ds_used_bytes = - clone_parent->ds_phys->ds_used_bytes; - dsphys->ds_compressed_bytes = - clone_parent->ds_phys->ds_compressed_bytes; - dsphys->ds_uncompressed_bytes = - clone_parent->ds_phys->ds_uncompressed_bytes; - dsphys->ds_bp = clone_parent->ds_phys->ds_bp; - - dmu_buf_will_dirty(clone_parent->ds_dbuf, tx); - clone_parent->ds_phys->ds_num_children++; - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object; - } - dmu_buf_rele(dbuf, FTAG); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_head_dataset_obj = dsobj; - dsl_dir_close(dd, FTAG); - - return (dsobj); -} - -struct destroyarg { - dsl_sync_task_group_t *dstg; - char *snapname; - char *failed; -}; - -static int -dsl_snapshot_destroy_one(char *name, void *arg) -{ - struct destroyarg *da = arg; - dsl_dataset_t *ds; - char *cp; - int err; - - (void) strcat(name, "@"); - (void) strcat(name, da->snapname); - err = dsl_dataset_open(name, - DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - da->dstg, &ds); - cp = strchr(name, '@'); - *cp = '\0'; - if (err == ENOENT) - return (0); - if (err) { - (void) strcpy(da->failed, name); - return (err); - } - - dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, da->dstg, 0); - return (0); -} - -/* - * Destroy 'snapname' in all descendants of 'fsname'. - */ -#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy -int -dsl_snapshots_destroy(char *fsname, char *snapname) -{ - int err; - struct destroyarg da; - dsl_sync_task_t *dst; - spa_t *spa; - char *cp; - - cp = strchr(fsname, '/'); - if (cp) { - *cp = '\0'; - err = spa_open(fsname, &spa, FTAG); - *cp = '/'; - } else { - err = spa_open(fsname, &spa, FTAG); - } - if (err) - return (err); - da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - da.snapname = snapname; - da.failed = fsname; - - err = dmu_objset_find(fsname, - dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); - - if (err == 0) - err = dsl_sync_task_group_wait(da.dstg); - - for (dst = list_head(&da.dstg->dstg_tasks); dst; - dst = list_next(&da.dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; - if (dst->dst_err) { - dsl_dataset_name(ds, fsname); - cp = strchr(fsname, '@'); - *cp = '\0'; - } - /* - * If it was successful, destroy_sync would have - * closed the ds - */ - if (err) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg); - } - - dsl_sync_task_group_destroy(da.dstg); - spa_close(spa, FTAG); - return (err); -} - -int -dsl_dataset_destroy(const char *name) -{ - int err; - dsl_sync_task_group_t *dstg; - objset_t *os; - dsl_dataset_t *ds; - dsl_dir_t *dd; - uint64_t obj; - - if (strchr(name, '@')) { - /* Destroying a snapshot is simpler */ - err = dsl_dataset_open(name, - DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - FTAG, &ds); - if (err) - return (err); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - ds, FTAG, 0); - if (err) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - return (err); - } - - err = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os); - if (err) - return (err); - ds = os->os->os_dsl_dataset; - dd = ds->ds_dir; - - /* - * Check for errors and mark this ds as inconsistent, in - * case we crash while freeing the objects. - */ - err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, - dsl_dataset_destroy_begin_sync, ds, NULL, 0); - if (err) { - dmu_objset_close(os); - return (err); - } - - /* - * remove the objects in open context, so that we won't - * have too much to do in syncing context. - */ - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, - ds->ds_phys->ds_prev_snap_txg)) { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); - dmu_tx_hold_bonus(tx, obj); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - /* - * Perhaps there is not enough disk - * space. Just deal with it from - * dsl_dataset_destroy_sync(). - */ - dmu_tx_abort(tx); - continue; - } - VERIFY(0 == dmu_object_free(os, obj, tx)); - dmu_tx_commit(tx); - } - /* Make sure it's not dirty before we finish destroying it. */ - txg_wait_synced(dd->dd_pool, 0); - - dmu_objset_close(os); - if (err != ESRCH) - return (err); - - err = dsl_dataset_open(name, - DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - FTAG, &ds); - if (err) - return (err); - - err = dsl_dir_open(name, FTAG, &dd, NULL); - if (err) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - return (err); - } - - /* - * Blow away the dsl_dir + head dataset. - */ - dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, FTAG, 0); - dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, dd, FTAG, 0); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); - /* if it is successful, *destroy_sync will close the ds+dd */ - if (err) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - dsl_dir_close(dd, FTAG); - } - return (err); -} - -int -dsl_dataset_rollback(dsl_dataset_t *ds) -{ - ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX); - return (dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_rollback_check, dsl_dataset_rollback_sync, - ds, NULL, 0)); -} - -void * -dsl_dataset_set_user_ptr(dsl_dataset_t *ds, - void *p, dsl_dataset_evict_func_t func) -{ - void *old; - - mutex_enter(&ds->ds_lock); - old = ds->ds_user_ptr; - if (old == NULL) { - ds->ds_user_ptr = p; - ds->ds_user_evict_func = func; - } - mutex_exit(&ds->ds_lock); - return (old); -} - -void * -dsl_dataset_get_user_ptr(dsl_dataset_t *ds) -{ - return (ds->ds_user_ptr); -} - - -blkptr_t * -dsl_dataset_get_blkptr(dsl_dataset_t *ds) -{ - return (&ds->ds_phys->ds_bp); -} - -void -dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - /* If it's the meta-objset, set dp_meta_rootbp */ - if (ds == NULL) { - tx->tx_pool->dp_meta_rootbp = *bp; - } else { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_bp = *bp; - } -} - -spa_t * -dsl_dataset_get_spa(dsl_dataset_t *ds) -{ - return (ds->ds_dir->dd_pool->dp_spa); -} - -void -dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp; - - if (ds == NULL) /* this is the meta-objset */ - return; - - ASSERT(ds->ds_user_ptr != NULL); - - if (ds->ds_phys->ds_next_snap_obj != 0) - panic("dirtying snapshot!"); - - dp = ds->ds_dir->dd_pool; - - if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { - /* up the hold count until we can be written out */ - dmu_buf_add_ref(ds->ds_dbuf, ds); - } -} - -struct killarg { - uint64_t *usedp; - uint64_t *compressedp; - uint64_t *uncompressedp; - zio_t *zio; - dmu_tx_t *tx; -}; - -static int -kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) -{ - struct killarg *ka = arg; - blkptr_t *bp = &bc->bc_blkptr; - - ASSERT3U(bc->bc_errno, ==, 0); - - /* - * Since this callback is not called concurrently, no lock is - * needed on the accounting values. - */ - *ka->usedp += bp_get_dasize(spa, bp); - *ka->compressedp += BP_GET_PSIZE(bp); - *ka->uncompressedp += BP_GET_UCSIZE(bp); - /* XXX check for EIO? */ - (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL, - ARC_NOWAIT); - return (0); -} - -/* ARGSUSED */ -static int -dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - /* - * There must be a previous snapshot. I suppose we could roll - * it back to being empty (and re-initialize the upper (ZPL) - * layer). But for now there's no way to do this via the user - * interface. - */ - if (ds->ds_phys->ds_prev_snap_txg == 0) - return (EINVAL); - - /* - * This must not be a snapshot. - */ - if (ds->ds_phys->ds_next_snap_obj != 0) - return (EINVAL); - - /* - * If we made changes this txg, traverse_dsl_dataset won't find - * them. Try again. - */ - if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) - return (EAGAIN); - - return (0); -} - -/* ARGSUSED */ -static void -dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - - /* Zero out the deadlist. */ - bplist_close(&ds->ds_deadlist); - bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); - ds->ds_phys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, - ds->ds_phys->ds_deadlist_obj)); - - { - /* Free blkptrs that we gave birth to */ - zio_t *zio; - uint64_t used = 0, compressed = 0, uncompressed = 0; - struct killarg ka; - - zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, - ZIO_FLAG_MUSTSUCCEED); - ka.usedp = &used; - ka.compressedp = &compressed; - ka.uncompressedp = &uncompressed; - ka.zio = zio; - ka.tx = tx; - (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - ADVANCE_POST, kill_blkptr, &ka); - (void) zio_wait(zio); - - dsl_dir_diduse_space(ds->ds_dir, - -used, -compressed, -uncompressed, tx); - } - - /* Change our contents to that of the prev snapshot */ - ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); - ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; - ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes; - ds->ds_phys->ds_compressed_bytes = - ds->ds_prev->ds_phys->ds_compressed_bytes; - ds->ds_phys->ds_uncompressed_bytes = - ds->ds_prev->ds_phys->ds_uncompressed_bytes; - ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; - ds->ds_phys->ds_unique_bytes = 0; - - if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_unique_bytes = 0; - } -} - -/* ARGSUSED */ -static int -dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - /* - * Can't delete a head dataset if there are snapshots of it. - * (Except if the only snapshots are from the branch we cloned - * from.) - */ - if (ds->ds_prev != NULL && - ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EINVAL); - - return (0); -} - -/* ARGSUSED */ -static void -dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - /* Mark it as inconsistent on-disk, in case we crash */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; -} - -/* ARGSUSED */ -static int -dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - /* Can't delete a branch point. */ - if (ds->ds_phys->ds_num_children > 1) - return (EEXIST); - - /* - * Can't delete a head dataset if there are snapshots of it. - * (Except if the only snapshots are from the branch we cloned - * from.) - */ - if (ds->ds_prev != NULL && - ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EINVAL); - - /* - * If we made changes this txg, traverse_dsl_dataset won't find - * them. Try again. - */ - if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) - return (EAGAIN); - - /* XXX we should do some i/o error checking... */ - return (0); -} - -static void -dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - uint64_t used = 0, compressed = 0, uncompressed = 0; - zio_t *zio; - int err; - int after_branch_point = FALSE; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - dsl_dataset_t *ds_prev = NULL; - uint64_t obj; - - ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX); - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); - ASSERT(ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); - ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); - - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - - obj = ds->ds_object; - - if (ds->ds_phys->ds_prev_snap_obj != 0) { - if (ds->ds_prev) { - ds_prev = ds->ds_prev; - } else { - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, FTAG, &ds_prev)); - } - after_branch_point = - (ds_prev->ds_phys->ds_next_snap_obj != obj); - - dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); - if (after_branch_point && - ds->ds_phys->ds_next_snap_obj == 0) { - /* This clone is toast. */ - ASSERT(ds_prev->ds_phys->ds_num_children > 1); - ds_prev->ds_phys->ds_num_children--; - } else if (!after_branch_point) { - ds_prev->ds_phys->ds_next_snap_obj = - ds->ds_phys->ds_next_snap_obj; - } - } - - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - - if (ds->ds_phys->ds_next_snap_obj != 0) { - blkptr_t bp; - dsl_dataset_t *ds_next; - uint64_t itor = 0; - - spa_scrub_restart(dp->dp_spa, tx->tx_txg); - - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_next_snap_obj, NULL, - DS_MODE_NONE, FTAG, &ds_next)); - ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); - - dmu_buf_will_dirty(ds_next->ds_dbuf, tx); - ds_next->ds_phys->ds_prev_snap_obj = - ds->ds_phys->ds_prev_snap_obj; - ds_next->ds_phys->ds_prev_snap_txg = - ds->ds_phys->ds_prev_snap_txg; - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, - ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); - - /* - * Transfer to our deadlist (which will become next's - * new deadlist) any entries from next's current - * deadlist which were born before prev, and free the - * other entries. - * - * XXX we're doing this long task with the config lock held - */ - while (bplist_iterate(&ds_next->ds_deadlist, &itor, - &bp) == 0) { - if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { - VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, - &bp, tx)); - if (ds_prev && !after_branch_point && - bp.blk_birth > - ds_prev->ds_phys->ds_prev_snap_txg) { - ds_prev->ds_phys->ds_unique_bytes += - bp_get_dasize(dp->dp_spa, &bp); - } - } else { - used += bp_get_dasize(dp->dp_spa, &bp); - compressed += BP_GET_PSIZE(&bp); - uncompressed += BP_GET_UCSIZE(&bp); - /* XXX check return value? */ - (void) arc_free(zio, dp->dp_spa, tx->tx_txg, - &bp, NULL, NULL, ARC_NOWAIT); - } - } - - /* free next's deadlist */ - bplist_close(&ds_next->ds_deadlist); - bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); - - /* set next's deadlist to our deadlist */ - ds_next->ds_phys->ds_deadlist_obj = - ds->ds_phys->ds_deadlist_obj; - VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, - ds_next->ds_phys->ds_deadlist_obj)); - ds->ds_phys->ds_deadlist_obj = 0; - - if (ds_next->ds_phys->ds_next_snap_obj != 0) { - /* - * Update next's unique to include blocks which - * were previously shared by only this snapshot - * and it. Those blocks will be born after the - * prev snap and before this snap, and will have - * died after the next snap and before the one - * after that (ie. be on the snap after next's - * deadlist). - * - * XXX we're doing this long task with the - * config lock held - */ - dsl_dataset_t *ds_after_next; - - VERIFY(0 == dsl_dataset_open_obj(dp, - ds_next->ds_phys->ds_next_snap_obj, NULL, - DS_MODE_NONE, FTAG, &ds_after_next)); - itor = 0; - while (bplist_iterate(&ds_after_next->ds_deadlist, - &itor, &bp) == 0) { - if (bp.blk_birth > - ds->ds_phys->ds_prev_snap_txg && - bp.blk_birth <= - ds->ds_phys->ds_creation_txg) { - ds_next->ds_phys->ds_unique_bytes += - bp_get_dasize(dp->dp_spa, &bp); - } - } - - dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG); - ASSERT3P(ds_next->ds_prev, ==, NULL); - } else { - /* - * It would be nice to update the head dataset's - * unique. To do so we would have to traverse - * it for blocks born after ds_prev, which is - * pretty expensive just to maintain something - * for debugging purposes. - */ - ASSERT3P(ds_next->ds_prev, ==, ds); - dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE, - ds_next); - if (ds_prev) { - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, ds_next, &ds_next->ds_prev)); - } else { - ds_next->ds_prev = NULL; - } - } - dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG); - - /* - * NB: unique_bytes is not accurate for head objsets - * because we don't update it when we delete the most - * recent snapshot -- see above comment. - */ - ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); - } else { - /* - * There's no next snapshot, so this is a head dataset. - * Destroy the deadlist. Unless it's a clone, the - * deadlist should be empty. (If it's a clone, it's - * safe to ignore the deadlist contents.) - */ - struct killarg ka; - - ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); - bplist_close(&ds->ds_deadlist); - bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); - ds->ds_phys->ds_deadlist_obj = 0; - - /* - * Free everything that we point to (that's born after - * the previous snapshot, if we are a clone) - * - * XXX we're doing this long task with the config lock held - */ - ka.usedp = &used; - ka.compressedp = &compressed; - ka.uncompressedp = &uncompressed; - ka.zio = zio; - ka.tx = tx; - err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - ADVANCE_POST, kill_blkptr, &ka); - ASSERT3U(err, ==, 0); - } - - err = zio_wait(zio); - ASSERT3U(err, ==, 0); - - dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx); - - if (ds->ds_phys->ds_snapnames_zapobj) { - err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); - ASSERT(err == 0); - } - - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { - /* Erase the link in the dataset */ - dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; - /* - * dsl_dir_sync_destroy() called us, they'll destroy - * the dataset. - */ - } else { - /* remove from snapshot namespace */ - dsl_dataset_t *ds_head; - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL, - DS_MODE_NONE, FTAG, &ds_head)); - VERIFY(0 == dsl_dataset_get_snapname(ds)); -#ifdef ZFS_DEBUG - { - uint64_t val; - err = zap_lookup(mos, - ds_head->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, 8, 1, &val); - ASSERT3U(err, ==, 0); - ASSERT3U(val, ==, obj); - } -#endif - err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, tx); - ASSERT(err == 0); - dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG); - } - - if (ds_prev && ds->ds_prev != ds_prev) - dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG); - - spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag); - VERIFY(0 == dmu_object_free(mos, obj, tx)); - -} - -/* ARGSUSED */ -int -dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - dsl_dataset_t *ds = os->os->os_dsl_dataset; - const char *snapname = arg2; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - int err; - uint64_t value; - - /* - * We don't allow multiple snapshots of the same txg. If there - * is already one, try again. - */ - if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) - return (EAGAIN); - - /* - * Check for conflicting name snapshot name. - */ - err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj, - snapname, 8, 1, &value); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - - /* - * Check that the dataset's name is not too long. Name consists - * of the dataset's length + 1 for the @-sign + snapshot name's length - */ - if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) - return (ENAMETOOLONG); - - ds->ds_trysnap_txg = tx->tx_txg; - return (0); -} - -void -dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - dsl_dataset_t *ds = os->os->os_dsl_dataset; - const char *snapname = arg2; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - dmu_buf_t *dbuf; - dsl_dataset_phys_t *dsphys; - uint64_t dsobj; - objset_t *mos = dp->dp_meta_objset; - int err; - - spa_scrub_restart(dp->dp_spa, tx->tx_txg); - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - - dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; - dsphys->ds_dir_obj = ds->ds_dir->dd_object; - dsphys->ds_fsid_guid = unique_create(); - unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ - (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, - sizeof (dsphys->ds_guid)); - dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; - dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; - dsphys->ds_next_snap_obj = ds->ds_object; - dsphys->ds_num_children = 1; - dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg; - dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; - dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; - dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; - dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; - dsphys->ds_flags = ds->ds_phys->ds_flags; - dsphys->ds_bp = ds->ds_phys->ds_bp; - dmu_buf_rele(dbuf, FTAG); - - ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); - if (ds->ds_prev) { - ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == - ds->ds_object || - ds->ds_prev->ds_phys->ds_num_children > 1); - if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, - ds->ds_prev->ds_phys->ds_creation_txg); - ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; - } - } - - bplist_close(&ds->ds_deadlist); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg); - ds->ds_phys->ds_prev_snap_obj = dsobj; - ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg; - ds->ds_phys->ds_unique_bytes = 0; - ds->ds_phys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, - ds->ds_phys->ds_deadlist_obj)); - - dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); - err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, - snapname, 8, 1, &dsobj, tx); - ASSERT(err == 0); - - if (ds->ds_prev) - dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, snapname, - DS_MODE_NONE, ds, &ds->ds_prev)); -} - -void -dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(ds->ds_user_ptr != NULL); - ASSERT(ds->ds_phys->ds_next_snap_obj == 0); - - dsl_dir_dirty(ds->ds_dir, tx); - dmu_objset_sync(ds->ds_user_ptr, zio, tx); - /* Unneeded? bplist_close(&ds->ds_deadlist); */ -} - -void -dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) -{ - dsl_dir_stats(ds->ds_dir, nv); - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, - ds->ds_phys->ds_creation_time); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, - ds->ds_phys->ds_creation_txg); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, - ds->ds_phys->ds_used_bytes); - - if (ds->ds_phys->ds_next_snap_obj) { - /* - * This is a snapshot; override the dd's space used with - * our unique space and compression ratio. - */ - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, - ds->ds_phys->ds_unique_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, - ds->ds_phys->ds_compressed_bytes == 0 ? 100 : - (ds->ds_phys->ds_uncompressed_bytes * 100 / - ds->ds_phys->ds_compressed_bytes)); - } -} - -void -dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) -{ - stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; - stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; - if (ds->ds_phys->ds_next_snap_obj) { - stat->dds_is_snapshot = B_TRUE; - stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; - } - - /* clone origin is really a dsl_dir thing... */ - if (ds->ds_dir->dd_phys->dd_clone_parent_obj) { - dsl_dataset_t *ods; - - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool, - ds->ds_dir->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_NONE, FTAG, &ods)); - dsl_dataset_name(ods, stat->dds_clone_of); - dsl_dataset_close(ods, DS_MODE_NONE, FTAG); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - } -} - -uint64_t -dsl_dataset_fsid_guid(dsl_dataset_t *ds) -{ - return (ds->ds_phys->ds_fsid_guid); -} - -void -dsl_dataset_space(dsl_dataset_t *ds, - uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp) -{ - *refdbytesp = ds->ds_phys->ds_used_bytes; - *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); - *usedobjsp = ds->ds_phys->ds_bp.blk_fill; - *availobjsp = DN_MAX_OBJECT - *usedobjsp; -} - -/* ARGSUSED */ -static int -dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - char *newsnapname = arg2; - dsl_dir_t *dd = ds->ds_dir; - objset_t *mos = dd->dd_pool->dp_meta_objset; - dsl_dataset_t *hds; - uint64_t val; - int err; - - err = dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds); - if (err) - return (err); - - /* new name better not be in use */ - err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj, - newsnapname, 8, 1, &val); - dsl_dataset_close(hds, DS_MODE_NONE, FTAG); - - if (err == 0) - err = EEXIST; - else if (err == ENOENT) - err = 0; - - /* dataset name + 1 for the "@" + the new snapshot name must fit */ - if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) - err = ENAMETOOLONG; - - return (err); -} - -static void -dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - char *newsnapname = arg2; - dsl_dir_t *dd = ds->ds_dir; - objset_t *mos = dd->dd_pool->dp_meta_objset; - dsl_dataset_t *hds; - int err; - - ASSERT(ds->ds_phys->ds_next_snap_obj != 0); - - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds)); - - VERIFY(0 == dsl_dataset_get_snapname(ds)); - err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, tx); - ASSERT3U(err, ==, 0); - mutex_enter(&ds->ds_lock); - (void) strcpy(ds->ds_snapname, newsnapname); - mutex_exit(&ds->ds_lock); - err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, 8, 1, &ds->ds_object, tx); - ASSERT3U(err, ==, 0); - - dsl_dataset_close(hds, DS_MODE_NONE, FTAG); -} - -struct renamearg { - dsl_sync_task_group_t *dstg; - char failed[MAXPATHLEN]; - char *oldsnap; - char *newsnap; -}; - -static int -dsl_snapshot_rename_one(char *name, void *arg) -{ - struct renamearg *ra = arg; - dsl_dataset_t *ds = NULL; - char *cp; - int err; - - cp = name + strlen(name); - *cp = '@'; - (void) strcpy(cp + 1, ra->oldsnap); - err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD, - ra->dstg, &ds); - if (err == ENOENT) { - *cp = '\0'; - return (0); - } - if (err) { - (void) strcpy(ra->failed, name); - *cp = '\0'; - dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg); - return (err); - } - -#ifdef _KERNEL - /* for all filesystems undergoing rename, we'll need to unmount it */ - (void) zfs_unmount_snap(name, NULL); -#endif - - *cp = '\0'; - - dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, - dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); - - return (0); -} - -static int -dsl_recursive_rename(char *oldname, const char *newname) -{ - int err; - struct renamearg *ra; - dsl_sync_task_t *dst; - spa_t *spa; - char *cp, *fsname = spa_strdup(oldname); - int len = strlen(oldname); - - /* truncate the snapshot name to get the fsname */ - cp = strchr(fsname, '@'); - *cp = '\0'; - - cp = strchr(fsname, '/'); - if (cp) { - *cp = '\0'; - err = spa_open(fsname, &spa, FTAG); - *cp = '/'; - } else { - err = spa_open(fsname, &spa, FTAG); - } - if (err) { - kmem_free(fsname, len + 1); - return (err); - } - ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP); - ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - - ra->oldsnap = strchr(oldname, '@') + 1; - ra->newsnap = strchr(newname, '@') + 1; - *ra->failed = '\0'; - - err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, - DS_FIND_CHILDREN); - kmem_free(fsname, len + 1); - - if (err == 0) { - err = dsl_sync_task_group_wait(ra->dstg); - } - - for (dst = list_head(&ra->dstg->dstg_tasks); dst; - dst = list_next(&ra->dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; - if (dst->dst_err) { - dsl_dir_name(ds->ds_dir, ra->failed); - (void) strcat(ra->failed, "@"); - (void) strcat(ra->failed, ra->newsnap); - } - dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg); - } - - (void) strcpy(oldname, ra->failed); - - dsl_sync_task_group_destroy(ra->dstg); - kmem_free(ra, sizeof (struct renamearg)); - spa_close(spa, FTAG); - return (err); -} - -#pragma weak dmu_objset_rename = dsl_dataset_rename -int -dsl_dataset_rename(char *oldname, const char *newname, - boolean_t recursive) -{ - dsl_dir_t *dd; - dsl_dataset_t *ds; - const char *tail; - int err; - - err = dsl_dir_open(oldname, FTAG, &dd, &tail); - if (err) - return (err); - if (tail == NULL) { - err = dsl_dir_rename(dd, newname); - dsl_dir_close(dd, FTAG); - return (err); - } - if (tail[0] != '@') { - /* the name ended in a nonexistant component */ - dsl_dir_close(dd, FTAG); - return (ENOENT); - } - - dsl_dir_close(dd, FTAG); - - /* new name must be snapshot in same filesystem */ - tail = strchr(newname, '@'); - if (tail == NULL) - return (EINVAL); - tail++; - if (strncmp(oldname, newname, tail - newname) != 0) - return (EXDEV); - - if (recursive) { - err = dsl_recursive_rename(oldname, newname); - } else { - err = dsl_dataset_open(oldname, - DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds); - if (err) - return (err); - - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_snapshot_rename_check, - dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); - - dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); - } - - return (err); -} - -struct promotearg { - uint64_t used, comp, uncomp, unique; - uint64_t newnext_obj, snapnames_obj; -}; - -static int -dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *hds = arg1; - struct promotearg *pa = arg2; - dsl_dir_t *dd = hds->ds_dir; - dsl_pool_t *dp = hds->ds_dir->dd_pool; - dsl_dir_t *pdd = NULL; - dsl_dataset_t *ds = NULL; - dsl_dataset_t *pivot_ds = NULL; - dsl_dataset_t *newnext_ds = NULL; - int err; - char *name = NULL; - uint64_t itor = 0; - blkptr_t bp; - - bzero(pa, sizeof (*pa)); - - /* Check that it is a clone */ - if (dd->dd_phys->dd_clone_parent_obj == 0) - return (EINVAL); - - /* Since this is so expensive, don't do the preliminary check */ - if (!dmu_tx_is_syncing(tx)) - return (0); - - if (err = dsl_dataset_open_obj(dp, - dd->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds)) - goto out; - pdd = pivot_ds->ds_dir; - - { - dsl_dataset_t *phds; - if (err = dsl_dataset_open_obj(dd->dd_pool, - pdd->dd_phys->dd_head_dataset_obj, - NULL, DS_MODE_NONE, FTAG, &phds)) - goto out; - pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj; - dsl_dataset_close(phds, DS_MODE_NONE, FTAG); - } - - if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) { - err = EXDEV; - goto out; - } - - /* find pivot point's new next ds */ - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object, - NULL, DS_MODE_NONE, FTAG, &newnext_ds)); - while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) { - dsl_dataset_t *prev; - - if (err = dsl_dataset_open_obj(dd->dd_pool, - newnext_ds->ds_phys->ds_prev_snap_obj, - NULL, DS_MODE_NONE, FTAG, &prev)) - goto out; - dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG); - newnext_ds = prev; - } - pa->newnext_obj = newnext_ds->ds_object; - - /* compute pivot point's new unique space */ - while ((err = bplist_iterate(&newnext_ds->ds_deadlist, - &itor, &bp)) == 0) { - if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg) - pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp); - } - if (err != ENOENT) - goto out; - - /* Walk the snapshots that we are moving */ - name = kmem_alloc(MAXPATHLEN, KM_SLEEP); - ds = pivot_ds; - /* CONSTCOND */ - while (TRUE) { - uint64_t val, dlused, dlcomp, dluncomp; - dsl_dataset_t *prev; - - /* Check that the snapshot name does not conflict */ - dsl_dataset_name(ds, name); - err = zap_lookup(dd->dd_pool->dp_meta_objset, - hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, - 8, 1, &val); - if (err != ENOENT) { - if (err == 0) - err = EEXIST; - goto out; - } - - /* - * compute space to transfer. Each snapshot gave birth to: - * (my used) - (prev's used) + (deadlist's used) - */ - pa->used += ds->ds_phys->ds_used_bytes; - pa->comp += ds->ds_phys->ds_compressed_bytes; - pa->uncomp += ds->ds_phys->ds_uncompressed_bytes; - - /* If we reach the first snapshot, we're done. */ - if (ds->ds_phys->ds_prev_snap_obj == 0) - break; - - if (err = bplist_space(&ds->ds_deadlist, - &dlused, &dlcomp, &dluncomp)) - goto out; - if (err = dsl_dataset_open_obj(dd->dd_pool, - ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE, - FTAG, &prev)) - goto out; - pa->used += dlused - prev->ds_phys->ds_used_bytes; - pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes; - pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes; - - /* - * We could be a clone of a clone. If we reach our - * parent's branch point, we're done. - */ - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG); - break; - } - if (ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - ds = prev; - } - - /* Check that there is enough space here */ - err = dsl_dir_transfer_possible(pdd, dd, pa->used); - -out: - if (ds && ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - if (pivot_ds) - dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG); - if (newnext_ds) - dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG); - if (name) - kmem_free(name, MAXPATHLEN); - return (err); -} - -static void -dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *hds = arg1; - struct promotearg *pa = arg2; - dsl_dir_t *dd = hds->ds_dir; - dsl_pool_t *dp = hds->ds_dir->dd_pool; - dsl_dir_t *pdd = NULL; - dsl_dataset_t *ds, *pivot_ds; - char *name; - - ASSERT(dd->dd_phys->dd_clone_parent_obj != 0); - ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); - - VERIFY(0 == dsl_dataset_open_obj(dp, - dd->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds)); - /* - * We need to explicitly open pdd, since pivot_ds's pdd will be - * changing. - */ - VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object, - NULL, FTAG, &pdd)); - - /* move snapshots to this dir */ - name = kmem_alloc(MAXPATHLEN, KM_SLEEP); - ds = pivot_ds; - /* CONSTCOND */ - while (TRUE) { - dsl_dataset_t *prev; - - /* move snap name entry */ - dsl_dataset_name(ds, name); - VERIFY(0 == zap_remove(dp->dp_meta_objset, - pa->snapnames_obj, ds->ds_snapname, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, - hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, - 8, 1, &ds->ds_object, tx)); - - /* change containing dsl_dir */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object); - ds->ds_phys->ds_dir_obj = dd->dd_object; - ASSERT3P(ds->ds_dir, ==, pdd); - dsl_dir_close(ds->ds_dir, ds); - VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, - NULL, ds, &ds->ds_dir)); - - ASSERT3U(dsl_prop_numcb(ds), ==, 0); - - if (ds->ds_phys->ds_prev_snap_obj == 0) - break; - - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE, - FTAG, &prev)); - - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG); - break; - } - if (ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - ds = prev; - } - if (ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - - /* change pivot point's next snap */ - dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx); - pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj; - - /* change clone_parent-age */ - dmu_buf_will_dirty(dd->dd_dbuf, tx); - ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object); - dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj; - dmu_buf_will_dirty(pdd->dd_dbuf, tx); - pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object; - - /* change space accounting */ - dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx); - dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx); - pivot_ds->ds_phys->ds_unique_bytes = pa->unique; - - dsl_dir_close(pdd, FTAG); - dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG); - kmem_free(name, MAXPATHLEN); -} - -int -dsl_dataset_promote(const char *name) -{ - dsl_dataset_t *ds; - int err; - dmu_object_info_t doi; - struct promotearg pa; - - err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds); - if (err) - return (err); - - err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, &doi); - if (err) { - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); - return (err); - } - - /* - * Add in 128x the snapnames zapobj size, since we will be moving - * a bunch of snapnames to the promoted ds, and dirtying their - * bonus buffers. - */ - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_promote_check, - dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); - return (err); -} - -/* - * Given a pool name and a dataset object number in that pool, - * return the name of that dataset. - */ -int -dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) -{ - spa_t *spa; - dsl_pool_t *dp; - dsl_dataset_t *ds = NULL; - int error; - - if ((error = spa_open(pname, &spa, FTAG)) != 0) - return (error); - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - if ((error = dsl_dataset_open_obj(dp, obj, - NULL, DS_MODE_NONE, FTAG, &ds)) != 0) { - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - return (error); - } - dsl_dataset_name(ds, buf); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - - return (0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c deleted file mode 100644 index 5e563b6..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ /dev/null @@ -1,1215 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/dmu_tx.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_synctask.h> -#include <sys/spa.h> -#include <sys/zap.h> -#include <sys/zio.h> -#include <sys/arc.h> -#include "zfs_namecheck.h" - -static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd); -static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); - - -/* ARGSUSED */ -static void -dsl_dir_evict(dmu_buf_t *db, void *arg) -{ - dsl_dir_t *dd = arg; - dsl_pool_t *dp = dd->dd_pool; - int t; - - for (t = 0; t < TXG_SIZE; t++) { - ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); - ASSERT(dd->dd_tempreserved[t] == 0); - ASSERT(dd->dd_space_towrite[t] == 0); - } - - ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes); - - if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); - - spa_close(dd->dd_pool->dp_spa, dd); - - /* - * The props callback list should be empty since they hold the - * dir open. - */ - list_destroy(&dd->dd_prop_cbs); - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); -} - -int -dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag, dsl_dir_t **ddp) -{ - dmu_buf_t *dbuf; - dsl_dir_t *dd; - int err; - - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); - - err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); - if (err) - return (err); - dd = dmu_buf_get_user(dbuf); -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(dbuf, &doi); - ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); - } -#endif - /* XXX assert bonus buffer size is correct */ - if (dd == NULL) { - dsl_dir_t *winner; - int err; - - dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); - dd->dd_object = ddobj; - dd->dd_dbuf = dbuf; - dd->dd_pool = dp; - dd->dd_phys = dbuf->db_data; - dd->dd_used_bytes = dd->dd_phys->dd_used_bytes; - mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); - - list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), - offsetof(dsl_prop_cb_record_t, cbr_node)); - - if (dd->dd_phys->dd_parent_obj) { - err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, - NULL, dd, &dd->dd_parent); - if (err) { - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } - if (tail) { -#ifdef ZFS_DEBUG - uint64_t foundobj; - - err = zap_lookup(dp->dp_meta_objset, - dd->dd_parent->dd_phys-> - dd_child_dir_zapobj, - tail, sizeof (foundobj), 1, &foundobj); - ASSERT(err || foundobj == ddobj); -#endif - (void) strcpy(dd->dd_myname, tail); - } else { - err = zap_value_search(dp->dp_meta_objset, - dd->dd_parent->dd_phys-> - dd_child_dir_zapobj, - ddobj, dd->dd_myname); - } - if (err) { - dsl_dir_close(dd->dd_parent, dd); - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } - } else { - (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); - } - - winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, - dsl_dir_evict); - if (winner) { - if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dd = winner; - } else { - spa_open_ref(dp->dp_spa, dd); - } - } - - /* - * The dsl_dir_t has both open-to-close and instantiate-to-evict - * holds on the spa. We need the open-to-close holds because - * otherwise the spa_refcnt wouldn't change when we open a - * dir which the spa also has open, so we could incorrectly - * think it was OK to unload/export/destroy the pool. We need - * the instantiate-to-evict hold because the dsl_dir_t has a - * pointer to the dd_pool, which has a pointer to the spa_t. - */ - spa_open_ref(dp->dp_spa, tag); - ASSERT3P(dd->dd_pool, ==, dp); - ASSERT3U(dd->dd_object, ==, ddobj); - ASSERT3P(dd->dd_dbuf, ==, dbuf); - *ddp = dd; - return (0); -} - -void -dsl_dir_close(dsl_dir_t *dd, void *tag) -{ - dprintf_dd(dd, "%s\n", ""); - spa_close(dd->dd_pool->dp_spa, tag); - dmu_buf_rele(dd->dd_dbuf, tag); -} - -/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ -void -dsl_dir_name(dsl_dir_t *dd, char *buf) -{ - if (dd->dd_parent) { - dsl_dir_name(dd->dd_parent, buf); - (void) strcat(buf, "/"); - } else { - buf[0] = '\0'; - } - if (!MUTEX_HELD(&dd->dd_lock)) { - /* - * recursive mutex so that we can use - * dprintf_dd() with dd_lock held - */ - mutex_enter(&dd->dd_lock); - (void) strcat(buf, dd->dd_myname); - mutex_exit(&dd->dd_lock); - } else { - (void) strcat(buf, dd->dd_myname); - } -} - -/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */ -int -dsl_dir_namelen(dsl_dir_t *dd) -{ - int result = 0; - - if (dd->dd_parent) { - /* parent's name + 1 for the "/" */ - result = dsl_dir_namelen(dd->dd_parent) + 1; - } - - if (!MUTEX_HELD(&dd->dd_lock)) { - /* see dsl_dir_name */ - mutex_enter(&dd->dd_lock); - result += strlen(dd->dd_myname); - mutex_exit(&dd->dd_lock); - } else { - result += strlen(dd->dd_myname); - } - - return (result); -} - -int -dsl_dir_is_private(dsl_dir_t *dd) -{ - int rv = FALSE; - - if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) - rv = TRUE; - if (dataset_name_hidden(dd->dd_myname)) - rv = TRUE; - return (rv); -} - - -static int -getcomponent(const char *path, char *component, const char **nextp) -{ - char *p; - if (path == NULL) - return (ENOENT); - /* This would be a good place to reserve some namespace... */ - p = strpbrk(path, "/@"); - if (p && (p[1] == '/' || p[1] == '@')) { - /* two separators in a row */ - return (EINVAL); - } - if (p == NULL || p == path) { - /* - * if the first thing is an @ or /, it had better be an - * @ and it had better not have any more ats or slashes, - * and it had better have something after the @. - */ - if (p != NULL && - (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) - return (EINVAL); - if (strlen(path) >= MAXNAMELEN) - return (ENAMETOOLONG); - (void) strcpy(component, path); - p = NULL; - } else if (p[0] == '/') { - if (p-path >= MAXNAMELEN) - return (ENAMETOOLONG); - (void) strncpy(component, path, p - path); - component[p-path] = '\0'; - p++; - } else if (p[0] == '@') { - /* - * if the next separator is an @, there better not be - * any more slashes. - */ - if (strchr(path, '/')) - return (EINVAL); - if (p-path >= MAXNAMELEN) - return (ENAMETOOLONG); - (void) strncpy(component, path, p - path); - component[p-path] = '\0'; - } else { - ASSERT(!"invalid p"); - } - *nextp = p; - return (0); -} - -/* - * same as dsl_open_dir, ignore the first component of name and use the - * spa instead - */ -int -dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - dsl_dir_t **ddp, const char **tailp) -{ - char buf[MAXNAMELEN]; - const char *next, *nextnext = NULL; - int err; - dsl_dir_t *dd; - dsl_pool_t *dp; - uint64_t ddobj; - int openedspa = FALSE; - - dprintf("%s\n", name); - - err = getcomponent(name, buf, &next); - if (err) - return (err); - if (spa == NULL) { - err = spa_open(buf, &spa, FTAG); - if (err) { - dprintf("spa_open(%s) failed\n", buf); - return (err); - } - openedspa = TRUE; - - /* XXX this assertion belongs in spa_open */ - ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); - } - - dp = spa_get_dsl(spa); - - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); - if (err) { - rw_exit(&dp->dp_config_rwlock); - if (openedspa) - spa_close(spa, FTAG); - return (err); - } - - while (next != NULL) { - dsl_dir_t *child_ds; - err = getcomponent(next, buf, &nextnext); - if (err) - break; - ASSERT(next[0] != '\0'); - if (next[0] == '@') - break; - dprintf("looking up %s in obj%lld\n", - buf, dd->dd_phys->dd_child_dir_zapobj); - - err = zap_lookup(dp->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj, - buf, sizeof (ddobj), 1, &ddobj); - if (err) { - if (err == ENOENT) - err = 0; - break; - } - - err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); - if (err) - break; - dsl_dir_close(dd, tag); - dd = child_ds; - next = nextnext; - } - rw_exit(&dp->dp_config_rwlock); - - if (err) { - dsl_dir_close(dd, tag); - if (openedspa) - spa_close(spa, FTAG); - return (err); - } - - /* - * It's an error if there's more than one component left, or - * tailp==NULL and there's any component left. - */ - if (next != NULL && - (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { - /* bad path name */ - dsl_dir_close(dd, tag); - dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); - err = ENOENT; - } - if (tailp) - *tailp = next; - if (openedspa) - spa_close(spa, FTAG); - *ddp = dd; - return (err); -} - -/* - * Return the dsl_dir_t, and possibly the last component which couldn't - * be found in *tail. Return NULL if the path is bogus, or if - * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' - * means that the last component is a snapshot. - */ -int -dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) -{ - return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); -} - -uint64_t -dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) -{ - objset_t *mos = pds->dd_pool->dp_meta_objset; - uint64_t ddobj; - dsl_dir_phys_t *dsphys; - dmu_buf_t *dbuf; - - ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, - DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); - VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, - name, sizeof (uint64_t), 1, &ddobj, tx)); - VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; - - dsphys->dd_creation_time = gethrestime_sec(); - dsphys->dd_parent_obj = pds->dd_object; - dsphys->dd_props_zapobj = zap_create(mos, - DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); - dsphys->dd_child_dir_zapobj = zap_create(mos, - DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); - dmu_buf_rele(dbuf, FTAG); - - return (ddobj); -} - -/* ARGSUSED */ -int -dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err; - uint64_t count; - - /* - * There should be exactly two holds, both from - * dsl_dataset_destroy: one on the dd directory, and one on its - * head ds. Otherwise, someone is trying to lookup something - * inside this dir while we want to destroy it. The - * config_rwlock ensures that nobody else opens it after we - * check. - */ - if (dmu_buf_refcount(dd->dd_dbuf) > 2) - return (EBUSY); - - err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); - if (err) - return (err); - if (count != 0) - return (EEXIST); - - return (0); -} - -void -dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t val, obj; - - ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); - ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); - - /* Remove our reservation. */ - val = 0; - dsl_dir_set_reservation_sync(dd, &val, tx); - ASSERT3U(dd->dd_used_bytes, ==, 0); - ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); - - VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); - VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); - VERIFY(0 == zap_remove(mos, - dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); - - obj = dd->dd_object; - dsl_dir_close(dd, tag); - VERIFY(0 == dmu_object_free(mos, obj, tx)); -} - -void -dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) -{ - dsl_dir_phys_t *dsp; - dmu_buf_t *dbuf; - int error; - - *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, - DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); - - error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, - sizeof (uint64_t), 1, ddobjp, tx); - ASSERT3U(error, ==, 0); - - VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsp = dbuf->db_data; - - dsp->dd_creation_time = gethrestime_sec(); - dsp->dd_props_zapobj = zap_create(mos, - DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); - dsp->dd_child_dir_zapobj = zap_create(mos, - DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); - - dmu_buf_rele(dbuf, FTAG); -} - -void -dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) -{ - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, - dsl_dir_space_available(dd, NULL, 0, TRUE)); - - mutex_enter(&dd->dd_lock); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, - dd->dd_phys->dd_quota); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, - dd->dd_phys->dd_reserved); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, - dd->dd_phys->dd_compressed_bytes == 0 ? 100 : - (dd->dd_phys->dd_uncompressed_bytes * 100 / - dd->dd_phys->dd_compressed_bytes)); - mutex_exit(&dd->dd_lock); - - if (dd->dd_phys->dd_clone_parent_obj) { - dsl_dataset_t *ds; - char buf[MAXNAMELEN]; - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_NONE, FTAG, &ds)); - dsl_dataset_name(ds, buf); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); - rw_exit(&dd->dd_pool->dp_config_rwlock); - - dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); - } -} - -void -dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dd->dd_pool; - - ASSERT(dd->dd_phys); - - if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { - /* up the hold count until we can be written out */ - dmu_buf_add_ref(dd->dd_dbuf, dd); - } -} - -static int64_t -parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) -{ - uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); - uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); - return (new_accounted - old_accounted); -} - -void -dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - - mutex_enter(&dd->dd_lock); - ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); - dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, - dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); - dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; - dd->dd_phys->dd_used_bytes = dd->dd_used_bytes; - mutex_exit(&dd->dd_lock); - - /* release the hold from dsl_dir_dirty */ - dmu_buf_rele(dd->dd_dbuf, dd); -} - -static uint64_t -dsl_dir_estimated_space(dsl_dir_t *dd) -{ - int64_t space; - int i; - - ASSERT(MUTEX_HELD(&dd->dd_lock)); - - space = dd->dd_phys->dd_used_bytes; - ASSERT(space >= 0); - for (i = 0; i < TXG_SIZE; i++) { - space += dd->dd_space_towrite[i&TXG_MASK]; - ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); - } - return (space); -} - -/* - * How much space would dd have available if ancestor had delta applied - * to it? If ondiskonly is set, we're only interested in what's - * on-disk, not estimated pending changes. - */ -uint64_t -dsl_dir_space_available(dsl_dir_t *dd, - dsl_dir_t *ancestor, int64_t delta, int ondiskonly) -{ - uint64_t parentspace, myspace, quota, used; - - /* - * If there are no restrictions otherwise, assume we have - * unlimited space available. - */ - quota = UINT64_MAX; - parentspace = UINT64_MAX; - - if (dd->dd_parent != NULL) { - parentspace = dsl_dir_space_available(dd->dd_parent, - ancestor, delta, ondiskonly); - } - - mutex_enter(&dd->dd_lock); - if (dd->dd_phys->dd_quota != 0) - quota = dd->dd_phys->dd_quota; - if (ondiskonly) { - used = dd->dd_used_bytes; - } else { - used = dsl_dir_estimated_space(dd); - } - if (dd == ancestor) - used += delta; - - if (dd->dd_parent == NULL) { - uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); - quota = MIN(quota, poolsize); - } - - if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { - /* - * We have some space reserved, in addition to what our - * parent gave us. - */ - parentspace += dd->dd_phys->dd_reserved - used; - } - - if (used > quota) { - /* over quota */ - myspace = 0; - - /* - * While it's OK to be a little over quota, if - * we think we are using more space than there - * is in the pool (which is already 1.6% more than - * dsl_pool_adjustedsize()), something is very - * wrong. - */ - ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa)); - } else { - /* - * the lesser of the space provided by our parent and - * the space left in our quota - */ - myspace = MIN(parentspace, quota - used); - } - - mutex_exit(&dd->dd_lock); - - return (myspace); -} - -struct tempreserve { - list_node_t tr_node; - dsl_dir_t *tr_ds; - uint64_t tr_size; -}; - -/* - * Reserve space in this dsl_dir, to be used in this tx's txg. - * After the space has been dirtied (and thus - * dsl_dir_willuse_space() has been called), the reservation should - * be canceled, using dsl_dir_tempreserve_clear(). - */ -static int -dsl_dir_tempreserve_impl(dsl_dir_t *dd, - uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx) -{ - uint64_t txg = tx->tx_txg; - uint64_t est_used, quota, parent_rsrv; - int edquot = EDQUOT; - int txgidx = txg & TXG_MASK; - int i; - struct tempreserve *tr; - - ASSERT3U(txg, !=, 0); - ASSERT3S(asize, >=, 0); - - mutex_enter(&dd->dd_lock); - /* - * Check against the dsl_dir's quota. We don't add in the delta - * when checking for over-quota because they get one free hit. - */ - est_used = dsl_dir_estimated_space(dd); - for (i = 0; i < TXG_SIZE; i++) - est_used += dd->dd_tempreserved[i]; - - quota = UINT64_MAX; - - if (dd->dd_phys->dd_quota) - quota = dd->dd_phys->dd_quota; - - /* - * If this transaction will result in a net free of space, we want - * to let it through, but we have to be careful: the space that it - * frees won't become available until *after* this txg syncs. - * Therefore, to ensure that it's possible to remove files from - * a full pool without inducing transient overcommits, we throttle - * netfree transactions against a quota that is slightly larger, - * but still within the pool's allocation slop. In cases where - * we're very close to full, this will allow a steady trickle of - * removes to get through. - */ - if (dd->dd_parent == NULL) { - uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); - if (poolsize < quota) { - quota = poolsize; - edquot = ENOSPC; - } - } else if (netfree) { - quota = UINT64_MAX; - } - - /* - * If they are requesting more space, and our current estimate - * is over quota. They get to try again unless the actual - * on-disk is over quota and there are no pending changes (which - * may free up space for us). - */ - if (asize > 0 && est_used > quota) { - if (dd->dd_space_towrite[txg & TXG_MASK] != 0 || - dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 || - dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 || - dd->dd_used_bytes < quota) - edquot = ERESTART; - dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " - "quota=%lluK tr=%lluK err=%d\n", - dd->dd_used_bytes>>10, est_used>>10, - quota>>10, asize>>10, edquot); - mutex_exit(&dd->dd_lock); - return (edquot); - } - - /* We need to up our estimated delta before dropping dd_lock */ - dd->dd_tempreserved[txgidx] += asize; - - parent_rsrv = parent_delta(dd, est_used, asize); - mutex_exit(&dd->dd_lock); - - tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); - tr->tr_ds = dd; - tr->tr_size = asize; - list_insert_tail(tr_list, tr); - - /* see if it's OK with our parent */ - if (dd->dd_parent && parent_rsrv) { - return (dsl_dir_tempreserve_impl(dd->dd_parent, - parent_rsrv, netfree, tr_list, tx)); - } else { - return (0); - } -} - -/* - * Reserve space in this dsl_dir, to be used in this tx's txg. - * After the space has been dirtied (and thus - * dsl_dir_willuse_space() has been called), the reservation should - * be canceled, using dsl_dir_tempreserve_clear(). - */ -int -dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, - uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx) -{ - int err = 0; - list_t *tr_list; - - tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); - list_create(tr_list, sizeof (struct tempreserve), - offsetof(struct tempreserve, tr_node)); - ASSERT3S(asize, >=, 0); - ASSERT3S(fsize, >=, 0); - - err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, - tr_list, tx); - - if (err == 0) { - struct tempreserve *tr; - - err = arc_tempreserve_space(lsize); - if (err == 0) { - tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); - tr->tr_ds = NULL; - tr->tr_size = lsize; - list_insert_tail(tr_list, tr); - } - } - - if (err) - dsl_dir_tempreserve_clear(tr_list, tx); - else - *tr_cookiep = tr_list; - return (err); -} - -/* - * Clear a temporary reservation that we previously made with - * dsl_dir_tempreserve_space(). - */ -void -dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) -{ - int txgidx = tx->tx_txg & TXG_MASK; - list_t *tr_list = tr_cookie; - struct tempreserve *tr; - - ASSERT3U(tx->tx_txg, !=, 0); - - while (tr = list_head(tr_list)) { - if (tr->tr_ds == NULL) { - arc_tempreserve_clear(tr->tr_size); - } else { - mutex_enter(&tr->tr_ds->dd_lock); - ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, - tr->tr_size); - tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; - mutex_exit(&tr->tr_ds->dd_lock); - } - list_remove(tr_list, tr); - kmem_free(tr, sizeof (struct tempreserve)); - } - - kmem_free(tr_list, sizeof (list_t)); -} - -/* - * Call in open context when we think we're going to write/free space, - * eg. when dirtying data. Be conservative (ie. OK to write less than - * this or free more than this, but don't write more or free less). - */ -void -dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) -{ - int64_t parent_space; - uint64_t est_used; - - mutex_enter(&dd->dd_lock); - if (space > 0) - dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; - - est_used = dsl_dir_estimated_space(dd); - parent_space = parent_delta(dd, est_used, space); - mutex_exit(&dd->dd_lock); - - /* Make sure that we clean up dd_space_to* */ - dsl_dir_dirty(dd, tx); - - /* XXX this is potentially expensive and unnecessary... */ - if (parent_space && dd->dd_parent) - dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); -} - -/* call from syncing context when we actually write/free space for this dd */ -void -dsl_dir_diduse_space(dsl_dir_t *dd, - int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) -{ - int64_t accounted_delta; - - ASSERT(dmu_tx_is_syncing(tx)); - - dsl_dir_dirty(dd, tx); - - mutex_enter(&dd->dd_lock); - accounted_delta = parent_delta(dd, dd->dd_used_bytes, used); - ASSERT(used >= 0 || dd->dd_used_bytes >= -used); - ASSERT(compressed >= 0 || - dd->dd_phys->dd_compressed_bytes >= -compressed); - ASSERT(uncompressed >= 0 || - dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); - dd->dd_used_bytes += used; - dd->dd_phys->dd_uncompressed_bytes += uncompressed; - dd->dd_phys->dd_compressed_bytes += compressed; - mutex_exit(&dd->dd_lock); - - if (dd->dd_parent != NULL) { - dsl_dir_diduse_space(dd->dd_parent, - accounted_delta, compressed, uncompressed, tx); - } -} - -/* ARGSUSED */ -static int -dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; - int err = 0; - uint64_t towrite; - - if (new_quota == 0) - return (0); - - mutex_enter(&dd->dd_lock); - /* - * If we are doing the preliminary check in open context, and - * there are pending changes, then don't fail it, since the - * pending changes could under-estimat the amount of space to be - * freed up. - */ - towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] + - dd->dd_space_towrite[2] + dd->dd_space_towrite[3]; - if ((dmu_tx_is_syncing(tx) || towrite == 0) && - (new_quota < dd->dd_phys->dd_reserved || - new_quota < dsl_dir_estimated_space(dd))) { - err = ENOSPC; - } - mutex_exit(&dd->dd_lock); - return (err); -} - -static void -dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - - mutex_enter(&dd->dd_lock); - dd->dd_phys->dd_quota = new_quota; - mutex_exit(&dd->dd_lock); -} - -int -dsl_dir_set_quota(const char *ddname, uint64_t quota) -{ - dsl_dir_t *dd; - int err; - - err = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (err) - return (err); - /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. - */ - txg_wait_open(dd->dd_pool, 0); - - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, dd, "a, 0); - dsl_dir_close(dd, FTAG); - return (err); -} - -/* ARGSUSED */ -static int -dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; - uint64_t used, avail; - int64_t delta; - - if (new_reservation > INT64_MAX) - return (EOVERFLOW); - - /* - * If we are doing the preliminary check in open context, the - * space estimates may be inaccurate. - */ - if (!dmu_tx_is_syncing(tx)) - return (0); - - mutex_enter(&dd->dd_lock); - used = dd->dd_used_bytes; - delta = MAX(used, new_reservation) - - MAX(used, dd->dd_phys->dd_reserved); - mutex_exit(&dd->dd_lock); - - if (dd->dd_parent) { - avail = dsl_dir_space_available(dd->dd_parent, - NULL, 0, FALSE); - } else { - avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; - } - - if (delta > 0 && delta > avail) - return (ENOSPC); - if (delta > 0 && dd->dd_phys->dd_quota > 0 && - new_reservation > dd->dd_phys->dd_quota) - return (ENOSPC); - return (0); -} - -static void -dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; - uint64_t used; - int64_t delta; - - mutex_enter(&dd->dd_lock); - used = dd->dd_used_bytes; - delta = MAX(used, new_reservation) - - MAX(used, dd->dd_phys->dd_reserved); - mutex_exit(&dd->dd_lock); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_reserved = new_reservation; - - if (dd->dd_parent != NULL) { - /* Roll up this additional usage into our ancestors */ - dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx); - } -} - -int -dsl_dir_set_reservation(const char *ddname, uint64_t reservation) -{ - dsl_dir_t *dd; - int err; - - err = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (err) - return (err); - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, - dsl_dir_set_reservation_sync, dd, &reservation, 0); - dsl_dir_close(dd, FTAG); - return (err); -} - -static dsl_dir_t * -closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) -{ - for (; ds1; ds1 = ds1->dd_parent) { - dsl_dir_t *dd; - for (dd = ds2; dd; dd = dd->dd_parent) { - if (ds1 == dd) - return (dd); - } - } - return (NULL); -} - -/* - * If delta is applied to dd, how much of that delta would be applied to - * ancestor? Syncing context only. - */ -static int64_t -would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) -{ - if (dd == ancestor) - return (delta); - - mutex_enter(&dd->dd_lock); - delta = parent_delta(dd, dd->dd_used_bytes, delta); - mutex_exit(&dd->dd_lock); - return (would_change(dd->dd_parent, delta, ancestor)); -} - -struct renamearg { - dsl_dir_t *newparent; - const char *mynewname; -}; - -/* ARGSUSED */ -static int -dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct renamearg *ra = arg2; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err; - uint64_t val; - - /* There should be 2 references: the open and the dirty */ - if (dmu_buf_refcount(dd->dd_dbuf) > 2) - return (EBUSY); - - /* check for existing name */ - err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, - ra->mynewname, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - - if (ra->newparent != dd->dd_parent) { - /* is there enough space? */ - uint64_t myspace = - MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); - - /* no rename into our descendant */ - if (closest_common_ancestor(dd, ra->newparent) == dd) - return (EINVAL); - - if (err = dsl_dir_transfer_possible(dd->dd_parent, - ra->newparent, myspace)) - return (err); - } - - return (0); -} - -static void -dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct renamearg *ra = arg2; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err; - - ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); - - if (ra->newparent != dd->dd_parent) { - uint64_t myspace = - MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); - - dsl_dir_diduse_space(dd->dd_parent, -myspace, - -dd->dd_phys->dd_compressed_bytes, - -dd->dd_phys->dd_uncompressed_bytes, tx); - dsl_dir_diduse_space(ra->newparent, myspace, - dd->dd_phys->dd_compressed_bytes, - dd->dd_phys->dd_uncompressed_bytes, tx); - } - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - - /* remove from old parent zapobj */ - err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, - dd->dd_myname, tx); - ASSERT3U(err, ==, 0); - - (void) strcpy(dd->dd_myname, ra->mynewname); - dsl_dir_close(dd->dd_parent, dd); - dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; - VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, - ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); - - /* add to new parent zapobj */ - err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, - dd->dd_myname, 8, 1, &dd->dd_object, tx); - ASSERT3U(err, ==, 0); -} - -int -dsl_dir_rename(dsl_dir_t *dd, const char *newname) -{ - struct renamearg ra; - int err; - - /* new parent should exist */ - err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); - if (err) - return (err); - - /* can't rename to different pool */ - if (dd->dd_pool != ra.newparent->dd_pool) { - err = ENXIO; - goto out; - } - - /* new name should not already exist */ - if (ra.mynewname == NULL) { - err = EEXIST; - goto out; - } - - - err = dsl_sync_task_do(dd->dd_pool, - dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); - -out: - dsl_dir_close(ra.newparent, FTAG); - return (err); -} - -int -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) -{ - dsl_dir_t *ancestor; - int64_t adelta; - uint64_t avail; - - ancestor = closest_common_ancestor(sdd, tdd); - adelta = would_change(sdd, -space, ancestor); - avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); - if (avail < space) - return (ENOSPC); - - return (0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c deleted file mode 100644 index 00abf7e..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ /dev/null @@ -1,256 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dsl_pool.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_synctask.h> -#include <sys/dmu_tx.h> -#include <sys/dmu_objset.h> -#include <sys/arc.h> -#include <sys/zap.h> -#include <sys/zio.h> -#include <sys/zfs_context.h> -#include <sys/fs/zfs.h> - -static int -dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp) -{ - uint64_t obj; - int err; - - err = zap_lookup(dp->dp_meta_objset, - dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, - MOS_DIR_NAME, sizeof (obj), 1, &obj); - if (err) - return (err); - - return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp)); -} - -static dsl_pool_t * -dsl_pool_open_impl(spa_t *spa, uint64_t txg) -{ - dsl_pool_t *dp; - blkptr_t *bp = spa_get_rootblkptr(spa); - - dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); - dp->dp_spa = spa; - dp->dp_meta_rootbp = *bp; - rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); - txg_init(dp, txg); - - txg_list_create(&dp->dp_dirty_datasets, - offsetof(dsl_dataset_t, ds_dirty_link)); - txg_list_create(&dp->dp_dirty_dirs, - offsetof(dsl_dir_t, dd_dirty_link)); - txg_list_create(&dp->dp_sync_tasks, - offsetof(dsl_sync_task_group_t, dstg_node)); - list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t), - offsetof(dsl_dataset_t, ds_synced_link)); - - return (dp); -} - -int -dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) -{ - int err; - dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); - objset_impl_t *osi; - - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); - if (err) - goto out; - dp->dp_meta_objset = &osi->os; - - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, - &dp->dp_root_dir_obj); - if (err) - goto out; - - err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, - NULL, dp, &dp->dp_root_dir); - if (err) - goto out; - - err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir); - if (err) - goto out; - -out: - rw_exit(&dp->dp_config_rwlock); - if (err) - dsl_pool_close(dp); - else - *dpp = dp; - - return (err); -} - -void -dsl_pool_close(dsl_pool_t *dp) -{ - /* drop our reference from dsl_pool_open() */ - if (dp->dp_mos_dir) - dsl_dir_close(dp->dp_mos_dir, dp); - if (dp->dp_root_dir) - dsl_dir_close(dp->dp_root_dir, dp); - - /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ - if (dp->dp_meta_objset) - dmu_objset_evict(NULL, dp->dp_meta_objset->os); - - txg_list_destroy(&dp->dp_dirty_datasets); - txg_list_destroy(&dp->dp_dirty_dirs); - txg_list_destroy(&dp->dp_sync_tasks); - list_destroy(&dp->dp_synced_objsets); - - arc_flush(); - txg_fini(dp); - rw_destroy(&dp->dp_config_rwlock); - kmem_free(dp, sizeof (dsl_pool_t)); -} - -dsl_pool_t * -dsl_pool_create(spa_t *spa, uint64_t txg) -{ - int err; - dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); - dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); - dp->dp_meta_objset = &dmu_objset_create_impl(spa, - NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; - - /* create the pool directory */ - err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); - ASSERT3U(err, ==, 0); - - /* create and open the root dir */ - dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx); - VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, - NULL, dp, &dp->dp_root_dir)); - - /* create and open the meta-objset dir */ - (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir)); - - dmu_tx_commit(tx); - - return (dp); -} - -void -dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) -{ - zio_t *zio; - dmu_tx_t *tx; - dsl_dir_t *dd; - dsl_dataset_t *ds; - dsl_sync_task_group_t *dstg; - objset_impl_t *mosi = dp->dp_meta_objset->os; - int err; - - tx = dmu_tx_create_assigned(dp, txg); - - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { - if (!list_link_active(&ds->ds_synced_link)) - list_insert_tail(&dp->dp_synced_objsets, ds); - else - dmu_buf_rele(ds->ds_dbuf, ds); - dsl_dataset_sync(ds, zio, tx); - } - err = zio_wait(zio); - ASSERT(err == 0); - - while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) - dsl_sync_task_group_sync(dstg, tx); - while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) - dsl_dir_sync(dd, tx); - - if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || - list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - dmu_objset_sync(mosi, zio, tx); - err = zio_wait(zio); - ASSERT(err == 0); - dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); - spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); - } - - dmu_tx_commit(tx); -} - -void -dsl_pool_zil_clean(dsl_pool_t *dp) -{ - dsl_dataset_t *ds; - - while (ds = list_head(&dp->dp_synced_objsets)) { - list_remove(&dp->dp_synced_objsets, ds); - ASSERT(ds->ds_user_ptr != NULL); - zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); - dmu_buf_rele(ds->ds_dbuf, ds); - } -} - -/* - * TRUE if the current thread is the tx_sync_thread or if we - * are being called from SPA context during pool initialization. - */ -int -dsl_pool_sync_context(dsl_pool_t *dp) -{ - return (curthread == dp->dp_tx.tx_sync_thread || - spa_get_dsl(dp->dp_spa) == NULL); -} - -uint64_t -dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) -{ - uint64_t space, resv; - - /* - * Reserve about 1.6% (1/64), or at least 32MB, for allocation - * efficiency. - * XXX The intent log is not accounted for, so it must fit - * within this slop. - * - * If we're trying to assess whether it's OK to do a free, - * cut the reservation in half to allow forward progress - * (e.g. make it possible to rm(1) files from a full pool). - */ - space = spa_get_dspace(dp->dp_spa); - resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); - if (netfree) - resv >>= 1; - - return (space - resv); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c deleted file mode 100644 index 2fff66d..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c +++ /dev/null @@ -1,501 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/dmu_objset.h> -#include <sys/dmu_tx.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_synctask.h> -#include <sys/spa.h> -#include <sys/zio_checksum.h> /* for the default checksum value */ -#include <sys/zap.h> -#include <sys/fs/zfs.h> - -#include "zfs_prop.h" - -static int -dodefault(const char *propname, int intsz, int numint, void *buf) -{ - zfs_prop_t prop; - - if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL || - zfs_prop_readonly(prop)) - return (ENOENT); - - if (zfs_prop_get_type(prop) == prop_type_string) { - if (intsz != 1) - return (EOVERFLOW); - (void) strncpy(buf, zfs_prop_default_string(prop), numint); - } else { - if (intsz != 8 || numint < 1) - return (EOVERFLOW); - - *(uint64_t *)buf = zfs_prop_default_numeric(prop); - } - - return (0); -} - -static int -dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, - int intsz, int numint, void *buf, char *setpoint) -{ - int err = ENOENT; - zfs_prop_t prop; - - if (setpoint) - setpoint[0] = '\0'; - - prop = zfs_name_to_prop(propname); - - /* - * Note: dd may be NULL, therefore we shouldn't dereference it - * ouside this loop. - */ - for (; dd != NULL; dd = dd->dd_parent) { - objset_t *mos = dd->dd_pool->dp_meta_objset; - ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, - propname, intsz, numint, buf); - if (err != ENOENT) { - if (setpoint) - dsl_dir_name(dd, setpoint); - break; - } - - /* - * Break out of this loop for non-inheritable properties. - */ - if (prop != ZFS_PROP_INVAL && - !zfs_prop_inheritable(prop)) - break; - } - if (err == ENOENT) - err = dodefault(propname, intsz, numint, buf); - - return (err); -} - -/* - * Register interest in the named property. We'll call the callback - * once to notify it of the current property value, and again each time - * the property changes, until this callback is unregistered. - * - * Return 0 on success, errno if the prop is not an integer value. - */ -int -dsl_prop_register(dsl_dataset_t *ds, const char *propname, - dsl_prop_changed_cb_t *callback, void *cbarg) -{ - dsl_dir_t *dd = ds->ds_dir; - uint64_t value; - dsl_prop_cb_record_t *cbr; - int err; - int need_rwlock; - - need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock); - if (need_rwlock) - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - - err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL); - if (err != 0) { - rw_exit(&dd->dd_pool->dp_config_rwlock); - return (err); - } - - cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); - cbr->cbr_ds = ds; - cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP); - (void) strcpy((char *)cbr->cbr_propname, propname); - cbr->cbr_func = callback; - cbr->cbr_arg = cbarg; - mutex_enter(&dd->dd_lock); - list_insert_head(&dd->dd_prop_cbs, cbr); - mutex_exit(&dd->dd_lock); - - cbr->cbr_func(cbr->cbr_arg, value); - - VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object, - NULL, cbr, &dd)); - if (need_rwlock) - rw_exit(&dd->dd_pool->dp_config_rwlock); - /* Leave dataset open until this callback is unregistered */ - return (0); -} - -int -dsl_prop_get_ds(dsl_dir_t *dd, const char *propname, - int intsz, int numints, void *buf, char *setpoint) -{ - int err; - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint); - rw_exit(&dd->dd_pool->dp_config_rwlock); - - return (err); -} - -int -dsl_prop_get(const char *ddname, const char *propname, - int intsz, int numints, void *buf, char *setpoint) -{ - dsl_dir_t *dd; - const char *tail; - int err; - - err = dsl_dir_open(ddname, FTAG, &dd, &tail); - if (err) - return (err); - if (tail && tail[0] != '@') { - dsl_dir_close(dd, FTAG); - return (ENOENT); - } - - err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint); - - dsl_dir_close(dd, FTAG); - return (err); -} - -/* - * Get the current property value. It may have changed by the time this - * function returns, so it is NOT safe to follow up with - * dsl_prop_register() and assume that the value has not changed in - * between. - * - * Return 0 on success, ENOENT if ddname is invalid. - */ -int -dsl_prop_get_integer(const char *ddname, const char *propname, - uint64_t *valuep, char *setpoint) -{ - return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); -} - -/* - * Unregister this callback. Return 0 on success, ENOENT if ddname is - * invalid, ENOMSG if no matching callback registered. - */ -int -dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, - dsl_prop_changed_cb_t *callback, void *cbarg) -{ - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_cb_record_t *cbr; - - mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); - cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - if (cbr->cbr_ds == ds && - cbr->cbr_func == callback && - cbr->cbr_arg == cbarg && - strcmp(cbr->cbr_propname, propname) == 0) - break; - } - - if (cbr == NULL) { - mutex_exit(&dd->dd_lock); - return (ENOMSG); - } - - list_remove(&dd->dd_prop_cbs, cbr); - mutex_exit(&dd->dd_lock); - kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); - kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); - - /* Clean up from dsl_prop_register */ - dsl_dir_close(dd, cbr); - return (0); -} - -/* - * Return the number of callbacks that are registered for this dataset. - */ -int -dsl_prop_numcb(dsl_dataset_t *ds) -{ - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_cb_record_t *cbr; - int num = 0; - - mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); - cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - if (cbr->cbr_ds == ds) - num++; - } - mutex_exit(&dd->dd_lock); - - return (num); -} - -static void -dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - const char *propname, uint64_t value, int first) -{ - dsl_dir_t *dd; - dsl_prop_cb_record_t *cbr; - objset_t *mos = dp->dp_meta_objset; - zap_cursor_t zc; - zap_attribute_t za; - int err; - - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); - if (err) - return; - - if (!first) { - /* - * If the prop is set here, then this change is not - * being inherited here or below; stop the recursion. - */ - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, - 8, 1, &value); - if (err == 0) { - dsl_dir_close(dd, FTAG); - return; - } - ASSERT3U(err, ==, ENOENT); - } - - mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); - cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - if (strcmp(cbr->cbr_propname, propname) == 0) { - cbr->cbr_func(cbr->cbr_arg, value); - } - } - mutex_exit(&dd->dd_lock); - - for (zap_cursor_init(&zc, mos, - dd->dd_phys->dd_child_dir_zapobj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - /* XXX recursion could blow stack; esp. za! */ - dsl_prop_changed_notify(dp, za.za_first_integer, - propname, value, FALSE); - } - zap_cursor_fini(&zc); - dsl_dir_close(dd, FTAG); -} - -struct prop_set_arg { - const char *name; - int intsz; - int numints; - const void *buf; -}; - - -static void -dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct prop_set_arg *psa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t zapobj = dd->dd_phys->dd_props_zapobj; - uint64_t intval; - int isint; - - isint = (dodefault(psa->name, 8, 1, &intval) == 0); - - if (psa->numints == 0) { - int err = zap_remove(mos, zapobj, psa->name, tx); - ASSERT(err == 0 || err == ENOENT); - if (isint) { - VERIFY(0 == dsl_prop_get_impl(dd->dd_parent, - psa->name, 8, 1, &intval, NULL)); - } - } else { - VERIFY(0 == zap_update(mos, zapobj, psa->name, - psa->intsz, psa->numints, psa->buf, tx)); - if (isint) - intval = *(uint64_t *)psa->buf; - } - - if (isint) { - dsl_prop_changed_notify(dd->dd_pool, - dd->dd_object, psa->name, intval, TRUE); - } -} - -int -dsl_prop_set_dd(dsl_dir_t *dd, const char *propname, - int intsz, int numints, const void *buf) -{ - struct prop_set_arg psa; - - psa.name = propname; - psa.intsz = intsz; - psa.numints = numints; - psa.buf = buf; - - return (dsl_sync_task_do(dd->dd_pool, - NULL, dsl_prop_set_sync, dd, &psa, 2)); -} - -int -dsl_prop_set(const char *ddname, const char *propname, - int intsz, int numints, const void *buf) -{ - dsl_dir_t *dd; - int err; - - /* - * We must do these checks before we get to the syncfunc, since - * it can't fail. - */ - if (strlen(propname) >= ZAP_MAXNAMELEN) - return (ENAMETOOLONG); - if (intsz * numints >= ZAP_MAXVALUELEN) - return (E2BIG); - - err = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (err) - return (err); - err = dsl_prop_set_dd(dd, propname, intsz, numints, buf); - dsl_dir_close(dd, FTAG); - return (err); -} - -/* - * Iterate over all properties for this dataset and return them in an nvlist. - */ -int -dsl_prop_get_all(objset_t *os, nvlist_t **nvp) -{ - dsl_dataset_t *ds = os->os->os_dsl_dataset; - dsl_dir_t *dd = ds->ds_dir; - int err = 0; - dsl_pool_t *dp; - objset_t *mos; - - if (dsl_dataset_is_snapshot(ds)) { - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - return (0); - } - - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - dp = dd->dd_pool; - mos = dp->dp_meta_objset; - - rw_enter(&dp->dp_config_rwlock, RW_READER); - for (; dd != NULL; dd = dd->dd_parent) { - char setpoint[MAXNAMELEN]; - zap_cursor_t zc; - zap_attribute_t za; - - dsl_dir_name(dd, setpoint); - - for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj); - (err = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - nvlist_t *propval; - zfs_prop_t prop; - /* - * Skip non-inheritable properties. - */ - if ((prop = zfs_name_to_prop(za.za_name)) != - ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) && - dd != ds->ds_dir) - continue; - - if (nvlist_lookup_nvlist(*nvp, za.za_name, - &propval) == 0) - continue; - - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - if (za.za_integer_length == 1) { - /* - * String property - */ - char *tmp = kmem_alloc(za.za_num_integers, - KM_SLEEP); - err = zap_lookup(mos, - dd->dd_phys->dd_props_zapobj, - za.za_name, 1, za.za_num_integers, - tmp); - if (err != 0) { - kmem_free(tmp, za.za_num_integers); - break; - } - VERIFY(nvlist_add_string(propval, - ZFS_PROP_VALUE, tmp) == 0); - kmem_free(tmp, za.za_num_integers); - } else { - /* - * Integer property - */ - ASSERT(za.za_integer_length == 8); - (void) nvlist_add_uint64(propval, - ZFS_PROP_VALUE, za.za_first_integer); - } - - VERIFY(nvlist_add_string(propval, - ZFS_PROP_SOURCE, setpoint) == 0); - VERIFY(nvlist_add_nvlist(*nvp, za.za_name, - propval) == 0); - nvlist_free(propval); - } - zap_cursor_fini(&zc); - - if (err != ENOENT) - break; - err = 0; - } - rw_exit(&dp->dp_config_rwlock); - - return (err); -} - -void -dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) -{ - nvlist_t *propval; - - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0); - VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); - nvlist_free(propval); -} - -void -dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) -{ - nvlist_t *propval; - - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0); - VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); - nvlist_free(propval); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c deleted file mode 100644 index 17deb56..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/dmu_tx.h> -#include <sys/dsl_pool.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_synctask.h> - -#define DST_AVG_BLKSHIFT 14 - -/* ARGSUSED */ -static int -dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) -{ - return (0); -} - -dsl_sync_task_group_t * -dsl_sync_task_group_create(dsl_pool_t *dp) -{ - dsl_sync_task_group_t *dstg; - - dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP); - list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), - offsetof(dsl_sync_task_t, dst_node)); - dstg->dstg_pool = dp; - - return (dstg); -} - -void -dsl_sync_task_create(dsl_sync_task_group_t *dstg, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified) -{ - dsl_sync_task_t *dst; - - if (checkfunc == NULL) - checkfunc = dsl_null_checkfunc; - dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP); - dst->dst_checkfunc = checkfunc; - dst->dst_syncfunc = syncfunc; - dst->dst_arg1 = arg1; - dst->dst_arg2 = arg2; - list_insert_tail(&dstg->dstg_tasks, dst); - - dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT; -} - -int -dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) -{ - dmu_tx_t *tx; - uint64_t txg; - dsl_sync_task_t *dst; - -top: - tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); - - txg = dmu_tx_get_txg(tx); - - /* Do a preliminary error check. */ - dstg->dstg_err = 0; - rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER); - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { -#ifdef ZFS_DEBUG - /* - * Only check half the time, otherwise, the sync-context - * check will almost never fail. - */ - if (spa_get_random(2) == 0) - continue; -#endif - dst->dst_err = - dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); - if (dst->dst_err) - dstg->dstg_err = dst->dst_err; - } - rw_exit(&dstg->dstg_pool->dp_config_rwlock); - - if (dstg->dstg_err) { - dmu_tx_commit(tx); - return (dstg->dstg_err); - } - - VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); - - dmu_tx_commit(tx); - - txg_wait_synced(dstg->dstg_pool, txg); - - if (dstg->dstg_err == EAGAIN) - goto top; - - return (dstg->dstg_err); -} - -void -dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) -{ - dsl_sync_task_t *dst; - - while (dst = list_head(&dstg->dstg_tasks)) { - list_remove(&dstg->dstg_tasks, dst); - kmem_free(dst, sizeof (dsl_sync_task_t)); - } - kmem_free(dstg, sizeof (dsl_sync_task_group_t)); -} - -void -dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) -{ - dsl_sync_task_t *dst; - void *tr_cookie; - - ASSERT3U(dstg->dstg_err, ==, 0); - - /* - * Check for sufficient space. - */ - dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir, - dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx); - /* don't bother trying again */ - if (dstg->dstg_err == ERESTART) - dstg->dstg_err = EAGAIN; - if (dstg->dstg_err) - return; - - /* - * Check for errors by calling checkfuncs. - */ - rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER); - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_err = - dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); - if (dst->dst_err) - dstg->dstg_err = dst->dst_err; - } - - if (dstg->dstg_err == 0) { - /* - * Execute sync tasks. - */ - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); - } - } - rw_exit(&dstg->dstg_pool->dp_config_rwlock); - - dsl_dir_tempreserve_clear(tr_cookie, tx); -} - -int -dsl_sync_task_do(dsl_pool_t *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified) -{ - dsl_sync_task_group_t *dstg; - int err; - - dstg = dsl_sync_task_group_create(dp); - dsl_sync_task_create(dstg, checkfunc, syncfunc, - arg1, arg2, blocks_modified); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); - return (err); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c b/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c deleted file mode 100644 index edda3c9..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/sysmacros.h> -#include <sys/byteorder.h> -#include <sys/spa.h> - -void -fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint64_t *ip = buf; - const uint64_t *ipend = ip + (size / sizeof (uint64_t)); - uint64_t a0, b0, a1, b1; - - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { - a0 += ip[0]; - a1 += ip[1]; - b0 += a0; - b1 += a1; - } - - ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); -} - -void -fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint64_t *ip = buf; - const uint64_t *ipend = ip + (size / sizeof (uint64_t)); - uint64_t a0, b0, a1, b1; - - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { - a0 += BSWAP_64(ip[0]); - a1 += BSWAP_64(ip[1]); - b0 += a0; - b1 += a1; - } - - ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); -} - -void -fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += ip[0]; - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += BSWAP_32(ip[0]); - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_incremental_native(const void *buf, uint64_t size, - zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - a = zcp->zc_word[0]; - b = zcp->zc_word[1]; - c = zcp->zc_word[2]; - d = zcp->zc_word[3]; - - for (; ip < ipend; ip++) { - a += ip[0]; - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_incremental_byteswap(const void *buf, uint64_t size, - zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - a = zcp->zc_word[0]; - b = zcp->zc_word[1]; - c = zcp->zc_word[2]; - d = zcp->zc_word[3]; - - for (; ip < ipend; ip++) { - a += BSWAP_32(ip[0]); - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c b/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c deleted file mode 100644 index b257d4a..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/debug.h> -#include <sys/types.h> -#include <sys/zmod.h> - -#ifdef _KERNEL -#include <sys/systm.h> -#else -#include <strings.h> -#endif - -size_t -gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - size_t dstlen = d_len; - - ASSERT(d_len <= s_len); - - if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) { - if (d_len != s_len) - return (s_len); - - bcopy(s_start, d_start, s_len); - return (s_len); - } - - return (dstlen); -} - -/*ARGSUSED*/ -int -gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - size_t dstlen = d_len; - - ASSERT(d_len >= s_len); - - if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK) - return (-1); - - return (0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c deleted file mode 100644 index a88b85c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c +++ /dev/null @@ -1,129 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * We keep our own copy of this algorithm for 2 main reasons: - * 1. If we didn't, anyone modifying common/os/compress.c would - * directly break our on disk format - * 2. Our version of lzjb does not have a number of checks that the - * common/os version needs and uses - * In particular, we are adding the "feature" that compress() can - * take a destination buffer size and return -1 if the data will not - * compress to d_len or less. - */ - -#include <sys/zfs_context.h> -#include <sys/types.h> - -#define MATCH_BITS 6 -#define MATCH_MIN 3 -#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) -#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) -#define LEMPEL_SIZE 256 - -/*ARGSUSED*/ -size_t -lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - uchar_t *src = s_start; - uchar_t *dst = d_start; - uchar_t *cpy, *copymap; - int copymask = 1 << (NBBY - 1); - int mlen, offset; - uint16_t *hp; - uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */ - - while (src < (uchar_t *)s_start + s_len) { - if ((copymask <<= 1) == (1 << NBBY)) { - if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { - if (d_len != s_len) - return (s_len); - mlen = s_len; - for (src = s_start, dst = d_start; mlen; mlen--) - *dst++ = *src++; - return (s_len); - } - copymask = 1; - copymap = dst; - *dst++ = 0; - } - if (src > (uchar_t *)s_start + s_len - MATCH_MAX) { - *dst++ = *src++; - continue; - } - hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) & - (LEMPEL_SIZE - 1)]; - offset = (intptr_t)(src - *hp) & OFFSET_MASK; - *hp = (uint16_t)(uintptr_t)src; - cpy = src - offset; - if (cpy >= (uchar_t *)s_start && cpy != src && - src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) { - *copymap |= copymask; - for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) - if (src[mlen] != cpy[mlen]) - break; - *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | - (offset >> NBBY); - *dst++ = (uchar_t)offset; - src += mlen; - } else { - *dst++ = *src++; - } - } - return (dst - (uchar_t *)d_start); -} - -/*ARGSUSED*/ -int -lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - uchar_t *src = s_start; - uchar_t *dst = d_start; - uchar_t *d_end = (uchar_t *)d_start + d_len; - uchar_t *cpy, copymap; - int copymask = 1 << (NBBY - 1); - - while (dst < d_end) { - if ((copymask <<= 1) == (1 << NBBY)) { - copymask = 1; - copymap = *src++; - } - if (copymap & copymask) { - int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; - int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; - src += 2; - if ((cpy = dst - offset) < (uchar_t *)d_start) - return (-1); - while (--mlen >= 0 && dst < d_end) - *dst++ = *cpy++; - } else { - *dst++ = *src++; - } - } - return (0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c deleted file mode 100644 index 0dba134..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ /dev/null @@ -1,1023 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa_impl.h> -#include <sys/dmu.h> -#include <sys/dmu_tx.h> -#include <sys/space_map.h> -#include <sys/metaslab_impl.h> -#include <sys/vdev_impl.h> -#include <sys/zio.h> - -uint64_t metaslab_aliquot = 512ULL << 10; - -/* - * ========================================================================== - * Metaslab classes - * ========================================================================== - */ -metaslab_class_t * -metaslab_class_create(void) -{ - metaslab_class_t *mc; - - mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); - - mc->mc_rotor = NULL; - - return (mc); -} - -void -metaslab_class_destroy(metaslab_class_t *mc) -{ - metaslab_group_t *mg; - - while ((mg = mc->mc_rotor) != NULL) { - metaslab_class_remove(mc, mg); - metaslab_group_destroy(mg); - } - - kmem_free(mc, sizeof (metaslab_class_t)); -} - -void -metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) -{ - metaslab_group_t *mgprev, *mgnext; - - ASSERT(mg->mg_class == NULL); - - if ((mgprev = mc->mc_rotor) == NULL) { - mg->mg_prev = mg; - mg->mg_next = mg; - } else { - mgnext = mgprev->mg_next; - mg->mg_prev = mgprev; - mg->mg_next = mgnext; - mgprev->mg_next = mg; - mgnext->mg_prev = mg; - } - mc->mc_rotor = mg; - mg->mg_class = mc; -} - -void -metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) -{ - metaslab_group_t *mgprev, *mgnext; - - ASSERT(mg->mg_class == mc); - - mgprev = mg->mg_prev; - mgnext = mg->mg_next; - - if (mg == mgnext) { - mc->mc_rotor = NULL; - } else { - mc->mc_rotor = mgnext; - mgprev->mg_next = mgnext; - mgnext->mg_prev = mgprev; - } - - mg->mg_prev = NULL; - mg->mg_next = NULL; - mg->mg_class = NULL; -} - -/* - * ========================================================================== - * Metaslab groups - * ========================================================================== - */ -static int -metaslab_compare(const void *x1, const void *x2) -{ - const metaslab_t *m1 = x1; - const metaslab_t *m2 = x2; - - if (m1->ms_weight < m2->ms_weight) - return (1); - if (m1->ms_weight > m2->ms_weight) - return (-1); - - /* - * If the weights are identical, use the offset to force uniqueness. - */ - if (m1->ms_map.sm_start < m2->ms_map.sm_start) - return (-1); - if (m1->ms_map.sm_start > m2->ms_map.sm_start) - return (1); - - ASSERT3P(m1, ==, m2); - - return (0); -} - -metaslab_group_t * -metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) -{ - metaslab_group_t *mg; - - mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); - mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&mg->mg_metaslab_tree, metaslab_compare, - sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); - mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); - mg->mg_vd = vd; - metaslab_class_add(mc, mg); - - return (mg); -} - -void -metaslab_group_destroy(metaslab_group_t *mg) -{ - avl_destroy(&mg->mg_metaslab_tree); - mutex_destroy(&mg->mg_lock); - kmem_free(mg, sizeof (metaslab_group_t)); -} - -static void -metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) -{ - mutex_enter(&mg->mg_lock); - ASSERT(msp->ms_group == NULL); - msp->ms_group = mg; - msp->ms_weight = 0; - avl_add(&mg->mg_metaslab_tree, msp); - mutex_exit(&mg->mg_lock); -} - -static void -metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) -{ - mutex_enter(&mg->mg_lock); - ASSERT(msp->ms_group == mg); - avl_remove(&mg->mg_metaslab_tree, msp); - msp->ms_group = NULL; - mutex_exit(&mg->mg_lock); -} - -static void -metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) -{ - /* - * Although in principle the weight can be any value, in - * practice we do not use values in the range [1, 510]. - */ - ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - mutex_enter(&mg->mg_lock); - ASSERT(msp->ms_group == mg); - avl_remove(&mg->mg_metaslab_tree, msp); - msp->ms_weight = weight; - avl_add(&mg->mg_metaslab_tree, msp); - mutex_exit(&mg->mg_lock); -} - -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ -static void -metaslab_ff_load(space_map_t *sm) -{ - ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); -} - -static void -metaslab_ff_unload(space_map_t *sm) -{ - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); - sm->sm_ppd = NULL; -} - -static uint64_t -metaslab_ff_alloc(space_map_t *sm, uint64_t size) -{ - avl_tree_t *t = &sm->sm_root; - uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; - space_seg_t *ss, ssearch; - avl_index_t where; - - ssearch.ss_start = *cursor; - ssearch.ss_end = *cursor + size; - - ss = avl_find(t, &ssearch, &where); - if (ss == NULL) - ss = avl_nearest(t, where, AVL_AFTER); - - while (ss != NULL) { - uint64_t offset = P2ROUNDUP(ss->ss_start, align); - - if (offset + size <= ss->ss_end) { - *cursor = offset + size; - return (offset); - } - ss = AVL_NEXT(t, ss); - } - - /* - * If we know we've searched the whole map (*cursor == 0), give up. - * Otherwise, reset the cursor to the beginning and try again. - */ - if (*cursor == 0) - return (-1ULL); - - *cursor = 0; - return (metaslab_ff_alloc(sm, size)); -} - -/* ARGSUSED */ -static void -metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) -{ - /* No need to update cursor */ -} - -/* ARGSUSED */ -static void -metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) -{ - /* No need to update cursor */ -} - -static space_map_ops_t metaslab_ff_ops = { - metaslab_ff_load, - metaslab_ff_unload, - metaslab_ff_alloc, - metaslab_ff_claim, - metaslab_ff_free -}; - -/* - * ========================================================================== - * Metaslabs - * ========================================================================== - */ -metaslab_t * -metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, - uint64_t start, uint64_t size, uint64_t txg) -{ - vdev_t *vd = mg->mg_vd; - metaslab_t *msp; - - msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); - mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); - - msp->ms_smo_syncing = *smo; - - /* - * We create the main space map here, but we don't create the - * allocmaps and freemaps until metaslab_sync_done(). This serves - * two purposes: it allows metaslab_sync_done() to detect the - * addition of new space; and for debugging, it ensures that we'd - * data fault on any attempt to use this metaslab before it's ready. - */ - space_map_create(&msp->ms_map, start, size, - vd->vdev_ashift, &msp->ms_lock); - - metaslab_group_add(mg, msp); - - /* - * If we're opening an existing pool (txg == 0) or creating - * a new one (txg == TXG_INITIAL), all space is available now. - * If we're adding space to an existing pool, the new space - * does not become available until after this txg has synced. - */ - if (txg <= TXG_INITIAL) - metaslab_sync_done(msp, 0); - - if (txg != 0) { - /* - * The vdev is dirty, but the metaslab isn't -- it just needs - * to have metaslab_sync_done() invoked from vdev_sync_done(). - * [We could just dirty the metaslab, but that would cause us - * to allocate a space map object for it, which is wasteful - * and would mess up the locality logic in metaslab_weight().] - */ - ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); - vdev_dirty(vd, 0, NULL, txg); - vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); - } - - return (msp); -} - -void -metaslab_fini(metaslab_t *msp) -{ - metaslab_group_t *mg = msp->ms_group; - int t; - - vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo.smo_alloc); - - metaslab_group_remove(mg, msp); - - mutex_enter(&msp->ms_lock); - - space_map_unload(&msp->ms_map); - space_map_destroy(&msp->ms_map); - - for (t = 0; t < TXG_SIZE; t++) { - space_map_destroy(&msp->ms_allocmap[t]); - space_map_destroy(&msp->ms_freemap[t]); - } - - mutex_exit(&msp->ms_lock); - mutex_destroy(&msp->ms_lock); - - kmem_free(msp, sizeof (metaslab_t)); -} - -#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) -#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) -#define METASLAB_ACTIVE_MASK \ - (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) -#define METASLAB_SMO_BONUS_MULTIPLIER 2 - -static uint64_t -metaslab_weight(metaslab_t *msp) -{ - metaslab_group_t *mg = msp->ms_group; - space_map_t *sm = &msp->ms_map; - space_map_obj_t *smo = &msp->ms_smo; - vdev_t *vd = mg->mg_vd; - uint64_t weight, space; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - /* - * The baseline weight is the metaslab's free space. - */ - space = sm->sm_size - smo->smo_alloc; - weight = space; - - /* - * Modern disks have uniform bit density and constant angular velocity. - * Therefore, the outer recording zones are faster (higher bandwidth) - * than the inner zones by the ratio of outer to inner track diameter, - * which is typically around 2:1. We account for this by assigning - * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). - * In effect, this means that we'll select the metaslab with the most - * free bandwidth rather than simply the one with the most free space. - */ - weight = 2 * weight - - ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; - ASSERT(weight >= space && weight <= 2 * space); - - /* - * For locality, assign higher weight to metaslabs we've used before. - */ - if (smo->smo_object != 0) - weight *= METASLAB_SMO_BONUS_MULTIPLIER; - ASSERT(weight >= space && - weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); - - /* - * If this metaslab is one we're actively using, adjust its weight to - * make it preferable to any inactive metaslab so we'll polish it off. - */ - weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); - - return (weight); -} - -static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight) -{ - space_map_t *sm = &msp->ms_map; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = space_map_load(sm, &metaslab_ff_ops, - SM_FREE, &msp->ms_smo, - msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); - if (error) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); - } - metaslab_group_sort(msp->ms_group, msp, - msp->ms_weight | activation_weight); - } - ASSERT(sm->sm_loaded); - ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); - - return (0); -} - -static void -metaslab_passivate(metaslab_t *msp, uint64_t size) -{ - /* - * If size < SPA_MINBLOCKSIZE, then we will not allocate from - * this metaslab again. In that case, it had better be empty, - * or we would be leaving space on the table. - */ - ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); - metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); - ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); -} - -/* - * Write a metaslab to disk in the context of the specified transaction group. - */ -void -metaslab_sync(metaslab_t *msp, uint64_t txg) -{ - vdev_t *vd = msp->ms_group->mg_vd; - spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; - space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *sm = &msp->ms_map; - space_map_obj_t *smo = &msp->ms_smo_syncing; - dmu_buf_t *db; - dmu_tx_t *tx; - int t; - - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - - /* - * The only state that can actually be changing concurrently with - * metaslab_sync() is the metaslab's ms_map. No other thread can - * be modifying this txg's allocmap, freemap, freed_map, or smo. - * Therefore, we only hold ms_lock to satify space_map ASSERTs. - * We drop it whenever we call into the DMU, because the DMU - * can call down to us (e.g. via zio_free()) at any time. - */ - mutex_enter(&msp->ms_lock); - - if (smo->smo_object == 0) { - ASSERT(smo->smo_objsize == 0); - ASSERT(smo->smo_alloc == 0); - mutex_exit(&msp->ms_lock); - smo->smo_object = dmu_object_alloc(mos, - DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, - DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); - ASSERT(smo->smo_object != 0); - dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * - (sm->sm_start >> vd->vdev_ms_shift), - sizeof (uint64_t), &smo->smo_object, tx); - mutex_enter(&msp->ms_lock); - } - - space_map_walk(freemap, space_map_add, freed_map); - - if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= - 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { - /* - * The in-core space map representation is twice as compact - * as the on-disk one, so it's time to condense the latter - * by generating a pure allocmap from first principles. - * - * This metaslab is 100% allocated, - * minus the content of the in-core map (sm), - * minus what's been freed this txg (freed_map), - * minus allocations from txgs in the future - * (because they haven't been committed yet). - */ - space_map_vacate(allocmap, NULL, NULL); - space_map_vacate(freemap, NULL, NULL); - - space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); - - space_map_walk(sm, space_map_remove, allocmap); - space_map_walk(freed_map, space_map_remove, allocmap); - - for (t = 1; t < TXG_CONCURRENT_STATES; t++) - space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], - space_map_remove, allocmap); - - mutex_exit(&msp->ms_lock); - space_map_truncate(smo, mos, tx); - mutex_enter(&msp->ms_lock); - } - - space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); - space_map_sync(freemap, SM_FREE, smo, mos, tx); - - mutex_exit(&msp->ms_lock); - - VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(smo, db->db_data, db->db_size); - dmu_buf_rele(db, FTAG); - - dmu_tx_commit(tx); -} - -/* - * Called after a transaction group has completely synced to mark - * all of the metaslab's free space as usable. - */ -void -metaslab_sync_done(metaslab_t *msp, uint64_t txg) -{ - space_map_obj_t *smo = &msp->ms_smo; - space_map_obj_t *smosync = &msp->ms_smo_syncing; - space_map_t *sm = &msp->ms_map; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - metaslab_group_t *mg = msp->ms_group; - vdev_t *vd = mg->mg_vd; - int t; - - mutex_enter(&msp->ms_lock); - - /* - * If this metaslab is just becoming available, initialize its - * allocmaps and freemaps and add its capacity to the vdev. - */ - if (freed_map->sm_size == 0) { - for (t = 0; t < TXG_SIZE; t++) { - space_map_create(&msp->ms_allocmap[t], sm->sm_start, - sm->sm_size, sm->sm_shift, sm->sm_lock); - space_map_create(&msp->ms_freemap[t], sm->sm_start, - sm->sm_size, sm->sm_shift, sm->sm_lock); - } - vdev_space_update(vd, sm->sm_size, 0); - } - - vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc); - - ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); - ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); - - /* - * If there's a space_map_load() in progress, wait for it to complete - * so that we have a consistent view of the in-core space map. - * Then, add everything we freed in this txg to the map. - */ - space_map_load_wait(sm); - space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); - - *smo = *smosync; - - /* - * If the map is loaded but no longer active, evict it as soon as all - * future allocations have synced. (If we unloaded it now and then - * loaded a moment later, the map wouldn't reflect those allocations.) - */ - if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int evictable = 1; - - for (t = 1; t < TXG_CONCURRENT_STATES; t++) - if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) - evictable = 0; - - if (evictable) - space_map_unload(sm); - } - - metaslab_group_sort(mg, msp, metaslab_weight(msp)); - - mutex_exit(&msp->ms_lock); -} - -static uint64_t -metaslab_distance(metaslab_t *msp, dva_t *dva) -{ - uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; - uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; - uint64_t start = msp->ms_map.sm_start >> ms_shift; - - if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) - return (1ULL << 63); - - if (offset < start) - return ((start - offset) << ms_shift); - if (offset > start) - return ((offset - start) << ms_shift); - return (0); -} - -static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, - uint64_t min_distance, dva_t *dva, int d) -{ - metaslab_t *msp = NULL; - uint64_t offset = -1ULL; - avl_tree_t *t = &mg->mg_metaslab_tree; - uint64_t activation_weight; - uint64_t target_distance; - int i; - - activation_weight = METASLAB_WEIGHT_PRIMARY; - for (i = 0; i < d; i++) - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) - activation_weight = METASLAB_WEIGHT_SECONDARY; - - for (;;) { - mutex_enter(&mg->mg_lock); - for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { - if (msp->ms_weight < size) { - mutex_exit(&mg->mg_lock); - return (-1ULL); - } - - if (activation_weight == METASLAB_WEIGHT_PRIMARY) - break; - - target_distance = min_distance + - (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); - - for (i = 0; i < d; i++) - if (metaslab_distance(msp, &dva[i]) < - target_distance) - break; - if (i == d) - break; - } - mutex_exit(&mg->mg_lock); - if (msp == NULL) - return (-1ULL); - - mutex_enter(&msp->ms_lock); - - /* - * Ensure that the metaslab we have selected is still - * capable of handling our request. It's possible that - * another thread may have changed the weight while we - * were blocked on the metaslab lock. - */ - if (msp->ms_weight < size) { - mutex_exit(&msp->ms_lock); - continue; - } - - if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && - activation_weight == METASLAB_WEIGHT_PRIMARY) { - metaslab_passivate(msp, - msp->ms_weight & ~METASLAB_ACTIVE_MASK); - mutex_exit(&msp->ms_lock); - continue; - } - - if (metaslab_activate(msp, activation_weight) != 0) { - mutex_exit(&msp->ms_lock); - continue; - } - - if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) - break; - - metaslab_passivate(msp, size - 1); - - mutex_exit(&msp->ms_lock); - } - - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) - vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); - - mutex_exit(&msp->ms_lock); - - return (offset); -} - -/* - * Allocate a block for the specified i/o. - */ -static int -metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, - dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid) -{ - metaslab_group_t *mg, *rotor; - metaslab_class_t *mc; - vdev_t *vd; - int dshift = 3; - int all_zero; - uint64_t offset = -1ULL; - uint64_t asize; - uint64_t distance; - - ASSERT(!DVA_IS_VALID(&dva[d])); - - mc = spa_metaslab_class_select(spa); - - /* - * Start at the rotor and loop through all mgs until we find something. - * Note that there's no locking on mc_rotor or mc_allocated because - * nothing actually breaks if we miss a few updates -- we just won't - * allocate quite as evenly. It all balances out over time. - * - * If we are doing ditto or log blocks, try to spread them across - * consecutive vdevs. If we're forced to reuse a vdev before we've - * allocated all of our ditto blocks, then try and spread them out on - * that vdev as much as possible. If it turns out to not be possible, - * gradually lower our standards until anything becomes acceptable. - * Also, allocating on consecutive vdevs (as opposed to random vdevs) - * gives us hope of containing our fault domains to something we're - * able to reason about. Otherwise, any two top-level vdev failures - * will guarantee the loss of data. With consecutive allocation, - * only two adjacent top-level vdev failures will result in data loss. - * - * If we are doing gang blocks (hintdva is non-NULL), try to keep - * ourselves on the same vdev as our gang block header. That - * way, we can hope for locality in vdev_cache, plus it makes our - * fault domains something tractable. - */ - if (hintdva) { - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); - if (hintdva_avoid) - mg = vd->vdev_mg->mg_next; - else - mg = vd->vdev_mg; - } else if (d != 0) { - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); - mg = vd->vdev_mg->mg_next; - } else { - mg = mc->mc_rotor; - } - rotor = mg; - -top: - all_zero = B_TRUE; - do { - vd = mg->mg_vd; - - distance = vd->vdev_asize >> dshift; - if (distance <= (1ULL << vd->vdev_ms_shift)) - distance = 0; - else - all_zero = B_FALSE; - - asize = vdev_psize_to_asize(vd, psize); - ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - - offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); - if (offset != -1ULL) { - /* - * If we've just selected this metaslab group, - * figure out whether the corresponding vdev is - * over- or under-used relative to the pool, - * and set an allocation bias to even it out. - */ - if (mc->mc_allocated == 0) { - vdev_stat_t *vs = &vd->vdev_stat; - uint64_t alloc, space; - int64_t vu, su; - - alloc = spa_get_alloc(spa); - space = spa_get_space(spa); - - /* - * Determine percent used in units of 0..1024. - * (This is just to avoid floating point.) - */ - vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); - su = (alloc << 10) / (space + 1); - - /* - * Bias by at most +/- 25% of the aliquot. - */ - mg->mg_bias = ((su - vu) * - (int64_t)mg->mg_aliquot) / (1024 * 4); - } - - if (atomic_add_64_nv(&mc->mc_allocated, asize) >= - mg->mg_aliquot + mg->mg_bias) { - mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; - } - - DVA_SET_VDEV(&dva[d], vd->vdev_id); - DVA_SET_OFFSET(&dva[d], offset); - DVA_SET_GANG(&dva[d], 0); - DVA_SET_ASIZE(&dva[d], asize); - - return (0); - } - mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; - } while ((mg = mg->mg_next) != rotor); - - if (!all_zero) { - dshift++; - ASSERT(dshift < 64); - goto top; - } - - bzero(&dva[d], sizeof (dva_t)); - - return (ENOSPC); -} - -/* - * Free the block represented by DVA in the context of the specified - * transaction group. - */ -static void -metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) -{ - uint64_t vdev = DVA_GET_VDEV(dva); - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd; - metaslab_t *msp; - - ASSERT(DVA_IS_VALID(dva)); - - if (txg > spa_freeze_txg(spa)) - return; - - if ((vd = vdev_lookup_top(spa, vdev)) == NULL || - (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { - cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", - (u_longlong_t)vdev, (u_longlong_t)offset); - ASSERT(0); - return; - } - - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - if (DVA_GET_GANG(dva)) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - - mutex_enter(&msp->ms_lock); - - if (now) { - space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], - offset, size); - space_map_free(&msp->ms_map, offset, size); - } else { - if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); - - /* - * verify that this region is actually allocated in - * either a ms_allocmap or the ms_map - */ - if (msp->ms_map.sm_loaded) { - boolean_t allocd = B_FALSE; - int i; - - if (!space_map_contains(&msp->ms_map, offset, size)) { - allocd = B_TRUE; - } else { - for (i = 0; i < TXG_CONCURRENT_STATES; i++) { - space_map_t *sm = &msp->ms_allocmap - [(txg - i) & TXG_MASK]; - if (space_map_contains(sm, - offset, size)) { - allocd = B_TRUE; - break; - } - } - } - - if (!allocd) { - zfs_panic_recover("freeing free segment " - "(vdev=%llu offset=%llx size=%llx)", - (longlong_t)vdev, (longlong_t)offset, - (longlong_t)size); - } - } - - - } - - mutex_exit(&msp->ms_lock); -} - -/* - * Intent log support: upon opening the pool after a crash, notify the SPA - * of blocks that the intent log has allocated for immediate write, but - * which are still considered free by the SPA because the last transaction - * group didn't commit yet. - */ -static int -metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) -{ - uint64_t vdev = DVA_GET_VDEV(dva); - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd; - metaslab_t *msp; - int error; - - ASSERT(DVA_IS_VALID(dva)); - - if ((vd = vdev_lookup_top(spa, vdev)) == NULL || - (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) - return (ENXIO); - - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - if (DVA_GET_GANG(dva)) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - - mutex_enter(&msp->ms_lock); - - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error) { - mutex_exit(&msp->ms_lock); - return (error); - } - - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - - space_map_claim(&msp->ms_map, offset, size); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); - - mutex_exit(&msp->ms_lock); - - return (0); -} - -int -metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas, - uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid) -{ - dva_t *dva = bp->blk_dva; - dva_t *hintdva = hintbp->blk_dva; - int d; - int error = 0; - - ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); - ASSERT(BP_GET_NDVAS(bp) == 0); - ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); - - for (d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, - txg, hintbp_avoid); - if (error) { - for (d--; d >= 0; d--) { - metaslab_free_dva(spa, &dva[d], txg, B_TRUE); - bzero(&dva[d], sizeof (dva_t)); - } - return (error); - } - } - ASSERT(error == 0); - ASSERT(BP_GET_NDVAS(bp) == ndvas); - - return (0); -} - -void -metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - int d; - - ASSERT(!BP_IS_HOLE(bp)); - - for (d = 0; d < ndvas; d++) - metaslab_free_dva(spa, &dva[d], txg, now); -} - -int -metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - int d, error; - int last_error = 0; - - ASSERT(!BP_IS_HOLE(bp)); - - for (d = 0; d < ndvas; d++) - if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) - last_error = error; - - return (last_error); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c deleted file mode 100644 index 411ed46..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c +++ /dev/null @@ -1,194 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/refcount.h> - -#if defined(DEBUG) || !defined(_KERNEL) - -#ifdef _KERNEL -int reference_tracking_enable = FALSE; /* runs out of memory too easily */ -#else -int reference_tracking_enable = TRUE; -#endif -int reference_history = 4; /* tunable */ - -static kmem_cache_t *reference_cache; -static kmem_cache_t *reference_history_cache; - -void -refcount_init(void) -{ - reference_cache = kmem_cache_create("reference_cache", - sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - - reference_history_cache = kmem_cache_create("reference_history_cache", - sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -refcount_fini(void) -{ - kmem_cache_destroy(reference_cache); - kmem_cache_destroy(reference_history_cache); -} - -void -refcount_create(refcount_t *rc) -{ - list_create(&rc->rc_list, sizeof (reference_t), - offsetof(reference_t, ref_link)); - list_create(&rc->rc_removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); - mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); -} - -void -refcount_destroy_many(refcount_t *rc, uint64_t number) -{ - reference_t *ref; - - ASSERT(rc->rc_count == number); - while (ref = list_head(&rc->rc_list)) { - list_remove(&rc->rc_list, ref); - kmem_cache_free(reference_cache, ref); - } - list_destroy(&rc->rc_list); - - while (ref = list_head(&rc->rc_removed)) { - list_remove(&rc->rc_removed, ref); - kmem_cache_free(reference_history_cache, ref->ref_removed); - kmem_cache_free(reference_cache, ref); - } - list_destroy(&rc->rc_removed); - mutex_destroy(&rc->rc_mtx); -} - -void -refcount_destroy(refcount_t *rc) -{ - refcount_destroy_many(rc, 0); -} - -int -refcount_is_zero(refcount_t *rc) -{ - ASSERT(rc->rc_count >= 0); - return (rc->rc_count == 0); -} - -int64_t -refcount_count(refcount_t *rc) -{ - ASSERT(rc->rc_count >= 0); - return (rc->rc_count); -} - -int64_t -refcount_add_many(refcount_t *rc, uint64_t number, void *holder) -{ - reference_t *ref; - int64_t count; - - if (reference_tracking_enable) { - ref = kmem_cache_alloc(reference_cache, KM_SLEEP); - ref->ref_holder = holder; - ref->ref_number = number; - } - mutex_enter(&rc->rc_mtx); - ASSERT(rc->rc_count >= 0); - if (reference_tracking_enable) - list_insert_head(&rc->rc_list, ref); - rc->rc_count += number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - - return (count); -} - -int64_t -refcount_add(refcount_t *rc, void *holder) -{ - return (refcount_add_many(rc, 1, holder)); -} - -int64_t -refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) -{ - reference_t *ref; - int64_t count; - - mutex_enter(&rc->rc_mtx); - ASSERT(rc->rc_count >= number); - - if (!reference_tracking_enable) { - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - return (count); - } - - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder && ref->ref_number == number) { - list_remove(&rc->rc_list, ref); - if (reference_history > 0) { - ref->ref_removed = - kmem_cache_alloc(reference_history_cache, - KM_SLEEP); - list_insert_head(&rc->rc_removed, ref); - rc->rc_removed_count++; - if (rc->rc_removed_count >= reference_history) { - ref = list_tail(&rc->rc_removed); - list_remove(&rc->rc_removed, ref); - kmem_cache_free(reference_history_cache, - ref->ref_removed); - kmem_cache_free(reference_cache, ref); - rc->rc_removed_count--; - } - } else { - kmem_cache_free(reference_cache, ref); - } - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - return (count); - } - } - panic("No such hold %p on refcount %llx", holder, - (u_longlong_t)(uintptr_t)rc); - return (-1); -} - -int64_t -refcount_remove(refcount_t *rc, void *holder) -{ - return (refcount_remove_many(rc, 1, holder)); -} - -#endif diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c deleted file mode 100644 index ce5c261..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/zio.h> -#include <sys/zio_checksum.h> - -/* - * SHA-256 checksum, as specified in FIPS 180-2, available at: - * http://csrc.nist.gov/cryptval - * - * This is a very compact implementation of SHA-256. - * It is designed to be simple and portable, not to be fast. - */ - -/* - * The literal definitions according to FIPS180-2 would be: - * - * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) - * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) - * - * We use logical equivalents which require one less op. - */ -#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) -#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s))) -#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22)) -#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25)) -#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3)) -#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10)) - -static const uint32_t SHA256_K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -static void -SHA256Transform(uint32_t *H, const uint8_t *cp) -{ - uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64]; - - for (t = 0; t < 16; t++, cp += 4) - W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3]; - - for (t = 16; t < 64; t++) - W[t] = sigma1(W[t - 2]) + W[t - 7] + - sigma0(W[t - 15]) + W[t - 16]; - - a = H[0]; b = H[1]; c = H[2]; d = H[3]; - e = H[4]; f = H[5]; g = H[6]; h = H[7]; - - for (t = 0; t < 64; t++) { - T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t]; - T2 = SIGMA0(a) + Maj(a, b, c); - h = g; g = f; f = e; e = d + T1; - d = c; c = b; b = a; a = T1 + T2; - } - - H[0] += a; H[1] += b; H[2] += c; H[3] += d; - H[4] += e; H[5] += f; H[6] += g; H[7] += h; -} - -void -zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, - 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; - uint8_t pad[128]; - int padsize = size & 63; - int i; - - for (i = 0; i < size - padsize; i += 64) - SHA256Transform(H, (uint8_t *)buf + i); - - for (i = 0; i < padsize; i++) - pad[i] = ((uint8_t *)buf)[i]; - - for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) - pad[padsize] = 0; - - for (i = 0; i < 8; i++) - pad[padsize++] = (size << 3) >> (56 - 8 * i); - - for (i = 0; i < padsize; i += 64) - SHA256Transform(H, pad + i); - - ZIO_SET_CHECKSUM(zcp, - (uint64_t)H[0] << 32 | H[1], - (uint64_t)H[2] << 32 | H[3], - (uint64_t)H[4] << 32 | H[5], - (uint64_t)H[6] << 32 | H[7]); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c deleted file mode 100644 index 6a7c525..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ /dev/null @@ -1,3301 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * This file contains all the routines used when modifying on-disk SPA state. - * This includes opening, importing, destroying, exporting a pool, and syncing a - * pool. - */ - -#include <sys/zfs_context.h> -#include <sys/fm/fs/zfs.h> -#include <sys/spa_impl.h> -#include <sys/zio.h> -#include <sys/zio_checksum.h> -#include <sys/zio_compress.h> -#include <sys/dmu.h> -#include <sys/dmu_tx.h> -#include <sys/zap.h> -#include <sys/zil.h> -#include <sys/vdev_impl.h> -#include <sys/metaslab.h> -#include <sys/uberblock_impl.h> -#include <sys/txg.h> -#include <sys/avl.h> -#include <sys/dmu_traverse.h> -#include <sys/dmu_objset.h> -#include <sys/unique.h> -#include <sys/dsl_pool.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_synctask.h> -#include <sys/fs/zfs.h> -#include <sys/callb.h> -#include <sys/sunddi.h> - -int zio_taskq_threads = 0; -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); -TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads); -SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW, - &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type"); - - -/* - * ========================================================================== - * SPA state manipulation (open/create/destroy/import/export) - * ========================================================================== - */ - -static int -spa_error_entry_compare(const void *a, const void *b) -{ - spa_error_entry_t *sa = (spa_error_entry_t *)a; - spa_error_entry_t *sb = (spa_error_entry_t *)b; - int ret; - - ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, - sizeof (zbookmark_t)); - - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); -} - -/* - * Utility function which retrieves copies of the current logs and - * re-initializes them in the process. - */ -void -spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) -{ - ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); - - bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); - bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); - - avl_create(&spa->spa_errlist_scrub, - spa_error_entry_compare, sizeof (spa_error_entry_t), - offsetof(spa_error_entry_t, se_avl)); - avl_create(&spa->spa_errlist_last, - spa_error_entry_compare, sizeof (spa_error_entry_t), - offsetof(spa_error_entry_t, se_avl)); -} - -/* - * Activate an uninitialized pool. - */ -static void -spa_activate(spa_t *spa) -{ - int t; - int nthreads = zio_taskq_threads; - char name[32]; - - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); - - spa->spa_state = POOL_STATE_ACTIVE; - - spa->spa_normal_class = metaslab_class_create(); - - if (nthreads == 0) - nthreads = max_ncpus; - for (t = 0; t < ZIO_TYPES; t++) { - snprintf(name, sizeof(name), "spa_zio_issue %d", t); - spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads, - maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); - snprintf(name, sizeof(name), "spa_zio_intr %d", t); - spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads, - maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); - } - - rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); - - mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); - - list_create(&spa->spa_dirty_list, sizeof (vdev_t), - offsetof(vdev_t, vdev_dirty_node)); - - txg_list_create(&spa->spa_vdev_txg_list, - offsetof(struct vdev, vdev_txg_node)); - - avl_create(&spa->spa_errlist_scrub, - spa_error_entry_compare, sizeof (spa_error_entry_t), - offsetof(spa_error_entry_t, se_avl)); - avl_create(&spa->spa_errlist_last, - spa_error_entry_compare, sizeof (spa_error_entry_t), - offsetof(spa_error_entry_t, se_avl)); -} - -/* - * Opposite of spa_activate(). - */ -static void -spa_deactivate(spa_t *spa) -{ - int t; - - ASSERT(spa->spa_sync_on == B_FALSE); - ASSERT(spa->spa_dsl_pool == NULL); - ASSERT(spa->spa_root_vdev == NULL); - - ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); - - txg_list_destroy(&spa->spa_vdev_txg_list); - - list_destroy(&spa->spa_dirty_list); - - for (t = 0; t < ZIO_TYPES; t++) { - taskq_destroy(spa->spa_zio_issue_taskq[t]); - taskq_destroy(spa->spa_zio_intr_taskq[t]); - spa->spa_zio_issue_taskq[t] = NULL; - spa->spa_zio_intr_taskq[t] = NULL; - } - - metaslab_class_destroy(spa->spa_normal_class); - spa->spa_normal_class = NULL; - - /* - * If this was part of an import or the open otherwise failed, we may - * still have errors left in the queues. Empty them just in case. - */ - spa_errlog_drain(spa); - - avl_destroy(&spa->spa_errlist_scrub); - avl_destroy(&spa->spa_errlist_last); - - rw_destroy(&spa->spa_traverse_lock); - mutex_destroy(&spa->spa_uberblock_lock); - mutex_destroy(&spa->spa_errlog_lock); - mutex_destroy(&spa->spa_errlist_lock); - mutex_destroy(&spa->spa_config_lock.scl_lock); - cv_destroy(&spa->spa_config_lock.scl_cv); - mutex_destroy(&spa->spa_sync_bplist.bpl_lock); - mutex_destroy(&spa->spa_history_lock); - mutex_destroy(&spa->spa_props_lock); - - spa->spa_state = POOL_STATE_UNINITIALIZED; -} - -/* - * Verify a pool configuration, and construct the vdev tree appropriately. This - * will create all the necessary vdevs in the appropriate layout, with each vdev - * in the CLOSED state. This will prep the pool before open/creation/import. - * All vdev validation is done by the vdev_alloc() routine. - */ -static int -spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, - uint_t id, int atype) -{ - nvlist_t **child; - uint_t c, children; - int error; - - if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) - return (error); - - if ((*vdp)->vdev_ops->vdev_op_leaf) - return (0); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) { - vdev_free(*vdp); - *vdp = NULL; - return (EINVAL); - } - - for (c = 0; c < children; c++) { - vdev_t *vd; - if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, - atype)) != 0) { - vdev_free(*vdp); - *vdp = NULL; - return (error); - } - } - - ASSERT(*vdp != NULL); - - return (0); -} - -/* - * Opposite of spa_load(). - */ -static void -spa_unload(spa_t *spa) -{ - int i; - - /* - * Stop async tasks. - */ - spa_async_suspend(spa); - - /* - * Stop syncing. - */ - if (spa->spa_sync_on) { - txg_sync_stop(spa->spa_dsl_pool); - spa->spa_sync_on = B_FALSE; - } - - /* - * Wait for any outstanding prefetch I/O to complete. - */ - spa_config_enter(spa, RW_WRITER, FTAG); - spa_config_exit(spa, FTAG); - - /* - * Close the dsl pool. - */ - if (spa->spa_dsl_pool) { - dsl_pool_close(spa->spa_dsl_pool); - spa->spa_dsl_pool = NULL; - } - - /* - * Close all vdevs. - */ - if (spa->spa_root_vdev) - vdev_free(spa->spa_root_vdev); - ASSERT(spa->spa_root_vdev == NULL); - - for (i = 0; i < spa->spa_nspares; i++) - vdev_free(spa->spa_spares[i]); - if (spa->spa_spares) { - kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); - spa->spa_spares = NULL; - } - if (spa->spa_sparelist) { - nvlist_free(spa->spa_sparelist); - spa->spa_sparelist = NULL; - } - - spa->spa_async_suspended = 0; -} - -/* - * Load (or re-load) the current list of vdevs describing the active spares for - * this pool. When this is called, we have some form of basic information in - * 'spa_sparelist'. We parse this into vdevs, try to open them, and then - * re-generate a more complete list including status information. - */ -static void -spa_load_spares(spa_t *spa) -{ - nvlist_t **spares; - uint_t nspares; - int i; - vdev_t *vd, *tvd; - - /* - * First, close and free any existing spare vdevs. - */ - for (i = 0; i < spa->spa_nspares; i++) { - vd = spa->spa_spares[i]; - - /* Undo the call to spa_activate() below */ - if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && - tvd->vdev_isspare) - spa_spare_remove(tvd); - vdev_close(vd); - vdev_free(vd); - } - - if (spa->spa_spares) - kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); - - if (spa->spa_sparelist == NULL) - nspares = 0; - else - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); - - spa->spa_nspares = (int)nspares; - spa->spa_spares = NULL; - - if (nspares == 0) - return; - - /* - * Construct the array of vdevs, opening them to get status in the - * process. For each spare, there is potentially two different vdev_t - * structures associated with it: one in the list of spares (used only - * for basic validation purposes) and one in the active vdev - * configuration (if it's spared in). During this phase we open and - * validate each vdev on the spare list. If the vdev also exists in the - * active configuration, then we also mark this vdev as an active spare. - */ - spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); - for (i = 0; i < spa->spa_nspares; i++) { - VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, - VDEV_ALLOC_SPARE) == 0); - ASSERT(vd != NULL); - - spa->spa_spares[i] = vd; - - if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { - if (!tvd->vdev_isspare) - spa_spare_add(tvd); - - /* - * We only mark the spare active if we were successfully - * able to load the vdev. Otherwise, importing a pool - * with a bad active spare would result in strange - * behavior, because multiple pool would think the spare - * is actively in use. - * - * There is a vulnerability here to an equally bizarre - * circumstance, where a dead active spare is later - * brought back to life (onlined or otherwise). Given - * the rarity of this scenario, and the extra complexity - * it adds, we ignore the possibility. - */ - if (!vdev_is_dead(tvd)) - spa_spare_activate(tvd); - } - - if (vdev_open(vd) != 0) - continue; - - vd->vdev_top = vd; - (void) vdev_validate_spare(vd); - } - - /* - * Recompute the stashed list of spares, with status information - * this time. - */ - VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - DATA_TYPE_NVLIST_ARRAY) == 0); - - spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); - for (i = 0; i < spa->spa_nspares; i++) - spares[i] = vdev_config_generate(spa, spa->spa_spares[i], - B_TRUE, B_TRUE); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - spares, spa->spa_nspares) == 0); - for (i = 0; i < spa->spa_nspares; i++) - nvlist_free(spares[i]); - kmem_free(spares, spa->spa_nspares * sizeof (void *)); -} - -static int -load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) -{ - dmu_buf_t *db; - char *packed = NULL; - size_t nvsize = 0; - int error; - *value = NULL; - - VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); - nvsize = *(uint64_t *)db->db_data; - dmu_buf_rele(db, FTAG); - - packed = kmem_alloc(nvsize, KM_SLEEP); - error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); - if (error == 0) - error = nvlist_unpack(packed, nvsize, value, 0); - kmem_free(packed, nvsize); - - return (error); -} - -/* - * Load an existing storage pool, using the pool's builtin spa_config as a - * source of configuration information. - */ -static int -spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) -{ - int error = 0; - nvlist_t *nvroot = NULL; - vdev_t *rvd; - uberblock_t *ub = &spa->spa_uberblock; - uint64_t config_cache_txg = spa->spa_config_txg; - uint64_t pool_guid; - uint64_t version; - zio_t *zio; - - spa->spa_load_state = state; - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { - error = EINVAL; - goto out; - } - - /* - * Versioning wasn't explicitly added to the label until later, so if - * it's not present treat it as the initial version. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) - version = ZFS_VERSION_INITIAL; - - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &spa->spa_config_txg); - - if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && - spa_guid_exists(pool_guid, 0)) { - error = EEXIST; - goto out; - } - - spa->spa_load_guid = pool_guid; - - /* - * Parse the configuration into a vdev tree. We explicitly set the - * value that will be returned by spa_version() since parsing the - * configuration requires knowing the version number. - */ - spa_config_enter(spa, RW_WRITER, FTAG); - spa->spa_ubsync.ub_version = version; - error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); - spa_config_exit(spa, FTAG); - - if (error != 0) - goto out; - - ASSERT(spa->spa_root_vdev == rvd); - ASSERT(spa_guid(spa) == pool_guid); - - /* - * Try to open all vdevs, loading each label in the process. - */ - error = vdev_open(rvd); - if (error != 0) - goto out; - - /* - * Validate the labels for all leaf vdevs. We need to grab the config - * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD - * flag. - */ - spa_config_enter(spa, RW_READER, FTAG); - error = vdev_validate(rvd); - spa_config_exit(spa, FTAG); - - if (error != 0) - goto out; - - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; - } - - /* - * Find the best uberblock. - */ - bzero(ub, sizeof (uberblock_t)); - - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); - vdev_uberblock_load(zio, rvd, ub); - error = zio_wait(zio); - - /* - * If we weren't able to find a single valid uberblock, return failure. - */ - if (ub->ub_txg == 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = ENXIO; - goto out; - } - - /* - * If the pool is newer than the code, we can't open it. - */ - if (ub->ub_version > ZFS_VERSION) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_VERSION_NEWER); - error = ENOTSUP; - goto out; - } - - /* - * If the vdev guid sum doesn't match the uberblock, we have an - * incomplete configuration. - */ - if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_GUID_SUM); - error = ENXIO; - goto out; - } - - /* - * Initialize internal SPA structures. - */ - spa->spa_state = POOL_STATE_ACTIVE; - spa->spa_ubsync = spa->spa_uberblock; - spa->spa_first_txg = spa_last_synced_txg(spa) + 1; - error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); - if (error) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - goto out; - } - spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; - - if (zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, - sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - if (!mosconfig) { - nvlist_t *newconfig; - uint64_t hostid; - - if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - /* - * hostid is set after the root file system is mounted, so - * ignore the check until it's done. - */ - if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, - &hostid) == 0 && root_mounted()) { - char *hostname; - unsigned long myhostid = 0; - - VERIFY(nvlist_lookup_string(newconfig, - ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); - - (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); - if ((unsigned long)hostid != myhostid) { - cmn_err(CE_WARN, "pool '%s' could not be " - "loaded as it was last accessed by " - "another system (host: %s hostid: 0x%lx). " - "See: http://www.sun.com/msg/ZFS-8000-EY", - spa->spa_name, hostname, - (unsigned long)hostid); - error = EBADF; - goto out; - } - } - - spa_config_set(spa, newconfig); - spa_unload(spa); - spa_deactivate(spa); - spa_activate(spa); - - return (spa_load(spa, newconfig, state, B_TRUE)); - } - - if (zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - /* - * Load the bit that tells us to use the new accounting function - * (raid-z deflation). If we have an older pool, this will not - * be present. - */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - /* - * Load the persistent error log. If we have an older pool, this will - * not be present. - */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, - sizeof (uint64_t), 1, &spa->spa_errlog_last); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, - sizeof (uint64_t), 1, &spa->spa_errlog_scrub); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - /* - * Load the history object. If we have an older pool, this - * will not be present. - */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, - sizeof (uint64_t), 1, &spa->spa_history); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - /* - * Load any hot spares for this pool. - */ - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - if (error == 0) { - ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); - if (load_nvlist(spa, spa->spa_spares_object, - &spa->spa_sparelist) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - spa_config_enter(spa, RW_WRITER, FTAG); - spa_load_spares(spa); - spa_config_exit(spa, FTAG); - } - - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); - - if (error && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - if (error == 0) { - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), - sizeof (uint64_t), 1, &spa->spa_bootfs); - } - - /* - * Load the vdev state for all toplevel vdevs. - */ - vdev_load(rvd); - - /* - * Propagate the leaf DTLs we just loaded all the way up the tree. - */ - spa_config_enter(spa, RW_WRITER, FTAG); - vdev_dtl_reassess(rvd, 0, 0, B_FALSE); - spa_config_exit(spa, FTAG); - - /* - * Check the state of the root vdev. If it can't be opened, it - * indicates one or more toplevel vdevs are faulted. - */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; - } - - if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { - dmu_tx_t *tx; - int need_update = B_FALSE; - int c; - - /* - * Claim log blocks that haven't been committed yet. - * This must all happen in a single txg. - */ - tx = dmu_tx_create_assigned(spa_get_dsl(spa), - spa_first_txg(spa)); - (void) dmu_objset_find(spa->spa_name, - zil_claim, tx, DS_FIND_CHILDREN); - dmu_tx_commit(tx); - - spa->spa_sync_on = B_TRUE; - txg_sync_start(spa->spa_dsl_pool); - - /* - * Wait for all claims to sync. - */ - txg_wait_synced(spa->spa_dsl_pool, 0); - - /* - * If the config cache is stale, or we have uninitialized - * metaslabs (see spa_vdev_add()), then update the config. - */ - if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT) - need_update = B_TRUE; - - for (c = 0; c < rvd->vdev_children; c++) - if (rvd->vdev_child[c]->vdev_ms_array == 0) - need_update = B_TRUE; - - /* - * Update the config cache asychronously in case we're the - * root pool, in which case the config cache isn't writable yet. - */ - if (need_update) - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); - } - - error = 0; -out: - if (error && error != EBADF) - zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); - spa->spa_load_state = SPA_LOAD_NONE; - spa->spa_ena = 0; - - return (error); -} - -/* - * Pool Open/Import - * - * The import case is identical to an open except that the configuration is sent - * down from userland, instead of grabbed from the configuration cache. For the - * case of an open, the pool configuration will exist in the - * POOL_STATE_UNITIALIZED state. - * - * The stats information (gen/count/ustats) is used to gather vdev statistics at - * the same time open the pool, without having to keep around the spa_t in some - * ambiguous state. - */ -static int -spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) -{ - spa_t *spa; - int error; - int loaded = B_FALSE; - int locked = B_FALSE; - - *spapp = NULL; - - /* - * As disgusting as this is, we need to support recursive calls to this - * function because dsl_dir_open() is called during spa_load(), and ends - * up calling spa_open() again. The real fix is to figure out how to - * avoid dsl_dir_open() calling this in the first place. - */ - if (mutex_owner(&spa_namespace_lock) != curthread) { - mutex_enter(&spa_namespace_lock); - locked = B_TRUE; - } - - if ((spa = spa_lookup(pool)) == NULL) { - if (locked) - mutex_exit(&spa_namespace_lock); - return (ENOENT); - } - if (spa->spa_state == POOL_STATE_UNINITIALIZED) { - - spa_activate(spa); - - error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); - - if (error == EBADF) { - /* - * If vdev_validate() returns failure (indicated by - * EBADF), it indicates that one of the vdevs indicates - * that the pool has been exported or destroyed. If - * this is the case, the config cache is out of sync and - * we should remove the pool from the namespace. - */ - zfs_post_ok(spa, NULL); - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); - spa_config_sync(); - if (locked) - mutex_exit(&spa_namespace_lock); - return (ENOENT); - } - - if (error) { - /* - * We can't open the pool, but we still have useful - * information: the state of each vdev after the - * attempted vdev_open(). Return this to the user. - */ - if (config != NULL && spa->spa_root_vdev != NULL) { - spa_config_enter(spa, RW_READER, FTAG); - *config = spa_config_generate(spa, NULL, -1ULL, - B_TRUE); - spa_config_exit(spa, FTAG); - } - spa_unload(spa); - spa_deactivate(spa); - spa->spa_last_open_failed = B_TRUE; - if (locked) - mutex_exit(&spa_namespace_lock); - *spapp = NULL; - return (error); - } else { - zfs_post_ok(spa, NULL); - spa->spa_last_open_failed = B_FALSE; - } - - loaded = B_TRUE; - } - - spa_open_ref(spa, tag); - if (locked) - mutex_exit(&spa_namespace_lock); - - *spapp = spa; - - if (config != NULL) { - spa_config_enter(spa, RW_READER, FTAG); - *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - spa_config_exit(spa, FTAG); - } - - /* - * If we just loaded the pool, resilver anything that's out of date. - */ - if (loaded && (spa_mode & FWRITE)) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); - - return (0); -} - -int -spa_open(const char *name, spa_t **spapp, void *tag) -{ - return (spa_open_common(name, spapp, tag, NULL)); -} - -/* - * Lookup the given spa_t, incrementing the inject count in the process, - * preventing it from being exported or destroyed. - */ -spa_t * -spa_inject_addref(char *name) -{ - spa_t *spa; - - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(name)) == NULL) { - mutex_exit(&spa_namespace_lock); - return (NULL); - } - spa->spa_inject_ref++; - mutex_exit(&spa_namespace_lock); - - return (spa); -} - -void -spa_inject_delref(spa_t *spa) -{ - mutex_enter(&spa_namespace_lock); - spa->spa_inject_ref--; - mutex_exit(&spa_namespace_lock); -} - -static void -spa_add_spares(spa_t *spa, nvlist_t *config) -{ - nvlist_t **spares; - uint_t i, nspares; - nvlist_t *nvroot; - uint64_t guid; - vdev_stat_t *vs; - uint_t vsc; - uint64_t pool; - - if (spa->spa_nspares == 0) - return; - - VERIFY(nvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); - if (nspares != 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvroot, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); - - /* - * Go through and find any spares which have since been - * repurposed as an active spare. If this is the case, update - * their status appropriately. - */ - for (i = 0; i < nspares; i++) { - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &guid) == 0); - if (spa_spare_exists(guid, &pool) && pool != 0ULL) { - VERIFY(nvlist_lookup_uint64_array( - spares[i], ZPOOL_CONFIG_STATS, - (uint64_t **)&vs, &vsc) == 0); - vs->vs_state = VDEV_STATE_CANT_OPEN; - vs->vs_aux = VDEV_AUX_SPARED; - } - } - } -} - -int -spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) -{ - int error; - spa_t *spa; - - *config = NULL; - error = spa_open_common(name, &spa, FTAG, config); - - if (spa && *config != NULL) { - VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, - spa_get_errlog_size(spa)) == 0); - - spa_add_spares(spa, *config); - } - - /* - * We want to get the alternate root even for faulted pools, so we cheat - * and call spa_lookup() directly. - */ - if (altroot) { - if (spa == NULL) { - mutex_enter(&spa_namespace_lock); - spa = spa_lookup(name); - if (spa) - spa_altroot(spa, altroot, buflen); - else - altroot[0] = '\0'; - spa = NULL; - mutex_exit(&spa_namespace_lock); - } else { - spa_altroot(spa, altroot, buflen); - } - } - - if (spa != NULL) - spa_close(spa, FTAG); - - return (error); -} - -/* - * Validate that the 'spares' array is well formed. We must have an array of - * nvlists, each which describes a valid leaf vdev. If this is an import (mode - * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long - * as they are well-formed. - */ -static int -spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) -{ - nvlist_t **spares; - uint_t i, nspares; - vdev_t *vd; - int error; - - /* - * It's acceptable to have no spares specified. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) != 0) - return (0); - - if (nspares == 0) - return (EINVAL); - - /* - * Make sure the pool is formatted with a version that supports hot - * spares. - */ - if (spa_version(spa) < ZFS_VERSION_SPARES) - return (ENOTSUP); - - /* - * Set the pending spare list so we correctly handle device in-use - * checking. - */ - spa->spa_pending_spares = spares; - spa->spa_pending_nspares = nspares; - - for (i = 0; i < nspares; i++) { - if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, - mode)) != 0) - goto out; - - if (!vd->vdev_ops->vdev_op_leaf) { - vdev_free(vd); - error = EINVAL; - goto out; - } - - vd->vdev_top = vd; - - if ((error = vdev_open(vd)) == 0 && - (error = vdev_label_init(vd, crtxg, - VDEV_LABEL_SPARE)) == 0) { - VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); - } - - vdev_free(vd); - - if (error && mode != VDEV_ALLOC_SPARE) - goto out; - else - error = 0; - } - -out: - spa->spa_pending_spares = NULL; - spa->spa_pending_nspares = 0; - return (error); -} - -/* - * Pool Creation - */ -int -spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) -{ - spa_t *spa; - vdev_t *rvd; - dsl_pool_t *dp; - dmu_tx_t *tx; - int c, error = 0; - uint64_t txg = TXG_INITIAL; - nvlist_t **spares; - uint_t nspares; - - /* - * If this pool already exists, return failure. - */ - mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { - mutex_exit(&spa_namespace_lock); - return (EEXIST); - } - - /* - * Allocate a new spa_t structure. - */ - spa = spa_add(pool, altroot); - spa_activate(spa); - - spa->spa_uberblock.ub_txg = txg - 1; - spa->spa_uberblock.ub_version = ZFS_VERSION; - spa->spa_ubsync = spa->spa_uberblock; - - /* - * Create the root vdev. - */ - spa_config_enter(spa, RW_WRITER, FTAG); - - error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); - - ASSERT(error != 0 || rvd != NULL); - ASSERT(error != 0 || spa->spa_root_vdev == rvd); - - if (error == 0 && rvd->vdev_children == 0) - error = EINVAL; - - if (error == 0 && - (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_spares(spa, nvroot, txg, - VDEV_ALLOC_ADD)) == 0) { - for (c = 0; c < rvd->vdev_children; c++) - vdev_init(rvd->vdev_child[c], txg); - vdev_config_dirty(rvd); - } - - spa_config_exit(spa, FTAG); - - if (error != 0) { - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); - mutex_exit(&spa_namespace_lock); - return (error); - } - - /* - * Get the list of spares, if specified. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - spa_config_enter(spa, RW_WRITER, FTAG); - spa_load_spares(spa); - spa_config_exit(spa, FTAG); - spa->spa_sync_spares = B_TRUE; - } - - spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); - spa->spa_meta_objset = dp->dp_meta_objset; - - tx = dmu_tx_create_assigned(dp, txg); - - /* - * Create the pool config object. - */ - spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_PACKED_NVLIST, 1 << 14, - DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); - - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, - sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { - cmn_err(CE_PANIC, "failed to add pool config"); - } - - /* Newly created pools are always deflated. */ - spa->spa_deflate = TRUE; - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { - cmn_err(CE_PANIC, "failed to add deflate"); - } - - /* - * Create the deferred-free bplist object. Turn off compression - * because sync-to-convergence takes longer if the blocksize - * keeps changing. - */ - spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, - 1 << 14, tx); - dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, - ZIO_COMPRESS_OFF, tx); - - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { - cmn_err(CE_PANIC, "failed to add bplist"); - } - - /* - * Create the pool's history object. - */ - spa_history_create_obj(spa, tx); - - dmu_tx_commit(tx); - - spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); - spa->spa_sync_on = B_TRUE; - txg_sync_start(spa->spa_dsl_pool); - - /* - * We explicitly wait for the first transaction to complete so that our - * bean counters are appropriately updated. - */ - txg_wait_synced(spa->spa_dsl_pool, txg); - - spa_config_sync(); - - mutex_exit(&spa_namespace_lock); - - return (0); -} - -/* - * Import the given pool into the system. We set up the necessary spa_t and - * then call spa_load() to do the dirty work. - */ -int -spa_import(const char *pool, nvlist_t *config, const char *altroot) -{ - spa_t *spa; - int error; - nvlist_t *nvroot; - nvlist_t **spares; - uint_t nspares; - - if (!(spa_mode & FWRITE)) - return (EROFS); - - /* - * If a pool with this name exists, return failure. - */ - mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { - mutex_exit(&spa_namespace_lock); - return (EEXIST); - } - - /* - * Create and initialize the spa structure. - */ - spa = spa_add(pool, altroot); - spa_activate(spa); - - /* - * Pass off the heavy lifting to spa_load(). - * Pass TRUE for mosconfig because the user-supplied config - * is actually the one to trust when doing an import. - */ - error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); - - spa_config_enter(spa, RW_WRITER, FTAG); - /* - * Toss any existing sparelist, as it doesn't have any validity anymore, - * and conflicts with spa_has_spare(). - */ - if (spa->spa_sparelist) { - nvlist_free(spa->spa_sparelist); - spa->spa_sparelist = NULL; - spa_load_spares(spa); - } - - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (error == 0) - error = spa_validate_spares(spa, nvroot, -1ULL, - VDEV_ALLOC_SPARE); - spa_config_exit(spa, FTAG); - - if (error != 0) { - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); - mutex_exit(&spa_namespace_lock); - return (error); - } - - /* - * Override any spares as specified by the user, as these may have - * correct device names/devids, etc. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - if (spa->spa_sparelist) - VERIFY(nvlist_remove(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - else - VERIFY(nvlist_alloc(&spa->spa_sparelist, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - spa_config_enter(spa, RW_WRITER, FTAG); - spa_load_spares(spa); - spa_config_exit(spa, FTAG); - spa->spa_sync_spares = B_TRUE; - } - - /* - * Update the config cache to include the newly-imported pool. - */ - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - - mutex_exit(&spa_namespace_lock); - - /* - * Resilver anything that's out of date. - */ - if (spa_mode & FWRITE) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); - - return (0); -} - -/* - * This (illegal) pool name is used when temporarily importing a spa_t in order - * to get the vdev stats associated with the imported devices. - */ -#define TRYIMPORT_NAME "$import" - -nvlist_t * -spa_tryimport(nvlist_t *tryconfig) -{ - nvlist_t *config = NULL; - char *poolname; - spa_t *spa; - uint64_t state; - - if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) - return (NULL); - - if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) - return (NULL); - - /* - * Create and initialize the spa structure. - */ - mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, NULL); - spa_activate(spa); - - /* - * Pass off the heavy lifting to spa_load(). - * Pass TRUE for mosconfig because the user-supplied config - * is actually the one to trust when doing an import. - */ - (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); - - /* - * If 'tryconfig' was at least parsable, return the current config. - */ - if (spa->spa_root_vdev != NULL) { - spa_config_enter(spa, RW_READER, FTAG); - config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - spa_config_exit(spa, FTAG); - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, - poolname) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, - state) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, - spa->spa_uberblock.ub_timestamp) == 0); - - /* - * Add the list of hot spares. - */ - spa_add_spares(spa, config); - } - - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); - mutex_exit(&spa_namespace_lock); - - return (config); -} - -/* - * Pool export/destroy - * - * The act of destroying or exporting a pool is very simple. We make sure there - * is no more pending I/O and any references to the pool are gone. Then, we - * update the pool state and sync all the labels to disk, removing the - * configuration from the cache afterwards. - */ -static int -spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) -{ - spa_t *spa; - - if (oldconfig) - *oldconfig = NULL; - - if (!(spa_mode & FWRITE)) - return (EROFS); - - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) == NULL) { - mutex_exit(&spa_namespace_lock); - return (ENOENT); - } - - /* - * Put a hold on the pool, drop the namespace lock, stop async tasks, - * reacquire the namespace lock, and see if we can export. - */ - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - spa_async_suspend(spa); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - - /* - * The pool will be in core if it's openable, - * in which case we can modify its state. - */ - if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { - /* - * Objsets may be open only because they're dirty, so we - * have to force it to sync before checking spa_refcnt. - */ - spa_scrub_suspend(spa); - txg_wait_synced(spa->spa_dsl_pool, 0); - - /* - * A pool cannot be exported or destroyed if there are active - * references. If we are resetting a pool, allow references by - * fault injection handlers. - */ - if (!spa_refcount_zero(spa) || - (spa->spa_inject_ref != 0 && - new_state != POOL_STATE_UNINITIALIZED)) { - spa_scrub_resume(spa); - spa_async_resume(spa); - mutex_exit(&spa_namespace_lock); - return (EBUSY); - } - - spa_scrub_resume(spa); - VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); - - /* - * We want this to be reflected on every label, - * so mark them all dirty. spa_unload() will do the - * final sync that pushes these changes out. - */ - if (new_state != POOL_STATE_UNINITIALIZED) { - spa_config_enter(spa, RW_WRITER, FTAG); - spa->spa_state = new_state; - spa->spa_final_txg = spa_last_synced_txg(spa) + 1; - vdev_config_dirty(spa->spa_root_vdev); - spa_config_exit(spa, FTAG); - } - } - - if (spa->spa_state != POOL_STATE_UNINITIALIZED) { - spa_unload(spa); - spa_deactivate(spa); - } - - if (oldconfig && spa->spa_config) - VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); - - if (new_state != POOL_STATE_UNINITIALIZED) { - spa_remove(spa); - spa_config_sync(); - } - mutex_exit(&spa_namespace_lock); - - return (0); -} - -/* - * Destroy a storage pool. - */ -int -spa_destroy(char *pool) -{ - return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); -} - -/* - * Export a storage pool. - */ -int -spa_export(char *pool, nvlist_t **oldconfig) -{ - return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); -} - -/* - * Similar to spa_export(), this unloads the spa_t without actually removing it - * from the namespace in any way. - */ -int -spa_reset(char *pool) -{ - return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); -} - - -/* - * ========================================================================== - * Device manipulation - * ========================================================================== - */ - -/* - * Add capacity to a storage pool. - */ -int -spa_vdev_add(spa_t *spa, nvlist_t *nvroot) -{ - uint64_t txg; - int c, error; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd, *tvd; - nvlist_t **spares; - uint_t i, nspares; - - txg = spa_vdev_enter(spa); - - if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, - VDEV_ALLOC_ADD)) != 0) - return (spa_vdev_exit(spa, NULL, txg, error)); - - spa->spa_pending_vdev = vd; - - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) != 0) - nspares = 0; - - if (vd->vdev_children == 0 && nspares == 0) { - spa->spa_pending_vdev = NULL; - return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } - - if (vd->vdev_children != 0) { - if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { - spa->spa_pending_vdev = NULL; - return (spa_vdev_exit(spa, vd, txg, error)); - } - } - - /* - * We must validate the spares after checking the children. Otherwise, - * vdev_inuse() will blindly overwrite the spare. - */ - if ((error = spa_validate_spares(spa, nvroot, txg, - VDEV_ALLOC_ADD)) != 0) { - spa->spa_pending_vdev = NULL; - return (spa_vdev_exit(spa, vd, txg, error)); - } - - spa->spa_pending_vdev = NULL; - - /* - * Transfer each new top-level vdev from vd to rvd. - */ - for (c = 0; c < vd->vdev_children; c++) { - tvd = vd->vdev_child[c]; - vdev_remove_child(vd, tvd); - tvd->vdev_id = rvd->vdev_children; - vdev_add_child(rvd, tvd); - vdev_config_dirty(tvd); - } - - if (nspares != 0) { - if (spa->spa_sparelist != NULL) { - nvlist_t **oldspares; - uint_t oldnspares; - nvlist_t **newspares; - - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); - - newspares = kmem_alloc(sizeof (void *) * - (nspares + oldnspares), KM_SLEEP); - for (i = 0; i < oldnspares; i++) - VERIFY(nvlist_dup(oldspares[i], - &newspares[i], KM_SLEEP) == 0); - for (i = 0; i < nspares; i++) - VERIFY(nvlist_dup(spares[i], - &newspares[i + oldnspares], - KM_SLEEP) == 0); - - VERIFY(nvlist_remove(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, newspares, - nspares + oldnspares) == 0); - for (i = 0; i < oldnspares + nspares; i++) - nvlist_free(newspares[i]); - kmem_free(newspares, (oldnspares + nspares) * - sizeof (void *)); - } else { - VERIFY(nvlist_alloc(&spa->spa_sparelist, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - } - - spa_load_spares(spa); - spa->spa_sync_spares = B_TRUE; - } - - /* - * We have to be careful when adding new vdevs to an existing pool. - * If other threads start allocating from these vdevs before we - * sync the config cache, and we lose power, then upon reboot we may - * fail to open the pool because there are DVAs that the config cache - * can't translate. Therefore, we first add the vdevs without - * initializing metaslabs; sync the config cache (via spa_vdev_exit()); - * and then let spa_config_update() initialize the new metaslabs. - * - * spa_load() checks for added-but-not-initialized vdevs, so that - * if we lose power at any point in this sequence, the remaining - * steps will be completed the next time we load the pool. - */ - (void) spa_vdev_exit(spa, vd, txg, 0); - - mutex_enter(&spa_namespace_lock); - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - mutex_exit(&spa_namespace_lock); - - return (0); -} - -/* - * Attach a device to a mirror. The arguments are the path to any device - * in the mirror, and the nvroot for the new device. If the path specifies - * a device that is not mirrored, we automatically insert the mirror vdev. - * - * If 'replacing' is specified, the new device is intended to replace the - * existing device; in this case the two devices are made into their own - * mirror using the 'replacing' vdev, which is functionally idendical to - * the mirror vdev (it actually reuses all the same ops) but has a few - * extra rules: you can't attach to it after it's been created, and upon - * completion of resilvering, the first disk (the one being replaced) - * is automatically detached. - */ -int -spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) -{ - uint64_t txg, open_txg; - int error; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; - vdev_ops_t *pvops; - - txg = spa_vdev_enter(spa); - - oldvd = vdev_lookup_by_guid(rvd, guid); - - if (oldvd == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - - if (!oldvd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - pvd = oldvd->vdev_parent; - - if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) - return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); - - newvd = newrootvd->vdev_child[0]; - - if (!newvd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); - - if ((error = vdev_create(newrootvd, txg, replacing)) != 0) - return (spa_vdev_exit(spa, newrootvd, txg, error)); - - if (!replacing) { - /* - * For attach, the only allowable parent is a mirror or the root - * vdev. - */ - if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops) - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - - pvops = &vdev_mirror_ops; - } else { - /* - * Active hot spares can only be replaced by inactive hot - * spares. - */ - if (pvd->vdev_ops == &vdev_spare_ops && - pvd->vdev_child[1] == oldvd && - !spa_has_spare(spa, newvd->vdev_guid)) - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - - /* - * If the source is a hot spare, and the parent isn't already a - * spare, then we want to create a new hot spare. Otherwise, we - * want to create a replacing vdev. The user is not allowed to - * attach to a spared vdev child unless the 'isspare' state is - * the same (spare replaces spare, non-spare replaces - * non-spare). - */ - if (pvd->vdev_ops == &vdev_replacing_ops) - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops == &vdev_spare_ops && - newvd->vdev_isspare != oldvd->vdev_isspare) - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops != &vdev_spare_ops && - newvd->vdev_isspare) - pvops = &vdev_spare_ops; - else - pvops = &vdev_replacing_ops; - } - - /* - * Compare the new device size with the replaceable/attachable - * device size. - */ - if (newvd->vdev_psize < vdev_get_rsize(oldvd)) - return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); - - /* - * The new device cannot have a higher alignment requirement - * than the top-level vdev. - */ - if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) - return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); - - /* - * If this is an in-place replacement, update oldvd's path and devid - * to make it distinguishable from newvd, and unopenable from now on. - */ - if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { - spa_strfree(oldvd->vdev_path); - oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, - KM_SLEEP); - (void) sprintf(oldvd->vdev_path, "%s/%s", - newvd->vdev_path, "old"); - if (oldvd->vdev_devid != NULL) { - spa_strfree(oldvd->vdev_devid); - oldvd->vdev_devid = NULL; - } - } - - /* - * If the parent is not a mirror, or if we're replacing, insert the new - * mirror/replacing/spare vdev above oldvd. - */ - if (pvd->vdev_ops != pvops) - pvd = vdev_add_parent(oldvd, pvops); - - ASSERT(pvd->vdev_top->vdev_parent == rvd); - ASSERT(pvd->vdev_ops == pvops); - ASSERT(oldvd->vdev_parent == pvd); - - /* - * Extract the new device from its root and add it to pvd. - */ - vdev_remove_child(newrootvd, newvd); - newvd->vdev_id = pvd->vdev_children; - vdev_add_child(pvd, newvd); - - /* - * If newvd is smaller than oldvd, but larger than its rsize, - * the addition of newvd may have decreased our parent's asize. - */ - pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); - - tvd = newvd->vdev_top; - ASSERT(pvd->vdev_top == tvd); - ASSERT(tvd->vdev_parent == rvd); - - vdev_config_dirty(tvd); - - /* - * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate - * upward when spa_vdev_exit() calls vdev_dtl_reassess(). - */ - open_txg = txg + TXG_CONCURRENT_STATES - 1; - - mutex_enter(&newvd->vdev_dtl_lock); - space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, - open_txg - TXG_INITIAL + 1); - mutex_exit(&newvd->vdev_dtl_lock); - - if (newvd->vdev_isspare) - spa_spare_activate(newvd); - - /* - * Mark newvd's DTL dirty in this txg. - */ - vdev_dirty(tvd, VDD_DTL, newvd, txg); - - (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); - - /* - * Kick off a resilver to update newvd. - */ - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); - - return (0); -} - -/* - * Detach a device from a mirror or replacing vdev. - * If 'replace_done' is specified, only detach if the parent - * is a replacing vdev. - */ -int -spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) -{ - uint64_t txg; - int c, t, error; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd, *pvd, *cvd, *tvd; - boolean_t unspare = B_FALSE; - uint64_t unspare_guid; - - txg = spa_vdev_enter(spa); - - vd = vdev_lookup_by_guid(rvd, guid); - - if (vd == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - pvd = vd->vdev_parent; - - /* - * If replace_done is specified, only remove this device if it's - * the first child of a replacing vdev. For the 'spare' vdev, either - * disk can be removed. - */ - if (replace_done) { - if (pvd->vdev_ops == &vdev_replacing_ops) { - if (vd->vdev_id != 0) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } else if (pvd->vdev_ops != &vdev_spare_ops) { - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } - } - - ASSERT(pvd->vdev_ops != &vdev_spare_ops || - spa_version(spa) >= ZFS_VERSION_SPARES); - - /* - * Only mirror, replacing, and spare vdevs support detach. - */ - if (pvd->vdev_ops != &vdev_replacing_ops && - pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_spare_ops) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - /* - * If there's only one replica, you can't detach it. - */ - if (pvd->vdev_children <= 1) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - - /* - * If all siblings have non-empty DTLs, this device may have the only - * valid copy of the data, which means we cannot safely detach it. - * - * XXX -- as in the vdev_offline() case, we really want a more - * precise DTL check. - */ - for (c = 0; c < pvd->vdev_children; c++) { - uint64_t dirty; - - cvd = pvd->vdev_child[c]; - if (cvd == vd) - continue; - if (vdev_is_dead(cvd)) - continue; - mutex_enter(&cvd->vdev_dtl_lock); - dirty = cvd->vdev_dtl_map.sm_space | - cvd->vdev_dtl_scrub.sm_space; - mutex_exit(&cvd->vdev_dtl_lock); - if (!dirty) - break; - } - - /* - * If we are a replacing or spare vdev, then we can always detach the - * latter child, as that is how one cancels the operation. - */ - if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && - c == pvd->vdev_children) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - - /* - * If we are detaching the original disk from a spare, then it implies - * that the spare should become a real disk, and be removed from the - * active spare list for the pool. - */ - if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0) - unspare = B_TRUE; - - /* - * Erase the disk labels so the disk can be used for other things. - * This must be done after all other error cases are handled, - * but before we disembowel vd (so we can still do I/O to it). - * But if we can't do it, don't treat the error as fatal -- - * it may be that the unwritability of the disk is the reason - * it's being detached! - */ - error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); - - /* - * Remove vd from its parent and compact the parent's children. - */ - vdev_remove_child(pvd, vd); - vdev_compact_children(pvd); - - /* - * Remember one of the remaining children so we can get tvd below. - */ - cvd = pvd->vdev_child[0]; - - /* - * If we need to remove the remaining child from the list of hot spares, - * do it now, marking the vdev as no longer a spare in the process. We - * must do this before vdev_remove_parent(), because that can change the - * GUID if it creates a new toplevel GUID. - */ - if (unspare) { - ASSERT(cvd->vdev_isspare); - spa_spare_remove(cvd); - unspare_guid = cvd->vdev_guid; - } - - /* - * If the parent mirror/replacing vdev only has one child, - * the parent is no longer needed. Remove it from the tree. - */ - if (pvd->vdev_children == 1) - vdev_remove_parent(cvd); - - /* - * We don't set tvd until now because the parent we just removed - * may have been the previous top-level vdev. - */ - tvd = cvd->vdev_top; - ASSERT(tvd->vdev_parent == rvd); - - /* - * Reevaluate the parent vdev state. - */ - vdev_propagate_state(cvd->vdev_parent); - - /* - * If the device we just detached was smaller than the others, it may be - * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() - * can't fail because the existing metaslabs are already in core, so - * there's nothing to read from disk. - */ - VERIFY(vdev_metaslab_init(tvd, txg) == 0); - - vdev_config_dirty(tvd); - - /* - * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that - * vd->vdev_detached is set and free vd's DTL object in syncing context. - * But first make sure we're not on any *other* txg's DTL list, to - * prevent vd from being accessed after it's freed. - */ - for (t = 0; t < TXG_SIZE; t++) - (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); - vd->vdev_detached = B_TRUE; - vdev_dirty(tvd, VDD_DTL, vd, txg); - - error = spa_vdev_exit(spa, vd, txg, 0); - - /* - * If this was the removal of the original device in a hot spare vdev, - * then we want to go through and remove the device from the hot spare - * list of every other pool. - */ - if (unspare) { - spa = NULL; - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa->spa_state != POOL_STATE_ACTIVE) - continue; - - (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); - } - mutex_exit(&spa_namespace_lock); - } - - return (error); -} - -/* - * Remove a device from the pool. Currently, this supports removing only hot - * spares. - */ -int -spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) -{ - vdev_t *vd; - nvlist_t **spares, *nv, **newspares; - uint_t i, j, nspares; - int ret = 0; - - spa_config_enter(spa, RW_WRITER, FTAG); - - vd = spa_lookup_by_guid(spa, guid); - - nv = NULL; - if (spa->spa_spares != NULL && - nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - for (i = 0; i < nspares; i++) { - uint64_t theguid; - - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == guid) { - nv = spares[i]; - break; - } - } - } - - /* - * We only support removing a hot spare, and only if it's not currently - * in use in this pool. - */ - if (nv == NULL && vd == NULL) { - ret = ENOENT; - goto out; - } - - if (nv == NULL && vd != NULL) { - ret = ENOTSUP; - goto out; - } - - if (!unspare && nv != NULL && vd != NULL) { - ret = EBUSY; - goto out; - } - - if (nspares == 1) { - newspares = NULL; - } else { - newspares = kmem_alloc((nspares - 1) * sizeof (void *), - KM_SLEEP); - for (i = 0, j = 0; i < nspares; i++) { - if (spares[i] != nv) - VERIFY(nvlist_dup(spares[i], - &newspares[j++], KM_SLEEP) == 0); - } - } - - VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - DATA_TYPE_NVLIST_ARRAY) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - newspares, nspares - 1) == 0); - for (i = 0; i < nspares - 1; i++) - nvlist_free(newspares[i]); - kmem_free(newspares, (nspares - 1) * sizeof (void *)); - spa_load_spares(spa); - spa->spa_sync_spares = B_TRUE; - -out: - spa_config_exit(spa, FTAG); - - return (ret); -} - -/* - * Find any device that's done replacing, so we can detach it. - */ -static vdev_t * -spa_vdev_replace_done_hunt(vdev_t *vd) -{ - vdev_t *newvd, *oldvd; - int c; - - for (c = 0; c < vd->vdev_children; c++) { - oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); - if (oldvd != NULL) - return (oldvd); - } - - if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { - oldvd = vd->vdev_child[0]; - newvd = vd->vdev_child[1]; - - mutex_enter(&newvd->vdev_dtl_lock); - if (newvd->vdev_dtl_map.sm_space == 0 && - newvd->vdev_dtl_scrub.sm_space == 0) { - mutex_exit(&newvd->vdev_dtl_lock); - return (oldvd); - } - mutex_exit(&newvd->vdev_dtl_lock); - } - - return (NULL); -} - -static void -spa_vdev_replace_done(spa_t *spa) -{ - vdev_t *vd; - vdev_t *pvd; - uint64_t guid; - uint64_t pguid = 0; - - spa_config_enter(spa, RW_READER, FTAG); - - while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { - guid = vd->vdev_guid; - /* - * If we have just finished replacing a hot spared device, then - * we need to detach the parent's first child (the original hot - * spare) as well. - */ - pvd = vd->vdev_parent; - if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && - pvd->vdev_id == 0) { - ASSERT(pvd->vdev_ops == &vdev_replacing_ops); - ASSERT(pvd->vdev_parent->vdev_children == 2); - pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; - } - spa_config_exit(spa, FTAG); - if (spa_vdev_detach(spa, guid, B_TRUE) != 0) - return; - if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) - return; - spa_config_enter(spa, RW_READER, FTAG); - } - - spa_config_exit(spa, FTAG); -} - -/* - * Update the stored path for this vdev. Dirty the vdev configuration, relying - * on spa_vdev_enter/exit() to synchronize the labels and cache. - */ -int -spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) -{ - vdev_t *rvd, *vd; - uint64_t txg; - - rvd = spa->spa_root_vdev; - - txg = spa_vdev_enter(spa); - - if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { - /* - * Determine if this is a reference to a hot spare. In that - * case, update the path as stored in the spare list. - */ - nvlist_t **spares; - uint_t i, nspares; - if (spa->spa_sparelist != NULL) { - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); - for (i = 0; i < nspares; i++) { - uint64_t theguid; - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == guid) - break; - } - - if (i == nspares) - return (spa_vdev_exit(spa, NULL, txg, ENOENT)); - - VERIFY(nvlist_add_string(spares[i], - ZPOOL_CONFIG_PATH, newpath) == 0); - spa_load_spares(spa); - spa->spa_sync_spares = B_TRUE; - return (spa_vdev_exit(spa, NULL, txg, 0)); - } else { - return (spa_vdev_exit(spa, NULL, txg, ENOENT)); - } - } - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - spa_strfree(vd->vdev_path); - vd->vdev_path = spa_strdup(newpath); - - vdev_config_dirty(vd->vdev_top); - - return (spa_vdev_exit(spa, NULL, txg, 0)); -} - -/* - * ========================================================================== - * SPA Scrubbing - * ========================================================================== - */ - -static void -spa_scrub_io_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - - zio_data_buf_free(zio->io_data, zio->io_size); - - mutex_enter(&spa->spa_scrub_lock); - if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; - spa->spa_scrub_errors++; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_errors++; - mutex_exit(&vd->vdev_stat_lock); - } - - if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) - cv_broadcast(&spa->spa_scrub_io_cv); - - ASSERT(spa->spa_scrub_inflight >= 0); - - mutex_exit(&spa->spa_scrub_lock); -} - -static void -spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, - zbookmark_t *zb) -{ - size_t size = BP_GET_LSIZE(bp); - void *data; - - mutex_enter(&spa->spa_scrub_lock); - /* - * Do not give too much work to vdev(s). - */ - while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - } - spa->spa_scrub_inflight++; - mutex_exit(&spa->spa_scrub_lock); - - data = zio_data_buf_alloc(size); - - if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) - flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ - - flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; - - zio_nowait(zio_read(NULL, spa, bp, data, size, - spa_scrub_io_done, NULL, priority, flags, zb)); -} - -/* ARGSUSED */ -static int -spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) -{ - blkptr_t *bp = &bc->bc_blkptr; - vdev_t *vd = spa->spa_root_vdev; - dva_t *dva = bp->blk_dva; - int needs_resilver = B_FALSE; - int d; - - if (bc->bc_errno) { - /* - * We can't scrub this block, but we can continue to scrub - * the rest of the pool. Note the error and move along. - */ - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_errors++; - mutex_exit(&spa->spa_scrub_lock); - - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_errors++; - mutex_exit(&vd->vdev_stat_lock); - - return (ERESTART); - } - - ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); - - for (d = 0; d < BP_GET_NDVAS(bp); d++) { - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); - - ASSERT(vd != NULL); - - /* - * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. - */ - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); - mutex_exit(&vd->vdev_stat_lock); - - if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { - if (DVA_GET_GANG(&dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best we can do is look at the - * pool-wide DTL. - * XXX -- it would be better to change our - * allocation policy to ensure that this can't - * happen. - */ - vd = spa->spa_root_vdev; - } - if (vdev_dtl_contains(&vd->vdev_dtl_map, - bp->blk_birth, 1)) - needs_resilver = B_TRUE; - } - } - - if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) - spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, - ZIO_FLAG_SCRUB, &bc->bc_bookmark); - else if (needs_resilver) - spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, - ZIO_FLAG_RESILVER, &bc->bc_bookmark); - - return (0); -} - -static void -spa_scrub_thread(void *arg) -{ - spa_t *spa = arg; - callb_cpr_t cprinfo; - traverse_handle_t *th = spa->spa_scrub_th; - vdev_t *rvd = spa->spa_root_vdev; - pool_scrub_type_t scrub_type = spa->spa_scrub_type; - int error = 0; - boolean_t complete; - - CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); - - /* - * If we're restarting due to a snapshot create/delete, - * wait for that to complete. - */ - txg_wait_synced(spa_get_dsl(spa), 0); - - dprintf("start %s mintxg=%llu maxtxg=%llu\n", - scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", - spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); - - spa_config_enter(spa, RW_WRITER, FTAG); - vdev_reopen(rvd); /* purge all vdev caches */ - vdev_config_dirty(rvd); /* rewrite all disk labels */ - vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); - spa_config_exit(spa, FTAG); - - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_errors = 0; - spa->spa_scrub_active = 1; - ASSERT(spa->spa_scrub_inflight == 0); - - while (!spa->spa_scrub_stop) { - CALLB_CPR_SAFE_BEGIN(&cprinfo); - while (spa->spa_scrub_suspended) { - spa->spa_scrub_active = 0; - cv_broadcast(&spa->spa_scrub_cv); - cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); - spa->spa_scrub_active = 1; - } - CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); - - if (spa->spa_scrub_restart_txg != 0) - break; - - mutex_exit(&spa->spa_scrub_lock); - error = traverse_more(th); - mutex_enter(&spa->spa_scrub_lock); - if (error != EAGAIN) - break; - } - - while (spa->spa_scrub_inflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - - spa->spa_scrub_active = 0; - cv_broadcast(&spa->spa_scrub_cv); - - mutex_exit(&spa->spa_scrub_lock); - - spa_config_enter(spa, RW_WRITER, FTAG); - - mutex_enter(&spa->spa_scrub_lock); - - /* - * Note: we check spa_scrub_restart_txg under both spa_scrub_lock - * AND the spa config lock to synchronize with any config changes - * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). - */ - if (spa->spa_scrub_restart_txg != 0) - error = ERESTART; - - if (spa->spa_scrub_stop) - error = EINTR; - - /* - * Even if there were uncorrectable errors, we consider the scrub - * completed. The downside is that if there is a transient error during - * a resilver, we won't resilver the data properly to the target. But - * if the damage is permanent (more likely) we will resilver forever, - * which isn't really acceptable. Since there is enough information for - * the user to know what has failed and why, this seems like a more - * tractable approach. - */ - complete = (error == 0); - - dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", - scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", - spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", - error, spa->spa_scrub_errors, spa->spa_scrub_stop); - - mutex_exit(&spa->spa_scrub_lock); - - /* - * If the scrub/resilver completed, update all DTLs to reflect this. - * Whether it succeeded or not, vacate all temporary scrub DTLs. - */ - vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, - complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); - vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); - spa_errlog_rotate(spa); - - spa_config_exit(spa, FTAG); - - mutex_enter(&spa->spa_scrub_lock); - - /* - * We may have finished replacing a device. - * Let the async thread assess this and handle the detach. - */ - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); - - /* - * If we were told to restart, our final act is to start a new scrub. - */ - if (error == ERESTART) - spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? - SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); - - spa->spa_scrub_type = POOL_SCRUB_NONE; - spa->spa_scrub_active = 0; - spa->spa_scrub_thread = NULL; - cv_broadcast(&spa->spa_scrub_cv); - CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ - thread_exit(); -} - -void -spa_scrub_suspend(spa_t *spa) -{ - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_suspended++; - while (spa->spa_scrub_active) { - cv_broadcast(&spa->spa_scrub_cv); - cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); - } - while (spa->spa_scrub_inflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - mutex_exit(&spa->spa_scrub_lock); -} - -void -spa_scrub_resume(spa_t *spa) -{ - mutex_enter(&spa->spa_scrub_lock); - ASSERT(spa->spa_scrub_suspended != 0); - if (--spa->spa_scrub_suspended == 0) - cv_broadcast(&spa->spa_scrub_cv); - mutex_exit(&spa->spa_scrub_lock); -} - -void -spa_scrub_restart(spa_t *spa, uint64_t txg) -{ - /* - * Something happened (e.g. snapshot create/delete) that means - * we must restart any in-progress scrubs. The itinerary will - * fix this properly. - */ - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_restart_txg = txg; - mutex_exit(&spa->spa_scrub_lock); -} - -int -spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) -{ - space_seg_t *ss; - uint64_t mintxg, maxtxg; - vdev_t *rvd = spa->spa_root_vdev; - - if ((uint_t)type >= POOL_SCRUB_TYPES) - return (ENOTSUP); - - mutex_enter(&spa->spa_scrub_lock); - - /* - * If there's a scrub or resilver already in progress, stop it. - */ - while (spa->spa_scrub_thread != NULL) { - /* - * Don't stop a resilver unless forced. - */ - if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { - mutex_exit(&spa->spa_scrub_lock); - return (EBUSY); - } - spa->spa_scrub_stop = 1; - cv_broadcast(&spa->spa_scrub_cv); - cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); - } - - /* - * Terminate the previous traverse. - */ - if (spa->spa_scrub_th != NULL) { - traverse_fini(spa->spa_scrub_th); - spa->spa_scrub_th = NULL; - } - - if (rvd == NULL) { - ASSERT(spa->spa_scrub_stop == 0); - ASSERT(spa->spa_scrub_type == type); - ASSERT(spa->spa_scrub_restart_txg == 0); - mutex_exit(&spa->spa_scrub_lock); - return (0); - } - - mintxg = TXG_INITIAL - 1; - maxtxg = spa_last_synced_txg(spa) + 1; - - mutex_enter(&rvd->vdev_dtl_lock); - - if (rvd->vdev_dtl_map.sm_space == 0) { - /* - * The pool-wide DTL is empty. - * If this is a resilver, there's nothing to do except - * check whether any in-progress replacements have completed. - */ - if (type == POOL_SCRUB_RESILVER) { - type = POOL_SCRUB_NONE; - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); - } - } else { - /* - * The pool-wide DTL is non-empty. - * If this is a normal scrub, upgrade to a resilver instead. - */ - if (type == POOL_SCRUB_EVERYTHING) - type = POOL_SCRUB_RESILVER; - } - - if (type == POOL_SCRUB_RESILVER) { - /* - * Determine the resilvering boundaries. - * - * Note: (mintxg, maxtxg) is an open interval, - * i.e. mintxg and maxtxg themselves are not included. - * - * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 - * so we don't claim to resilver a txg that's still changing. - */ - ss = avl_first(&rvd->vdev_dtl_map.sm_root); - mintxg = ss->ss_start - 1; - ss = avl_last(&rvd->vdev_dtl_map.sm_root); - maxtxg = MIN(ss->ss_end, maxtxg); - } - - mutex_exit(&rvd->vdev_dtl_lock); - - spa->spa_scrub_stop = 0; - spa->spa_scrub_type = type; - spa->spa_scrub_restart_txg = 0; - - if (type != POOL_SCRUB_NONE) { - spa->spa_scrub_mintxg = mintxg; - spa->spa_scrub_maxtxg = maxtxg; - spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, - ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, - ZIO_FLAG_CANFAIL); - traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); - spa->spa_scrub_thread = thread_create(NULL, 0, - spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); - } - - mutex_exit(&spa->spa_scrub_lock); - - return (0); -} - -/* - * ========================================================================== - * SPA async task processing - * ========================================================================== - */ - -static void -spa_async_reopen(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *tvd; - int c; - - spa_config_enter(spa, RW_WRITER, FTAG); - - for (c = 0; c < rvd->vdev_children; c++) { - tvd = rvd->vdev_child[c]; - if (tvd->vdev_reopen_wanted) { - tvd->vdev_reopen_wanted = 0; - vdev_reopen(tvd); - } - } - - spa_config_exit(spa, FTAG); -} - -static void -spa_async_thread(void *arg) -{ - spa_t *spa = arg; - int tasks; - - ASSERT(spa->spa_sync_on); - - mutex_enter(&spa->spa_async_lock); - tasks = spa->spa_async_tasks; - spa->spa_async_tasks = 0; - mutex_exit(&spa->spa_async_lock); - - /* - * See if the config needs to be updated. - */ - if (tasks & SPA_ASYNC_CONFIG_UPDATE) { - mutex_enter(&spa_namespace_lock); - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - mutex_exit(&spa_namespace_lock); - } - - /* - * See if any devices need to be reopened. - */ - if (tasks & SPA_ASYNC_REOPEN) - spa_async_reopen(spa); - - /* - * If any devices are done replacing, detach them. - */ - if (tasks & SPA_ASYNC_REPLACE_DONE) - spa_vdev_replace_done(spa); - - /* - * Kick off a scrub. - */ - if (tasks & SPA_ASYNC_SCRUB) - VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); - - /* - * Kick off a resilver. - */ - if (tasks & SPA_ASYNC_RESILVER) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); - - /* - * Let the world know that we're done. - */ - mutex_enter(&spa->spa_async_lock); - spa->spa_async_thread = NULL; - cv_broadcast(&spa->spa_async_cv); - mutex_exit(&spa->spa_async_lock); - thread_exit(); -} - -void -spa_async_suspend(spa_t *spa) -{ - mutex_enter(&spa->spa_async_lock); - spa->spa_async_suspended++; - while (spa->spa_async_thread != NULL) - cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); - mutex_exit(&spa->spa_async_lock); -} - -void -spa_async_resume(spa_t *spa) -{ - mutex_enter(&spa->spa_async_lock); - ASSERT(spa->spa_async_suspended != 0); - spa->spa_async_suspended--; - mutex_exit(&spa->spa_async_lock); -} - -static void -spa_async_dispatch(spa_t *spa) -{ - mutex_enter(&spa->spa_async_lock); - if (spa->spa_async_tasks && !spa->spa_async_suspended && - spa->spa_async_thread == NULL && - rootdir != NULL && !vn_is_readonly(rootdir)) - spa->spa_async_thread = thread_create(NULL, 0, - spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); - mutex_exit(&spa->spa_async_lock); -} - -void -spa_async_request(spa_t *spa, int task) -{ - mutex_enter(&spa->spa_async_lock); - spa->spa_async_tasks |= task; - mutex_exit(&spa->spa_async_lock); -} - -/* - * ========================================================================== - * SPA syncing routines - * ========================================================================== - */ - -static void -spa_sync_deferred_frees(spa_t *spa, uint64_t txg) -{ - bplist_t *bpl = &spa->spa_sync_bplist; - dmu_tx_t *tx; - blkptr_t blk; - uint64_t itor = 0; - zio_t *zio; - int error; - uint8_t c = 1; - - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); - - while (bplist_iterate(bpl, &itor, &blk) == 0) - zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); - - error = zio_wait(zio); - ASSERT3U(error, ==, 0); - - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - bplist_vacate(bpl, tx); - - /* - * Pre-dirty the first block so we sync to convergence faster. - * (Usually only the first block is needed.) - */ - dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); - dmu_tx_commit(tx); -} - -static void -spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) -{ - char *packed = NULL; - size_t nvsize = 0; - dmu_buf_t *db; - - VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); - - packed = kmem_alloc(nvsize, KM_SLEEP); - - VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, - KM_SLEEP) == 0); - - dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); - - kmem_free(packed, nvsize); - - VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = nvsize; - dmu_buf_rele(db, FTAG); -} - -static void -spa_sync_spares(spa_t *spa, dmu_tx_t *tx) -{ - nvlist_t *nvroot; - nvlist_t **spares; - int i; - - if (!spa->spa_sync_spares) - return; - - /* - * Update the MOS nvlist describing the list of available spares. - * spa_validate_spares() will have already made sure this nvlist is - * valid and the vdevs are labelled appropriately. - */ - if (spa->spa_spares_object == 0) { - spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_PACKED_NVLIST, 1 << 14, - DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); - VERIFY(zap_update(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, - sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); - } - - VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (spa->spa_nspares == 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - NULL, 0) == 0); - } else { - spares = kmem_alloc(spa->spa_nspares * sizeof (void *), - KM_SLEEP); - for (i = 0; i < spa->spa_nspares; i++) - spares[i] = vdev_config_generate(spa, - spa->spa_spares[i], B_FALSE, B_TRUE); - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - spares, spa->spa_nspares) == 0); - for (i = 0; i < spa->spa_nspares; i++) - nvlist_free(spares[i]); - kmem_free(spares, spa->spa_nspares * sizeof (void *)); - } - - spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); - nvlist_free(nvroot); - - spa->spa_sync_spares = B_FALSE; -} - -static void -spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) -{ - nvlist_t *config; - - if (list_is_empty(&spa->spa_dirty_list)) - return; - - config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); - - if (spa->spa_config_syncing) - nvlist_free(spa->spa_config_syncing); - spa->spa_config_syncing = config; - - spa_sync_nvlist(spa, spa->spa_config_object, config, tx); -} - -static void -spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) -{ - spa_t *spa = arg1; - nvlist_t *nvp = arg2; - nvpair_t *nvpair; - objset_t *mos = spa->spa_meta_objset; - uint64_t zapobj; - - mutex_enter(&spa->spa_props_lock); - if (spa->spa_pool_props_object == 0) { - zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); - VERIFY(zapobj > 0); - - spa->spa_pool_props_object = zapobj; - - VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_PROPS, 8, 1, - &spa->spa_pool_props_object, tx) == 0); - } - mutex_exit(&spa->spa_props_lock); - - nvpair = NULL; - while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { - switch (zpool_name_to_prop(nvpair_name(nvpair))) { - case ZFS_PROP_BOOTFS: - VERIFY(nvlist_lookup_uint64(nvp, - nvpair_name(nvpair), &spa->spa_bootfs) == 0); - VERIFY(zap_update(mos, - spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, - &spa->spa_bootfs, tx) == 0); - break; - } - } -} - -/* - * Sync the specified transaction group. New blocks may be dirtied as - * part of the process, so we iterate until it converges. - */ -void -spa_sync(spa_t *spa, uint64_t txg) -{ - dsl_pool_t *dp = spa->spa_dsl_pool; - objset_t *mos = spa->spa_meta_objset; - bplist_t *bpl = &spa->spa_sync_bplist; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd; - dmu_tx_t *tx; - int dirty_vdevs; - - /* - * Lock out configuration changes. - */ - spa_config_enter(spa, RW_READER, FTAG); - - spa->spa_syncing_txg = txg; - spa->spa_sync_pass = 0; - - VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); - - tx = dmu_tx_create_assigned(dp, txg); - - /* - * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, - * set spa_deflate if we have no raid-z vdevs. - */ - if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && - spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { - int i; - - for (i = 0; i < rvd->vdev_children; i++) { - vd = rvd->vdev_child[i]; - if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) - break; - } - if (i == rvd->vdev_children) { - spa->spa_deflate = TRUE; - VERIFY(0 == zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate, tx)); - } - } - - /* - * If anything has changed in this txg, push the deferred frees - * from the previous txg. If not, leave them alone so that we - * don't generate work on an otherwise idle system. - */ - if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || - !txg_list_empty(&dp->dp_dirty_dirs, txg) || - !txg_list_empty(&dp->dp_sync_tasks, txg)) - spa_sync_deferred_frees(spa, txg); - - /* - * Iterate to convergence. - */ - do { - spa->spa_sync_pass++; - - spa_sync_config_object(spa, tx); - spa_sync_spares(spa, tx); - spa_errlog_sync(spa, txg); - dsl_pool_sync(dp, txg); - - dirty_vdevs = 0; - while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { - vdev_sync(vd, txg); - dirty_vdevs++; - } - - bplist_sync(bpl, tx); - } while (dirty_vdevs); - - bplist_close(bpl); - - dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); - - /* - * Rewrite the vdev configuration (which includes the uberblock) - * to commit the transaction group. - * - * If there are any dirty vdevs, sync the uberblock to all vdevs. - * Otherwise, pick a random top-level vdev that's known to be - * visible in the config cache (see spa_vdev_add() for details). - * If the write fails, try the next vdev until we're tried them all. - */ - if (!list_is_empty(&spa->spa_dirty_list)) { - VERIFY(vdev_config_sync(rvd, txg) == 0); - } else { - int children = rvd->vdev_children; - int c0 = spa_get_random(children); - int c; - - for (c = 0; c < children; c++) { - vd = rvd->vdev_child[(c0 + c) % children]; - if (vd->vdev_ms_array == 0) - continue; - if (vdev_config_sync(vd, txg) == 0) - break; - } - if (c == children) - VERIFY(vdev_config_sync(rvd, txg) == 0); - } - - dmu_tx_commit(tx); - - /* - * Clear the dirty config list. - */ - while ((vd = list_head(&spa->spa_dirty_list)) != NULL) - vdev_config_clean(vd); - - /* - * Now that the new config has synced transactionally, - * let it become visible to the config cache. - */ - if (spa->spa_config_syncing != NULL) { - spa_config_set(spa, spa->spa_config_syncing); - spa->spa_config_txg = txg; - spa->spa_config_syncing = NULL; - } - - /* - * Make a stable copy of the fully synced uberblock. - * We use this as the root for pool traversals. - */ - spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ - - spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ - - rw_enter(&spa->spa_traverse_lock, RW_WRITER); - spa->spa_traverse_wanted = 0; - spa->spa_ubsync = spa->spa_uberblock; - rw_exit(&spa->spa_traverse_lock); - - spa_scrub_resume(spa); /* resume scrub with new ubsync */ - - /* - * Clean up the ZIL records for the synced txg. - */ - dsl_pool_zil_clean(dp); - - /* - * Update usable space statistics. - */ - while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) - vdev_sync_done(vd, txg); - - /* - * It had better be the case that we didn't dirty anything - * since vdev_config_sync(). - */ - ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); - ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); - ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); - ASSERT(bpl->bpl_queue == NULL); - - spa_config_exit(spa, FTAG); - - /* - * If any async tasks have been requested, kick them off. - */ - spa_async_dispatch(spa); -} - -/* - * Sync all pools. We don't want to hold the namespace lock across these - * operations, so we take a reference on the spa_t and drop the lock during the - * sync. - */ -void -spa_sync_allpools(void) -{ - spa_t *spa = NULL; - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa_state(spa) != POOL_STATE_ACTIVE) - continue; - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - txg_wait_synced(spa_get_dsl(spa), 0); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - } - mutex_exit(&spa_namespace_lock); -} - -/* - * ========================================================================== - * Miscellaneous routines - * ========================================================================== - */ - -/* - * Remove all pools in the system. - */ -void -spa_evict_all(void) -{ - spa_t *spa; - - /* - * Remove all cached state. All pools should be closed now, - * so every spa in the AVL tree should be unreferenced. - */ - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(NULL)) != NULL) { - /* - * Stop async tasks. The async thread may need to detach - * a device that's been replaced, which requires grabbing - * spa_namespace_lock, so we must drop it here. - */ - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - spa_async_suspend(spa); - VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - - if (spa->spa_state != POOL_STATE_UNINITIALIZED) { - spa_unload(spa); - spa_deactivate(spa); - } - spa_remove(spa); - } - mutex_exit(&spa_namespace_lock); -} - -vdev_t * -spa_lookup_by_guid(spa_t *spa, uint64_t guid) -{ - return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); -} - -void -spa_upgrade(spa_t *spa) -{ - spa_config_enter(spa, RW_WRITER, FTAG); - - /* - * This should only be called for a non-faulted pool, and since a - * future version would result in an unopenable pool, this shouldn't be - * possible. - */ - ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); - - spa->spa_uberblock.ub_version = ZFS_VERSION; - vdev_config_dirty(spa->spa_root_vdev); - - spa_config_exit(spa, FTAG); - - txg_wait_synced(spa_get_dsl(spa), 0); -} - -boolean_t -spa_has_spare(spa_t *spa, uint64_t guid) -{ - int i; - uint64_t spareguid; - - for (i = 0; i < spa->spa_nspares; i++) - if (spa->spa_spares[i]->vdev_guid == guid) - return (B_TRUE); - - for (i = 0; i < spa->spa_pending_nspares; i++) { - if (nvlist_lookup_uint64(spa->spa_pending_spares[i], - ZPOOL_CONFIG_GUID, &spareguid) == 0 && - spareguid == guid) - return (B_TRUE); - } - - return (B_FALSE); -} - -int -spa_set_props(spa_t *spa, nvlist_t *nvp) -{ - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); -} - -int -spa_get_props(spa_t *spa, nvlist_t **nvp) -{ - zap_cursor_t zc; - zap_attribute_t za; - objset_t *mos = spa->spa_meta_objset; - zfs_source_t src; - zfs_prop_t prop; - nvlist_t *propval; - uint64_t value; - int err; - - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - mutex_enter(&spa->spa_props_lock); - /* If no props object, then just return empty nvlist */ - if (spa->spa_pool_props_object == 0) { - mutex_exit(&spa->spa_props_lock); - return (0); - } - - for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); - (err = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - - if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) - continue; - - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - switch (za.za_integer_length) { - case 8: - if (zfs_prop_default_numeric(prop) == - za.za_first_integer) - src = ZFS_SRC_DEFAULT; - else - src = ZFS_SRC_LOCAL; - value = za.za_first_integer; - - if (prop == ZFS_PROP_BOOTFS) { - dsl_pool_t *dp; - dsl_dataset_t *ds = NULL; - char strval[MAXPATHLEN]; - - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - if ((err = dsl_dataset_open_obj(dp, - za.za_first_integer, NULL, DS_MODE_NONE, - FTAG, &ds)) != 0) { - rw_exit(&dp->dp_config_rwlock); - break; - } - dsl_dataset_name(ds, strval); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); - rw_exit(&dp->dp_config_rwlock); - - VERIFY(nvlist_add_uint64(propval, - ZFS_PROP_SOURCE, src) == 0); - VERIFY(nvlist_add_string(propval, - ZFS_PROP_VALUE, strval) == 0); - } else { - VERIFY(nvlist_add_uint64(propval, - ZFS_PROP_SOURCE, src) == 0); - VERIFY(nvlist_add_uint64(propval, - ZFS_PROP_VALUE, value) == 0); - } - VERIFY(nvlist_add_nvlist(*nvp, za.za_name, - propval) == 0); - break; - } - nvlist_free(propval); - } - zap_cursor_fini(&zc); - mutex_exit(&spa->spa_props_lock); - if (err && err != ENOENT) { - nvlist_free(*nvp); - return (err); - } - - return (0); -} - -/* - * If the bootfs property value is dsobj, clear it. - */ -void -spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) -{ - if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { - VERIFY(zap_remove(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); - spa->spa_bootfs = 0; - } -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c deleted file mode 100644 index 9e8bcf3..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c +++ /dev/null @@ -1,375 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/spa_impl.h> -#include <sys/nvpair.h> -#include <sys/uio.h> -#include <sys/fs/zfs.h> -#include <sys/vdev_impl.h> -#include <sys/zfs_ioctl.h> -#include <sys/utsname.h> -#include <sys/sunddi.h> -#ifdef _KERNEL -#include <sys/kobj.h> -#endif - -/* - * Pool configuration repository. - * - * The configuration for all pools, in addition to being stored on disk, is - * stored in /etc/zfs/zpool.cache as a packed nvlist. The kernel maintains - * this list as pools are created, destroyed, or modified. - * - * We have a single nvlist which holds all the configuration information. When - * the module loads, we read this information from the cache and populate the - * SPA namespace. This namespace is maintained independently in spa.c. - * Whenever the namespace is modified, or the configuration of a pool is - * changed, we call spa_config_sync(), which walks through all the active pools - * and writes the configuration to disk. - */ - -static uint64_t spa_config_generation = 1; - -/* - * This can be overridden in userland to preserve an alternate namespace for - * userland pools when doing testing. - */ -const char *spa_config_dir = ZPOOL_CACHE_DIR; - -/* - * Called when the module is first loaded, this routine loads the configuration - * file into the SPA namespace. It does not actually open or load the pools; it - * only populates the namespace. - */ -void -spa_config_load(void) -{ - void *buf = NULL; - nvlist_t *nvlist, *child; - nvpair_t *nvpair; - spa_t *spa; - char pathname[128]; - struct _buf *file; - uint64_t fsize; - - /* - * Open the configuration file. - */ - (void) snprintf(pathname, sizeof (pathname), "%s/%s", - spa_config_dir, ZPOOL_CACHE_FILE); - - file = kobj_open_file(pathname); - if (file == (struct _buf *)-1) { - ZFS_LOG(1, "Cannot open %s.", pathname); - return; - } - - if (kobj_get_filesize(file, &fsize) != 0) { - ZFS_LOG(1, "Cannot get size of %s.", pathname); - goto out; - } - - buf = kmem_alloc(fsize, KM_SLEEP); - - /* - * Read the nvlist from the file. - */ - if (kobj_read_file(file, buf, fsize, 0) < 0) { - ZFS_LOG(1, "Cannot read %s.", pathname); - goto out; - } - - /* - * Unpack the nvlist. - */ - if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) - goto out; - - ZFS_LOG(1, "File %s loaded.", pathname); - - /* - * Iterate over all elements in the nvlist, creating a new spa_t for - * each one with the specified configuration. - */ - mutex_enter(&spa_namespace_lock); - nvpair = NULL; - while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { - - if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) - continue; - - VERIFY(nvpair_value_nvlist(nvpair, &child) == 0); - - if (spa_lookup(nvpair_name(nvpair)) != NULL) - continue; - spa = spa_add(nvpair_name(nvpair), NULL); - - /* - * We blindly duplicate the configuration here. If it's - * invalid, we will catch it when the pool is first opened. - */ - VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0); - } - mutex_exit(&spa_namespace_lock); - - nvlist_free(nvlist); - -out: - if (buf != NULL) - kmem_free(buf, fsize); - - kobj_close_file(file); -} - -/* - * Synchronize all pools to disk. This must be called with the namespace lock - * held. - */ -void -spa_config_sync(void) -{ - spa_t *spa = NULL; - nvlist_t *config; - size_t buflen; - char *buf; - vnode_t *vp; - int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; - char pathname[128]; - char pathname2[128]; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - /* - * Add all known pools to the configuration list, ignoring those with - * alternate root paths. - */ - spa = NULL; - while ((spa = spa_next(spa)) != NULL) { - mutex_enter(&spa->spa_config_cache_lock); - if (spa->spa_config && spa->spa_name && spa->spa_root == NULL) - VERIFY(nvlist_add_nvlist(config, spa->spa_name, - spa->spa_config) == 0); - mutex_exit(&spa->spa_config_cache_lock); - } - - /* - * Pack the configuration into a buffer. - */ - VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0); - - buf = kmem_alloc(buflen, KM_SLEEP); - - VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, - KM_SLEEP) == 0); - - /* - * Write the configuration to disk. We need to do the traditional - * 'write to temporary file, sync, move over original' to make sure we - * always have a consistent view of the data. - */ - (void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir, - ZPOOL_CACHE_TMP); - - if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0) - goto out; - - if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, NULL) == 0 && - VOP_FSYNC(vp, FSYNC, kcred) == 0) { - (void) snprintf(pathname2, sizeof (pathname2), "%s/%s", - spa_config_dir, ZPOOL_CACHE_FILE); - (void) vn_rename(pathname, pathname2, UIO_SYSSPACE); - } - - (void) VOP_CLOSE(vp, oflags, 1, 0, kcred); - VN_RELE(vp); - -out: - (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE); - spa_config_generation++; - - kmem_free(buf, buflen); - nvlist_free(config); -} - -/* - * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, - * and we don't want to allow the local zone to see all the pools anyway. - * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration - * information for all pool visible within the zone. - */ -nvlist_t * -spa_all_configs(uint64_t *generation) -{ - nvlist_t *pools; - spa_t *spa; - - if (*generation == spa_config_generation) - return (NULL); - - VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - spa = NULL; - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (INGLOBALZONE(curproc) || - zone_dataset_visible(spa_name(spa), NULL)) { - mutex_enter(&spa->spa_config_cache_lock); - VERIFY(nvlist_add_nvlist(pools, spa_name(spa), - spa->spa_config) == 0); - mutex_exit(&spa->spa_config_cache_lock); - } - } - mutex_exit(&spa_namespace_lock); - - *generation = spa_config_generation; - - return (pools); -} - -void -spa_config_set(spa_t *spa, nvlist_t *config) -{ - mutex_enter(&spa->spa_config_cache_lock); - if (spa->spa_config != NULL) - nvlist_free(spa->spa_config); - spa->spa_config = config; - mutex_exit(&spa->spa_config_cache_lock); -} - -/* - * Generate the pool's configuration based on the current in-core state. - * We infer whether to generate a complete config or just one top-level config - * based on whether vd is the root vdev. - */ -nvlist_t * -spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) -{ - nvlist_t *config, *nvroot; - vdev_t *rvd = spa->spa_root_vdev; - unsigned long hostid = 0; - - ASSERT(spa_config_held(spa, RW_READER)); - - if (vd == NULL) - vd = rvd; - - /* - * If txg is -1, report the current value of spa->spa_config_txg. - */ - if (txg == -1ULL) - txg = spa->spa_config_txg; - - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, - spa_name(spa)) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, - spa_state(spa)) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, - txg) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, - spa_guid(spa)) == 0); - (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, - hostid) == 0); - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, - utsname.nodename) == 0); - - if (vd != rvd) { - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, - vd->vdev_top->vdev_guid) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); - if (vd->vdev_isspare) - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, - 1ULL) == 0); - vd = vd->vdev_top; /* label contains top config */ - } - - nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE); - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); - nvlist_free(nvroot); - - return (config); -} - -/* - * Update all disk labels, generate a fresh config based on the current - * in-core state, and sync the global config cache. - */ -void -spa_config_update(spa_t *spa, int what) -{ - vdev_t *rvd = spa->spa_root_vdev; - uint64_t txg; - int c; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - spa_config_enter(spa, RW_WRITER, FTAG); - txg = spa_last_synced_txg(spa) + 1; - if (what == SPA_CONFIG_UPDATE_POOL) { - vdev_config_dirty(rvd); - } else { - /* - * If we have top-level vdevs that were added but have - * not yet been prepared for allocation, do that now. - * (It's safe now because the config cache is up to date, - * so it will be able to translate the new DVAs.) - * See comments in spa_vdev_add() for full details. - */ - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - if (tvd->vdev_ms_array == 0) { - vdev_init(tvd, txg); - vdev_config_dirty(tvd); - } - } - } - spa_config_exit(spa, FTAG); - - /* - * Wait for the mosconfig to be regenerated and synced. - */ - txg_wait_synced(spa->spa_dsl_pool, txg); - - /* - * Update the global config cache to reflect the new mosconfig. - */ - spa_config_sync(); - - if (what == SPA_CONFIG_UPDATE_POOL) - spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c deleted file mode 100644 index c52acaf..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c +++ /dev/null @@ -1,440 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * Routines to manage the on-disk persistent error log. - * - * Each pool stores a log of all logical data errors seen during normal - * operation. This is actually the union of two distinct logs: the last log, - * and the current log. All errors seen are logged to the current log. When a - * scrub completes, the current log becomes the last log, the last log is thrown - * out, and the current log is reinitialized. This way, if an error is somehow - * corrected, a new scrub will show that that it no longer exists, and will be - * deleted from the log when the scrub completes. - * - * The log is stored using a ZAP object whose key is a string form of the - * zbookmark tuple (objset, object, level, blkid), and whose contents is an - * optional 'objset:object' human-readable string describing the data. When an - * error is first logged, this string will be empty, indicating that no name is - * known. This prevents us from having to issue a potentially large amount of - * I/O to discover the object name during an error path. Instead, we do the - * calculation when the data is requested, storing the result so future queries - * will be faster. - * - * This log is then shipped into an nvlist where the key is the dataset name and - * the value is the object name. Userland is then responsible for uniquifying - * this list and displaying it to the user. - */ - -#include <sys/dmu_tx.h> -#include <sys/spa.h> -#include <sys/spa_impl.h> -#include <sys/zap.h> -#include <sys/zio.h> - -/* - * This is a stripped-down version of strtoull, suitable only for converting - * lowercase hexidecimal numbers that don't overflow. - */ -#ifdef _KERNEL -static uint64_t -_strtonum(char *str, char **nptr) -{ - uint64_t val = 0; - char c; - int digit; - - while ((c = *str) != '\0') { - if (c >= '0' && c <= '9') - digit = c - '0'; - else if (c >= 'a' && c <= 'f') - digit = 10 + c - 'a'; - else - break; - - val *= 16; - val += digit; - - str++; - } - - *nptr = str; - - return (val); -} -#endif - -/* - * Convert a bookmark to a string. - */ -static void -bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) -{ - (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", - (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); -} - -/* - * Convert a string to a bookmark - */ -#ifdef _KERNEL -static void -name_to_bookmark(char *buf, zbookmark_t *zb) -{ - zb->zb_objset = _strtonum(buf, &buf); - ASSERT(*buf == ':'); - zb->zb_object = _strtonum(buf + 1, &buf); - ASSERT(*buf == ':'); - zb->zb_level = (int)_strtonum(buf + 1, &buf); - ASSERT(*buf == ':'); - zb->zb_blkid = _strtonum(buf + 1, &buf); - ASSERT(*buf == '\0'); -} -#endif - -/* - * Log an uncorrectable error to the persistent error log. We add it to the - * spa's list of pending errors. The changes are actually synced out to disk - * during spa_errlog_sync(). - */ -void -spa_log_error(spa_t *spa, zio_t *zio) -{ - zbookmark_t *zb = &zio->io_logical->io_bookmark; - spa_error_entry_t search; - spa_error_entry_t *new; - avl_tree_t *tree; - avl_index_t where; - - /* - * If we are trying to import a pool, ignore any errors, as we won't be - * writing to the pool any time soon. - */ - if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) - return; - - mutex_enter(&spa->spa_errlist_lock); - - /* - * If we have had a request to rotate the log, log it to the next list - * instead of the current one. - */ - if (spa->spa_scrub_active || spa->spa_scrub_finished) - tree = &spa->spa_errlist_scrub; - else - tree = &spa->spa_errlist_last; - - search.se_bookmark = *zb; - if (avl_find(tree, &search, &where) != NULL) { - mutex_exit(&spa->spa_errlist_lock); - return; - } - - new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); - new->se_bookmark = *zb; - avl_insert(tree, new, where); - - mutex_exit(&spa->spa_errlist_lock); -} - -/* - * Return the number of errors currently in the error log. This is actually the - * sum of both the last log and the current log, since we don't know the union - * of these logs until we reach userland. - */ -uint64_t -spa_get_errlog_size(spa_t *spa) -{ - uint64_t total = 0, count; - - mutex_enter(&spa->spa_errlog_lock); - if (spa->spa_errlog_scrub != 0 && - zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, - &count) == 0) - total += count; - - if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && - zap_count(spa->spa_meta_objset, spa->spa_errlog_last, - &count) == 0) - total += count; - mutex_exit(&spa->spa_errlog_lock); - - mutex_enter(&spa->spa_errlist_lock); - total += avl_numnodes(&spa->spa_errlist_last); - total += avl_numnodes(&spa->spa_errlist_scrub); - mutex_exit(&spa->spa_errlist_lock); - - return (total); -} - -#ifdef _KERNEL -static int -process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) -{ - zap_cursor_t zc; - zap_attribute_t za; - zbookmark_t zb; - - if (obj == 0) - return (0); - - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - - if (*count == 0) { - zap_cursor_fini(&zc); - return (ENOMEM); - } - - name_to_bookmark(za.za_name, &zb); - - if (copyout(&zb, (char *)addr + - (*count - 1) * sizeof (zbookmark_t), - sizeof (zbookmark_t)) != 0) - return (EFAULT); - - *count -= 1; - } - - zap_cursor_fini(&zc); - - return (0); -} - -static int -process_error_list(avl_tree_t *list, void *addr, size_t *count) -{ - spa_error_entry_t *se; - - for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { - - if (*count == 0) - return (ENOMEM); - - if (copyout(&se->se_bookmark, (char *)addr + - (*count - 1) * sizeof (zbookmark_t), - sizeof (zbookmark_t)) != 0) - return (EFAULT); - - *count -= 1; - } - - return (0); -} -#endif - -/* - * Copy all known errors to userland as an array of bookmarks. This is - * actually a union of the on-disk last log and current log, as well as any - * pending error requests. - * - * Because the act of reading the on-disk log could cause errors to be - * generated, we have two separate locks: one for the error log and one for the - * in-core error lists. We only need the error list lock to log and error, so - * we grab the error log lock while we read the on-disk logs, and only pick up - * the error list lock when we are finished. - */ -int -spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) -{ - int ret = 0; - -#ifdef _KERNEL - mutex_enter(&spa->spa_errlog_lock); - - ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); - - if (!ret && !spa->spa_scrub_finished) - ret = process_error_log(spa, spa->spa_errlog_last, uaddr, - count); - - mutex_enter(&spa->spa_errlist_lock); - if (!ret) - ret = process_error_list(&spa->spa_errlist_scrub, uaddr, - count); - if (!ret) - ret = process_error_list(&spa->spa_errlist_last, uaddr, - count); - mutex_exit(&spa->spa_errlist_lock); - - mutex_exit(&spa->spa_errlog_lock); -#endif - - return (ret); -} - -/* - * Called when a scrub completes. This simply set a bit which tells which AVL - * tree to add new errors. spa_errlog_sync() is responsible for actually - * syncing the changes to the underlying objects. - */ -void -spa_errlog_rotate(spa_t *spa) -{ - mutex_enter(&spa->spa_errlist_lock); - - ASSERT(!spa->spa_scrub_finished); - spa->spa_scrub_finished = B_TRUE; - - mutex_exit(&spa->spa_errlist_lock); -} - -/* - * Discard any pending errors from the spa_t. Called when unloading a faulted - * pool, as the errors encountered during the open cannot be synced to disk. - */ -void -spa_errlog_drain(spa_t *spa) -{ - spa_error_entry_t *se; - void *cookie; - - mutex_enter(&spa->spa_errlist_lock); - - cookie = NULL; - while ((se = avl_destroy_nodes(&spa->spa_errlist_last, - &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); - cookie = NULL; - while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, - &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); - - mutex_exit(&spa->spa_errlist_lock); -} - -/* - * Process a list of errors into the current on-disk log. - */ -static void -sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) -{ - spa_error_entry_t *se; - char buf[64]; - void *cookie; - - if (avl_numnodes(t) != 0) { - /* create log if necessary */ - if (*obj == 0) - *obj = zap_create(spa->spa_meta_objset, - DMU_OT_ERROR_LOG, DMU_OT_NONE, - 0, tx); - - /* add errors to the current log */ - for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { - char *name = se->se_name ? se->se_name : ""; - - bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); - - (void) zap_update(spa->spa_meta_objset, - *obj, buf, 1, strlen(name) + 1, name, tx); - } - - /* purge the error list */ - cookie = NULL; - while ((se = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); - } -} - -/* - * Sync the error log out to disk. This is a little tricky because the act of - * writing the error log requires the spa_errlist_lock. So, we need to lock the - * error lists, take a copy of the lists, and then reinitialize them. Then, we - * drop the error list lock and take the error log lock, at which point we - * do the errlog processing. Then, if we encounter an I/O error during this - * process, we can successfully add the error to the list. Note that this will - * result in the perpetual recycling of errors, but it is an unlikely situation - * and not a performance critical operation. - */ -void -spa_errlog_sync(spa_t *spa, uint64_t txg) -{ - dmu_tx_t *tx; - avl_tree_t scrub, last; - int scrub_finished; - - mutex_enter(&spa->spa_errlist_lock); - - /* - * Bail out early under normal circumstances. - */ - if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && - avl_numnodes(&spa->spa_errlist_last) == 0 && - !spa->spa_scrub_finished) { - mutex_exit(&spa->spa_errlist_lock); - return; - } - - spa_get_errlists(spa, &last, &scrub); - scrub_finished = spa->spa_scrub_finished; - spa->spa_scrub_finished = B_FALSE; - - mutex_exit(&spa->spa_errlist_lock); - mutex_enter(&spa->spa_errlog_lock); - - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - - /* - * Sync out the current list of errors. - */ - sync_error_list(spa, &last, &spa->spa_errlog_last, tx); - - /* - * Rotate the log if necessary. - */ - if (scrub_finished) { - if (spa->spa_errlog_last != 0) - VERIFY(dmu_object_free(spa->spa_meta_objset, - spa->spa_errlog_last, tx) == 0); - spa->spa_errlog_last = spa->spa_errlog_scrub; - spa->spa_errlog_scrub = 0; - - sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); - } - - /* - * Sync out any pending scrub errors. - */ - sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); - - /* - * Update the MOS to reflect the new values. - */ - (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, - &spa->spa_errlog_last, tx); - (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, - &spa->spa_errlog_scrub, tx); - - dmu_tx_commit(tx); - - mutex_exit(&spa->spa_errlog_lock); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c deleted file mode 100644 index 6642801..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c +++ /dev/null @@ -1,354 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa_impl.h> -#include <sys/zap.h> -#include <sys/dsl_synctask.h> - -/* - * Routines to manage the on-disk history log. - * - * The history log is stored as a dmu object containing - * <packed record length, record nvlist> tuples. - * - * Where "record nvlist" is a nvlist containing uint64_ts and strings, and - * "packed record length" is the packed length of the "record nvlist" stored - * as a little endian uint64_t. - * - * The log is implemented as a ring buffer, though the original creation - * of the pool ('zpool create') is never overwritten. - * - * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer - * of 'spa_history' stores the offsets for logging/retrieving history as - * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of - * where the 'zpool create' record is stored. This allows us to never - * overwrite the original creation of the pool. 'sh_phys_max_off' is the - * physical ending offset in bytes of the log. This tells you the length of - * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record - * is added, 'sh_eof' is incremented by the the size of the record. - * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). - * This is where the consumer should start reading from after reading in - * the 'zpool create' portion of the log. - * - * 'sh_records_lost' keeps track of how many records have been overwritten - * and permanently lost. - */ - -typedef enum history_log_type { - LOG_CMD_CREATE, - LOG_CMD_NO_CREATE -} history_log_type_t; - -typedef struct history_arg { - const char *ha_history_str; - history_log_type_t ha_log_type; -} history_arg_t; - -/* convert a logical offset to physical */ -static uint64_t -spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) -{ - uint64_t phys_len; - - phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; - return ((log_off - shpp->sh_pool_create_len) % phys_len - + shpp->sh_pool_create_len); -} - -void -spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) -{ - dmu_buf_t *dbp; - spa_history_phys_t *shpp; - objset_t *mos = spa->spa_meta_objset; - - ASSERT(spa->spa_history == 0); - spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, - SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, - sizeof (spa_history_phys_t), tx); - - VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_HISTORY, sizeof (uint64_t), 1, - &spa->spa_history, tx) == 0); - - VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); - ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); - - shpp = dbp->db_data; - dmu_buf_will_dirty(dbp, tx); - - /* - * Figure out maximum size of history log. We set it at - * 1% of pool size, with a max of 32MB and min of 128KB. - */ - shpp->sh_phys_max_off = spa_get_dspace(spa) / 100; - shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); - shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); - - dmu_buf_rele(dbp, FTAG); -} - -/* - * Change 'sh_bof' to the beginning of the next record. - */ -static int -spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) -{ - objset_t *mos = spa->spa_meta_objset; - uint64_t firstread, reclen, phys_bof; - char buf[sizeof (reclen)]; - int err; - - phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); - firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); - - if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, - buf)) != 0) - return (err); - if (firstread != sizeof (reclen)) { - if ((err = dmu_read(mos, spa->spa_history, - shpp->sh_pool_create_len, sizeof (reclen) - firstread, - buf + firstread)) != 0) - return (err); - } - - reclen = LE_64(*((uint64_t *)buf)); - shpp->sh_bof += reclen + sizeof (reclen); - shpp->sh_records_lost++; - return (0); -} - -static int -spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, - dmu_tx_t *tx) -{ - uint64_t firstwrite, phys_eof; - objset_t *mos = spa->spa_meta_objset; - int err; - - ASSERT(MUTEX_HELD(&spa->spa_history_lock)); - - /* see if we need to reset logical BOF */ - while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - - (shpp->sh_eof - shpp->sh_bof) <= len) { - if ((err = spa_history_advance_bof(spa, shpp)) != 0) - return (err); - } - - phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); - firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); - shpp->sh_eof += len; - dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); - - len -= firstwrite; - if (len > 0) { - /* write out the rest at the beginning of physical file */ - dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, - len, (char *)buf + firstwrite, tx); - } - - return (0); -} - -/* - * Write out a history event. - */ -void -spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - spa_t *spa = arg1; - history_arg_t *hap = arg2; - const char *history_str = hap->ha_history_str; - objset_t *mos = spa->spa_meta_objset; - dmu_buf_t *dbp; - spa_history_phys_t *shpp; - size_t reclen; - uint64_t le_len; - nvlist_t *nvrecord; - char *record_packed = NULL; - int ret; - - if (history_str == NULL) - return; - - /* - * If we have an older pool that doesn't have a command - * history object, create it now. - */ - mutex_enter(&spa->spa_history_lock); - if (!spa->spa_history) - spa_history_create_obj(spa, tx); - mutex_exit(&spa->spa_history_lock); - - /* - * Get the offset of where we need to write via the bonus buffer. - * Update the offset when the write completes. - */ - VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); - shpp = dbp->db_data; - - dmu_buf_will_dirty(dbp, tx); - -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(dbp, &doi); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); - } -#endif - - /* construct a nvlist of the current time and cmd string */ - VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, - gethrestime_sec()) == 0); - VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0); - VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, - NV_ENCODE_XDR, KM_SLEEP) == 0); - - mutex_enter(&spa->spa_history_lock); - if (hap->ha_log_type == LOG_CMD_CREATE) - VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); - - /* write out the packed length as little endian */ - le_len = LE_64((uint64_t)reclen); - ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); - if (!ret) - ret = spa_history_write(spa, record_packed, reclen, shpp, tx); - - if (!ret && hap->ha_log_type == LOG_CMD_CREATE) { - shpp->sh_pool_create_len += sizeof (le_len) + reclen; - shpp->sh_bof = shpp->sh_pool_create_len; - } - - mutex_exit(&spa->spa_history_lock); - nvlist_free(nvrecord); - kmem_free(record_packed, reclen); - dmu_buf_rele(dbp, FTAG); -} - -/* - * Write out a history event. - */ -int -spa_history_log(spa_t *spa, const char *history_str, uint64_t pool_create) -{ - history_arg_t ha; - - ha.ha_history_str = history_str; - ha.ha_log_type = pool_create ? LOG_CMD_CREATE : LOG_CMD_NO_CREATE; - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync, - spa, &ha, 0)); -} - -/* - * Read out the command history. - */ -int -spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) -{ - objset_t *mos = spa->spa_meta_objset; - dmu_buf_t *dbp; - uint64_t read_len, phys_read_off, phys_eof; - uint64_t leftover = 0; - spa_history_phys_t *shpp; - int err; - - /* - * If the command history doesn't exist (older pool), - * that's ok, just return ENOENT. - */ - if (!spa->spa_history) - return (ENOENT); - - if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) - return (err); - shpp = dbp->db_data; - -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(dbp, &doi); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); - } -#endif - - mutex_enter(&spa->spa_history_lock); - phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); - - if (*offp < shpp->sh_pool_create_len) { - /* read in just the zpool create history */ - phys_read_off = *offp; - read_len = MIN(*len, shpp->sh_pool_create_len - - phys_read_off); - } else { - /* - * Need to reset passed in offset to BOF if the passed in - * offset has since been overwritten. - */ - *offp = MAX(*offp, shpp->sh_bof); - phys_read_off = spa_history_log_to_phys(*offp, shpp); - - /* - * Read up to the minimum of what the user passed down or - * the EOF (physical or logical). If we hit physical EOF, - * use 'leftover' to read from the physical BOF. - */ - if (phys_read_off <= phys_eof) { - read_len = MIN(*len, phys_eof - phys_read_off); - } else { - read_len = MIN(*len, - shpp->sh_phys_max_off - phys_read_off); - if (phys_read_off + *len > shpp->sh_phys_max_off) { - leftover = MIN(*len - read_len, - phys_eof - shpp->sh_pool_create_len); - } - } - } - - /* offset for consumer to use next */ - *offp += read_len + leftover; - - /* tell the consumer how much you actually read */ - *len = read_len + leftover; - - if (read_len == 0) { - mutex_exit(&spa->spa_history_lock); - dmu_buf_rele(dbp, FTAG); - return (0); - } - - err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf); - if (leftover && err == 0) { - err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, - leftover, buf + read_len); - } - mutex_exit(&spa->spa_history_lock); - - dmu_buf_rele(dbp, FTAG); - return (err); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c deleted file mode 100644 index 5da1f96..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ /dev/null @@ -1,1130 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa_impl.h> -#include <sys/zio.h> -#include <sys/zio_checksum.h> -#include <sys/zio_compress.h> -#include <sys/dmu.h> -#include <sys/dmu_tx.h> -#include <sys/zap.h> -#include <sys/zil.h> -#include <sys/vdev_impl.h> -#include <sys/metaslab.h> -#include <sys/uberblock_impl.h> -#include <sys/txg.h> -#include <sys/avl.h> -#include <sys/unique.h> -#include <sys/dsl_pool.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_prop.h> -#include <sys/fs/zfs.h> - -/* - * SPA locking - * - * There are four basic locks for managing spa_t structures: - * - * spa_namespace_lock (global mutex) - * - * This lock must be acquired to do any of the following: - * - * - Lookup a spa_t by name - * - Add or remove a spa_t from the namespace - * - Increase spa_refcount from non-zero - * - Check if spa_refcount is zero - * - Rename a spa_t - * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/import/export - * - * It does not need to handle recursion. A create or destroy may - * reference objects (files or zvols) in other pools, but by - * definition they must have an existing reference, and will never need - * to lookup a spa_t by name. - * - * spa_refcount (per-spa refcount_t protected by mutex) - * - * This reference count keep track of any active users of the spa_t. The - * spa_t cannot be destroyed or freed while this is non-zero. Internally, - * the refcount is never really 'zero' - opening a pool implicitly keeps - * some references in the DMU. Internally we check against SPA_MINREF, but - * present the image of a zero/non-zero value to consumers. - * - * spa_config_lock (per-spa crazy rwlock) - * - * This SPA special is a recursive rwlock, capable of being acquired from - * asynchronous threads. It has protects the spa_t from config changes, - * and must be held in the following circumstances: - * - * - RW_READER to perform I/O to the spa - * - RW_WRITER to change the vdev config - * - * spa_config_cache_lock (per-spa mutex) - * - * This mutex prevents the spa_config nvlist from being updated. No - * other locks are required to obtain this lock, although implicitly you - * must have the namespace lock or non-zero refcount to have any kind - * of spa_t pointer at all. - * - * The locking order is fairly straightforward: - * - * spa_namespace_lock -> spa_refcount - * - * The namespace lock must be acquired to increase the refcount from 0 - * or to check if it is zero. - * - * spa_refcount -> spa_config_lock - * - * There must be at least one valid reference on the spa_t to acquire - * the config lock. - * - * spa_namespace_lock -> spa_config_lock - * - * The namespace lock must always be taken before the config lock. - * - * - * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and - * are globally visible. - * - * The namespace is manipulated using the following functions, all which require - * the spa_namespace_lock to be held. - * - * spa_lookup() Lookup a spa_t by name. - * - * spa_add() Create a new spa_t in the namespace. - * - * spa_remove() Remove a spa_t from the namespace. This also - * frees up any memory associated with the spa_t. - * - * spa_next() Returns the next spa_t in the system, or the - * first if NULL is passed. - * - * spa_evict_all() Shutdown and remove all spa_t structures in - * the system. - * - * spa_guid_exists() Determine whether a pool/device guid exists. - * - * The spa_refcount is manipulated using the following functions: - * - * spa_open_ref() Adds a reference to the given spa_t. Must be - * called with spa_namespace_lock held if the - * refcount is currently zero. - * - * spa_close() Remove a reference from the spa_t. This will - * not free the spa_t or remove it from the - * namespace. No locking is required. - * - * spa_refcount_zero() Returns true if the refcount is currently - * zero. Must be called with spa_namespace_lock - * held. - * - * The spa_config_lock is manipulated using the following functions: - * - * spa_config_enter() Acquire the config lock as RW_READER or - * RW_WRITER. At least one reference on the spa_t - * must exist. - * - * spa_config_exit() Release the config lock. - * - * spa_config_held() Returns true if the config lock is currently - * held in the given state. - * - * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). - * - * spa_vdev_enter() Acquire the namespace lock and the config lock - * for writing. - * - * spa_vdev_exit() Release the config lock, wait for all I/O - * to complete, sync the updated configs to the - * cache, and release the namespace lock. - * - * The spa_name() function also requires either the spa_namespace_lock - * or the spa_config_lock, as both are needed to do a rename. spa_rename() is - * also implemented within this file since is requires manipulation of the - * namespace. - */ - -static avl_tree_t spa_namespace_avl; -kmutex_t spa_namespace_lock; -static kcondvar_t spa_namespace_cv; -static int spa_active_count; -int spa_max_replication_override = SPA_DVAS_PER_BP; - -static kmutex_t spa_spare_lock; -static avl_tree_t spa_spare_avl; - -kmem_cache_t *spa_buffer_pool; -int spa_mode; - -#ifdef ZFS_DEBUG -int zfs_flags = ~0; -#else -int zfs_flags = 0; -#endif - -/* - * zfs_recover can be set to nonzero to attempt to recover from - * otherwise-fatal errors, typically caused by on-disk corruption. When - * set, calls to zfs_panic_recover() will turn into warning messages. - */ -int zfs_recover = 0; -SYSCTL_DECL(_vfs_zfs); -TUNABLE_INT("vfs.zfs.recover", &zfs_recover); -SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, - "Try to recover from otherwise-fatal errors."); - -#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */ - -/* - * ========================================================================== - * SPA namespace functions - * ========================================================================== - */ - -/* - * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. - * Returns NULL if no matching spa_t is found. - */ -spa_t * -spa_lookup(const char *name) -{ - spa_t search, *spa; - avl_index_t where; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - search.spa_name = (char *)name; - spa = avl_find(&spa_namespace_avl, &search, &where); - - return (spa); -} - -/* - * Create an uninitialized spa_t with the given name. Requires - * spa_namespace_lock. The caller must ensure that the spa_t doesn't already - * exist by calling spa_lookup() first. - */ -spa_t * -spa_add(const char *name, const char *altroot) -{ - spa_t *spa; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); - - spa->spa_name = spa_strdup(name); - spa->spa_state = POOL_STATE_UNINITIALIZED; - spa->spa_freeze_txg = UINT64_MAX; - spa->spa_final_txg = UINT64_MAX; - - mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); - - cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); - - refcount_create(&spa->spa_refcount); - refcount_create(&spa->spa_config_lock.scl_count); - - avl_add(&spa_namespace_avl, spa); - - /* - * Set the alternate root, if there is one. - */ - if (altroot) { - spa->spa_root = spa_strdup(altroot); - spa_active_count++; - } - - return (spa); -} - -/* - * Removes a spa_t from the namespace, freeing up any memory used. Requires - * spa_namespace_lock. This is called only after the spa_t has been closed and - * deactivated. - */ -void -spa_remove(spa_t *spa) -{ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); - ASSERT(spa->spa_scrub_thread == NULL); - - avl_remove(&spa_namespace_avl, spa); - cv_broadcast(&spa_namespace_cv); - - if (spa->spa_root) { - spa_strfree(spa->spa_root); - spa_active_count--; - } - - if (spa->spa_name) - spa_strfree(spa->spa_name); - - spa_config_set(spa, NULL); - - refcount_destroy(&spa->spa_refcount); - refcount_destroy(&spa->spa_config_lock.scl_count); - - cv_destroy(&spa->spa_async_cv); - cv_destroy(&spa->spa_scrub_io_cv); - cv_destroy(&spa->spa_scrub_cv); - - mutex_destroy(&spa->spa_scrub_lock); - mutex_destroy(&spa->spa_async_lock); - mutex_destroy(&spa->spa_config_cache_lock); - - kmem_free(spa, sizeof (spa_t)); -} - -/* - * Given a pool, return the next pool in the namespace, or NULL if there is - * none. If 'prev' is NULL, return the first pool. - */ -spa_t * -spa_next(spa_t *prev) -{ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - if (prev) - return (AVL_NEXT(&spa_namespace_avl, prev)); - else - return (avl_first(&spa_namespace_avl)); -} - -/* - * ========================================================================== - * SPA refcount functions - * ========================================================================== - */ - -/* - * Add a reference to the given spa_t. Must have at least one reference, or - * have the namespace lock held. - */ -void -spa_open_ref(spa_t *spa, void *tag) -{ - ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF || - MUTEX_HELD(&spa_namespace_lock)); - - (void) refcount_add(&spa->spa_refcount, tag); -} - -/* - * Remove a reference to the given spa_t. Must have at least one reference, or - * have the namespace lock held. - */ -void -spa_close(spa_t *spa, void *tag) -{ - ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF || - MUTEX_HELD(&spa_namespace_lock)); - - (void) refcount_remove(&spa->spa_refcount, tag); -} - -/* - * Check to see if the spa refcount is zero. Must be called with - * spa_namespace_lock held. We really compare against SPA_MINREF, which is the - * number of references acquired when opening a pool - */ -boolean_t -spa_refcount_zero(spa_t *spa) -{ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - return (refcount_count(&spa->spa_refcount) == SPA_MINREF); -} - -/* - * ========================================================================== - * SPA spare tracking - * ========================================================================== - */ - -/* - * Spares are tracked globally due to the following constraints: - * - * - A spare may be part of multiple pools. - * - A spare may be added to a pool even if it's actively in use within - * another pool. - * - A spare in use in any pool can only be the source of a replacement if - * the target is a spare in the same pool. - * - * We keep track of all spares on the system through the use of a reference - * counted AVL tree. When a vdev is added as a spare, or used as a replacement - * spare, then we bump the reference count in the AVL tree. In addition, we set - * the 'vdev_isspare' member to indicate that the device is a spare (active or - * inactive). When a spare is made active (used to replace a device in the - * pool), we also keep track of which pool its been made a part of. - * - * The 'spa_spare_lock' protects the AVL tree. These functions are normally - * called under the spa_namespace lock as part of vdev reconfiguration. The - * separate spare lock exists for the status query path, which does not need to - * be completely consistent with respect to other vdev configuration changes. - */ - -typedef struct spa_spare { - uint64_t spare_guid; - uint64_t spare_pool; - avl_node_t spare_avl; - int spare_count; -} spa_spare_t; - -static int -spa_spare_compare(const void *a, const void *b) -{ - const spa_spare_t *sa = a; - const spa_spare_t *sb = b; - - if (sa->spare_guid < sb->spare_guid) - return (-1); - else if (sa->spare_guid > sb->spare_guid) - return (1); - else - return (0); -} - -void -spa_spare_add(vdev_t *vd) -{ - avl_index_t where; - spa_spare_t search; - spa_spare_t *spare; - - mutex_enter(&spa_spare_lock); - ASSERT(!vd->vdev_isspare); - - search.spare_guid = vd->vdev_guid; - if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) { - spare->spare_count++; - } else { - spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP); - spare->spare_guid = vd->vdev_guid; - spare->spare_count = 1; - avl_insert(&spa_spare_avl, spare, where); - } - vd->vdev_isspare = B_TRUE; - - mutex_exit(&spa_spare_lock); -} - -void -spa_spare_remove(vdev_t *vd) -{ - spa_spare_t search; - spa_spare_t *spare; - avl_index_t where; - - mutex_enter(&spa_spare_lock); - - search.spare_guid = vd->vdev_guid; - spare = avl_find(&spa_spare_avl, &search, &where); - - ASSERT(vd->vdev_isspare); - ASSERT(spare != NULL); - - if (--spare->spare_count == 0) { - avl_remove(&spa_spare_avl, spare); - kmem_free(spare, sizeof (spa_spare_t)); - } else if (spare->spare_pool == spa_guid(vd->vdev_spa)) { - spare->spare_pool = 0ULL; - } - - vd->vdev_isspare = B_FALSE; - mutex_exit(&spa_spare_lock); -} - -boolean_t -spa_spare_exists(uint64_t guid, uint64_t *pool) -{ - spa_spare_t search, *found; - avl_index_t where; - - mutex_enter(&spa_spare_lock); - - search.spare_guid = guid; - found = avl_find(&spa_spare_avl, &search, &where); - - if (pool) { - if (found) - *pool = found->spare_pool; - else - *pool = 0ULL; - } - - mutex_exit(&spa_spare_lock); - - return (found != NULL); -} - -void -spa_spare_activate(vdev_t *vd) -{ - spa_spare_t search, *found; - avl_index_t where; - - mutex_enter(&spa_spare_lock); - ASSERT(vd->vdev_isspare); - - search.spare_guid = vd->vdev_guid; - found = avl_find(&spa_spare_avl, &search, &where); - ASSERT(found != NULL); - ASSERT(found->spare_pool == 0ULL); - - found->spare_pool = spa_guid(vd->vdev_spa); - mutex_exit(&spa_spare_lock); -} - -/* - * ========================================================================== - * SPA config locking - * ========================================================================== - */ - -/* - * Acquire the config lock. The config lock is a special rwlock that allows for - * recursive enters. Because these enters come from the same thread as well as - * asynchronous threads working on behalf of the owner, we must unilaterally - * allow all reads access as long at least one reader is held (even if a write - * is requested). This has the side effect of write starvation, but write locks - * are extremely rare, and a solution to this problem would be significantly - * more complex (if even possible). - * - * We would like to assert that the namespace lock isn't held, but this is a - * valid use during create. - */ -void -spa_config_enter(spa_t *spa, krw_t rw, void *tag) -{ - spa_config_lock_t *scl = &spa->spa_config_lock; - - mutex_enter(&scl->scl_lock); - - if (scl->scl_writer != curthread) { - if (rw == RW_READER) { - while (scl->scl_writer != NULL) - cv_wait(&scl->scl_cv, &scl->scl_lock); - } else { - while (scl->scl_writer != NULL || - !refcount_is_zero(&scl->scl_count)) - cv_wait(&scl->scl_cv, &scl->scl_lock); - scl->scl_writer = curthread; - } - } - - (void) refcount_add(&scl->scl_count, tag); - - mutex_exit(&scl->scl_lock); -} - -/* - * Release the spa config lock, notifying any waiters in the process. - */ -void -spa_config_exit(spa_t *spa, void *tag) -{ - spa_config_lock_t *scl = &spa->spa_config_lock; - - mutex_enter(&scl->scl_lock); - - ASSERT(!refcount_is_zero(&scl->scl_count)); - if (refcount_remove(&scl->scl_count, tag) == 0) { - cv_broadcast(&scl->scl_cv); - scl->scl_writer = NULL; /* OK in either case */ - } - - mutex_exit(&scl->scl_lock); -} - -/* - * Returns true if the config lock is held in the given manner. - */ -boolean_t -spa_config_held(spa_t *spa, krw_t rw) -{ - spa_config_lock_t *scl = &spa->spa_config_lock; - boolean_t held; - - mutex_enter(&scl->scl_lock); - if (rw == RW_WRITER) - held = (scl->scl_writer == curthread); - else - held = !refcount_is_zero(&scl->scl_count); - mutex_exit(&scl->scl_lock); - - return (held); -} - -/* - * ========================================================================== - * SPA vdev locking - * ========================================================================== - */ - -/* - * Lock the given spa_t for the purpose of adding or removing a vdev. - * Grabs the global spa_namespace_lock plus the spa config lock for writing. - * It returns the next transaction group for the spa_t. - */ -uint64_t -spa_vdev_enter(spa_t *spa) -{ - /* - * Suspend scrub activity while we mess with the config. - */ - spa_scrub_suspend(spa); - - mutex_enter(&spa_namespace_lock); - - spa_config_enter(spa, RW_WRITER, spa); - - return (spa_last_synced_txg(spa) + 1); -} - -/* - * Unlock the spa_t after adding or removing a vdev. Besides undoing the - * locking of spa_vdev_enter(), we also want make sure the transactions have - * synced to disk, and then update the global configuration cache with the new - * information. - */ -int -spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) -{ - int config_changed = B_FALSE; - - ASSERT(txg > spa_last_synced_txg(spa)); - - /* - * Reassess the DTLs. - */ - vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); - - /* - * If the config changed, notify the scrub thread that it must restart. - */ - if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) { - config_changed = B_TRUE; - spa_scrub_restart(spa, txg); - } - - spa_config_exit(spa, spa); - - /* - * Allow scrubbing to resume. - */ - spa_scrub_resume(spa); - - /* - * Note: this txg_wait_synced() is important because it ensures - * that there won't be more than one config change per txg. - * This allows us to use the txg as the generation number. - */ - if (error == 0) - txg_wait_synced(spa->spa_dsl_pool, txg); - - if (vd != NULL) { - ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0); - vdev_free(vd); - } - - /* - * If the config changed, update the config cache. - */ - if (config_changed) - spa_config_sync(); - - mutex_exit(&spa_namespace_lock); - - return (error); -} - -/* - * ========================================================================== - * Miscellaneous functions - * ========================================================================== - */ - -/* - * Rename a spa_t. - */ -int -spa_rename(const char *name, const char *newname) -{ - spa_t *spa; - int err; - - /* - * Lookup the spa_t and grab the config lock for writing. We need to - * actually open the pool so that we can sync out the necessary labels. - * It's OK to call spa_open() with the namespace lock held because we - * allow recursive calls for other reasons. - */ - mutex_enter(&spa_namespace_lock); - if ((err = spa_open(name, &spa, FTAG)) != 0) { - mutex_exit(&spa_namespace_lock); - return (err); - } - - spa_config_enter(spa, RW_WRITER, FTAG); - - avl_remove(&spa_namespace_avl, spa); - spa_strfree(spa->spa_name); - spa->spa_name = spa_strdup(newname); - avl_add(&spa_namespace_avl, spa); - - /* - * Sync all labels to disk with the new names by marking the root vdev - * dirty and waiting for it to sync. It will pick up the new pool name - * during the sync. - */ - vdev_config_dirty(spa->spa_root_vdev); - - spa_config_exit(spa, FTAG); - - txg_wait_synced(spa->spa_dsl_pool, 0); - - /* - * Sync the updated config cache. - */ - spa_config_sync(); - - spa_close(spa, FTAG); - - mutex_exit(&spa_namespace_lock); - - return (0); -} - - -/* - * Determine whether a pool with given pool_guid exists. If device_guid is - * non-zero, determine whether the pool exists *and* contains a device with the - * specified device_guid. - */ -boolean_t -spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) -{ - spa_t *spa; - avl_tree_t *t = &spa_namespace_avl; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { - if (spa->spa_state == POOL_STATE_UNINITIALIZED) - continue; - if (spa->spa_root_vdev == NULL) - continue; - if (spa_guid(spa) == pool_guid) { - if (device_guid == 0) - break; - - if (vdev_lookup_by_guid(spa->spa_root_vdev, - device_guid) != NULL) - break; - - /* - * Check any devices we may in the process of adding. - */ - if (spa->spa_pending_vdev) { - if (vdev_lookup_by_guid(spa->spa_pending_vdev, - device_guid) != NULL) - break; - } - } - } - - return (spa != NULL); -} - -char * -spa_strdup(const char *s) -{ - size_t len; - char *new; - - len = strlen(s); - new = kmem_alloc(len + 1, KM_SLEEP); - bcopy(s, new, len); - new[len] = '\0'; - - return (new); -} - -void -spa_strfree(char *s) -{ - kmem_free(s, strlen(s) + 1); -} - -uint64_t -spa_get_random(uint64_t range) -{ - uint64_t r; - - ASSERT(range != 0); - - (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); - - return (r % range); -} - -void -sprintf_blkptr(char *buf, int len, const blkptr_t *bp) -{ - int d; - - if (bp == NULL) { - (void) snprintf(buf, len, "<NULL>"); - return; - } - - if (BP_IS_HOLE(bp)) { - (void) snprintf(buf, len, "<hole>"); - return; - } - - (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ", - (u_longlong_t)BP_GET_LEVEL(bp), - dmu_ot[BP_GET_TYPE(bp)].ot_name, - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp)); - - for (d = 0; d < BP_GET_NDVAS(bp); d++) { - const dva_t *dva = &bp->blk_dva[d]; - (void) snprintf(buf + strlen(buf), len - strlen(buf), - "DVA[%d]=<%llu:%llx:%llx> ", d, - (u_longlong_t)DVA_GET_VDEV(dva), - (u_longlong_t)DVA_GET_OFFSET(dva), - (u_longlong_t)DVA_GET_ASIZE(dva)); - } - - (void) snprintf(buf + strlen(buf), len - strlen(buf), - "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx", - zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, - zio_compress_table[BP_GET_COMPRESS(bp)].ci_name, - BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", - BP_IS_GANG(bp) ? "gang" : "contiguous", - (u_longlong_t)bp->blk_birth, - (u_longlong_t)bp->blk_fill, - (u_longlong_t)bp->blk_cksum.zc_word[0], - (u_longlong_t)bp->blk_cksum.zc_word[1], - (u_longlong_t)bp->blk_cksum.zc_word[2], - (u_longlong_t)bp->blk_cksum.zc_word[3]); -} - -void -spa_freeze(spa_t *spa) -{ - uint64_t freeze_txg = 0; - - spa_config_enter(spa, RW_WRITER, FTAG); - if (spa->spa_freeze_txg == UINT64_MAX) { - freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; - spa->spa_freeze_txg = freeze_txg; - } - spa_config_exit(spa, FTAG); - if (freeze_txg != 0) - txg_wait_synced(spa_get_dsl(spa), freeze_txg); -} - -void -zfs_panic_recover(const char *fmt, ...) -{ - va_list adx; - - va_start(adx, fmt); - vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); - va_end(adx); -} - -/* - * ========================================================================== - * Accessor functions - * ========================================================================== - */ - -krwlock_t * -spa_traverse_rwlock(spa_t *spa) -{ - return (&spa->spa_traverse_lock); -} - -int -spa_traverse_wanted(spa_t *spa) -{ - return (spa->spa_traverse_wanted); -} - -dsl_pool_t * -spa_get_dsl(spa_t *spa) -{ - return (spa->spa_dsl_pool); -} - -blkptr_t * -spa_get_rootblkptr(spa_t *spa) -{ - return (&spa->spa_ubsync.ub_rootbp); -} - -void -spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) -{ - spa->spa_uberblock.ub_rootbp = *bp; -} - -void -spa_altroot(spa_t *spa, char *buf, size_t buflen) -{ - if (spa->spa_root == NULL) - buf[0] = '\0'; - else - (void) strncpy(buf, spa->spa_root, buflen); -} - -int -spa_sync_pass(spa_t *spa) -{ - return (spa->spa_sync_pass); -} - -char * -spa_name(spa_t *spa) -{ - /* - * Accessing the name requires holding either the namespace lock or the - * config lock, both of which are required to do a rename. - */ - ASSERT(MUTEX_HELD(&spa_namespace_lock) || - spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER)); - - return (spa->spa_name); -} - -uint64_t -spa_guid(spa_t *spa) -{ - /* - * If we fail to parse the config during spa_load(), we can go through - * the error path (which posts an ereport) and end up here with no root - * vdev. We stash the original pool guid in 'spa_load_guid' to handle - * this case. - */ - if (spa->spa_root_vdev != NULL) - return (spa->spa_root_vdev->vdev_guid); - else - return (spa->spa_load_guid); -} - -uint64_t -spa_last_synced_txg(spa_t *spa) -{ - return (spa->spa_ubsync.ub_txg); -} - -uint64_t -spa_first_txg(spa_t *spa) -{ - return (spa->spa_first_txg); -} - -int -spa_state(spa_t *spa) -{ - return (spa->spa_state); -} - -uint64_t -spa_freeze_txg(spa_t *spa) -{ - return (spa->spa_freeze_txg); -} - -/* - * In the future, this may select among different metaslab classes - * depending on the zdp. For now, there's no such distinction. - */ -metaslab_class_t * -spa_metaslab_class_select(spa_t *spa) -{ - return (spa->spa_normal_class); -} - -/* - * Return how much space is allocated in the pool (ie. sum of all asize) - */ -uint64_t -spa_get_alloc(spa_t *spa) -{ - return (spa->spa_root_vdev->vdev_stat.vs_alloc); -} - -/* - * Return how much (raid-z inflated) space there is in the pool. - */ -uint64_t -spa_get_space(spa_t *spa) -{ - return (spa->spa_root_vdev->vdev_stat.vs_space); -} - -/* - * Return the amount of raid-z-deflated space in the pool. - */ -uint64_t -spa_get_dspace(spa_t *spa) -{ - if (spa->spa_deflate) - return (spa->spa_root_vdev->vdev_stat.vs_dspace); - else - return (spa->spa_root_vdev->vdev_stat.vs_space); -} - -/* ARGSUSED */ -uint64_t -spa_get_asize(spa_t *spa, uint64_t lsize) -{ - /* - * For now, the worst case is 512-byte RAID-Z blocks, in which - * case the space requirement is exactly 2x; so just assume that. - * Add to this the fact that we can have up to 3 DVAs per bp, and - * we have to multiply by a total of 6x. - */ - return (lsize * 6); -} - -uint64_t -spa_version(spa_t *spa) -{ - return (spa->spa_ubsync.ub_version); -} - -int -spa_max_replication(spa_t *spa) -{ - /* - * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to - * handle BPs with more than one DVA allocated. Set our max - * replication level accordingly. - */ - if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS) - return (1); - return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); -} - -uint64_t -bp_get_dasize(spa_t *spa, const blkptr_t *bp) -{ - int sz = 0, i; - - if (!spa->spa_deflate) - return (BP_GET_ASIZE(bp)); - - for (i = 0; i < SPA_DVAS_PER_BP; i++) { - vdev_t *vd = - vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i])); - sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; - } - return (sz); -} - -/* - * ========================================================================== - * Initialization and Termination - * ========================================================================== - */ - -static int -spa_name_compare(const void *a1, const void *a2) -{ - const spa_t *s1 = a1; - const spa_t *s2 = a2; - int s; - - s = strcmp(s1->spa_name, s2->spa_name); - if (s > 0) - return (1); - if (s < 0) - return (-1); - return (0); -} - -int -spa_busy(void) -{ - return (spa_active_count); -} - -void -spa_init(int mode) -{ - mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); - - avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), - offsetof(spa_t, spa_avl)); - - mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); - - avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t), - offsetof(spa_spare_t, spare_avl)); - - spa_mode = mode; - - refcount_init(); - unique_init(); - zio_init(); - dmu_init(); - zil_init(); - spa_config_load(); -} - -void -spa_fini(void) -{ - spa_evict_all(); - - zil_fini(); - dmu_fini(); - zio_fini(); - refcount_fini(); - - avl_destroy(&spa_namespace_avl); - avl_destroy(&spa_spare_avl); - - cv_destroy(&spa_namespace_cv); - mutex_destroy(&spa_namespace_lock); - mutex_destroy(&spa_spare_lock); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c deleted file mode 100644 index 23313a9..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ /dev/null @@ -1,501 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/dmu.h> -#include <sys/zio.h> -#include <sys/space_map.h> - -/* - * Space map routines. - * NOTE: caller is responsible for all locking. - */ -static int -space_map_seg_compare(const void *x1, const void *x2) -{ - const space_seg_t *s1 = x1; - const space_seg_t *s2 = x2; - - if (s1->ss_start < s2->ss_start) { - if (s1->ss_end > s2->ss_start) - return (0); - return (-1); - } - if (s1->ss_start > s2->ss_start) { - if (s1->ss_start < s2->ss_end) - return (0); - return (1); - } - return (0); -} - -void -space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, - kmutex_t *lp) -{ - bzero(sm, sizeof (*sm)); - - cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL); - avl_create(&sm->sm_root, space_map_seg_compare, - sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); - - sm->sm_start = start; - sm->sm_size = size; - sm->sm_shift = shift; - sm->sm_lock = lp; -} - -void -space_map_destroy(space_map_t *sm) -{ - ASSERT(!sm->sm_loaded && !sm->sm_loading); - VERIFY3U(sm->sm_space, ==, 0); - avl_destroy(&sm->sm_root); - cv_destroy(&sm->sm_load_cv); -} - -void -space_map_add(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_index_t where; - space_seg_t ssearch, *ss_before, *ss_after, *ss; - uint64_t end = start + size; - int merge_before, merge_after; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - VERIFY(size != 0); - VERIFY3U(start, >=, sm->sm_start); - VERIFY3U(end, <=, sm->sm_start + sm->sm_size); - VERIFY(sm->sm_space + size <= sm->sm_size); - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); - - if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { - zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size); - return; - } - - /* Make sure we don't overlap with either of our neighbors */ - VERIFY(ss == NULL); - - ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE); - ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER); - - merge_before = (ss_before != NULL && ss_before->ss_end == start); - merge_after = (ss_after != NULL && ss_after->ss_start == end); - - if (merge_before && merge_after) { - avl_remove(&sm->sm_root, ss_before); - ss_after->ss_start = ss_before->ss_start; - kmem_free(ss_before, sizeof (*ss_before)); - } else if (merge_before) { - ss_before->ss_end = end; - } else if (merge_after) { - ss_after->ss_start = start; - } else { - ss = kmem_alloc(sizeof (*ss), KM_SLEEP); - ss->ss_start = start; - ss->ss_end = end; - avl_insert(&sm->sm_root, ss, where); - } - - sm->sm_space += size; -} - -void -space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_index_t where; - space_seg_t ssearch, *ss, *newseg; - uint64_t end = start + size; - int left_over, right_over; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - VERIFY(size != 0); - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); - - /* Make sure we completely overlap with someone */ - if (ss == NULL) { - zfs_panic_recover("zfs: freeing free segment " - "(offset=%llu size=%llu)", - (longlong_t)start, (longlong_t)size); - return; - } - VERIFY3U(ss->ss_start, <=, start); - VERIFY3U(ss->ss_end, >=, end); - VERIFY(sm->sm_space - size <= sm->sm_size); - - left_over = (ss->ss_start != start); - right_over = (ss->ss_end != end); - - if (left_over && right_over) { - newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); - newseg->ss_start = end; - newseg->ss_end = ss->ss_end; - ss->ss_end = start; - avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); - } else if (left_over) { - ss->ss_end = start; - } else if (right_over) { - ss->ss_start = end; - } else { - avl_remove(&sm->sm_root, ss); - kmem_free(ss, sizeof (*ss)); - } - - sm->sm_space -= size; -} - -int -space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_index_t where; - space_seg_t ssearch, *ss; - uint64_t end = start + size; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - VERIFY(size != 0); - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); - - return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); -} - -void -space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) -{ - space_seg_t *ss; - void *cookie = NULL; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - - while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { - if (func != NULL) - func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); - kmem_free(ss, sizeof (*ss)); - } - sm->sm_space = 0; -} - -void -space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) -{ - space_seg_t *ss; - - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) - func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); -} - -void -space_map_excise(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_tree_t *t = &sm->sm_root; - avl_index_t where; - space_seg_t *ss, search; - uint64_t end = start + size; - uint64_t rm_start, rm_end; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - - search.ss_start = start; - search.ss_end = start; - - for (;;) { - ss = avl_find(t, &search, &where); - - if (ss == NULL) - ss = avl_nearest(t, where, AVL_AFTER); - - if (ss == NULL || ss->ss_start >= end) - break; - - rm_start = MAX(ss->ss_start, start); - rm_end = MIN(ss->ss_end, end); - - space_map_remove(sm, rm_start, rm_end - rm_start); - } -} - -/* - * Replace smd with the union of smd and sms. - */ -void -space_map_union(space_map_t *smd, space_map_t *sms) -{ - avl_tree_t *t = &sms->sm_root; - space_seg_t *ss; - - ASSERT(MUTEX_HELD(smd->sm_lock)); - - /* - * For each source segment, remove any intersections with the - * destination, then add the source segment to the destination. - */ - for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { - space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start); - space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start); - } -} - -/* - * Wait for any in-progress space_map_load() to complete. - */ -void -space_map_load_wait(space_map_t *sm) -{ - ASSERT(MUTEX_HELD(sm->sm_lock)); - - while (sm->sm_loading) - cv_wait(&sm->sm_load_cv, sm->sm_lock); -} - -/* - * Note: space_map_load() will drop sm_lock across dmu_read() calls. - * The caller must be OK with this. - */ -int -space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, - space_map_obj_t *smo, objset_t *os) -{ - uint64_t *entry, *entry_map, *entry_map_end; - uint64_t bufsize, size, offset, end, space; - uint64_t mapstart = sm->sm_start; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - - space_map_load_wait(sm); - - if (sm->sm_loaded) - return (0); - - sm->sm_loading = B_TRUE; - end = smo->smo_objsize; - space = smo->smo_alloc; - - ASSERT(sm->sm_ops == NULL); - VERIFY3U(sm->sm_space, ==, 0); - - if (maptype == SM_FREE) { - space_map_add(sm, sm->sm_start, sm->sm_size); - space = sm->sm_size - space; - } - - bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT; - entry_map = zio_buf_alloc(bufsize); - - mutex_exit(sm->sm_lock); - if (end > bufsize) - dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize); - mutex_enter(sm->sm_lock); - - for (offset = 0; offset < end; offset += bufsize) { - size = MIN(end - offset, bufsize); - VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); - VERIFY(size != 0); - - dprintf("object=%llu offset=%llx size=%llx\n", - smo->smo_object, offset, size); - - mutex_exit(sm->sm_lock); - VERIFY3U(dmu_read(os, smo->smo_object, offset, size, - entry_map), ==, 0); - mutex_enter(sm->sm_lock); - - entry_map_end = entry_map + (size / sizeof (uint64_t)); - for (entry = entry_map; entry < entry_map_end; entry++) { - uint64_t e = *entry; - - if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ - continue; - - (SM_TYPE_DECODE(e) == maptype ? - space_map_add : space_map_remove)(sm, - (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart, - SM_RUN_DECODE(e) << sm->sm_shift); - } - } - VERIFY3U(sm->sm_space, ==, space); - - zio_buf_free(entry_map, bufsize); - - sm->sm_loading = B_FALSE; - sm->sm_loaded = B_TRUE; - sm->sm_ops = ops; - - cv_broadcast(&sm->sm_load_cv); - - if (ops != NULL) - ops->smop_load(sm); - - return (0); -} - -void -space_map_unload(space_map_t *sm) -{ - ASSERT(MUTEX_HELD(sm->sm_lock)); - - if (sm->sm_loaded && sm->sm_ops != NULL) - sm->sm_ops->smop_unload(sm); - - sm->sm_loaded = B_FALSE; - sm->sm_ops = NULL; - - space_map_vacate(sm, NULL, NULL); -} - -uint64_t -space_map_alloc(space_map_t *sm, uint64_t size) -{ - uint64_t start; - - start = sm->sm_ops->smop_alloc(sm, size); - if (start != -1ULL) - space_map_remove(sm, start, size); - return (start); -} - -void -space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) -{ - sm->sm_ops->smop_claim(sm, start, size); - space_map_remove(sm, start, size); -} - -void -space_map_free(space_map_t *sm, uint64_t start, uint64_t size) -{ - space_map_add(sm, start, size); - sm->sm_ops->smop_free(sm, start, size); -} - -/* - * Note: space_map_sync() will drop sm_lock across dmu_write() calls. - */ -void -space_map_sync(space_map_t *sm, uint8_t maptype, - space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) -{ - spa_t *spa = dmu_objset_spa(os); - void *cookie = NULL; - space_seg_t *ss; - uint64_t bufsize, start, size, run_len; - uint64_t *entry, *entry_map, *entry_map_end; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - - if (sm->sm_space == 0) - return; - - dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n", - smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa), - maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root), - sm->sm_space); - - if (maptype == SM_ALLOC) - smo->smo_alloc += sm->sm_space; - else - smo->smo_alloc -= sm->sm_space; - - bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t); - bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT); - entry_map = zio_buf_alloc(bufsize); - entry_map_end = entry_map + (bufsize / sizeof (uint64_t)); - entry = entry_map; - - *entry++ = SM_DEBUG_ENCODE(1) | - SM_DEBUG_ACTION_ENCODE(maptype) | - SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | - SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - - while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { - size = ss->ss_end - ss->ss_start; - start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; - - sm->sm_space -= size; - size >>= sm->sm_shift; - - while (size) { - run_len = MIN(size, SM_RUN_MAX); - - if (entry == entry_map_end) { - mutex_exit(sm->sm_lock); - dmu_write(os, smo->smo_object, smo->smo_objsize, - bufsize, entry_map, tx); - mutex_enter(sm->sm_lock); - smo->smo_objsize += bufsize; - entry = entry_map; - } - - *entry++ = SM_OFFSET_ENCODE(start) | - SM_TYPE_ENCODE(maptype) | - SM_RUN_ENCODE(run_len); - - start += run_len; - size -= run_len; - } - kmem_free(ss, sizeof (*ss)); - } - - if (entry != entry_map) { - size = (entry - entry_map) * sizeof (uint64_t); - mutex_exit(sm->sm_lock); - dmu_write(os, smo->smo_object, smo->smo_objsize, - size, entry_map, tx); - mutex_enter(sm->sm_lock); - smo->smo_objsize += size; - } - - zio_buf_free(entry_map, bufsize); - - VERIFY3U(sm->sm_space, ==, 0); -} - -void -space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) -{ - VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0); - - smo->smo_objsize = 0; - smo->smo_alloc = 0; -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h deleted file mode 100644 index f58ffc0..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ARC_H -#define _SYS_ARC_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#include <sys/zio.h> - -typedef struct arc_buf_hdr arc_buf_hdr_t; -typedef struct arc_buf arc_buf_t; -typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); -typedef void arc_byteswap_func_t(void *buf, size_t size); -typedef int arc_evict_func_t(void *private); - -/* generic arc_done_func_t's which you can use */ -arc_done_func_t arc_bcopy_func; -arc_done_func_t arc_getbuf_func; - -struct arc_buf { - arc_buf_hdr_t *b_hdr; - arc_buf_t *b_next; - void *b_data; - arc_evict_func_t *b_efunc; - void *b_private; -}; - -typedef enum arc_buf_contents { - ARC_BUFC_UNDEF, /* buffer contents undefined */ - ARC_BUFC_DATA, /* buffer contains data */ - ARC_BUFC_METADATA /* buffer contains metadata */ -} arc_buf_contents_t; -/* - * These are the flags we pass into calls to the arc - */ -#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ -#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ -#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ -#define ARC_CACHED (1 << 4) /* I/O was already in cache */ - -arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, - arc_buf_contents_t type); -void arc_buf_add_ref(arc_buf_t *buf, void *tag); -int arc_buf_remove_ref(arc_buf_t *buf, void *tag); -int arc_buf_size(arc_buf_t *buf); -void arc_release(arc_buf_t *buf, void *tag); -int arc_released(arc_buf_t *buf); -int arc_has_callback(arc_buf_t *buf); -void arc_buf_freeze(arc_buf_t *buf); -void arc_buf_thaw(arc_buf_t *buf); -#ifdef ZFS_DEBUG -int arc_referenced(arc_buf_t *buf); -#endif - -int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, - arc_done_func_t *done, void *private, int priority, int flags, - uint32_t *arc_flags, zbookmark_t *zb); -zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, - int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb); -int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, uint32_t arc_flags); -int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); - -void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); -int arc_buf_evict(arc_buf_t *buf); - -void arc_flush(void); -void arc_tempreserve_clear(uint64_t tempreserve); -int arc_tempreserve_space(uint64_t tempreserve); - -void arc_init(void); -void arc_fini(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ARC_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h deleted file mode 100644 index b4c8376..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_BPLIST_H -#define _SYS_BPLIST_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct bplist_phys { - /* - * This is the bonus buffer for the dead lists. The object's - * contents is an array of bpl_entries blkptr_t's, representing - * a total of bpl_bytes physical space. - */ - uint64_t bpl_entries; - uint64_t bpl_bytes; - uint64_t bpl_comp; - uint64_t bpl_uncomp; -} bplist_phys_t; - -#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t)) - -typedef struct bplist_q { - blkptr_t bpq_blk; - void *bpq_next; -} bplist_q_t; - -typedef struct bplist { - kmutex_t bpl_lock; - objset_t *bpl_mos; - uint64_t bpl_object; - uint8_t bpl_blockshift; - uint8_t bpl_bpshift; - uint8_t bpl_havecomp; - bplist_q_t *bpl_queue; - bplist_phys_t *bpl_phys; - dmu_buf_t *bpl_dbuf; - dmu_buf_t *bpl_cached_dbuf; -} bplist_t; - -extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); -extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); -extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); -extern void bplist_close(bplist_t *bpl); -extern boolean_t bplist_empty(bplist_t *bpl); -extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); -extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx); -extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp); -extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); -extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); -extern int bplist_space(bplist_t *bpl, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_BPLIST_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h deleted file mode 100644 index d33657b..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ /dev/null @@ -1,334 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DBUF_H -#define _SYS_DBUF_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/zio.h> -#include <sys/arc.h> -#include <sys/zfs_context.h> -#include <sys/refcount.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#define DB_BONUS_BLKID (-1ULL) -#define IN_DMU_SYNC 2 - -/* - * define flags for dbuf_read - */ - -#define DB_RF_MUST_SUCCEED (1 << 0) -#define DB_RF_CANFAIL (1 << 1) -#define DB_RF_HAVESTRUCT (1 << 2) -#define DB_RF_NOPREFETCH (1 << 3) -#define DB_RF_NEVERWAIT (1 << 4) -#define DB_RF_CACHED (1 << 5) - -/* - * The state transition diagram for dbufs looks like: - * - * +----> READ ----+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * | ^ - * | | - * +----> FILL ----+ - */ -typedef enum dbuf_states { - DB_UNCACHED, - DB_FILL, - DB_READ, - DB_CACHED, - DB_EVICTING -} dbuf_states_t; - -struct objset_impl; -struct dnode; -struct dmu_tx; - -/* - * level = 0 means the user data - * level = 1 means the single indirect block - * etc. - */ - -#define LIST_LINK_INACTIVE(link) \ - ((link)->list_next == NULL && (link)->list_prev == NULL) - -struct dmu_buf_impl; - -typedef enum override_states { - DR_NOT_OVERRIDDEN, - DR_IN_DMU_SYNC, - DR_OVERRIDDEN -} override_states_t; - -typedef struct dbuf_dirty_record { - /* link on our parents dirty list */ - list_node_t dr_dirty_node; - - /* transaction group this data will sync in */ - uint64_t dr_txg; - - /* zio of outstanding write IO */ - zio_t *dr_zio; - - /* pointer back to our dbuf */ - struct dmu_buf_impl *dr_dbuf; - - /* pointer to next dirty record */ - struct dbuf_dirty_record *dr_next; - - /* pointer to parent dirty record */ - struct dbuf_dirty_record *dr_parent; - - union dirty_types { - struct dirty_indirect { - - /* protect access to list */ - kmutex_t dr_mtx; - - /* Our list of dirty children */ - list_t dr_children; - } di; - struct dirty_leaf { - - /* - * dr_data is set when we dirty the buffer - * so that we can retain the pointer even if it - * gets COW'd in a subsequent transaction group. - */ - arc_buf_t *dr_data; - blkptr_t dr_overridden_by; - override_states_t dr_override_state; - } dl; - } dt; -} dbuf_dirty_record_t; - -typedef struct dmu_buf_impl { - /* - * The following members are immutable, with the exception of - * db.db_data, which is protected by db_mtx. - */ - - /* the publicly visible structure */ - dmu_buf_t db; - - /* the objset we belong to */ - struct objset_impl *db_objset; - - /* - * the dnode we belong to (NULL when evicted) - */ - struct dnode *db_dnode; - - /* - * our parent buffer; if the dnode points to us directly, - * db_parent == db_dnode->dn_dbuf - * only accessed by sync thread ??? - * (NULL when evicted) - */ - struct dmu_buf_impl *db_parent; - - /* - * link for hash table of all dmu_buf_impl_t's - */ - struct dmu_buf_impl *db_hash_next; - - /* our block number */ - uint64_t db_blkid; - - /* - * Pointer to the blkptr_t which points to us. May be NULL if we - * don't have one yet. (NULL when evicted) - */ - blkptr_t *db_blkptr; - - /* - * Our indirection level. Data buffers have db_level==0. - * Indirect buffers which point to data buffers have - * db_level==1. etc. Buffers which contain dnodes have - * db_level==0, since the dnodes are stored in a file. - */ - uint8_t db_level; - - /* db_mtx protects the members below */ - kmutex_t db_mtx; - - /* - * Current state of the buffer - */ - dbuf_states_t db_state; - - /* - * Refcount accessed by dmu_buf_{hold,rele}. - * If nonzero, the buffer can't be destroyed. - * Protected by db_mtx. - */ - refcount_t db_holds; - - /* buffer holding our data */ - arc_buf_t *db_buf; - - kcondvar_t db_changed; - dbuf_dirty_record_t *db_data_pending; - - /* pointer to most recent dirty record for this buffer */ - dbuf_dirty_record_t *db_last_dirty; - - /* - * Our link on the owner dnodes's dn_dbufs list. - * Protected by its dn_dbufs_mtx. - */ - list_node_t db_link; - - /* Data which is unique to data (leaf) blocks: */ - - /* stuff we store for the user (see dmu_buf_set_user) */ - void *db_user_ptr; - void **db_user_data_ptr_ptr; - dmu_buf_evict_func_t *db_evict_func; - - uint8_t db_immediate_evict; - uint8_t db_freed_in_flight; - - uint8_t db_dirtycnt; -} dmu_buf_impl_t; - -/* Note: the dbuf hash table is exposed only for the mdb module */ -#define DBUF_MUTEXES 256 -#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) -typedef struct dbuf_hash_table { - uint64_t hash_table_mask; - dmu_buf_impl_t **hash_table; - kmutex_t hash_mutexes[DBUF_MUTEXES]; -} dbuf_hash_table_t; - - -uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); - -dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); -dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn); - -dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); -dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, - void *tag); -int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, - void *tag, dmu_buf_impl_t **dbp); - -void dbuf_prefetch(struct dnode *dn, uint64_t blkid); - -void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); -uint64_t dbuf_refcount(dmu_buf_impl_t *db); - -void dbuf_rele(dmu_buf_impl_t *db, void *tag); - -dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); - -int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); -void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); -dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); - -void dbuf_clear(dmu_buf_impl_t *db); -void dbuf_evict(dmu_buf_impl_t *db); - -void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dbuf_unoverride(dbuf_dirty_record_t *dr); -void dbuf_sync_list(list_t *list, dmu_tx_t *tx); - -void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks, - struct dmu_tx *); - -void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); - -void dbuf_init(void); -void dbuf_fini(void); - -#define DBUF_GET_BUFC_TYPE(db) \ - ((((db)->db_level > 0) || \ - (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA); - -#ifdef ZFS_DEBUG - -/* - * There should be a ## between the string literal and fmt, to make it - * clear that we're joining two strings together, but gcc does not - * support that preprocessor token. - */ -#define dprintf_dbuf(dbuf, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char __db_buf[32]; \ - uint64_t __db_obj = (dbuf)->db.db_object; \ - if (__db_obj == DMU_META_DNODE_OBJECT) \ - (void) strcpy(__db_buf, "mdn"); \ - else \ - (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ - (u_longlong_t)__db_obj); \ - dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ - "obj=%s lvl=%u blkid=%lld " fmt, \ - __db_buf, (dbuf)->db_level, \ - (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ - } \ -_NOTE(CONSTCOND) } while (0) - -#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ - dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ - kmem_free(__blkbuf, BP_SPRINTF_LEN); \ - } \ -_NOTE(CONSTCOND) } while (0) - -#define DBUF_VERIFY(db) dbuf_verify(db) - -#else - -#define dprintf_dbuf(db, fmt, ...) -#define dprintf_dbuf_bp(db, bp, fmt, ...) -#define DBUF_VERIFY(db) - -#endif - - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DBUF_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h deleted file mode 100644 index 8c2a1fd..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ /dev/null @@ -1,587 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DMU_H -#define _SYS_DMU_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * This file describes the interface that the DMU provides for its - * consumers. - * - * The DMU also interacts with the SPA. That interface is described in - * dmu_spa.h. - */ - -#include <sys/types.h> -#include <sys/param.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct uio; -struct page; -struct vnode; -struct spa; -struct zilog; -struct zio; -struct blkptr; -struct zap_cursor; -struct dsl_dataset; -struct dsl_pool; -struct dnode; -struct drr_begin; -struct drr_end; -struct zbookmark; -struct spa; -struct nvlist; -struct objset_impl; -struct file; - -typedef struct objset objset_t; -typedef struct dmu_tx dmu_tx_t; -typedef struct dsl_dir dsl_dir_t; - -typedef enum dmu_object_type { - DMU_OT_NONE, - /* general: */ - DMU_OT_OBJECT_DIRECTORY, /* ZAP */ - DMU_OT_OBJECT_ARRAY, /* UINT64 */ - DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ - DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ - DMU_OT_BPLIST, /* UINT64 */ - DMU_OT_BPLIST_HDR, /* UINT64 */ - /* spa: */ - DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ - DMU_OT_SPACE_MAP, /* UINT64 */ - /* zil: */ - DMU_OT_INTENT_LOG, /* UINT64 */ - /* dmu: */ - DMU_OT_DNODE, /* DNODE */ - DMU_OT_OBJSET, /* OBJSET */ - /* dsl: */ - DMU_OT_DSL_DIR, /* UINT64 */ - DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ - DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ - DMU_OT_DSL_PROPS, /* ZAP */ - DMU_OT_DSL_DATASET, /* UINT64 */ - /* zpl: */ - DMU_OT_ZNODE, /* ZNODE */ - DMU_OT_ACL, /* ACL */ - DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ - DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ - DMU_OT_MASTER_NODE, /* ZAP */ - DMU_OT_UNLINKED_SET, /* ZAP */ - /* zvol: */ - DMU_OT_ZVOL, /* UINT8 */ - DMU_OT_ZVOL_PROP, /* ZAP */ - /* other; for testing only! */ - DMU_OT_PLAIN_OTHER, /* UINT8 */ - DMU_OT_UINT64_OTHER, /* UINT64 */ - DMU_OT_ZAP_OTHER, /* ZAP */ - /* new object types: */ - DMU_OT_ERROR_LOG, /* ZAP */ - DMU_OT_SPA_HISTORY, /* UINT8 */ - DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ - DMU_OT_POOL_PROPS, /* ZAP */ - - DMU_OT_NUMTYPES -} dmu_object_type_t; - -typedef enum dmu_objset_type { - DMU_OST_NONE, - DMU_OST_META, - DMU_OST_ZFS, - DMU_OST_ZVOL, - DMU_OST_OTHER, /* For testing only! */ - DMU_OST_ANY, /* Be careful! */ - DMU_OST_NUMTYPES -} dmu_objset_type_t; - -void byteswap_uint64_array(void *buf, size_t size); -void byteswap_uint32_array(void *buf, size_t size); -void byteswap_uint16_array(void *buf, size_t size); -void byteswap_uint8_array(void *buf, size_t size); -void zap_byteswap(void *buf, size_t size); -void zfs_acl_byteswap(void *buf, size_t size); -void zfs_znode_byteswap(void *buf, size_t size); - -#define DS_MODE_NONE 0 /* invalid, to aid debugging */ -#define DS_MODE_STANDARD 1 /* normal access, no special needs */ -#define DS_MODE_PRIMARY 2 /* the "main" access, e.g. a mount */ -#define DS_MODE_EXCLUSIVE 3 /* exclusive access, e.g. to destroy */ -#define DS_MODE_LEVELS 4 -#define DS_MODE_LEVEL(x) ((x) & (DS_MODE_LEVELS - 1)) -#define DS_MODE_READONLY 0x8 -#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY) -#define DS_MODE_INCONSISTENT 0x10 -#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT) - -#define DS_FIND_SNAPSHOTS (1<<0) -#define DS_FIND_CHILDREN (1<<1) - -/* - * The maximum number of bytes that can be accessed as part of one - * operation, including metadata. - */ -#define DMU_MAX_ACCESS (10<<20) /* 10MB */ - -/* - * Public routines to create, destroy, open, and close objsets. - */ -int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp); -void dmu_objset_close(objset_t *os); -int dmu_objset_evict_dbufs(objset_t *os, int try); -int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, - void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); -int dmu_snapshots_destroy(char *fsname, char *snapname); -int dmu_objset_rollback(const char *name); -int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); -int dmu_objset_rename(const char *name, const char *newname, - boolean_t recursive); -int dmu_objset_find(char *name, int func(char *, void *), void *arg, - int flags); -void dmu_objset_byteswap(void *buf, size_t size); - -typedef struct dmu_buf { - uint64_t db_object; /* object that this buffer is part of */ - uint64_t db_offset; /* byte offset in this object */ - uint64_t db_size; /* size of buffer in bytes */ - void *db_data; /* data in buffer */ -} dmu_buf_t; - -typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); - -/* - * Callback function to perform byte swapping on a block. - */ -typedef void dmu_byteswap_func_t(void *buf, size_t size); - -/* - * The names of zap entries in the DIRECTORY_OBJECT of the MOS. - */ -#define DMU_POOL_DIRECTORY_OBJECT 1 -#define DMU_POOL_CONFIG "config" -#define DMU_POOL_ROOT_DATASET "root_dataset" -#define DMU_POOL_SYNC_BPLIST "sync_bplist" -#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" -#define DMU_POOL_ERRLOG_LAST "errlog_last" -#define DMU_POOL_SPARES "spares" -#define DMU_POOL_DEFLATE "deflate" -#define DMU_POOL_HISTORY "history" -#define DMU_POOL_PROPS "pool_props" - -/* - * Allocate an object from this objset. The range of object numbers - * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. - * - * The transaction must be assigned to a txg. The newly allocated - * object will be "held" in the transaction (ie. you can modify the - * newly allocated object in this transaction). - * - * dmu_object_alloc() chooses an object and returns it in *objectp. - * - * dmu_object_claim() allocates a specific object number. If that - * number is already allocated, it fails and returns EEXIST. - * - * Return 0 on success, or ENOSPC or EEXIST as specified above. - */ -uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); -int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); -int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); - -/* - * Free an object from this objset. - * - * The object's data will be freed as well (ie. you don't need to call - * dmu_free(object, 0, -1, tx)). - * - * The object need not be held in the transaction. - * - * If there are any holds on this object's buffers (via dmu_buf_hold()), - * or tx holds on the object (via dmu_tx_hold_object()), you can not - * free it; it fails and returns EBUSY. - * - * If the object is not allocated, it fails and returns ENOENT. - * - * Return 0 on success, or EBUSY or ENOENT as specified above. - */ -int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); - -/* - * Find the next allocated or free object. - * - * The objectp parameter is in-out. It will be updated to be the next - * object which is allocated. Ignore objects which have not been - * modified since txg. - * - * XXX Can only be called on a objset with no dirty data. - * - * Returns 0 on success, or ENOENT if there are no more objects. - */ -int dmu_object_next(objset_t *os, uint64_t *objectp, - boolean_t hole, uint64_t txg); - -/* - * Set the data blocksize for an object. - * - * The object cannot have any blocks allcated beyond the first. If - * the first block is allocated already, the new size must be greater - * than the current block size. If these conditions are not met, - * ENOTSUP will be returned. - * - * Returns 0 on success, or EBUSY if there are any holds on the object - * contents, or ENOTSUP as described above. - */ -int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, - int ibs, dmu_tx_t *tx); - -/* - * Set the checksum property on a dnode. The new checksum algorithm will - * apply to all newly written blocks; existing blocks will not be affected. - */ -void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, - dmu_tx_t *tx); - -/* - * Set the compress property on a dnode. The new compression algorithm will - * apply to all newly written blocks; existing blocks will not be affected. - */ -void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, - dmu_tx_t *tx); - -/* - * Decide how many copies of a given block we should make. Can be from - * 1 to SPA_DVAS_PER_BP. - */ -int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, - dmu_object_type_t ot); -/* - * The bonus data is accessed more or less like a regular buffer. - * You must dmu_bonus_hold() to get the buffer, which will give you a - * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus - * data. As with any normal buffer, you must call dmu_buf_read() to - * read db_data, dmu_buf_will_dirty() before modifying it, and the - * object must be held in an assigned transaction before calling - * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus - * buffer as well. You must release your hold with dmu_buf_rele(). - */ -int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); -int dmu_bonus_max(void); - -/* - * Obtain the DMU buffer from the specified object which contains the - * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so - * that it will remain in memory. You must release the hold with - * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your - * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. - * - * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill - * on the returned buffer before reading or writing the buffer's - * db_data. The comments for those routines describe what particular - * operations are valid after calling them. - * - * The object number must be a valid, allocated object number. - */ -int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **); -void dmu_buf_add_ref(dmu_buf_t *db, void* tag); -void dmu_buf_rele(dmu_buf_t *db, void *tag); -uint64_t dmu_buf_refcount(dmu_buf_t *db); - -/* - * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a - * range of an object. A pointer to an array of dmu_buf_t*'s is - * returned (in *dbpp). - * - * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and - * frees the array. The hold on the array of buffers MUST be released - * with dmu_buf_rele_array. You can NOT release the hold on each buffer - * individually with dmu_buf_rele. - */ -int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); -void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); - -/* - * Returns NULL on success, or the existing user ptr if it's already - * been set. - * - * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). - * - * user_data_ptr_ptr should be NULL, or a pointer to a pointer which - * will be set to db->db_data when you are allowed to access it. Note - * that db->db_data (the pointer) can change when you do dmu_buf_read(), - * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill(). - * *user_data_ptr_ptr will be set to the new value when it changes. - * - * If non-NULL, pageout func will be called when this buffer is being - * excised from the cache, so that you can clean up the data structure - * pointed to by user_ptr. - * - * dmu_evict_user() will call the pageout func for all buffers in a - * objset with a given pageout func. - */ -void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *pageout_func); -/* - * set_user_ie is the same as set_user, but request immediate eviction - * when hold count goes to zero. - */ -void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, - void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); -void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, - void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *pageout_func); -void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); - -/* - * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. - */ -void *dmu_buf_get_user(dmu_buf_t *db); - -/* - * Indicate that you are going to modify the buffer's data (db_data). - * - * The transaction (tx) must be assigned to a txg (ie. you've called - * dmu_tx_assign()). The buffer's object must be held in the tx - * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). - */ -void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); - -/* - * You must create a transaction, then hold the objects which you will - * (or might) modify as part of this transaction. Then you must assign - * the transaction to a transaction group. Once the transaction has - * been assigned, you can modify buffers which belong to held objects as - * part of this transaction. You can't modify buffers before the - * transaction has been assigned; you can't modify buffers which don't - * belong to objects which this transaction holds; you can't hold - * objects once the transaction has been assigned. You may hold an - * object which you are going to free (with dmu_object_free()), but you - * don't have to. - * - * You can abort the transaction before it has been assigned. - * - * Note that you may hold buffers (with dmu_buf_hold) at any time, - * regardless of transaction state. - */ - -#define DMU_NEW_OBJECT (-1ULL) -#define DMU_OBJECT_END (-1ULL) - -dmu_tx_t *dmu_tx_create(objset_t *os); -void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); -void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, - uint64_t len); -void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name); -void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); -void dmu_tx_abort(dmu_tx_t *tx); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); -void dmu_tx_wait(dmu_tx_t *tx); -void dmu_tx_commit(dmu_tx_t *tx); - -/* - * Free up the data blocks for a defined range of a file. If size is - * zero, the range from offset to end-of-file is freed. - */ -int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, dmu_tx_t *tx); - -/* - * Convenience functions. - * - * Canfail routines will return 0 on success, or an errno if there is a - * nonrecoverable I/O error. - */ -int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf); -void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx); -int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); -int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, - dmu_tx_t *tx); -int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, struct page *pp, dmu_tx_t *tx); - -extern int zfs_prefetch_disable; - -/* - * Asynchronously try to read in the data. - */ -void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, - uint64_t len); - -typedef struct dmu_object_info { - /* All sizes are in bytes. */ - uint32_t doi_data_block_size; - uint32_t doi_metadata_block_size; - uint64_t doi_bonus_size; - dmu_object_type_t doi_type; - dmu_object_type_t doi_bonus_type; - uint8_t doi_indirection; /* 2 = dnode->indirect->data */ - uint8_t doi_checksum; - uint8_t doi_compress; - uint8_t doi_pad[5]; - /* Values below are number of 512-byte blocks. */ - uint64_t doi_physical_blks; /* data + metadata */ - uint64_t doi_max_block_offset; -} dmu_object_info_t; - -typedef struct dmu_object_type_info { - dmu_byteswap_func_t *ot_byteswap; - boolean_t ot_metadata; - char *ot_name; -} dmu_object_type_info_t; - -extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; - -/* - * Get information on a DMU object. - * - * Return 0 on success or ENOENT if object is not allocated. - * - * If doi is NULL, just indicates whether the object exists. - */ -int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); -void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); -void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); -void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, - u_longlong_t *nblk512); - -typedef struct dmu_objset_stats { - uint64_t dds_num_clones; /* number of clones of this */ - uint64_t dds_creation_txg; - dmu_objset_type_t dds_type; - uint8_t dds_is_snapshot; - uint8_t dds_inconsistent; - char dds_clone_of[MAXNAMELEN]; -} dmu_objset_stats_t; - -/* - * Get stats on a dataset. - */ -void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); - -/* - * Add entries to the nvlist for all the objset's properties. See - * zfs_prop_table[] and zfs(1m) for details on the properties. - */ -void dmu_objset_stats(objset_t *os, struct nvlist *nv); - -/* - * Get the space usage statistics for statvfs(). - * - * refdbytes is the amount of space "referenced" by this objset. - * availbytes is the amount of space available to this objset, taking - * into account quotas & reservations, assuming that no other objsets - * use the space first. These values correspond to the 'referenced' and - * 'available' properties, described in the zfs(1m) manpage. - * - * usedobjs and availobjs are the number of objects currently allocated, - * and available. - */ -void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp); - -/* - * The fsid_guid is a 56-bit ID that can change to avoid collisions. - * (Contrast with the ds_guid which is a 64-bit ID that will never - * change, so there is a small probability that it will collide.) - */ -uint64_t dmu_objset_fsid_guid(objset_t *os); - -int dmu_objset_is_snapshot(objset_t *os); - -extern struct spa *dmu_objset_spa(objset_t *os); -extern struct zilog *dmu_objset_zil(objset_t *os); -extern struct dsl_pool *dmu_objset_pool(objset_t *os); -extern struct dsl_dataset *dmu_objset_ds(objset_t *os); -extern void dmu_objset_name(objset_t *os, char *buf); -extern dmu_objset_type_t dmu_objset_type(objset_t *os); -extern uint64_t dmu_objset_id(objset_t *os); -extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - uint64_t *id, uint64_t *offp); -extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, - uint64_t *idp, uint64_t *offp); - -/* - * Return the txg number for the given assigned transaction. - */ -uint64_t dmu_tx_get_txg(dmu_tx_t *tx); - -/* - * Synchronous write. - * If a parent zio is provided this function initiates a write on the - * provided buffer as a child of the parent zio. - * In the absense of a parent zio, the write is completed synchronously. - * At write completion, blk is filled with the bp of the written block. - * Note that while the data covered by this function will be on stable - * storage when the write completes this new data does not become a - * permanent part of the file until the associated transaction commits. - */ -typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg); -int dmu_sync(struct zio *zio, dmu_buf_t *db, - struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg); - -/* - * Find the next hole or data block in file starting at *off - * Return found offset in *off. Return ESRCH for end of file. - */ -int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, - uint64_t *off); - -/* - * Initial setup and final teardown. - */ -extern void dmu_init(void); -extern void dmu_fini(void); - -typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, - uint64_t object, uint64_t offset, int len); -void dmu_traverse_objset(objset_t *os, uint64_t txg_start, - dmu_traverse_cb_t cb, void *arg); - -int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp); -int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, - boolean_t force, struct file *fp, uint64_t voffset); - -/* CRC64 table */ -#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ -extern uint64_t zfs_crc64_table[256]; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h deleted file mode 100644 index 807011e..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ /dev/null @@ -1,237 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DMU_IMPL_H -#define _SYS_DMU_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/txg_impl.h> -#include <sys/zio.h> -#include <sys/dnode.h> -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * This is the locking strategy for the DMU. Numbers in parenthesis are - * cases that use that lock order, referenced below: - * - * ARC is self-contained - * bplist is self-contained - * refcount is self-contained - * txg is self-contained (hopefully!) - * zst_lock - * zf_rwlock - * - * XXX try to improve evicting path? - * - * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > - * dn_dbufs_mtx > hash_mutexes > db_mtx > leafs - * - * dp_config_rwlock - * must be held before: everything - * protects dd namespace changes - * protects property changes globally - * held from: - * dsl_dir_open/r: - * dsl_dir_create_sync/w: - * dsl_dir_sync_destroy/w: - * dsl_dir_rename_sync/w: - * dsl_prop_changed_notify/r: - * - * os_obj_lock - * must be held before: - * everything except dp_config_rwlock - * protects os_obj_next - * held from: - * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock - * - * dn_struct_rwlock - * must be held before: - * everything except dp_config_rwlock and os_obj_lock - * protects structure of dnode (eg. nlevels) - * db_blkptr can change when syncing out change to nlevels - * dn_maxblkid - * dn_nlevels - * dn_*blksz* - * phys nlevels, maxblkid, physical blkptr_t's (?) - * held from: - * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch - * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) - * dmu_tx_count_free: - * dbuf_read_impl: db_mtx, dmu_zfetch() - * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() - * dbuf_new_size: db_mtx - * dbuf_dirty: db_mtx - * dbuf_findbp: (callers, phys? - the real need) - * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) - * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx - * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() - * dnode_sync/w (increase_indirection): db_mtx (phys) - * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) - * dnode_new_blkid/w: (dn_maxblkid) - * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) - * dnode_next_offset: (phys) - * - * dn_dbufs_mtx - * must be held before: - * db_mtx, hash_mutexes - * protects: - * dn_dbufs - * dn_evicted - * held from: - * dmu_evict_user: db_mtx (dn_dbufs) - * dbuf_free_range: db_mtx (dn_dbufs) - * dbuf_remove_ref: db_mtx, callees: - * dbuf_hash_remove: hash_mutexes, db_mtx - * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) - * dnode_set_blksz: (dn_dbufs) - * - * hash_mutexes (global) - * must be held before: - * db_mtx - * protects dbuf_hash_table (global) and db_hash_next - * held from: - * dbuf_find: db_mtx - * dbuf_hash_insert: db_mtx - * dbuf_hash_remove: db_mtx - * - * db_mtx (meta-leaf) - * must be held before: - * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) - * protects: - * db_state - * db_holds - * db_buf - * db_changed - * db_data_pending - * db_dirtied - * db_link - * db_dirty_node (??) - * db_dirtycnt - * db_d.* - * db.* - * held from: - * dbuf_dirty: dn_mtx, dn_dirty_mtx - * dbuf_dirty->dsl_dir_willuse_space: dd_lock - * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock - * dbuf_undirty: dn_dirty_mtx (db_d) - * dbuf_write_done: dn_dirty_mtx (db_state) - * dbuf_* - * dmu_buf_update_user: none (db_d) - * dmu_evict_user: none (db_d) (maybe can eliminate) - * dbuf_find: none (db_holds) - * dbuf_hash_insert: none (db_holds) - * dmu_buf_read_array_impl: none (db_state, db_changed) - * dmu_sync: none (db_dirty_node, db_d) - * dnode_reallocate: none (db) - * - * dn_mtx (leaf) - * protects: - * dn_dirty_dbufs - * dn_ranges - * phys accounting - * dn_allocated_txg - * dn_free_txg - * dn_assigned_txg - * dd_assigned_tx - * dn_notxholds - * dn_dirtyctx - * dn_dirtyctx_firstset - * (dn_phys copy fields?) - * (dn_phys contents?) - * held from: - * dnode_* - * dbuf_dirty: none - * dbuf_sync: none (phys accounting) - * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) - * dbuf_write_done: none (phys accounting) - * dmu_object_info_from_dnode: none (accounting) - * dmu_tx_commit: none - * dmu_tx_hold_object_impl: none - * dmu_tx_try_assign: dn_notxholds(cv) - * dmu_tx_unassign: none - * - * dd_lock (leaf) - * protects: - * dd_prop_cbs - * dd_sync_* - * dd_used_bytes - * dd_tempreserved - * dd_space_towrite - * dd_myname - * dd_phys accounting? - * held from: - * dsl_dir_* - * dsl_prop_changed_notify: none (dd_prop_cbs) - * dsl_prop_register: none (dd_prop_cbs) - * dsl_prop_unregister: none (dd_prop_cbs) - * dsl_dataset_block_freeable: none (dd_sync_*) - * - * os_lock (leaf) - * protects: - * os_dirty_dnodes - * os_free_dnodes - * os_dnodes - * os_downgraded_dbufs - * dn_dirtyblksz - * dn_dirty_link - * held from: - * dnode_create: none (os_dnodes) - * dnode_destroy: none (os_dnodes) - * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) - * dnode_free: none (dn_dirtyblksz, os_*_dnodes) - * - * ds_lock (leaf) - * protects: - * ds_user_ptr - * ds_user_evice_func - * ds_open_refcount - * ds_snapname - * ds_phys accounting - * held from: - * dsl_dataset_* - * - * dr_mtx (leaf) - * protects: - * dr_children - * held from: - * dbuf_dirty - * dbuf_undirty - * dbuf_sync_indirect - * dnode_new_blkid - */ - -struct objset; -struct dmu_pool; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h deleted file mode 100644 index 8293a3b..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DMU_OBJSET_H -#define _SYS_DMU_OBJSET_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/arc.h> -#include <sys/txg.h> -#include <sys/zfs_context.h> -#include <sys/dnode.h> -#include <sys/zio.h> -#include <sys/zil.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_dataset; -struct dmu_tx; -struct objset_impl; - -typedef struct objset_phys { - dnode_phys_t os_meta_dnode; - zil_header_t os_zil_header; - uint64_t os_type; - char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - - sizeof (uint64_t)]; -} objset_phys_t; - -struct objset { - struct objset_impl *os; - int os_mode; -}; - -typedef struct objset_impl { - /* Immutable: */ - struct dsl_dataset *os_dsl_dataset; - spa_t *os_spa; - arc_buf_t *os_phys_buf; - objset_phys_t *os_phys; - dnode_t *os_meta_dnode; - zilog_t *os_zil; - objset_t os; - uint8_t os_checksum; /* can change, under dsl_dir's locks */ - uint8_t os_compress; /* can change, under dsl_dir's locks */ - uint8_t os_copies; /* can change, under dsl_dir's locks */ - uint8_t os_md_checksum; - uint8_t os_md_compress; - - /* no lock needed: */ - struct dmu_tx *os_synctx; /* XXX sketchy */ - blkptr_t *os_rootbp; - - /* Protected by os_obj_lock */ - kmutex_t os_obj_lock; - uint64_t os_obj_next; - - /* Protected by os_lock */ - kmutex_t os_lock; - list_t os_dirty_dnodes[TXG_SIZE]; - list_t os_free_dnodes[TXG_SIZE]; - list_t os_dnodes; - list_t os_downgraded_dbufs; -} objset_impl_t; - -#define DMU_META_DNODE_OBJECT 0 - -/* called from zpl */ -int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp); -void dmu_objset_close(objset_t *os); -int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, - void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); -int dmu_objset_rollback(const char *name); -int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); -void dmu_objset_stats(objset_t *os, nvlist_t *nv); -void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); -void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp); -uint64_t dmu_objset_fsid_guid(objset_t *os); -int dmu_objset_find(char *name, int func(char *, void *), void *arg, - int flags); -void dmu_objset_byteswap(void *buf, size_t size); -int dmu_objset_evict_dbufs(objset_t *os, int try); - -/* called from dsl */ -void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); -objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, - blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); -int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, - objset_impl_t **osip); -void dmu_objset_evict(struct dsl_dataset *ds, void *arg); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_OBJSET_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h deleted file mode 100644 index ea9fa6c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DMU_TRAVERSE_H -#define _SYS_DMU_TRAVERSE_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/dmu.h> -#include <sys/dnode.h> -#include <sys/arc.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#define ADVANCE_POST 0 /* post-order traversal */ -#define ADVANCE_PRE 0x01 /* pre-order traversal */ -#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */ -#define ADVANCE_DATA 0x04 /* read user data blocks */ -#define ADVANCE_HOLES 0x08 /* visit holes */ -#define ADVANCE_ZIL 0x10 /* visit intent log blocks */ -#define ADVANCE_NOLOCK 0x20 /* Don't grab SPA sync lock */ - -#define ZB_NO_LEVEL -2 -#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */ -#define ZB_MAXBLKID (1ULL << 62) -#define ZB_MAXOBJSET (1ULL << 62) -#define ZB_MAXOBJECT (1ULL << 62) - -#define ZB_MOS_CACHE 0 -#define ZB_MDN_CACHE 1 -#define ZB_DN_CACHE 2 -#define ZB_DEPTH 3 - -typedef struct zseg { - uint64_t seg_mintxg; - uint64_t seg_maxtxg; - zbookmark_t seg_start; - zbookmark_t seg_end; - list_node_t seg_node; -} zseg_t; - -typedef struct traverse_blk_cache { - zbookmark_t bc_bookmark; - blkptr_t bc_blkptr; - void *bc_data; - dnode_phys_t *bc_dnode; - int bc_errno; - int bc_pad1; - uint64_t bc_pad2; -} traverse_blk_cache_t; - -typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg); - -struct traverse_handle { - spa_t *th_spa; - blkptr_cb_t *th_func; - void *th_arg; - uint16_t th_advance; - uint16_t th_locked; - int th_zio_flags; - list_t th_seglist; - traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL]; - traverse_blk_cache_t th_zil_cache; - uint64_t th_hits; - uint64_t th_arc_hits; - uint64_t th_reads; - uint64_t th_callbacks; - uint64_t th_syncs; - uint64_t th_restarts; - zbookmark_t th_noread; - zbookmark_t th_lastcb; -}; - -int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start, - int advance, blkptr_cb_t func, void *arg); - -traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg, - int advance, int zio_flags); -void traverse_fini(traverse_handle_t *th); - -void traverse_add_dnode(traverse_handle_t *th, - uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object); -void traverse_add_objset(traverse_handle_t *th, - uint64_t mintxg, uint64_t maxtxg, uint64_t objset); -void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg); - -int traverse_more(traverse_handle_t *th); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_TRAVERSE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h deleted file mode 100644 index 89f4799..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DMU_TX_H -#define _SYS_DMU_TX_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/txg.h> -#include <sys/refcount.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct dmu_buf_impl; -struct dmu_tx_hold; -struct dnode_link; -struct dsl_pool; -struct dnode; -struct dsl_dir; - -struct dmu_tx { - /* - * No synchronization is needed because a tx can only be handled - * by one thread. - */ - list_t tx_holds; /* list of dmu_tx_hold_t */ - objset_t *tx_objset; - struct dsl_dir *tx_dir; - struct dsl_pool *tx_pool; - uint64_t tx_txg; - uint64_t tx_lastsnap_txg; - uint64_t tx_lasttried_txg; - txg_handle_t tx_txgh; - void *tx_tempreserve_cookie; - struct dmu_tx_hold *tx_needassign_txh; - uint8_t tx_anyobj; - int tx_err; -#ifdef ZFS_DEBUG - uint64_t tx_space_towrite; - uint64_t tx_space_tofree; - uint64_t tx_space_tooverwrite; - refcount_t tx_space_written; - refcount_t tx_space_freed; -#endif -}; - -enum dmu_tx_hold_type { - THT_NEWOBJECT, - THT_WRITE, - THT_BONUS, - THT_FREE, - THT_ZAP, - THT_SPACE, - THT_NUMTYPES -}; - -typedef struct dmu_tx_hold { - dmu_tx_t *txh_tx; - list_node_t txh_node; - struct dnode *txh_dnode; - uint64_t txh_space_towrite; - uint64_t txh_space_tofree; - uint64_t txh_space_tooverwrite; -#ifdef ZFS_DEBUG - enum dmu_tx_hold_type txh_type; - uint64_t txh_arg1; - uint64_t txh_arg2; -#endif -} dmu_tx_hold_t; - - -/* - * These routines are defined in dmu.h, and are called by the user. - */ -dmu_tx_t *dmu_tx_create(objset_t *dd); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); -void dmu_tx_commit(dmu_tx_t *tx); -void dmu_tx_abort(dmu_tx_t *tx); -uint64_t dmu_tx_get_txg(dmu_tx_t *tx); -void dmu_tx_wait(dmu_tx_t *tx); - -/* - * These routines are defined in dmu_spa.h, and are called by the SPA. - */ -extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); - -/* - * These routines are only called by the DMU. - */ -dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); -int dmu_tx_is_syncing(dmu_tx_t *tx); -int dmu_tx_private_ok(dmu_tx_t *tx); -void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); -void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); -void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); -int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); -void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); - -#ifdef ZFS_DEBUG -#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) -#else -#define DMU_TX_DIRTY_BUF(tx, db) -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_TX_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h deleted file mode 100644 index c94bced..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _DFETCH_H -#define _DFETCH_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -extern uint64_t zfetch_array_rd_sz; - -struct dnode; /* so we can reference dnode */ - -typedef enum zfetch_dirn { - ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */ - ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */ -} zfetch_dirn_t; - -typedef struct zstream { - uint64_t zst_offset; /* offset of starting block in range */ - uint64_t zst_len; /* length of range, in blocks */ - zfetch_dirn_t zst_direction; /* direction of prefetch */ - uint64_t zst_stride; /* length of stride, in blocks */ - uint64_t zst_ph_offset; /* prefetch offset, in blocks */ - uint64_t zst_cap; /* prefetch limit (cap), in blocks */ - kmutex_t zst_lock; /* protects stream */ - clock_t zst_last; /* lbolt of last prefetch */ - avl_node_t zst_node; /* embed avl node here */ -} zstream_t; - -typedef struct zfetch { - krwlock_t zf_rwlock; /* protects zfetch structure */ - list_t zf_stream; /* AVL tree of zstream_t's */ - struct dnode *zf_dnode; /* dnode that owns this zfetch */ - uint32_t zf_stream_cnt; /* # of active streams */ - uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */ -} zfetch_t; - -void dmu_zfetch_init(zfetch_t *, struct dnode *); -void dmu_zfetch_rele(zfetch_t *); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); - - -#ifdef __cplusplus -} -#endif - -#endif /* _DFETCH_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h deleted file mode 100644 index 327e538..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ /dev/null @@ -1,267 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DNODE_H -#define _SYS_DNODE_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/avl.h> -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/zio.h> -#include <sys/refcount.h> -#include <sys/dmu_zfetch.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Flags. - */ -#define DNODE_MUST_BE_ALLOCATED 1 -#define DNODE_MUST_BE_FREE 2 - -/* - * Fixed constants. - */ -#define DNODE_SHIFT 9 /* 512 bytes */ -#define DN_MIN_INDBLKSHIFT 10 /* 1k */ -#define DN_MAX_INDBLKSHIFT 14 /* 16k */ -#define DNODE_BLOCK_SHIFT 14 /* 16k */ -#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ -#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ -#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ - -/* - * Derived constants. - */ -#define DNODE_SIZE (1 << DNODE_SHIFT) -#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) -#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) -#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) - -#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) -#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) -#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) - -/* The +2 here is a cheesy way to round up */ -#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ - (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) - -#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ - (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) - -#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ - (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) - -#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) - -struct dmu_buf_impl; -struct objset_impl; -struct zio; - -enum dnode_dirtycontext { - DN_UNDIRTIED, - DN_DIRTY_OPEN, - DN_DIRTY_SYNC -}; - -/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ -#define DNODE_FLAG_USED_BYTES (1<<0) - -typedef struct dnode_phys { - uint8_t dn_type; /* dmu_object_type_t */ - uint8_t dn_indblkshift; /* ln2(indirect block size) */ - uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ - uint8_t dn_nblkptr; /* length of dn_blkptr */ - uint8_t dn_bonustype; /* type of data in bonus buffer */ - uint8_t dn_checksum; /* ZIO_CHECKSUM type */ - uint8_t dn_compress; /* ZIO_COMPRESS type */ - uint8_t dn_flags; /* DNODE_FLAG_* */ - uint16_t dn_datablkszsec; /* data block size in 512b sectors */ - uint16_t dn_bonuslen; /* length of dn_bonus */ - uint8_t dn_pad2[4]; - - /* accounting is protected by dn_dirty_mtx */ - uint64_t dn_maxblkid; /* largest allocated block ID */ - uint64_t dn_used; /* bytes (or sectors) of disk space */ - - uint64_t dn_pad3[4]; - - blkptr_t dn_blkptr[1]; - uint8_t dn_bonus[DN_MAX_BONUSLEN]; -} dnode_phys_t; - -typedef struct dnode { - /* - * dn_struct_rwlock protects the structure of the dnode, - * including the number of levels of indirection (dn_nlevels), - * dn_maxblkid, and dn_next_* - */ - krwlock_t dn_struct_rwlock; - - /* - * Our link on dataset's dd_dnodes list. - * Protected by dd_accounting_mtx. - */ - list_node_t dn_link; - - /* immutable: */ - struct objset_impl *dn_objset; - uint64_t dn_object; - struct dmu_buf_impl *dn_dbuf; - dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ - - /* - * Copies of stuff in dn_phys. They're valid in the open - * context (eg. even before the dnode is first synced). - * Where necessary, these are protected by dn_struct_rwlock. - */ - dmu_object_type_t dn_type; /* object type */ - uint16_t dn_bonuslen; /* bonus length */ - uint8_t dn_bonustype; /* bonus type */ - uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ - uint8_t dn_checksum; /* ZIO_CHECKSUM type */ - uint8_t dn_compress; /* ZIO_COMPRESS type */ - uint8_t dn_nlevels; - uint8_t dn_indblkshift; - uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ - uint16_t dn_datablkszsec; /* in 512b sectors */ - uint32_t dn_datablksz; /* in bytes */ - uint64_t dn_maxblkid; - uint8_t dn_next_nlevels[TXG_SIZE]; - uint8_t dn_next_indblkshift[TXG_SIZE]; - uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ - - /* protected by os_lock: */ - list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ - - /* protected by dn_mtx: */ - kmutex_t dn_mtx; - list_t dn_dirty_records[TXG_SIZE]; - avl_tree_t dn_ranges[TXG_SIZE]; - uint64_t dn_allocated_txg; - uint64_t dn_free_txg; - uint64_t dn_assigned_txg; - kcondvar_t dn_notxholds; - enum dnode_dirtycontext dn_dirtyctx; - uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ - - /* protected by own devices */ - refcount_t dn_tx_holds; - refcount_t dn_holds; - - kmutex_t dn_dbufs_mtx; - list_t dn_dbufs; /* linked list of descendent dbuf_t's */ - struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ - - /* parent IO for current sync write */ - zio_t *dn_zio; - - /* holds prefetch structure */ - struct zfetch dn_zfetch; -} dnode_t; - -typedef struct free_range { - avl_node_t fr_node; - uint64_t fr_blkid; - uint64_t fr_nblks; -} free_range_t; - -dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, - uint64_t object); -void dnode_special_close(dnode_t *dn); - -int dnode_hold(struct objset_impl *dd, uint64_t object, - void *ref, dnode_t **dnp); -int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, - void *ref, dnode_t **dnp); -void dnode_add_ref(dnode_t *dn, void *ref); -void dnode_rele(dnode_t *dn, void *ref); -void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); -void dnode_sync(dnode_t *dn, dmu_tx_t *tx); -void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -void dnode_free(dnode_t *dn, dmu_tx_t *tx); -void dnode_byteswap(dnode_phys_t *dnp); -void dnode_buf_byteswap(void *buf, size_t size); -void dnode_verify(dnode_t *dn); -int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); -uint64_t dnode_current_max_length(dnode_t *dn); -void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); -void dnode_clear_range(dnode_t *dn, uint64_t blkid, - uint64_t nblks, dmu_tx_t *tx); -void dnode_diduse_space(dnode_t *dn, int64_t space); -void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); -void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); -uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); -void dnode_init(void); -void dnode_fini(void); -int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl, - uint64_t blkfill, uint64_t txg); -int dnode_evict_dbufs(dnode_t *dn, int try); - -#ifdef ZFS_DEBUG - -/* - * There should be a ## between the string literal and fmt, to make it - * clear that we're joining two strings together, but that piece of shit - * gcc doesn't support that preprocessor token. - */ -#define dprintf_dnode(dn, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char __db_buf[32]; \ - uint64_t __db_obj = (dn)->dn_object; \ - if (__db_obj == DMU_META_DNODE_OBJECT) \ - (void) strcpy(__db_buf, "mdn"); \ - else \ - (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ - (u_longlong_t)__db_obj);\ - dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ - __db_buf, __VA_ARGS__); \ - } \ -_NOTE(CONSTCOND) } while (0) - -#define DNODE_VERIFY(dn) dnode_verify(dn) -#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) - -#else - -#define dprintf_dnode(db, fmt, ...) -#define DNODE_VERIFY(dn) -#define FREE_VERIFY(db, start, end, tx) - -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DNODE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h deleted file mode 100644 index 8cfc1dc..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ /dev/null @@ -1,185 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DSL_DATASET_H -#define _SYS_DSL_DATASET_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/zio.h> -#include <sys/bplist.h> -#include <sys/dsl_synctask.h> -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_dataset; -struct dsl_dir; -struct dsl_pool; - -typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); - -#define DS_FLAG_INCONSISTENT (1ULL<<0) -/* - * NB: nopromote can not yet be set, but we want support for it in this - * on-disk version, so that we don't need to upgrade for it later. It - * will be needed when we implement 'zfs split' (where the split off - * clone should not be promoted). - */ -#define DS_FLAG_NOPROMOTE (1ULL<<1) - -typedef struct dsl_dataset_phys { - uint64_t ds_dir_obj; - uint64_t ds_prev_snap_obj; - uint64_t ds_prev_snap_txg; - uint64_t ds_next_snap_obj; - uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */ - uint64_t ds_num_children; /* clone/snap children; ==0 for head */ - uint64_t ds_creation_time; /* seconds since 1970 */ - uint64_t ds_creation_txg; - uint64_t ds_deadlist_obj; - uint64_t ds_used_bytes; - uint64_t ds_compressed_bytes; - uint64_t ds_uncompressed_bytes; - uint64_t ds_unique_bytes; /* only relevant to snapshots */ - /* - * The ds_fsid_guid is a 56-bit ID that can change to avoid - * collisions. The ds_guid is a 64-bit ID that will never - * change, so there is a small probability that it will collide. - */ - uint64_t ds_fsid_guid; - uint64_t ds_guid; - uint64_t ds_flags; - blkptr_t ds_bp; - uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */ -} dsl_dataset_phys_t; - -typedef struct dsl_dataset { - /* Immutable: */ - struct dsl_dir *ds_dir; - dsl_dataset_phys_t *ds_phys; - dmu_buf_t *ds_dbuf; - uint64_t ds_object; - - /* only used in syncing context: */ - struct dsl_dataset *ds_prev; /* only valid for non-snapshots */ - - /* has internal locking: */ - bplist_t ds_deadlist; - - /* protected by lock on pool's dp_dirty_datasets list */ - txg_node_t ds_dirty_link; - list_node_t ds_synced_link; - - /* - * ds_phys->ds_<accounting> is also protected by ds_lock. - * Protected by ds_lock: - */ - kmutex_t ds_lock; - void *ds_user_ptr; - dsl_dataset_evict_func_t *ds_user_evict_func; - uint64_t ds_open_refcount; - - /* no locking; only for making guesses */ - uint64_t ds_trysnap_txg; - - /* Protected by ds_lock; keep at end of struct for better locality */ - char ds_snapname[MAXNAMELEN]; -} dsl_dataset_t; - -#define dsl_dataset_is_snapshot(ds) \ - ((ds)->ds_phys->ds_num_children != 0) - -int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, - void *tag, dsl_dataset_t **dsp); -int dsl_dataset_open(const char *name, int mode, void *tag, - dsl_dataset_t **dsp); -int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj, - const char *tail, int mode, void *tag, dsl_dataset_t **); -void dsl_dataset_name(dsl_dataset_t *ds, char *name); -void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag); -uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, - const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx); -int dsl_dataset_destroy(const char *name); -int dsl_snapshots_destroy(char *fsname, char *snapname); -dsl_checkfunc_t dsl_dataset_snapshot_check; -dsl_syncfunc_t dsl_dataset_snapshot_sync; -int dsl_dataset_rollback(dsl_dataset_t *ds); -int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); -int dsl_dataset_promote(const char *name); - -void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, - void *p, dsl_dataset_evict_func_t func); -void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds); - -blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); -void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); - -spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); - -void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); - -void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); -void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, - dmu_tx_t *tx); -int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); -uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); - -void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); -void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); -void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat); -void dsl_dataset_space(dsl_dataset_t *ds, - uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp); -uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); - -void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp, - dmu_tx_t *tx); - -int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); - -#ifdef ZFS_DEBUG -#define dprintf_ds(ds, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \ - dsl_dataset_name(ds, __ds_name); \ - dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ - kmem_free(__ds_name, MAXNAMELEN); \ - } \ -_NOTE(CONSTCOND) } while (0) -#else -#define dprintf_ds(dd, fmt, ...) -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_DATASET_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h deleted file mode 100644 index e0595d3..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DSL_DIR_H -#define _SYS_DSL_DIR_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/dsl_pool.h> -#include <sys/dsl_synctask.h> -#include <sys/refcount.h> -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_dataset; - -typedef struct dsl_dir_phys { - uint64_t dd_creation_time; /* not actually used */ - uint64_t dd_head_dataset_obj; - uint64_t dd_parent_obj; - uint64_t dd_clone_parent_obj; - uint64_t dd_child_dir_zapobj; - /* - * how much space our children are accounting for; for leaf - * datasets, == physical space used by fs + snaps - */ - uint64_t dd_used_bytes; - uint64_t dd_compressed_bytes; - uint64_t dd_uncompressed_bytes; - /* Administrative quota setting */ - uint64_t dd_quota; - /* Administrative reservation setting */ - uint64_t dd_reserved; - uint64_t dd_props_zapobj; - uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */ -} dsl_dir_phys_t; - -struct dsl_dir { - /* These are immutable; no lock needed: */ - uint64_t dd_object; - dsl_dir_phys_t *dd_phys; - dmu_buf_t *dd_dbuf; - dsl_pool_t *dd_pool; - - /* protected by lock on pool's dp_dirty_dirs list */ - txg_node_t dd_dirty_link; - - /* protected by dp_config_rwlock */ - dsl_dir_t *dd_parent; - - /* Protected by dd_lock */ - kmutex_t dd_lock; - list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ - - /* Accounting */ - /* reflects any changes to dd_phys->dd_used_bytes made this syncing */ - int64_t dd_used_bytes; - /* gross estimate of space used by in-flight tx's */ - uint64_t dd_tempreserved[TXG_SIZE]; - /* amount of space we expect to write; == amount of dirty data */ - int64_t dd_space_towrite[TXG_SIZE]; - - /* protected by dd_lock; keep at end of struct for better locality */ - char dd_myname[MAXNAMELEN]; -}; - -void dsl_dir_close(dsl_dir_t *dd, void *tag); -int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail); -int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **, - const char **tailp); -int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag, dsl_dir_t **); -void dsl_dir_name(dsl_dir_t *dd, char *buf); -int dsl_dir_namelen(dsl_dir_t *dd); -int dsl_dir_is_private(dsl_dir_t *dd); -uint64_t dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx); -void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx); -dsl_checkfunc_t dsl_dir_destroy_check; -dsl_syncfunc_t dsl_dir_destroy_sync; -void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); -uint64_t dsl_dir_space_available(dsl_dir_t *dd, - dsl_dir_t *ancestor, int64_t delta, int ondiskonly); -void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); -void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); -int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, - uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx); -void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); -void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); -void dsl_dir_diduse_space(dsl_dir_t *dd, - int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); -int dsl_dir_set_quota(const char *ddname, uint64_t quota); -int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); -int dsl_dir_rename(dsl_dir_t *dd, const char *newname); -int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); - -/* internal reserved dir name */ -#define MOS_DIR_NAME "$MOS" - -#ifdef ZFS_DEBUG -#define dprintf_dd(dd, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \ - KM_SLEEP); \ - dsl_dir_name(dd, __ds_name); \ - dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ - kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \ - } \ -_NOTE(CONSTCOND) } while (0) -#else -#define dprintf_dd(dd, fmt, ...) -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_DIR_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h deleted file mode 100644 index f7ec67a..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DSL_POOL_H -#define _SYS_DSL_POOL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/txg_impl.h> -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct objset; -struct dsl_dir; - -typedef struct dsl_pool { - /* Immutable */ - spa_t *dp_spa; - struct objset *dp_meta_objset; - struct dsl_dir *dp_root_dir; - struct dsl_dir *dp_mos_dir; - uint64_t dp_root_dir_obj; - - /* No lock needed - sync context only */ - blkptr_t dp_meta_rootbp; - list_t dp_synced_objsets; - - /* Has its own locking */ - tx_state_t dp_tx; - txg_list_t dp_dirty_datasets; - txg_list_t dp_dirty_dirs; - txg_list_t dp_sync_tasks; - - /* - * Protects administrative changes (properties, namespace) - * It is only held for write in syncing context. Therefore - * syncing context does not need to ever have it for read, since - * nobody else could possibly have it for write. - */ - krwlock_t dp_config_rwlock; -} dsl_pool_t; - -int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); -void dsl_pool_close(dsl_pool_t *dp); -dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg); -void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); -void dsl_pool_zil_clean(dsl_pool_t *dp); -int dsl_pool_sync_context(dsl_pool_t *dp); -uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_POOL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h deleted file mode 100644 index d2debff..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DSL_PROP_H -#define _SYS_DSL_PROP_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/dsl_pool.h> -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_dataset; - -/* The callback func may not call into the DMU or DSL! */ -typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); - -typedef struct dsl_prop_cb_record { - list_node_t cbr_node; /* link on dd_prop_cbs */ - struct dsl_dataset *cbr_ds; - const char *cbr_propname; - dsl_prop_changed_cb_t *cbr_func; - void *cbr_arg; -} dsl_prop_cb_record_t; - -int dsl_prop_register(struct dsl_dataset *ds, const char *propname, - dsl_prop_changed_cb_t *callback, void *cbarg); -int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, - dsl_prop_changed_cb_t *callback, void *cbarg); -int dsl_prop_numcb(struct dsl_dataset *ds); - -int dsl_prop_get(const char *ddname, const char *propname, - int intsz, int numints, void *buf, char *setpoint); -int dsl_prop_get_integer(const char *ddname, const char *propname, - uint64_t *valuep, char *setpoint); -int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); - -int dsl_prop_set(const char *ddname, const char *propname, - int intsz, int numints, const void *buf); -int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname, - int intsz, int numints, const void *buf); - -void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); -void dsl_prop_nvlist_add_string(nvlist_t *nv, - zfs_prop_t prop, const char *value); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_PROP_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h deleted file mode 100644 index e695b18..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DSL_SYNCTASK_H -#define _SYS_DSL_SYNCTASK_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/txg.h> -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_pool; - -typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); -typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); - -typedef struct dsl_sync_task { - list_node_t dst_node; - dsl_checkfunc_t *dst_checkfunc; - dsl_syncfunc_t *dst_syncfunc; - void *dst_arg1; - void *dst_arg2; - int dst_err; -} dsl_sync_task_t; - -typedef struct dsl_sync_task_group { - txg_node_t dstg_node; - list_t dstg_tasks; - struct dsl_pool *dstg_pool; - uint64_t dstg_txg; - int dstg_err; - int dstg_space; -} dsl_sync_task_group_t; - -dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp); -void dsl_sync_task_create(dsl_sync_task_group_t *dstg, - dsl_checkfunc_t *, dsl_syncfunc_t *, - void *arg1, void *arg2, int blocks_modified); -int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg); -void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg); -void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); - -int dsl_sync_task_do(struct dsl_pool *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_SYNCTASK_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h deleted file mode 100644 index 095dd3c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_METASLAB_H -#define _SYS_METASLAB_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/space_map.h> -#include <sys/txg.h> -#include <sys/zio.h> -#include <sys/avl.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct metaslab_class metaslab_class_t; -typedef struct metaslab_group metaslab_group_t; - -extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, - uint64_t start, uint64_t size, uint64_t txg); -extern void metaslab_fini(metaslab_t *msp); -extern void metaslab_sync(metaslab_t *msp, uint64_t txg); -extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); - -extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, - int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid); -extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, - boolean_t now); -extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); - -extern metaslab_class_t *metaslab_class_create(void); -extern void metaslab_class_destroy(metaslab_class_t *mc); -extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); -extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); - -extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, - vdev_t *vd); -extern void metaslab_group_destroy(metaslab_group_t *mg); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_METASLAB_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h deleted file mode 100644 index 5980cbc..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_METASLAB_IMPL_H -#define _SYS_METASLAB_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/metaslab.h> -#include <sys/space_map.h> -#include <sys/vdev.h> -#include <sys/txg.h> -#include <sys/avl.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct metaslab_class { - metaslab_group_t *mc_rotor; - uint64_t mc_allocated; -}; - -struct metaslab_group { - kmutex_t mg_lock; - avl_tree_t mg_metaslab_tree; - uint64_t mg_aliquot; - int64_t mg_bias; - metaslab_class_t *mg_class; - vdev_t *mg_vd; - metaslab_group_t *mg_prev; - metaslab_group_t *mg_next; -}; - -/* - * Each metaslab's free space is tracked in space map object in the MOS, - * which is only updated in syncing context. Each time we sync a txg, - * we append the allocs and frees from that txg to the space map object. - * When the txg is done syncing, metaslab_sync_done() updates ms_smo - * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. - */ -struct metaslab { - kmutex_t ms_lock; /* metaslab lock */ - space_map_obj_t ms_smo; /* synced space map object */ - space_map_obj_t ms_smo_syncing; /* syncing space map object */ - space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ - space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ - space_map_t ms_map; /* in-core free space map */ - uint64_t ms_weight; /* weight vs. others in group */ - metaslab_group_t *ms_group; /* metaslab group */ - avl_node_t ms_group_node; /* node in metaslab group tree */ - txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ -}; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_METASLAB_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h deleted file mode 100644 index 4de1cae..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_REFCOUNT_H -#define _SYS_REFCOUNT_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/list.h> -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * If the reference is held only by the calling function and not any - * particular object, use FTAG (which is a string) for the holder_tag. - * Otherwise, use the object that holds the reference. - */ -#define FTAG ((char *)__func__) - -#if defined(DEBUG) || !defined(_KERNEL) -typedef struct reference { - list_node_t ref_link; - void *ref_holder; - uint64_t ref_number; - uint8_t *ref_removed; -} reference_t; - -typedef struct refcount { - kmutex_t rc_mtx; - list_t rc_list; - list_t rc_removed; - int64_t rc_count; - int64_t rc_removed_count; -} refcount_t; - -/* Note: refcount_t should be initialized to zero before use. */ - -void refcount_create(refcount_t *rc); -void refcount_destroy(refcount_t *rc); -void refcount_destroy_many(refcount_t *rc, uint64_t number); -int refcount_is_zero(refcount_t *rc); -int64_t refcount_count(refcount_t *rc); -int64_t refcount_add(refcount_t *rc, void *holder_tag); -int64_t refcount_remove(refcount_t *rc, void *holder_tag); -int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); -int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); - -void refcount_init(void); -void refcount_fini(void); - -#else /* DEBUG */ - -typedef struct refcount { - uint64_t rc_count; -} refcount_t; - -#define refcount_create(rc) ((rc)->rc_count = 0) -#define refcount_destroy(rc) ((rc)->rc_count = 0) -#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) -#define refcount_is_zero(rc) ((rc)->rc_count == 0) -#define refcount_count(rc) ((rc)->rc_count) -#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1) -#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1) -#define refcount_add_many(rc, number, holder) \ - atomic_add_64_nv(&(rc)->rc_count, number) -#define refcount_remove_many(rc, number, holder) \ - atomic_add_64_nv(&(rc)->rc_count, -number) - -#define refcount_init() -#define refcount_fini() - -#endif /* DEBUG */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_REFCOUNT_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h deleted file mode 100644 index f0eb2e1..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ /dev/null @@ -1,491 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_SPA_H -#define _SYS_SPA_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/avl.h> -#include <sys/zfs_context.h> -#include <sys/nvpair.h> -#include <sys/sysmacros.h> -#include <sys/types.h> -#include <sys/fs/zfs.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Forward references that lots of things need. - */ -typedef struct spa spa_t; -typedef struct vdev vdev_t; -typedef struct metaslab metaslab_t; -typedef struct zilog zilog_t; -typedef struct traverse_handle traverse_handle_t; -struct dsl_pool; - -/* - * General-purpose 32-bit and 64-bit bitfield encodings. - */ -#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) -#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) -#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) -#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) - -#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) -#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) - -#define BF32_SET(x, low, len, val) \ - ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len)) -#define BF64_SET(x, low, len, val) \ - ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)) - -#define BF32_GET_SB(x, low, len, shift, bias) \ - ((BF32_GET(x, low, len) + (bias)) << (shift)) -#define BF64_GET_SB(x, low, len, shift, bias) \ - ((BF64_GET(x, low, len) + (bias)) << (shift)) - -#define BF32_SET_SB(x, low, len, shift, bias, val) \ - BF32_SET(x, low, len, ((val) >> (shift)) - (bias)) -#define BF64_SET_SB(x, low, len, shift, bias, val) \ - BF64_SET(x, low, len, ((val) >> (shift)) - (bias)) - -/* - * We currently support nine block sizes, from 512 bytes to 128K. - * We could go higher, but the benefits are near-zero and the cost - * of COWing a giant block to modify one byte would become excessive. - */ -#define SPA_MINBLOCKSHIFT 9 -#define SPA_MAXBLOCKSHIFT 17 -#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) -#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) - -#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) - -/* - * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. - * The ASIZE encoding should be at least 64 times larger (6 more bits) - * to support up to 4-way RAID-Z mirror mode with worst-case gang block - * overhead, three DVAs per bp, plus one more bit in case we do anything - * else that expands the ASIZE. - */ -#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ -#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ -#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ - -/* - * All SPA data is represented by 128-bit data virtual addresses (DVAs). - * The members of the dva_t should be considered opaque outside the SPA. - */ -typedef struct dva { - uint64_t dva_word[2]; -} dva_t; - -/* - * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. - */ -typedef struct zio_cksum { - uint64_t zc_word[4]; -} zio_cksum_t; - -/* - * Each block is described by its DVAs, time of birth, checksum, etc. - * The word-by-word, bit-by-bit layout of the blkptr is as follows: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | vdev1 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 1 |G| offset1 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | vdev2 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 3 |G| offset2 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 4 | vdev3 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 5 |G| offset3 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | padding | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 8 | padding | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 9 | padding | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * a | birth txg | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * b | fill count | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * c | checksum[0] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * d | checksum[1] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * e | checksum[2] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * f | checksum[3] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * Legend: - * - * vdev virtual device ID - * offset offset into virtual device - * LSIZE logical size - * PSIZE physical size (after compression) - * ASIZE allocated size (including RAID-Z parity and gang block headers) - * GRID RAID-Z layout information (reserved for future use) - * cksum checksum function - * comp compression function - * G gang block indicator - * E endianness - * type DMU object type - * lvl level of indirection - * birth txg transaction group in which the block was born - * fill count number of non-zero blocks under this bp - * checksum[4] 256-bit checksum of the data this bp describes - */ -typedef struct blkptr { - dva_t blk_dva[3]; /* 128-bit Data Virtual Address */ - uint64_t blk_prop; /* size, compression, type, etc */ - uint64_t blk_pad[3]; /* Extra space for the future */ - uint64_t blk_birth; /* transaction group at birth */ - uint64_t blk_fill; /* fill count */ - zio_cksum_t blk_cksum; /* 256-bit checksum */ -} blkptr_t; - -#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ -#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ - -/* - * Macros to get and set fields in a bp or DVA. - */ -#define DVA_GET_ASIZE(dva) \ - BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0) -#define DVA_SET_ASIZE(dva, x) \ - BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x) - -#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) -#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) - -#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) -#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) - -#define DVA_GET_OFFSET(dva) \ - BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) -#define DVA_SET_OFFSET(dva, x) \ - BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) - -#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) -#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) - -#define BP_GET_LSIZE(bp) \ - (BP_IS_HOLE(bp) ? 0 : \ - BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) -#define BP_SET_LSIZE(bp, x) \ - BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) - -#define BP_GET_PSIZE(bp) \ - BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) -#define BP_SET_PSIZE(bp, x) \ - BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) - -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) - -#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) -#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) - -#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) -#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) - -#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) -#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) - -#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) -#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) - -#define BP_GET_ASIZE(bp) \ - (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ - DVA_GET_ASIZE(&(bp)->blk_dva[2])) - -#define BP_GET_UCSIZE(bp) \ - ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ - BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); - -#define BP_GET_NDVAS(bp) \ - (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ - !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ - !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) - -#define BP_COUNT_GANG(bp) \ - (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ - DVA_GET_GANG(&(bp)->blk_dva[1]) + \ - DVA_GET_GANG(&(bp)->blk_dva[2])) - -#define DVA_EQUAL(dva1, dva2) \ - ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ - (dva1)->dva_word[0] == (dva2)->dva_word[0]) - -#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ - (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ - ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ - ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ - ((zc1).zc_word[3] - (zc2).zc_word[3]))) - - -#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) - -#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ -{ \ - (zcp)->zc_word[0] = w0; \ - (zcp)->zc_word[1] = w1; \ - (zcp)->zc_word[2] = w2; \ - (zcp)->zc_word[3] = w3; \ -} - -#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) -#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) -#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) -#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) - -#define BP_ZERO(bp) \ -{ \ - (bp)->blk_dva[0].dva_word[0] = 0; \ - (bp)->blk_dva[0].dva_word[1] = 0; \ - (bp)->blk_dva[1].dva_word[0] = 0; \ - (bp)->blk_dva[1].dva_word[1] = 0; \ - (bp)->blk_dva[2].dva_word[0] = 0; \ - (bp)->blk_dva[2].dva_word[1] = 0; \ - (bp)->blk_prop = 0; \ - (bp)->blk_pad[0] = 0; \ - (bp)->blk_pad[1] = 0; \ - (bp)->blk_pad[2] = 0; \ - (bp)->blk_birth = 0; \ - (bp)->blk_fill = 0; \ - ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ -} - -/* - * Note: the byteorder is either 0 or -1, both of which are palindromes. - * This simplifies the endianness handling a bit. - */ -#if BYTE_ORDER == _BIG_ENDIAN -#define ZFS_HOST_BYTEORDER (0ULL) -#else -#define ZFS_HOST_BYTEORDER (-1ULL) -#endif - -#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) - -#define BP_SPRINTF_LEN 320 - -#include <sys/dmu.h> - -#define BP_GET_BUFC_TYPE(bp) \ - (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA); -/* - * Routines found in spa.c - */ - -/* state manipulation functions */ -extern int spa_open(const char *pool, spa_t **, void *tag); -extern int spa_get_stats(const char *pool, nvlist_t **config, - char *altroot, size_t buflen); -extern int spa_create(const char *pool, nvlist_t *config, const char *altroot); -extern int spa_import(const char *pool, nvlist_t *config, const char *altroot); -extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); -extern int spa_destroy(char *pool); -extern int spa_export(char *pool, nvlist_t **oldconfig); -extern int spa_reset(char *pool); -extern void spa_async_request(spa_t *spa, int flag); -extern void spa_async_suspend(spa_t *spa); -extern void spa_async_resume(spa_t *spa); -extern spa_t *spa_inject_addref(char *pool); -extern void spa_inject_delref(spa_t *spa); - -#define SPA_ASYNC_REOPEN 0x01 -#define SPA_ASYNC_REPLACE_DONE 0x02 -#define SPA_ASYNC_SCRUB 0x04 -#define SPA_ASYNC_RESILVER 0x08 -#define SPA_ASYNC_CONFIG_UPDATE 0x10 - -/* device manipulation */ -extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); -extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, - int replacing); -extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done); -extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); -extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); - -/* spare state (which is global across all pools) */ -extern void spa_spare_add(vdev_t *vd); -extern void spa_spare_remove(vdev_t *vd); -extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool); -extern void spa_spare_activate(vdev_t *vd); - -/* scrubbing */ -extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force); -extern void spa_scrub_suspend(spa_t *spa); -extern void spa_scrub_resume(spa_t *spa); -extern void spa_scrub_restart(spa_t *spa, uint64_t txg); - -/* spa syncing */ -extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ -extern void spa_sync_allpools(void); - -/* - * SPA configuration functions in spa_config.c - */ - -#define SPA_CONFIG_UPDATE_POOL 0 -#define SPA_CONFIG_UPDATE_VDEVS 1 - -extern void spa_config_sync(void); -extern void spa_config_load(void); -extern nvlist_t *spa_all_configs(uint64_t *); -extern void spa_config_set(spa_t *spa, nvlist_t *config); -extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, - int getstats); -extern void spa_config_update(spa_t *spa, int what); - -/* - * Miscellaneous SPA routines in spa_misc.c - */ - -/* Namespace manipulation */ -extern spa_t *spa_lookup(const char *name); -extern spa_t *spa_add(const char *name, const char *altroot); -extern void spa_remove(spa_t *spa); -extern spa_t *spa_next(spa_t *prev); - -/* Refcount functions */ -extern void spa_open_ref(spa_t *spa, void *tag); -extern void spa_close(spa_t *spa, void *tag); -extern boolean_t spa_refcount_zero(spa_t *spa); - -/* Pool configuration lock */ -extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag); -extern void spa_config_exit(spa_t *spa, void *tag); -extern boolean_t spa_config_held(spa_t *spa, krw_t rw); - -/* Pool vdev add/remove lock */ -extern uint64_t spa_vdev_enter(spa_t *spa); -extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); - -/* Accessor functions */ -extern krwlock_t *spa_traverse_rwlock(spa_t *spa); -extern int spa_traverse_wanted(spa_t *spa); -extern struct dsl_pool *spa_get_dsl(spa_t *spa); -extern blkptr_t *spa_get_rootblkptr(spa_t *spa); -extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); -extern void spa_altroot(spa_t *, char *, size_t); -extern int spa_sync_pass(spa_t *spa); -extern char *spa_name(spa_t *spa); -extern uint64_t spa_guid(spa_t *spa); -extern uint64_t spa_last_synced_txg(spa_t *spa); -extern uint64_t spa_first_txg(spa_t *spa); -extern uint64_t spa_version(spa_t *spa); -extern int spa_state(spa_t *spa); -extern uint64_t spa_freeze_txg(spa_t *spa); -struct metaslab_class; -extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa); -extern uint64_t spa_get_alloc(spa_t *spa); -extern uint64_t spa_get_space(spa_t *spa); -extern uint64_t spa_get_dspace(spa_t *spa); -extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); -extern uint64_t spa_version(spa_t *spa); -extern int spa_max_replication(spa_t *spa); -extern int spa_busy(void); - -/* Miscellaneous support routines */ -extern int spa_rename(const char *oldname, const char *newname); -extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); -extern char *spa_strdup(const char *); -extern void spa_strfree(char *); -extern uint64_t spa_get_random(uint64_t range); -extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); -extern void spa_freeze(spa_t *spa); -extern void spa_upgrade(spa_t *spa); -extern void spa_evict_all(void); -extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid); -extern boolean_t spa_has_spare(spa_t *, uint64_t guid); -extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); - -/* history logging */ -extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); -extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, - char *his_buf); -extern int spa_history_log(spa_t *spa, const char *his_buf, - uint64_t pool_create); - -/* error handling */ -struct zbookmark; -struct zio; -extern void spa_log_error(spa_t *spa, struct zio *zio); -extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, - struct zio *zio, uint64_t stateoroffset, uint64_t length); -extern void zfs_post_ok(spa_t *spa, vdev_t *vd); -extern uint64_t spa_get_errlog_size(spa_t *spa); -extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); -extern void spa_errlog_rotate(spa_t *spa); -extern void spa_errlog_drain(spa_t *spa); -extern void spa_errlog_sync(spa_t *spa, uint64_t txg); -extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); - -/* Initialization and termination */ -extern void spa_init(int flags); -extern void spa_fini(void); - -/* properties */ -extern int spa_set_props(spa_t *spa, nvlist_t *nvp); -extern int spa_get_props(spa_t *spa, nvlist_t **nvp); -extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); -extern boolean_t spa_has_bootfs(spa_t *spa); - -#ifdef ZFS_DEBUG -#define dprintf_bp(bp, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ - dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ - kmem_free(__blkbuf, BP_SPRINTF_LEN); \ - } \ -_NOTE(CONSTCOND) } while (0) -#else -#define dprintf_bp(bp, fmt, ...) -#endif - -extern int spa_mode; /* mode, e.g. FREAD | FWRITE */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPA_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h deleted file mode 100644 index 8c57123..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ /dev/null @@ -1,168 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_SPA_IMPL_H -#define _SYS_SPA_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/vdev.h> -#include <sys/metaslab.h> -#include <sys/dmu.h> -#include <sys/dsl_pool.h> -#include <sys/uberblock_impl.h> -#include <sys/zfs_context.h> -#include <sys/avl.h> -#include <sys/refcount.h> -#include <sys/bplist.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct spa_config_lock { - kmutex_t scl_lock; - refcount_t scl_count; - kthread_t *scl_writer; - kcondvar_t scl_cv; -} spa_config_lock_t; - -typedef struct spa_error_entry { - zbookmark_t se_bookmark; - char *se_name; - avl_node_t se_avl; -} spa_error_entry_t; - -typedef struct spa_history_phys { - uint64_t sh_pool_create_len; /* ending offset of zpool create */ - uint64_t sh_phys_max_off; /* physical EOF */ - uint64_t sh_bof; /* logical BOF */ - uint64_t sh_eof; /* logical EOF */ - uint64_t sh_records_lost; /* num of records overwritten */ -} spa_history_phys_t; - -typedef struct spa_props { - nvlist_t *spa_props_nvp; - list_node_t spa_list_node; -} spa_props_t; - -struct spa { - /* - * Fields protected by spa_namespace_lock. - */ - char *spa_name; /* pool name */ - avl_node_t spa_avl; /* node in spa_namespace_avl */ - nvlist_t *spa_config; /* last synced config */ - nvlist_t *spa_config_syncing; /* currently syncing config */ - uint64_t spa_config_txg; /* txg of last config change */ - kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */ - int spa_sync_pass; /* iterate-to-convergence */ - int spa_state; /* pool state */ - int spa_inject_ref; /* injection references */ - uint8_t spa_traverse_wanted; /* traverse lock wanted */ - uint8_t spa_sync_on; /* sync threads are running */ - spa_load_state_t spa_load_state; /* current load operation */ - taskq_t *spa_zio_issue_taskq[ZIO_TYPES]; - taskq_t *spa_zio_intr_taskq[ZIO_TYPES]; - dsl_pool_t *spa_dsl_pool; - metaslab_class_t *spa_normal_class; /* normal data class */ - uint64_t spa_first_txg; /* first txg after spa_open() */ - uint64_t spa_final_txg; /* txg of export/destroy */ - uint64_t spa_freeze_txg; /* freeze pool at this txg */ - objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ - txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ - vdev_t *spa_root_vdev; /* top-level vdev container */ - uint64_t spa_load_guid; /* initial guid for spa_load */ - list_t spa_dirty_list; /* vdevs with dirty labels */ - uint64_t spa_spares_object; /* MOS object for spare list */ - nvlist_t *spa_sparelist; /* cached spare config */ - vdev_t **spa_spares; /* available hot spares */ - int spa_nspares; /* number of hot spares */ - boolean_t spa_sync_spares; /* sync the spares list */ - uint64_t spa_config_object; /* MOS object for pool config */ - uint64_t spa_syncing_txg; /* txg currently syncing */ - uint64_t spa_sync_bplist_obj; /* object for deferred frees */ - bplist_t spa_sync_bplist; /* deferred-free bplist */ - krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */ - uberblock_t spa_ubsync; /* last synced uberblock */ - uberblock_t spa_uberblock; /* current uberblock */ - kmutex_t spa_scrub_lock; /* resilver/scrub lock */ - kthread_t *spa_scrub_thread; /* scrub/resilver thread */ - traverse_handle_t *spa_scrub_th; /* scrub traverse handle */ - uint64_t spa_scrub_restart_txg; /* need to restart */ - uint64_t spa_scrub_mintxg; /* min txg we'll scrub */ - uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */ - uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ - uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ - uint64_t spa_scrub_errors; /* scrub I/O error count */ - int spa_scrub_suspended; /* tell scrubber to suspend */ - kcondvar_t spa_scrub_cv; /* scrub thread state change */ - kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ - uint8_t spa_scrub_stop; /* tell scrubber to stop */ - uint8_t spa_scrub_active; /* active or suspended? */ - uint8_t spa_scrub_type; /* type of scrub we're doing */ - uint8_t spa_scrub_finished; /* indicator to rotate logs */ - kmutex_t spa_async_lock; /* protect async state */ - kthread_t *spa_async_thread; /* thread doing async task */ - int spa_async_suspended; /* async tasks suspended */ - kcondvar_t spa_async_cv; /* wait for thread_exit() */ - uint16_t spa_async_tasks; /* async task mask */ - char *spa_root; /* alternate root directory */ - kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */ - uint64_t spa_ena; /* spa-wide ereport ENA */ - boolean_t spa_last_open_failed; /* true if last open faled */ - kmutex_t spa_errlog_lock; /* error log lock */ - uint64_t spa_errlog_last; /* last error log object */ - uint64_t spa_errlog_scrub; /* scrub error log object */ - kmutex_t spa_errlist_lock; /* error list/ereport lock */ - avl_tree_t spa_errlist_last; /* last error list */ - avl_tree_t spa_errlist_scrub; /* scrub error list */ - uint64_t spa_deflate; /* should we deflate? */ - uint64_t spa_history; /* history object */ - kmutex_t spa_history_lock; /* history lock */ - vdev_t *spa_pending_vdev; /* pending vdev additions */ - nvlist_t **spa_pending_spares; /* pending spare additions */ - uint_t spa_pending_nspares; /* # pending spares */ - kmutex_t spa_props_lock; /* property lock */ - uint64_t spa_pool_props_object; /* object for properties */ - uint64_t spa_bootfs; /* default boot filesystem */ - /* - * spa_refcnt must be the last element because it changes size based on - * compilation options. In order for the MDB module to function - * correctly, the other fields must remain in the same location. - */ - spa_config_lock_t spa_config_lock; /* configuration changes */ - refcount_t spa_refcount; /* number of opens */ -}; - -extern const char *spa_config_dir; -extern kmutex_t spa_namespace_lock; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPA_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h deleted file mode 100644 index db9daef..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h +++ /dev/null @@ -1,162 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_SPACE_MAP_H -#define _SYS_SPACE_MAP_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/avl.h> -#include <sys/dmu.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct space_map_ops space_map_ops_t; - -typedef struct space_map { - avl_tree_t sm_root; /* AVL tree of map segments */ - uint64_t sm_space; /* sum of all segments in the map */ - uint64_t sm_start; /* start of map */ - uint64_t sm_size; /* size of map */ - uint8_t sm_shift; /* unit shift */ - uint8_t sm_pad[3]; /* unused */ - uint8_t sm_loaded; /* map loaded? */ - uint8_t sm_loading; /* map loading? */ - kcondvar_t sm_load_cv; /* map load completion */ - space_map_ops_t *sm_ops; /* space map block picker ops vector */ - void *sm_ppd; /* picker-private data */ - kmutex_t *sm_lock; /* pointer to lock that protects map */ -} space_map_t; - -typedef struct space_seg { - avl_node_t ss_node; /* AVL node */ - uint64_t ss_start; /* starting offset of this segment */ - uint64_t ss_end; /* ending offset (non-inclusive) */ -} space_seg_t; - -typedef struct space_map_obj { - uint64_t smo_object; /* on-disk space map object */ - uint64_t smo_objsize; /* size of the object */ - uint64_t smo_alloc; /* space allocated from the map */ -} space_map_obj_t; - -struct space_map_ops { - void (*smop_load)(space_map_t *sm); - void (*smop_unload)(space_map_t *sm); - uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); - void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); - void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); -}; - -/* - * debug entry - * - * 1 3 10 50 - * ,---+--------+------------+---------------------------------. - * | 1 | action | syncpass | txg (lower bits) | - * `---+--------+------------+---------------------------------' - * 63 62 60 59 50 49 0 - * - * - * - * non-debug entry - * - * 1 47 1 15 - * ,-----------------------------------------------------------. - * | 0 | offset (sm_shift units) | type | run | - * `-----------------------------------------------------------' - * 63 62 17 16 15 0 - */ - -/* All this stuff takes and returns bytes */ -#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1) -#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15) -#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) -#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) -#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47) -#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47) -#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1) -#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1) - -#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3) -#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3) - -#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) -#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) - -#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) -#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) - -#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) - -#define SM_ALLOC 0x0 -#define SM_FREE 0x1 - -/* - * The data for a given space map can be kept on blocks of any size. - * Larger blocks entail fewer i/o operations, but they also cause the - * DMU to keep more data in-core, and also to waste more i/o bandwidth - * when only a few blocks have changed since the last transaction group. - * This could use a lot more research, but for now, set the freelist - * block size to 4k (2^12). - */ -#define SPACE_MAP_BLOCKSHIFT 12 - -typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size); - -extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, - uint8_t shift, kmutex_t *lp); -extern void space_map_destroy(space_map_t *sm); -extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); -extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); -extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); -extern void space_map_vacate(space_map_t *sm, - space_map_func_t *func, space_map_t *mdest); -extern void space_map_walk(space_map_t *sm, - space_map_func_t *func, space_map_t *mdest); -extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size); -extern void space_map_union(space_map_t *smd, space_map_t *sms); - -extern void space_map_load_wait(space_map_t *sm); -extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, - uint8_t maptype, space_map_obj_t *smo, objset_t *os); -extern void space_map_unload(space_map_t *sm); - -extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); -extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); -extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); - -extern void space_map_sync(space_map_t *sm, uint8_t maptype, - space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); -extern void space_map_truncate(space_map_obj_t *smo, - objset_t *os, dmu_tx_t *tx); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPACE_MAP_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h deleted file mode 100644 index dae129c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_TXG_H -#define _SYS_TXG_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */ -#define TXG_SIZE 4 /* next power of 2 */ -#define TXG_MASK (TXG_SIZE - 1) /* mask for size */ -#define TXG_INITIAL TXG_SIZE /* initial txg */ -#define TXG_IDX (txg & TXG_MASK) - -#define TXG_WAIT 1ULL -#define TXG_NOWAIT 2ULL - -typedef struct tx_cpu tx_cpu_t; - -typedef struct txg_handle { - tx_cpu_t *th_cpu; - uint64_t th_txg; -} txg_handle_t; - -typedef struct txg_node { - struct txg_node *tn_next[TXG_SIZE]; - uint8_t tn_member[TXG_SIZE]; -} txg_node_t; - -typedef struct txg_list { - kmutex_t tl_lock; - size_t tl_offset; - txg_node_t *tl_head[TXG_SIZE]; -} txg_list_t; - -struct dsl_pool; - -extern void txg_init(struct dsl_pool *dp, uint64_t txg); -extern void txg_fini(struct dsl_pool *dp); -extern void txg_sync_start(struct dsl_pool *dp); -extern void txg_sync_stop(struct dsl_pool *dp); -extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); -extern void txg_rele_to_quiesce(txg_handle_t *txghp); -extern void txg_rele_to_sync(txg_handle_t *txghp); -extern void txg_suspend(struct dsl_pool *dp); -extern void txg_resume(struct dsl_pool *dp); - -/* - * Wait until the given transaction group has finished syncing. - * Try to make this happen as soon as possible (eg. kick off any - * necessary syncs immediately). If txg==0, wait for the currently open - * txg to finish syncing. - */ -extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); - -/* - * Wait until the given transaction group, or one after it, is - * the open transaction group. Try to make this happen as soon - * as possible (eg. kick off any necessary syncs immediately). - * If txg == 0, wait for the next open txg. - */ -extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg); - -/* - * Returns TRUE if we are "backed up" waiting for the syncing - * transaction to complete; otherwise returns FALSE. - */ -extern int txg_stalled(struct dsl_pool *dp); - -/* - * Per-txg object lists. - */ - -#define TXG_CLEAN(txg) ((txg) - 1) - -extern void txg_list_create(txg_list_t *tl, size_t offset); -extern void txg_list_destroy(txg_list_t *tl); -extern int txg_list_empty(txg_list_t *tl, uint64_t txg); -extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); -extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); -extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); -extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); -extern void *txg_list_head(txg_list_t *tl, uint64_t txg); -extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_TXG_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h deleted file mode 100644 index 45a138a..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_TXG_IMPL_H -#define _SYS_TXG_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/txg.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct tx_cpu { - kmutex_t tc_lock; - kcondvar_t tc_cv[TXG_SIZE]; - uint64_t tc_count[TXG_SIZE]; - char tc_pad[16]; -}; - -typedef struct tx_state { - tx_cpu_t *tx_cpu; /* protects right to enter txg */ - kmutex_t tx_sync_lock; /* protects tx_state_t */ - krwlock_t tx_suspend; - uint64_t tx_open_txg; /* currently open txg id */ - uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ - uint64_t tx_syncing_txg; /* currently syncing txg id */ - uint64_t tx_synced_txg; /* last synced txg id */ - - uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ - uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ - - kcondvar_t tx_sync_more_cv; - kcondvar_t tx_sync_done_cv; - kcondvar_t tx_quiesce_more_cv; - kcondvar_t tx_quiesce_done_cv; - kcondvar_t tx_timeout_exit_cv; - kcondvar_t tx_exit_cv; /* wait for all threads to exit */ - - uint8_t tx_threads; /* number of threads */ - uint8_t tx_exiting; /* set when we're exiting */ - - kthread_t *tx_sync_thread; - kthread_t *tx_quiesce_thread; - kthread_t *tx_timelimit_thread; -} tx_state_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_TXG_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h deleted file mode 100644 index 93d936a..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_UBERBLOCK_H -#define _SYS_UBERBLOCK_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/vdev.h> -#include <sys/zio.h> -#include <sys/zio_checksum.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct uberblock uberblock_t; - -extern int uberblock_verify(uberblock_t *ub); -extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_UBERBLOCK_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h deleted file mode 100644 index ab0f2dc..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_UBERBLOCK_IMPL_H -#define _SYS_UBERBLOCK_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/uberblock.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The uberblock version is incremented whenever an incompatible on-disk - * format change is made to the SPA, DMU, or ZAP. - * - * Note: the first two fields should never be moved. When a storage pool - * is opened, the uberblock must be read off the disk before the version - * can be checked. If the ub_version field is moved, we may not detect - * version mismatch. If the ub_magic field is moved, applications that - * expect the magic number in the first word won't work. - */ -#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ -#define UBERBLOCK_SHIFT 10 /* up to 1K */ - -struct uberblock { - uint64_t ub_magic; /* UBERBLOCK_MAGIC */ - uint64_t ub_version; /* ZFS_VERSION */ - uint64_t ub_txg; /* txg of last sync */ - uint64_t ub_guid_sum; /* sum of all vdev guids */ - uint64_t ub_timestamp; /* UTC time of last sync */ - blkptr_t ub_rootbp; /* MOS objset_phys_t */ -}; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_UBERBLOCK_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h deleted file mode 100644 index c8c177e..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_UNIQUE_H -#define _SYS_UNIQUE_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* The number of significant bits in each unique value. */ -#define UNIQUE_BITS 56 - -void unique_init(void); - -/* Return a new unique value. */ -uint64_t unique_create(void); - -/* Return a unique value, which equals the one passed in if possible. */ -uint64_t unique_insert(uint64_t value); - -/* Indicate that this value no longer needs to be uniquified against. */ -void unique_remove(uint64_t value); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_UNIQUE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h deleted file mode 100644 index 3120811..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_VDEV_H -#define _SYS_VDEV_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/dmu.h> -#include <sys/space_map.h> -#include <sys/fs/zfs.h> - -#ifdef __cplusplus -extern "C" { -#endif - -extern boolean_t zfs_nocacheflush; - -/* - * Fault injection modes. - */ -#define VDEV_FAULT_NONE 0 -#define VDEV_FAULT_RANDOM 1 -#define VDEV_FAULT_COUNT 2 - -extern int vdev_open(vdev_t *); -extern int vdev_validate(vdev_t *); -extern void vdev_close(vdev_t *); -extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); -extern void vdev_init(vdev_t *, uint64_t txg); -extern void vdev_reopen(vdev_t *); -extern int vdev_validate_spare(vdev_t *); - -extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); -extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); -extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); -extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); -extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, - int scrub_done); - -extern const char *vdev_description(vdev_t *vd); - -extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); -extern void vdev_metaslab_fini(vdev_t *vd); - -extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); -extern void vdev_stat_update(zio_t *zio); -extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, - boolean_t complete); -extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); -extern void vdev_propagate_state(vdev_t *vd); -extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, - vdev_aux_t aux); - -extern void vdev_space_update(vdev_t *vd, int64_t space_delta, - int64_t alloc_delta); - -extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); - -extern void vdev_io_start(zio_t *zio); -extern void vdev_io_done(zio_t *zio); - -extern int vdev_online(spa_t *spa, uint64_t guid); -extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp); -extern void vdev_clear(spa_t *spa, vdev_t *vd); - -extern int vdev_error_inject(vdev_t *vd, zio_t *zio); -extern int vdev_is_dead(vdev_t *vd); - -extern void vdev_cache_init(vdev_t *vd); -extern void vdev_cache_fini(vdev_t *vd); -extern int vdev_cache_read(zio_t *zio); -extern void vdev_cache_write(zio_t *zio); - -extern void vdev_queue_init(vdev_t *vd); -extern void vdev_queue_fini(vdev_t *vd); -extern zio_t *vdev_queue_io(zio_t *zio); -extern void vdev_queue_io_done(zio_t *zio); - -extern void vdev_config_dirty(vdev_t *vd); -extern void vdev_config_clean(vdev_t *vd); -extern int vdev_config_sync(vdev_t *vd, uint64_t txg); - -extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, - boolean_t getstats, boolean_t isspare); - -/* - * Label routines - */ -struct uberblock; -extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); -extern nvlist_t *vdev_label_read_config(vdev_t *vd); -extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); - -typedef enum { - VDEV_LABEL_CREATE, /* create/add a new device */ - VDEV_LABEL_REPLACE, /* replace an existing device */ - VDEV_LABEL_SPARE, /* add a new hot spare */ - VDEV_LABEL_REMOVE /* remove an existing device */ -} vdev_labeltype_t; - -extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h deleted file mode 100644 index 95536a7..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_VDEV_DISK_H -#define _SYS_VDEV_DISK_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/vdev.h> -#ifdef _KERNEL -#include <sys/sunldi.h> -#include <sys/sunddi.h> -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct vdev_disk { - ddi_devid_t vd_devid; - char *vd_minor; - ldi_handle_t vd_lh; -} vdev_disk_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_DISK_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h deleted file mode 100644 index cd49673..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_VDEV_FILE_H -#define _SYS_VDEV_FILE_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/vdev.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct vdev_file { - vnode_t *vf_vnode; -} vdev_file_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_FILE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h deleted file mode 100644 index aba7567..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ /dev/null @@ -1,298 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_VDEV_IMPL_H -#define _SYS_VDEV_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/avl.h> -#include <sys/dmu.h> -#include <sys/metaslab.h> -#include <sys/nvpair.h> -#include <sys/space_map.h> -#include <sys/vdev.h> -#include <sys/dkio.h> -#include <sys/uberblock_impl.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Virtual device descriptors. - * - * All storage pool operations go through the virtual device framework, - * which provides data replication and I/O scheduling. - */ - -/* - * Forward declarations that lots of things need. - */ -typedef struct vdev_queue vdev_queue_t; -typedef struct vdev_cache vdev_cache_t; -typedef struct vdev_cache_entry vdev_cache_entry_t; - -/* - * Virtual device operations - */ -typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift); -typedef void vdev_close_func_t(vdev_t *vd); -typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); -typedef void vdev_io_start_func_t(zio_t *zio); -typedef void vdev_io_done_func_t(zio_t *zio); -typedef void vdev_state_change_func_t(vdev_t *vd, int, int); - -typedef struct vdev_ops { - vdev_open_func_t *vdev_op_open; - vdev_close_func_t *vdev_op_close; - vdev_asize_func_t *vdev_op_asize; - vdev_io_start_func_t *vdev_op_io_start; - vdev_io_done_func_t *vdev_op_io_done; - vdev_state_change_func_t *vdev_op_state_change; - char vdev_op_type[16]; - boolean_t vdev_op_leaf; -} vdev_ops_t; - -/* - * Virtual device properties - */ -struct vdev_cache_entry { - char *ve_data; - uint64_t ve_offset; - uint64_t ve_lastused; - avl_node_t ve_offset_node; - avl_node_t ve_lastused_node; - uint32_t ve_hits; - uint16_t ve_missed_update; - zio_t *ve_fill_io; -}; - -struct vdev_cache { - avl_tree_t vc_offset_tree; - avl_tree_t vc_lastused_tree; - kmutex_t vc_lock; -}; - -struct vdev_queue { - avl_tree_t vq_deadline_tree; - avl_tree_t vq_read_tree; - avl_tree_t vq_write_tree; - avl_tree_t vq_pending_tree; - kmutex_t vq_lock; -}; - -/* - * Virtual device descriptor - */ -struct vdev { - /* - * Common to all vdev types. - */ - uint64_t vdev_id; /* child number in vdev parent */ - uint64_t vdev_guid; /* unique ID for this vdev */ - uint64_t vdev_guid_sum; /* self guid + all child guids */ - uint64_t vdev_asize; /* allocatable device capacity */ - uint64_t vdev_ashift; /* block alignment shift */ - uint64_t vdev_state; /* see VDEV_STATE_* #defines */ - uint64_t vdev_prevstate; /* used when reopening a vdev */ - vdev_ops_t *vdev_ops; /* vdev operations */ - spa_t *vdev_spa; /* spa for this vdev */ - void *vdev_tsd; /* type-specific data */ - vdev_t *vdev_top; /* top-level vdev */ - vdev_t *vdev_parent; /* parent vdev */ - vdev_t **vdev_child; /* array of children */ - uint64_t vdev_children; /* number of children */ - space_map_t vdev_dtl_map; /* dirty time log in-core state */ - space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */ - vdev_stat_t vdev_stat; /* virtual device statistics */ - - /* - * Top-level vdev state. - */ - uint64_t vdev_ms_array; /* metaslab array object */ - uint64_t vdev_ms_shift; /* metaslab size shift */ - uint64_t vdev_ms_count; /* number of metaslabs */ - metaslab_group_t *vdev_mg; /* metaslab group */ - metaslab_t **vdev_ms; /* metaslab array */ - txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ - txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ - txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ - uint8_t vdev_reopen_wanted; /* async reopen wanted? */ - list_node_t vdev_dirty_node; /* config dirty list */ - uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ - - /* - * Leaf vdev state. - */ - uint64_t vdev_psize; /* physical device capacity */ - space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ - txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ - uint64_t vdev_wholedisk; /* true if this is a whole disk */ - uint64_t vdev_offline; /* device taken offline? */ - uint64_t vdev_nparity; /* number of parity devices for raidz */ - char *vdev_path; /* vdev path (if any) */ - char *vdev_devid; /* vdev devid (if any) */ - uint64_t vdev_fault_arg; /* fault injection paramater */ - int vdev_fault_mask; /* zio types to fault */ - uint8_t vdev_fault_mode; /* fault injection mode */ - uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */ - uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ - uint8_t vdev_detached; /* device detached? */ - uint64_t vdev_isspare; /* was a hot spare */ - vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ - vdev_cache_t vdev_cache; /* physical block cache */ - uint64_t vdev_not_present; /* not present during import */ - hrtime_t vdev_last_try; /* last reopen time */ - boolean_t vdev_nowritecache; /* true if flushwritecache failed */ - - /* - * For DTrace to work in userland (libzpool) context, these fields must - * remain at the end of the structure. DTrace will use the kernel's - * CTF definition for 'struct vdev', and since the size of a kmutex_t is - * larger in userland, the offsets for the rest fields would be - * incorrect. - */ - kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ - kmutex_t vdev_stat_lock; /* vdev_stat */ -}; - -#define VDEV_SKIP_SIZE (8 << 10) -#define VDEV_BOOT_HEADER_SIZE (8 << 10) -#define VDEV_PHYS_SIZE (112 << 10) -#define VDEV_UBERBLOCK_RING (128 << 10) - -#define VDEV_UBERBLOCK_SHIFT(vd) \ - MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) -#define VDEV_UBERBLOCK_COUNT(vd) \ - (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) -#define VDEV_UBERBLOCK_OFFSET(vd, n) \ - offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) -#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) - -/* ZFS boot block */ -#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL -#define VDEV_BOOT_VERSION 1 /* version number */ - -typedef struct vdev_boot_header { - uint64_t vb_magic; /* VDEV_BOOT_MAGIC */ - uint64_t vb_version; /* VDEV_BOOT_VERSION */ - uint64_t vb_offset; /* start offset (bytes) */ - uint64_t vb_size; /* size (bytes) */ - char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)]; -} vdev_boot_header_t; - -typedef struct vdev_phys { - char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; - zio_block_tail_t vp_zbt; -} vdev_phys_t; - -typedef struct vdev_label { - char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ - vdev_boot_header_t vl_boot_header; /* 8K */ - vdev_phys_t vl_vdev_phys; /* 112K */ - char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ -} vdev_label_t; /* 256K total */ - -/* - * vdev_dirty() flags - */ -#define VDD_METASLAB 0x01 -#define VDD_DTL 0x02 - -/* - * Size and offset of embedded boot loader region on each label. - * The total size of the first two labels plus the boot area is 4MB. - */ -#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) -#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ - -/* - * Size of label regions at the start and end of each leaf device. - */ -#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) -#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) -#define VDEV_LABELS 4 - -#define VDEV_ALLOC_LOAD 0 -#define VDEV_ALLOC_ADD 1 -#define VDEV_ALLOC_SPARE 2 - -/* - * Allocate or free a vdev - */ -extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, - vdev_t *parent, uint_t id, int alloctype); -extern void vdev_free(vdev_t *vd); - -/* - * Add or remove children and parents - */ -extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); -extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); -extern void vdev_compact_children(vdev_t *pvd); -extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); -extern void vdev_remove_parent(vdev_t *cvd); - -/* - * vdev sync load and sync - */ -extern void vdev_load(vdev_t *vd); -extern void vdev_sync(vdev_t *vd, uint64_t txg); -extern void vdev_sync_done(vdev_t *vd, uint64_t txg); -extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); - -/* - * Available vdev types. - */ -extern vdev_ops_t vdev_root_ops; -extern vdev_ops_t vdev_mirror_ops; -extern vdev_ops_t vdev_replacing_ops; -extern vdev_ops_t vdev_raidz_ops; -#ifdef _KERNEL -extern vdev_ops_t vdev_geom_ops; -#else -extern vdev_ops_t vdev_disk_ops; -extern vdev_ops_t vdev_file_ops; -#endif -extern vdev_ops_t vdev_missing_ops; -extern vdev_ops_t vdev_spare_ops; - -/* - * Common size functions - */ -extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); -extern uint64_t vdev_get_rsize(vdev_t *vd); - -/* - * zdb uses this tunable, so it must be declared here to make lint happy. - */ -extern int zfs_vdev_cache_size; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h deleted file mode 100644 index f89d938..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h +++ /dev/null @@ -1,359 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZAP_H -#define _SYS_ZAP_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * ZAP - ZFS Attribute Processor - * - * The ZAP is a module which sits on top of the DMU (Data Managemnt - * Unit) and implements a higher-level storage primitive using DMU - * objects. Its primary consumer is the ZPL (ZFS Posix Layer). - * - * A "zapobj" is a DMU object which the ZAP uses to stores attributes. - * Users should use only zap routines to access a zapobj - they should - * not access the DMU object directly using DMU routines. - * - * The attributes stored in a zapobj are name-value pairs. The name is - * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including - * terminating NULL). The value is an array of integers, which may be - * 1, 2, 4, or 8 bytes long. The total space used by the array (number - * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. - * Note that an 8-byte integer value can be used to store the location - * (object number) of another dmu object (which may be itself a zapobj). - * Note that you can use a zero-length attribute to store a single bit - * of information - the attribute is present or not. - * - * The ZAP routines are thread-safe. However, you must observe the - * DMU's restriction that a transaction may not be operated on - * concurrently. - * - * Any of the routines that return an int may return an I/O error (EIO - * or ECHECKSUM). - * - * - * Implementation / Performance Notes: - * - * The ZAP is intended to operate most efficiently on attributes with - * short (49 bytes or less) names and single 8-byte values, for which - * the microzap will be used. The ZAP should be efficient enough so - * that the user does not need to cache these attributes. - * - * The ZAP's locking scheme makes its routines thread-safe. Operations - * on different zapobjs will be processed concurrently. Operations on - * the same zapobj which only read data will be processed concurrently. - * Operations on the same zapobj which modify data will be processed - * concurrently when there are many attributes in the zapobj (because - * the ZAP uses per-block locking - more than 128 * (number of cpus) - * small attributes will suffice). - */ - -/* - * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C - * strings) for the names of attributes, rather than a byte string - * bounded by an explicit length. If some day we want to support names - * in character sets which have embedded zeros (eg. UTF-16, UTF-32), - * we'll have to add routines for using length-bounded strings. - */ - -#include <sys/dmu.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#define ZAP_MAXNAMELEN 256 -#define ZAP_MAXVALUELEN 1024 - -/* - * Create a new zapobj with no attributes and return its object number. - */ -uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); - -/* - * Create a new zapobj with no attributes from the given (unallocated) - * object number. - */ -int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); - -/* - * The zapobj passed in must be a valid ZAP object for all of the - * following routines. - */ - -/* - * Destroy this zapobj and all its attributes. - * - * Frees the object number using dmu_object_free. - */ -int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); - -/* - * Manipulate attributes. - * - * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. - */ - -/* - * Retrieve the contents of the attribute with the given name. - * - * If the requested attribute does not exist, the call will fail and - * return ENOENT. - * - * If 'integer_size' is smaller than the attribute's integer size, the - * call will fail and return EINVAL. - * - * If 'integer_size' is equal to or larger than the attribute's integer - * size, the call will succeed and return 0. * When converting to a - * larger integer size, the integers will be treated as unsigned (ie. no - * sign-extension will be performed). - * - * 'num_integers' is the length (in integers) of 'buf'. - * - * If the attribute is longer than the buffer, as many integers as will - * fit will be transferred to 'buf'. If the entire attribute was not - * transferred, the call will return EOVERFLOW. - */ -int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf); - -/* - * Create an attribute with the given name and value. - * - * If an attribute with the given name already exists, the call will - * fail and return EEXIST. - */ -int zap_add(objset_t *ds, uint64_t zapobj, const char *name, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx); - -/* - * Set the attribute with the given name to the given value. If an - * attribute with the given name does not exist, it will be created. If - * an attribute with the given name already exists, the previous value - * will be overwritten. The integer_size may be different from the - * existing attribute's integer size, in which case the attribute's - * integer size will be updated to the new value. - */ -int zap_update(objset_t *ds, uint64_t zapobj, const char *name, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); - -/* - * Get the length (in integers) and the integer size of the specified - * attribute. - * - * If the requested attribute does not exist, the call will fail and - * return ENOENT. - */ -int zap_length(objset_t *ds, uint64_t zapobj, const char *name, - uint64_t *integer_size, uint64_t *num_integers); - -/* - * Remove the specified attribute. - * - * If the specified attribute does not exist, the call will fail and - * return ENOENT. - */ -int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); - -/* - * Returns (in *count) the number of attributes in the specified zap - * object. - */ -int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); - - -/* - * Returns (in name) the name of the entry whose value - * (za_first_integer) is value, or ENOENT if not found. The string - * pointed to by name must be at least 256 bytes long. - */ -int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name); - -struct zap; -struct zap_leaf; -typedef struct zap_cursor { - /* This structure is opaque! */ - objset_t *zc_objset; - struct zap *zc_zap; - struct zap_leaf *zc_leaf; - uint64_t zc_zapobj; - uint64_t zc_hash; - uint32_t zc_cd; -} zap_cursor_t; - -typedef struct { - int za_integer_length; - uint64_t za_num_integers; - uint64_t za_first_integer; /* no sign extension for <8byte ints */ - char za_name[MAXNAMELEN]; -} zap_attribute_t; - -/* - * The interface for listing all the attributes of a zapobj can be - * thought of as cursor moving down a list of the attributes one by - * one. The cookie returned by the zap_cursor_serialize routine is - * persistent across system calls (and across reboot, even). - */ - -/* - * Initialize a zap cursor, pointing to the "first" attribute of the - * zapobj. You must _fini the cursor when you are done with it. - */ -void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); -void zap_cursor_fini(zap_cursor_t *zc); - -/* - * Get the attribute currently pointed to by the cursor. Returns - * ENOENT if at the end of the attributes. - */ -int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); - -/* - * Advance the cursor to the next attribute. - */ -void zap_cursor_advance(zap_cursor_t *zc); - -/* - * Get a persistent cookie pointing to the current position of the zap - * cursor. The low 4 bits in the cookie are always zero, and thus can - * be used as to differentiate a serialized cookie from a different type - * of value. The cookie will be less than 2^32 as long as there are - * fewer than 2^22 (4.2 million) entries in the zap object. - */ -uint64_t zap_cursor_serialize(zap_cursor_t *zc); - -/* - * Initialize a zap cursor pointing to the position recorded by - * zap_cursor_serialize (in the "serialized" argument). You can also - * use a "serialized" argument of 0 to start at the beginning of the - * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to - * zap_cursor_init(...).) - */ -void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, - uint64_t zapobj, uint64_t serialized); - - -#define ZAP_HISTOGRAM_SIZE 10 - -typedef struct zap_stats { - /* - * Size of the pointer table (in number of entries). - * This is always a power of 2, or zero if it's a microzap. - * In general, it should be considerably greater than zs_num_leafs. - */ - uint64_t zs_ptrtbl_len; - - uint64_t zs_blocksize; /* size of zap blocks */ - - /* - * The number of blocks used. Note that some blocks may be - * wasted because old ptrtbl's and large name/value blocks are - * not reused. (Although their space is reclaimed, we don't - * reuse those offsets in the object.) - */ - uint64_t zs_num_blocks; - - /* - * Pointer table values from zap_ptrtbl in the zap_phys_t - */ - uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ - uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ - uint64_t zs_ptrtbl_zt_blk; /* starting block number */ - uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ - uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ - - /* - * Values of the other members of the zap_phys_t - */ - uint64_t zs_block_type; /* ZBT_HEADER */ - uint64_t zs_magic; /* ZAP_MAGIC */ - uint64_t zs_num_leafs; /* The number of leaf blocks */ - uint64_t zs_num_entries; /* The number of zap entries */ - uint64_t zs_salt; /* salt to stir into hash function */ - - /* - * Histograms. For all histograms, the last index - * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater - * than what can be represented. For example - * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number - * of leafs with more than 45 entries. - */ - - /* - * zs_leafs_with_n_pointers[n] is the number of leafs with - * 2^n pointers to it. - */ - uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; - - /* - * zs_leafs_with_n_entries[n] is the number of leafs with - * [n*5, (n+1)*5) entries. In the current implementation, there - * can be at most 55 entries in any block, but there may be - * fewer if the name or value is large, or the block is not - * completely full. - */ - uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; - - /* - * zs_leafs_n_tenths_full[n] is the number of leafs whose - * fullness is in the range [n/10, (n+1)/10). - */ - uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; - - /* - * zs_entries_using_n_chunks[n] is the number of entries which - * consume n 24-byte chunks. (Note, large names/values only use - * one chunk, but contribute to zs_num_blocks_large.) - */ - uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; - - /* - * zs_buckets_with_n_entries[n] is the number of buckets (each - * leaf has 64 buckets) with n entries. - * zs_buckets_with_n_entries[1] should be very close to - * zs_num_entries. - */ - uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; -} zap_stats_t; - -/* - * Get statistics about a ZAP object. Note: you need to be aware of the - * internal implementation of the ZAP to correctly interpret some of the - * statistics. This interface shouldn't be relied on unless you really - * know what you're doing. - */ -int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZAP_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h deleted file mode 100644 index 4e43f4a..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h +++ /dev/null @@ -1,204 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZAP_IMPL_H -#define _SYS_ZAP_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zap.h> -#include <sys/zfs_context.h> -#include <sys/avl.h> - -#ifdef __cplusplus -extern "C" { -#endif - -extern int fzap_default_block_shift; - -#define ZAP_MAGIC 0x2F52AB2ABULL - -#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) - -#define ZAP_MAXCD (uint32_t)(-1) -#define ZAP_HASHBITS 28 -#define MZAP_ENT_LEN 64 -#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) -#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT -#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) - -typedef struct mzap_ent_phys { - uint64_t mze_value; - uint32_t mze_cd; - uint16_t mze_pad; /* in case we want to chain them someday */ - char mze_name[MZAP_NAME_LEN]; -} mzap_ent_phys_t; - -typedef struct mzap_phys { - uint64_t mz_block_type; /* ZBT_MICRO */ - uint64_t mz_salt; - uint64_t mz_pad[6]; - mzap_ent_phys_t mz_chunk[1]; - /* actually variable size depending on block size */ -} mzap_phys_t; - -typedef struct mzap_ent { - avl_node_t mze_node; - int mze_chunkid; - uint64_t mze_hash; - mzap_ent_phys_t mze_phys; -} mzap_ent_t; - - -/* - * The (fat) zap is stored in one object. It is an array of - * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of: - * - * ptrtbl fits in first block: - * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ... - * - * ptrtbl too big for first block: - * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ... - * - */ - -struct dmu_buf; -struct zap_leaf; - -#define ZBT_LEAF ((1ULL << 63) + 0) -#define ZBT_HEADER ((1ULL << 63) + 1) -#define ZBT_MICRO ((1ULL << 63) + 3) -/* any other values are ptrtbl blocks */ - -/* - * the embedded pointer table takes up half a block: - * block size / entry size (2^3) / 2 - */ -#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) - -/* - * The embedded pointer table starts half-way through the block. Since - * the pointer table itself is half the block, it starts at (64-bit) - * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)). - */ -#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \ - ((uint64_t *)(zap)->zap_f.zap_phys) \ - [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))] - -/* - * TAKE NOTE: - * If zap_phys_t is modified, zap_byteswap() must be modified. - */ -typedef struct zap_phys { - uint64_t zap_block_type; /* ZBT_HEADER */ - uint64_t zap_magic; /* ZAP_MAGIC */ - - struct zap_table_phys { - uint64_t zt_blk; /* starting block number */ - uint64_t zt_numblks; /* number of blocks */ - uint64_t zt_shift; /* bits to index it */ - uint64_t zt_nextblk; /* next (larger) copy start block */ - uint64_t zt_blks_copied; /* number source blocks copied */ - } zap_ptrtbl; - - uint64_t zap_freeblk; /* the next free block */ - uint64_t zap_num_leafs; /* number of leafs */ - uint64_t zap_num_entries; /* number of entries */ - uint64_t zap_salt; /* salt to stir into hash function */ - /* - * This structure is followed by padding, and then the embedded - * pointer table. The embedded pointer table takes up second - * half of the block. It is accessed using the - * ZAP_EMBEDDED_PTRTBL_ENT() macro. - */ -} zap_phys_t; - -typedef struct zap_table_phys zap_table_phys_t; - -typedef struct zap { - objset_t *zap_objset; - uint64_t zap_object; - struct dmu_buf *zap_dbuf; - krwlock_t zap_rwlock; - int zap_ismicro; - uint64_t zap_salt; - union { - struct { - zap_phys_t *zap_phys; - - /* - * zap_num_entries_mtx protects - * zap_num_entries - */ - kmutex_t zap_num_entries_mtx; - int zap_block_shift; - } zap_fat; - struct { - mzap_phys_t *zap_phys; - int16_t zap_num_entries; - int16_t zap_num_chunks; - int16_t zap_alloc_next; - avl_tree_t zap_avl; - } zap_micro; - } zap_u; -} zap_t; - -#define zap_f zap_u.zap_fat -#define zap_m zap_u.zap_micro - -uint64_t zap_hash(zap_t *zap, const char *name); -int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, int fatreader, zap_t **zapp); -void zap_unlockdir(zap_t *zap); -void zap_evict(dmu_buf_t *db, void *vmzap); - -#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) - -void fzap_byteswap(void *buf, size_t size); -int fzap_count(zap_t *zap, uint64_t *count); -int fzap_lookup(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf); -int fzap_add(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx); -int fzap_update(zap_t *zap, const char *name, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); -int fzap_length(zap_t *zap, const char *name, - uint64_t *integer_size, uint64_t *num_integers); -int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx); -int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); -void fzap_get_stats(zap_t *zap, zap_stats_t *zs); -void zap_put_leaf(struct zap_leaf *l); - -int fzap_add_cd(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, dmu_tx_t *tx); -void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZAP_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h deleted file mode 100644 index 147fb72..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h +++ /dev/null @@ -1,234 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZAP_LEAF_H -#define _SYS_ZAP_LEAF_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef __cplusplus -extern "C" { -#endif - -struct zap; - -#define ZAP_LEAF_MAGIC 0x2AB1EAF - -/* chunk size = 24 bytes */ -#define ZAP_LEAF_CHUNKSIZE 24 - -/* - * The amount of space available for chunks is: - * block size (1<<l->l_bs) - hash entry size (2) * number of hash - * entries - header space (2*chunksize) - */ -#define ZAP_LEAF_NUMCHUNKS(l) \ - (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ - ZAP_LEAF_CHUNKSIZE - 2) - -/* - * The amount of space within the chunk available for the array is: - * chunk size - space for type (1) - space for next pointer (2) - */ -#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) - -#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ - (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) - -/* - * Low water mark: when there are only this many chunks free, start - * growing the ptrtbl. Ideally, this should be larger than a - * "reasonably-sized" entry. 20 chunks is more than enough for the - * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), - * while still being only around 3% for 16k blocks. - */ -#define ZAP_LEAF_LOW_WATER (20) - -/* - * The leaf hash table has block size / 2^5 (32) number of entries, - * which should be more than enough for the maximum number of entries, - * which is less than block size / CHUNKSIZE (24) / minimum number of - * chunks per entry (3). - */ -#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) -#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) - -/* - * The chunks start immediately after the hash table. The end of the - * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a - * chunk_t. - */ -#define ZAP_LEAF_CHUNK(l, idx) \ - ((zap_leaf_chunk_t *) \ - ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] -#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) - -typedef enum zap_chunk_type { - ZAP_CHUNK_FREE = 253, - ZAP_CHUNK_ENTRY = 252, - ZAP_CHUNK_ARRAY = 251, - ZAP_CHUNK_TYPE_MAX = 250 -} zap_chunk_type_t; - -/* - * TAKE NOTE: - * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. - */ -typedef struct zap_leaf_phys { - struct zap_leaf_header { - uint64_t lh_block_type; /* ZBT_LEAF */ - uint64_t lh_pad1; - uint64_t lh_prefix; /* hash prefix of this leaf */ - uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ - uint16_t lh_nfree; /* number free chunks */ - uint16_t lh_nentries; /* number of entries */ - uint16_t lh_prefix_len; /* num bits used to id this */ - -/* above is accessable to zap, below is zap_leaf private */ - - uint16_t lh_freelist; /* chunk head of free list */ - uint8_t lh_pad2[12]; - } l_hdr; /* 2 24-byte chunks */ - - /* - * The header is followed by a hash table with - * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is - * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) - * zap_leaf_chunk structures. These structures are accessed - * with the ZAP_LEAF_CHUNK() macro. - */ - - uint16_t l_hash[1]; -} zap_leaf_phys_t; - -typedef union zap_leaf_chunk { - struct zap_leaf_entry { - uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ - uint8_t le_int_size; /* size of ints */ - uint16_t le_next; /* next entry in hash chain */ - uint16_t le_name_chunk; /* first chunk of the name */ - uint16_t le_name_length; /* bytes in name, incl null */ - uint16_t le_value_chunk; /* first chunk of the value */ - uint16_t le_value_length; /* value length in ints */ - uint32_t le_cd; /* collision differentiator */ - uint64_t le_hash; /* hash value of the name */ - } l_entry; - struct zap_leaf_array { - uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ - uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; - uint16_t la_next; /* next blk or CHAIN_END */ - } l_array; - struct zap_leaf_free { - uint8_t lf_type; /* always ZAP_CHUNK_FREE */ - uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; - uint16_t lf_next; /* next in free list, or CHAIN_END */ - } l_free; -} zap_leaf_chunk_t; - -typedef struct zap_leaf { - krwlock_t l_rwlock; /* only used on head of chain */ - uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */ - int l_bs; /* block size shift */ - dmu_buf_t *l_dbuf; - zap_leaf_phys_t *l_phys; -} zap_leaf_t; - - -typedef struct zap_entry_handle { - /* below is set by zap_leaf.c and is public to zap.c */ - uint64_t zeh_num_integers; - uint64_t zeh_hash; - uint32_t zeh_cd; - uint8_t zeh_integer_size; - - /* below is private to zap_leaf.c */ - uint16_t zeh_fakechunk; - uint16_t *zeh_chunkp; - zap_leaf_t *zeh_leaf; -} zap_entry_handle_t; - -/* - * Return a handle to the named entry, or ENOENT if not found. The hash - * value must equal zap_hash(name). - */ -extern int zap_leaf_lookup(zap_leaf_t *l, - const char *name, uint64_t h, zap_entry_handle_t *zeh); - -/* - * Return a handle to the entry with this hash+cd, or the entry with the - * next closest hash+cd. - */ -extern int zap_leaf_lookup_closest(zap_leaf_t *l, - uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh); - -/* - * Read the first num_integers in the attribute. Integer size - * conversion will be done without sign extension. Return EINVAL if - * integer_size is too small. Return EOVERFLOW if there are more than - * num_integers in the attribute. - */ -extern int zap_entry_read(const zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, void *buf); - -extern int zap_entry_read_name(const zap_entry_handle_t *zeh, - uint16_t buflen, char *buf); - -/* - * Replace the value of an existing entry. - * - * zap_entry_update may fail if it runs out of space (ENOSPC). - */ -extern int zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf); - -/* - * Remove an entry. - */ -extern void zap_entry_remove(zap_entry_handle_t *zeh); - -/* - * Create an entry. An equal entry must not exist, and this entry must - * belong in this leaf (according to its hash value). Fills in the - * entry handle on success. Returns 0 on success or ENOSPC on failure. - */ -extern int zap_entry_create(zap_leaf_t *l, - const char *name, uint64_t h, uint32_t cd, - uint8_t integer_size, uint64_t num_integers, const void *buf, - zap_entry_handle_t *zeh); - -/* - * Other stuff. - */ - -extern void zap_leaf_init(zap_leaf_t *l); -extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); -extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl); -extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZAP_LEAF_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h deleted file mode 100644 index 3250b76..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_FS_ZFS_ACL_H -#define _SYS_FS_ZFS_ACL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef _KERNEL -#include <sys/cred.h> -#endif -#include <sys/acl.h> -#include <sys/dmu.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct znode_phys; - -#define ACCESS_UNDETERMINED -1 - -#define ACE_SLOT_CNT 6 - -typedef struct zfs_znode_acl { - uint64_t z_acl_extern_obj; /* ext acl pieces */ - uint32_t z_acl_count; /* Number of ACEs */ - uint16_t z_acl_version; /* acl version */ - uint16_t z_acl_pad; /* pad */ - ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ -} zfs_znode_acl_t; - -#define ACL_DATA_ALLOCED 0x1 - -/* - * Max ACL size is prepended deny for all entries + the - * canonical six tacked on * the end. - */ -#define MAX_ACL_SIZE (MAX_ACL_ENTRIES * 2 + 6) - -typedef struct zfs_acl { - int z_slots; /* number of allocated slots for ACEs */ - int z_acl_count; - uint_t z_state; - ace_t *z_acl; -} zfs_acl_t; - -#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) - -/* - * Property values for acl_mode and acl_inherit. - * - * acl_mode can take discard, noallow, groupmask and passthrough. - * whereas acl_inherit has secure instead of groupmask. - */ - -#define ZFS_ACL_DISCARD 0 -#define ZFS_ACL_NOALLOW 1 -#define ZFS_ACL_GROUPMASK 2 -#define ZFS_ACL_PASSTHROUGH 3 -#define ZFS_ACL_SECURE 4 - -struct znode; - -#ifdef _KERNEL -void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, - dmu_tx_t *, cred_t *); -#ifdef TODO -int zfs_getacl(struct znode *, vsecattr_t *, cred_t *); -#endif -int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t *); -#ifdef TODO -int zfs_setacl(struct znode *, vsecattr_t *, cred_t *); -#endif -void zfs_acl_rele(void *); -void zfs_ace_byteswap(ace_t *, int); -extern int zfs_zaccess(struct znode *, int, cred_t *); -extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *); -extern int zfs_acl_access(struct znode *, int, cred_t *); -int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *); -int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); -int zfs_zaccess_rename(struct znode *, struct znode *, - struct znode *, struct znode *, cred_t *cr); -int zfs_zaccess_v4_perm(struct znode *, int, cred_t *); -void zfs_acl_free(zfs_acl_t *); - -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* !ZFS_NO_ACL */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h deleted file mode 100644 index 4deeb3c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZFS_CONTEXT_H -#define _SYS_ZFS_CONTEXT_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef __cplusplus -extern "C" { -#endif - -#include <sys/param.h> -#include <sys/stdint.h> -#include <sys/note.h> -#include <sys/kernel.h> -#include <sys/debug.h> -#include <sys/systm.h> -#include <sys/proc.h> -#include <sys/sysmacros.h> -#include <sys/bitmap.h> -#include <sys/cmn_err.h> -#include <sys/kmem.h> -#include <sys/taskq.h> -#include <sys/systm.h> -#include <sys/conf.h> -#include <sys/mutex.h> -#include <sys/rwlock.h> -#include <sys/random.h> -#include <sys/byteorder.h> -#include <sys/systm.h> -#include <sys/list.h> -#include <sys/uio.h> -#include <sys/dirent.h> -#include <sys/time.h> -#include <sys/uio.h> -#include <sys/fcntl.h> -#include <sys/limits.h> -#include <sys/string.h> -#include <sys/bio.h> -#include <sys/buf.h> -#include <sys/cred.h> -#include <sys/sdt.h> -#include <sys/file.h> -#include <sys/vfs.h> -#include <sys/sysctl.h> -#include <sys/sbuf.h> -#include <sys/priv.h> -#include <sys/kdb.h> -#include <sys/ktr.h> -#include <sys/stack.h> -#include <sys/lockf.h> -#include <sys/policy.h> -#include <sys/zone.h> -#include <sys/eventhandler.h> -#include <sys/misc.h> -#include <sys/zfs_debug.h> - -#include <machine/stdarg.h> - -#include <vm/vm.h> -#include <vm/vm_page.h> -#include <vm/vm_object.h> -#include <vm/vm_pager.h> -#include <vm/vm_kern.h> -#include <vm/vm_map.h> -/* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */ -#ifdef min_offset -#undef min_offset -#endif -#ifdef max_offset -#undef max_offset -#endif -#include <vm/vm_extern.h> -#include <vm/vnode_pager.h> - -#define CPU_SEQID (curcpu) - -#ifdef __cplusplus -} -#endif - -extern int zfs_debug_level; -extern struct mtx zfs_debug_mtx; -#define ZFS_LOG(lvl, ...) do { \ - if (((lvl) & 0xff) <= zfs_debug_level) { \ - mtx_lock(&zfs_debug_mtx); \ - printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - if ((lvl) & 0x100) \ - kdb_backtrace(); \ - mtx_unlock(&zfs_debug_mtx); \ - } \ -} while (0) - -#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h deleted file mode 100644 index a676533..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _ZFS_CTLDIR_H -#define _ZFS_CTLDIR_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/vnode.h> -#include <sys/zfs_vfsops.h> -#include <sys/zfs_znode.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#define ZFS_CTLDIR_NAME ".zfs" - -#define zfs_has_ctldir(zdp) \ - ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ - ((zdp)->z_zfsvfs->z_ctldir != NULL)) -#define zfs_show_ctldir(zdp) \ - (zfs_has_ctldir(zdp) && \ - ((zdp)->z_zfsvfs->z_show_ctldir)) - -void zfsctl_create(zfsvfs_t *); -void zfsctl_destroy(zfsvfs_t *); -vnode_t *zfsctl_root(znode_t *); -void zfsctl_init(void); -void zfsctl_fini(void); - -int zfsctl_rename_snapshot(const char *from, const char *to); -int zfsctl_destroy_snapshot(const char *snapname, int force); -int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); - -int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr); - -int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); - -#define ZFSCTL_INO_ROOT 0x1 -#define ZFSCTL_INO_SNAPDIR 0x2 - -#ifdef __cplusplus -} -#endif - -#endif /* _ZFS_CTLDIR_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h deleted file mode 100644 index 450ac1c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZFS_DEBUG_H -#define _SYS_ZFS_DEBUG_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef TRUE -#define TRUE 1 -#endif - -#ifndef FALSE -#define FALSE 0 -#endif - -/* - * ZFS debugging - */ - -#if defined(DEBUG) || !defined(_KERNEL) -#define ZFS_DEBUG -#endif - -extern int zfs_flags; - -#define ZFS_DEBUG_DPRINTF 0x0001 -#define ZFS_DEBUG_DBUF_VERIFY 0x0002 -#define ZFS_DEBUG_DNODE_VERIFY 0x0004 -#define ZFS_DEBUG_SNAPNAMES 0x0008 -#define ZFS_DEBUG_MODIFY 0x0010 - -#ifdef ZFS_DEBUG -extern void __dprintf(const char *file, const char *func, - int line, const char *fmt, ...); -#define dprintf(...) \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) \ - __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__) -#else -#define dprintf(...) ((void)0) -#endif /* ZFS_DEBUG */ - -extern void zfs_panic_recover(const char *fmt, ...); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZFS_DEBUG_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h deleted file mode 100644 index f60d614..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_FS_ZFS_DIR_H -#define _SYS_FS_ZFS_DIR_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/dmu.h> -#include <sys/zfs_znode.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* zfs_dirent_lock() flags */ -#define ZNEW 0x0001 /* entry should not exist */ -#define ZEXISTS 0x0002 /* entry should exist */ -#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ -#define ZXATTR 0x0008 /* we want the xattr dir */ -#define ZRENAMING 0x0010 /* znode is being renamed */ - -/* mknode flags */ -#define IS_ROOT_NODE 0x01 /* create a root node */ -#define IS_XATTR 0x02 /* create an extended attribute node */ -#define IS_REPLAY 0x04 /* we are replaying intent log */ - -extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, - int); -extern void zfs_dirent_unlock(zfs_dirlock_t *); -extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); -extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, - boolean_t *); -extern int zfs_dirlook(znode_t *, char *, vnode_t **); -extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *, - dmu_tx_t *, cred_t *, uint_t, znode_t **, int); -extern void zfs_rmnode(znode_t *); -extern boolean_t zfs_dirempty(znode_t *); -extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); -extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); -extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); -extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int); -extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h deleted file mode 100644 index 61a0a9e..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ /dev/null @@ -1,163 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZFS_IOCTL_H -#define _SYS_ZFS_IOCTL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/cred.h> -#include <sys/dmu.h> -#include <sys/zio.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Property values for snapdir - */ -#define ZFS_SNAPDIR_HIDDEN 0 -#define ZFS_SNAPDIR_VISIBLE 1 - -#define DMU_BACKUP_VERSION (1ULL) -#define DMU_BACKUP_MAGIC 0x2F5bacbacULL - -/* - * zfs ioctl command structure - */ -typedef struct dmu_replay_record { - enum { - DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, - DRR_WRITE, DRR_FREE, DRR_END, - } drr_type; - uint32_t drr_pad; - union { - struct drr_begin { - uint64_t drr_magic; - uint64_t drr_version; - uint64_t drr_creation_time; - dmu_objset_type_t drr_type; - uint32_t drr_pad; - uint64_t drr_toguid; - uint64_t drr_fromguid; - char drr_toname[MAXNAMELEN]; - } drr_begin; - struct drr_end { - zio_cksum_t drr_checksum; - } drr_end; - struct drr_object { - uint64_t drr_object; - dmu_object_type_t drr_type; - dmu_object_type_t drr_bonustype; - uint32_t drr_blksz; - uint32_t drr_bonuslen; - uint8_t drr_checksum; - uint8_t drr_compress; - uint8_t drr_pad[6]; - /* bonus content follows */ - } drr_object; - struct drr_freeobjects { - uint64_t drr_firstobj; - uint64_t drr_numobjs; - } drr_freeobjects; - struct drr_write { - uint64_t drr_object; - dmu_object_type_t drr_type; - uint32_t drr_pad; - uint64_t drr_offset; - uint64_t drr_length; - /* content follows */ - } drr_write; - struct drr_free { - uint64_t drr_object; - uint64_t drr_offset; - uint64_t drr_length; - } drr_free; - } drr_u; -} dmu_replay_record_t; - -typedef struct zinject_record { - uint64_t zi_objset; - uint64_t zi_object; - uint64_t zi_start; - uint64_t zi_end; - uint64_t zi_guid; - uint32_t zi_level; - uint32_t zi_error; - uint64_t zi_type; - uint32_t zi_freq; -} zinject_record_t; - -#define ZINJECT_NULL 0x1 -#define ZINJECT_FLUSH_ARC 0x2 -#define ZINJECT_UNLOAD_SPA 0x4 - -typedef struct zfs_cmd { - char zc_name[MAXPATHLEN]; - char zc_value[MAXPATHLEN * 2]; - uint64_t zc_guid; - uint64_t zc_nvlist_src; /* really (char *) */ - uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ - uint64_t zc_nvlist_dst_size; - uint64_t zc_cookie; - uint64_t zc_cred; - uint64_t zc_dev; - uint64_t zc_objset_type; - uint64_t zc_history; /* really (char *) */ - uint64_t zc_history_len; - uint64_t zc_history_offset; - uint64_t zc_obj; - uint64_t zc_jailid; - dmu_objset_stats_t zc_objset_stats; - struct drr_begin zc_begin_record; - zinject_record_t zc_inject_record; -} zfs_cmd_t; - -#ifdef _KERNEL -typedef struct zfs_create_data { - cred_t *zc_cred; - dev_t zc_dev; - nvlist_t *zc_props; -} zfs_create_data_t; -#endif - -#define ZVOL_MAX_MINOR (1 << 16) -#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) - -#ifdef _KERNEL - -extern int zfs_secpolicy_write(const char *dataset, cred_t *cr); -extern int zfs_busy(void); -extern int zfs_unmount_snap(char *, void *); - -#endif /* _KERNEL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h deleted file mode 100644 index f302b66..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_FS_ZFS_RLOCK_H -#define _SYS_FS_ZFS_RLOCK_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _KERNEL - -#include <sys/zfs_znode.h> - -typedef enum { - RL_READER, - RL_WRITER, - RL_APPEND -} rl_type_t; - -typedef struct rl { - znode_t *r_zp; /* znode this lock applies to */ - avl_node_t r_node; /* avl node link */ - uint64_t r_off; /* file range offset */ - uint64_t r_len; /* file range length */ - uint_t r_cnt; /* range reference count in tree */ - rl_type_t r_type; /* range type */ - kcondvar_t r_wr_cv; /* cv for waiting writers */ - kcondvar_t r_rd_cv; /* cv for waiting readers */ - uint8_t r_proxy; /* acting for original range */ - uint8_t r_write_wanted; /* writer wants to lock this range */ - uint8_t r_read_wanted; /* reader wants to lock this range */ -} rl_t; - -/* - * Lock a range (offset, length) as either shared (READER) - * or exclusive (WRITER or APPEND). APPEND is a special type that - * is converted to WRITER that specified to lock from the start of the - * end of file. zfs_range_lock() returns the range lock structure. - */ -rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); - -/* - * Unlock range and destroy range lock structure. - */ -void zfs_range_unlock(rl_t *rl); - -/* - * Reduce range locked as RW_WRITER from whole file to specified range. - * Asserts the whole file was previously locked. - */ -void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); - -/* - * AVL comparison function used to compare range locks - */ -int zfs_range_compare(const void *arg1, const void *arg2); - -#endif /* _KERNEL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_RLOCK_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h deleted file mode 100644 index aa82cc1..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_FS_ZFS_VFSOPS_H -#define _SYS_FS_ZFS_VFSOPS_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/list.h> -#include <sys/vfs.h> -#include <sys/zil.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct zfsvfs zfsvfs_t; - -struct zfsvfs { - vfs_t *z_vfs; /* generic fs struct */ - zfsvfs_t *z_parent; /* parent fs */ - objset_t *z_os; /* objset reference */ - uint64_t z_root; /* id of root znode */ - uint64_t z_unlinkedobj; /* id of unlinked zapobj */ - uint64_t z_max_blksz; /* maximum block size for files */ - uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ - zilog_t *z_log; /* intent log pointer */ - uint_t z_acl_mode; /* acl chmod/mode behavior */ - uint_t z_acl_inherit; /* acl inheritance behavior */ - boolean_t z_atime; /* enable atimes mount option */ - boolean_t z_unmounted1; /* unmounted phase 1 */ - boolean_t z_unmounted2; /* unmounted phase 2 */ - uint32_t z_op_cnt; /* vnode/vfs operations ref count */ - krwlock_t z_um_lock; /* rw lock for umount phase 2 */ - list_t z_all_znodes; /* all vnodes in the fs */ - kmutex_t z_znodes_lock; /* lock for z_all_znodes */ - vnode_t *z_ctldir; /* .zfs directory pointer */ - boolean_t z_show_ctldir; /* expose .zfs in the root dir */ - boolean_t z_issnap; /* true if this is a snapshot */ -#define ZFS_OBJ_MTX_SZ 64 - kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ -}; - -/* - * The total file ID size is limited to 12 bytes (including the length - * field) in the NFSv2 protocol. For historical reasons, this same limit - * is currently being imposed by the Solaris NFSv3 implementation... - * although the protocol actually permits a maximum of 64 bytes. It will - * not be possible to expand beyond 12 bytes without abandoning support - * of NFSv2 and making some changes to the Solaris NFSv3 implementation. - * - * For the time being, we will partition up the available space as follows: - * 2 bytes fid length (required) - * 6 bytes object number (48 bits) - * 4 bytes generation number (32 bits) - * We reserve only 48 bits for the object number, as this is the limit - * currently defined and imposed by the DMU. - */ -typedef struct zfid_short { - uint16_t zf_len; - uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ - uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ -} zfid_short_t; - -typedef struct zfid_long { - zfid_short_t z_fid; - uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ - uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ -} zfid_long_t; - -#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) -#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h deleted file mode 100644 index c9c317e..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h +++ /dev/null @@ -1,298 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_FS_ZFS_ZNODE_H -#define _SYS_FS_ZFS_ZNODE_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef _KERNEL -#include <sys/list.h> -#include <sys/dmu.h> -#include <sys/zfs_vfsops.h> -#endif -#include <sys/zfs_acl.h> -#include <sys/zil.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Define special zfs pflags - */ -#define ZFS_XATTR 0x1 /* is an extended attribute */ -#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ -#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ - -#define MASTER_NODE_OBJ 1 - -/* - * special attributes for master node. - */ - -#define ZFS_FSID "FSID" -#define ZFS_UNLINKED_SET "DELETE_QUEUE" -#define ZFS_ROOT_OBJ "ROOT" -#define ZPL_VERSION_OBJ "VERSION" -#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE" -#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS" - -#define ZFS_FLAG_BLOCKPERPAGE 0x1 -#define ZFS_FLAG_NOGROWBLOCKS 0x2 - -/* - * ZPL version - rev'd whenever an incompatible on-disk format change - * occurs. Independent of SPA/DMU/ZAP versioning. - */ - -#define ZPL_VERSION 1ULL - -#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) - -/* Path component length */ -/* - * The generic fs code uses MAXNAMELEN to represent - * what the largest component length is. Unfortunately, - * this length includes the terminating NULL. ZFS needs - * to tell the users via pathconf() and statvfs() what the - * true maximum length of a component is, excluding the NULL. - */ -#define ZFS_MAXNAMELEN (MAXNAMELEN - 1) - -/* - * The directory entry has the type (currently unused on Solaris) in the - * top 4 bits, and the object number in the low 48 bits. The "middle" - * 12 bits are unused. - */ -#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) -#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) -#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj) - - -/* - * This is the persistent portion of the znode. It is stored - * in the "bonus buffer" of the file. Short symbolic links - * are also stored in the bonus buffer. - */ -typedef struct znode_phys { - uint64_t zp_atime[2]; /* 0 - last file access time */ - uint64_t zp_mtime[2]; /* 16 - last file modification time */ - uint64_t zp_ctime[2]; /* 32 - last file change time */ - uint64_t zp_crtime[2]; /* 48 - creation time */ - uint64_t zp_gen; /* 64 - generation (txg of creation) */ - uint64_t zp_mode; /* 72 - file mode bits */ - uint64_t zp_size; /* 80 - size of file */ - uint64_t zp_parent; /* 88 - directory parent (`..') */ - uint64_t zp_links; /* 96 - number of links to file */ - uint64_t zp_xattr; /* 104 - DMU object for xattrs */ - uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ - uint64_t zp_flags; /* 120 - persistent flags */ - uint64_t zp_uid; /* 128 - file owner */ - uint64_t zp_gid; /* 136 - owning group */ - uint64_t zp_pad[4]; /* 144 - future */ - zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */ - /* - * Data may pad out any remaining bytes in the znode buffer, eg: - * - * |<---------------------- dnode_phys (512) ------------------------>| - * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| - * |<---- znode (264) ---->|<---- data (56) ---->| - * - * At present, we only use this space to store symbolic links. - */ -} znode_phys_t; - -/* - * Directory entry locks control access to directory entries. - * They are used to protect creates, deletes, and renames. - * Each directory znode has a mutex and a list of locked names. - */ -#ifdef _KERNEL -typedef struct zfs_dirlock { - char *dl_name; /* directory entry being locked */ - uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ - uint16_t dl_namesize; /* set if dl_name was allocated */ - kcondvar_t dl_cv; /* wait for entry to be unlocked */ - struct znode *dl_dzp; /* directory znode */ - struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ -} zfs_dirlock_t; - -typedef struct znode { - struct zfsvfs *z_zfsvfs; - vnode_t *z_vnode; - uint64_t z_id; /* object ID for this znode */ - kmutex_t z_lock; /* znode modification lock */ - krwlock_t z_map_lock; /* page map lock */ - krwlock_t z_parent_lock; /* parent lock for directories */ - krwlock_t z_name_lock; /* "master" lock for dirent locks */ - zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ - kmutex_t z_range_lock; /* protects changes to z_range_avl */ - avl_tree_t z_range_avl; /* avl tree of file range locks */ - uint8_t z_unlinked; /* file has been unlinked */ - uint8_t z_atime_dirty; /* atime needs to be synced */ - uint8_t z_dbuf_held; /* Is z_dbuf already held? */ - uint8_t z_zn_prefetch; /* Prefetch znodes? */ - uint_t z_blksz; /* block size in bytes */ - uint_t z_seq; /* modification sequence number */ - uint64_t z_mapcnt; /* number of pages mapped to file */ - uint64_t z_last_itx; /* last ZIL itx on this znode */ - uint32_t z_sync_cnt; /* synchronous open count */ - kmutex_t z_acl_lock; /* acl data lock */ - list_node_t z_link_node; /* all znodes in fs link */ - struct lockf *z_lockf; /* Head of byte-level lock list. */ - /* - * These are dmu managed fields. - */ - znode_phys_t *z_phys; /* pointer to persistent znode */ - dmu_buf_t *z_dbuf; /* buffer containing the z_phys */ -} znode_t; - - -/* - * Range locking rules - * -------------------- - * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole - * file range needs to be locked as RL_WRITER. Only then can the pages be - * freed etc and zp_size reset. zp_size must be set within range lock. - * 2. For writes and punching holes (zfs_write & zfs_space) just the range - * being written or freed needs to be locked as RL_WRITER. - * Multiple writes at the end of the file must coordinate zp_size updates - * to ensure data isn't lost. A compare and swap loop is currently used - * to ensure the file size is at least the offset last written. - * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being - * read needs to be locked as RL_READER. A check against zp_size can then - * be made for reading beyond end of file. - */ - -/* - * Convert between znode pointers and vnode pointers - */ -#define ZTOV(ZP) ((ZP)->z_vnode) -#define VTOZ(VP) ((znode_t *)(VP)->v_data) - -/* - * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation. - * ZFS_EXIT() must be called before exitting the vop. - */ -#define ZFS_ENTER(zfsvfs) \ - { \ - atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \ - if ((zfsvfs)->z_unmounted1) { \ - ZFS_EXIT(zfsvfs); \ - return (EIO); \ - } \ - } -#define ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1) - -/* - * Macros for dealing with dmu_buf_hold - */ -#define ZFS_OBJ_HASH(obj_num) (obj_num & (ZFS_OBJ_MTX_SZ - 1)) -#define ZFS_OBJ_MUTEX(zp) \ - (&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)]) -#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ - mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]); - -#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ - mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) - -/* - * Macros to encode/decode ZFS stored time values from/to struct timespec - */ -#define ZFS_TIME_ENCODE(tp, stmp) \ -{ \ - stmp[0] = (uint64_t)(tp)->tv_sec; \ - stmp[1] = (uint64_t)(tp)->tv_nsec; \ -} - -#define ZFS_TIME_DECODE(tp, stmp) \ -{ \ - (tp)->tv_sec = (time_t)stmp[0]; \ - (tp)->tv_nsec = (long)stmp[1]; \ -} - -/* - * Timestamp defines - */ -#define ACCESSED (AT_ATIME) -#define STATE_CHANGED (AT_CTIME) -#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) - -#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ - if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ - zfs_time_stamper(zp, ACCESSED, NULL) - -extern int zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *); -extern void zfs_set_dataprop(objset_t *); -extern void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx); -extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *); -extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *); -extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); -extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); -extern void zfs_znode_init(void); -extern void zfs_znode_fini(void); -extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); -extern void zfs_zinactive(znode_t *); -extern void zfs_znode_delete(znode_t *, dmu_tx_t *); -extern void zfs_znode_free(znode_t *); -extern void zfs_remove_op_tables(); -extern int zfs_create_op_tables(); -extern dev_t zfs_cmpldev(uint64_t); - -extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, znode_t *zp, char *name); -extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, char *name); -extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, znode_t *zp, char *name); -extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, znode_t *zp, char *name, char *link); -extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); -extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t len, int ioflag); -extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, uint64_t off, uint64_t len); -extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, vattr_t *vap, uint_t mask_applied); -#ifndef ZFS_NO_ACL -extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, int aclcnt, ace_t *z_ace); -#endif - -extern zil_get_data_t zfs_get_data; -extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; -extern int zfsfstype; - -#endif /* _KERNEL */ - -extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_ZNODE_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h deleted file mode 100644 index 947ba9f..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h +++ /dev/null @@ -1,276 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZIL_H -#define _SYS_ZIL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/dmu.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Intent log format: - * - * Each objset has its own intent log. The log header (zil_header_t) - * for objset N's intent log is kept in the Nth object of the SPA's - * intent_log objset. The log header points to a chain of log blocks, - * each of which contains log records (i.e., transactions) followed by - * a log block trailer (zil_trailer_t). The format of a log record - * depends on the record (or transaction) type, but all records begin - * with a common structure that defines the type, length, and txg. - */ - -/* - * Intent log header - this on disk structure holds fields to manage - * the log. All fields are 64 bit to easily handle cross architectures. - */ -typedef struct zil_header { - uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ - uint64_t zh_replay_seq; /* highest replayed sequence number */ - blkptr_t zh_log; /* log chain */ - uint64_t zh_claim_seq; /* highest claimed sequence number */ - uint64_t zh_pad[5]; -} zil_header_t; - -/* - * Log block trailer - structure at the end of the header and each log block - * - * The zit_bt contains a zbt_cksum which for the intent log is - * the sequence number of this log block. A seq of 0 is invalid. - * The zbt_cksum is checked by the SPA against the sequence - * number passed in the blk_cksum field of the blkptr_t - */ -typedef struct zil_trailer { - uint64_t zit_pad; - blkptr_t zit_next_blk; /* next block in chain */ - uint64_t zit_nused; /* bytes in log block used */ - zio_block_tail_t zit_bt; /* block trailer */ -} zil_trailer_t; - -#define ZIL_MIN_BLKSZ 4096ULL -#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE -#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) - -/* - * The words of a log block checksum. - */ -#define ZIL_ZC_GUID_0 0 -#define ZIL_ZC_GUID_1 1 -#define ZIL_ZC_OBJSET 2 -#define ZIL_ZC_SEQ 3 - -/* - * Intent log transaction types and record structures - */ -#define TX_CREATE 1 /* Create file */ -#define TX_MKDIR 2 /* Make directory */ -#define TX_MKXATTR 3 /* Make XATTR directory */ -#define TX_SYMLINK 4 /* Create symbolic link to a file */ -#define TX_REMOVE 5 /* Remove file */ -#define TX_RMDIR 6 /* Remove directory */ -#define TX_LINK 7 /* Create hard link to a file */ -#define TX_RENAME 8 /* Rename a file */ -#define TX_WRITE 9 /* File write */ -#define TX_TRUNCATE 10 /* Truncate a file */ -#define TX_SETATTR 11 /* Set file attributes */ -#define TX_ACL 12 /* Set acl */ -#define TX_MAX_TYPE 13 /* Max transaction type */ - -/* - * Format of log records. - * The fields are carefully defined to allow them to be aligned - * and sized the same on sparc & intel architectures. - * Each log record has a common structure at the beginning. - * - * Note, lrc_seq holds two different sequence numbers. Whilst in memory - * it contains the transaction sequence number. The log record on - * disk holds the sequence number of all log records which is used to - * ensure we don't replay the same record. The two sequence numbers are - * different because the transactions can now be pushed out of order. - */ -typedef struct { /* common log record header */ - uint64_t lrc_txtype; /* intent log transaction type */ - uint64_t lrc_reclen; /* transaction record length */ - uint64_t lrc_txg; /* dmu transaction group number */ - uint64_t lrc_seq; /* see comment above */ -} lr_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_doid; /* object id of directory */ - uint64_t lr_foid; /* object id of created file object */ - uint64_t lr_mode; /* mode of object */ - uint64_t lr_uid; /* uid of object */ - uint64_t lr_gid; /* gid of object */ - uint64_t lr_gen; /* generation (txg of creation) */ - uint64_t lr_crtime[2]; /* creation time */ - uint64_t lr_rdev; /* rdev of object to create */ - /* name of object to create follows this */ - /* for symlinks, link content follows name */ -} lr_create_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_doid; /* obj id of directory */ - /* name of object to remove follows this */ -} lr_remove_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_doid; /* obj id of directory */ - uint64_t lr_link_obj; /* obj id of link */ - /* name of object to link follows this */ -} lr_link_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_sdoid; /* obj id of source directory */ - uint64_t lr_tdoid; /* obj id of target directory */ - /* 2 strings: names of source and destination follow this */ -} lr_rename_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* file object to write */ - uint64_t lr_offset; /* offset to write to */ - uint64_t lr_length; /* user data length to write */ - uint64_t lr_blkoff; /* offset represented by lr_blkptr */ - blkptr_t lr_blkptr; /* spa block pointer for replay */ - /* write data will follow for small writes */ -} lr_write_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* object id of file to truncate */ - uint64_t lr_offset; /* offset to truncate from */ - uint64_t lr_length; /* length to truncate */ -} lr_truncate_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* file object to change attributes */ - uint64_t lr_mask; /* mask of attributes to set */ - uint64_t lr_mode; /* mode to set */ - uint64_t lr_uid; /* uid to set */ - uint64_t lr_gid; /* gid to set */ - uint64_t lr_size; /* size to set */ - uint64_t lr_atime[2]; /* access time */ - uint64_t lr_mtime[2]; /* modification time */ -} lr_setattr_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* obj id of file */ - uint64_t lr_aclcnt; /* number of acl entries */ - /* lr_aclcnt number of ace_t entries follow this */ -} lr_acl_t; - -/* - * ZIL structure definitions, interface function prototype and globals. - */ - -/* - * ZFS intent log transaction structure - */ -typedef enum { - WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ - /* and put blkptr in log, rather than actual data) */ - WR_COPIED, /* immediate - data is copied into lr_write_t */ - WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ -} itx_wr_state_t; - -typedef struct itx { - list_node_t itx_node; /* linkage on zl_itx_list */ - void *itx_private; /* type-specific opaque data */ - itx_wr_state_t itx_wr_state; /* write state */ - uint8_t itx_sync; /* synchronous transaction */ - lr_t itx_lr; /* common part of log record */ - /* followed by type-specific part of lr_xx_t and its immediate data */ -} itx_t; - - -/* - * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done() - * to handle the cleanup of the dmu_sync() buffer write - */ -typedef struct { - zilog_t *zgd_zilog; /* zilog */ - blkptr_t *zgd_bp; /* block pointer */ - struct rl *zgd_rl; /* range lock */ -} zgd_t; - - -typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, - uint64_t txg); -typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, - uint64_t txg); -typedef int zil_replay_func_t(); -typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); - -extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); - -extern void zil_init(void); -extern void zil_fini(void); - -extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); -extern void zil_free(zilog_t *zilog); - -extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); -extern void zil_close(zilog_t *zilog); - -extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE]); -extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); - -extern itx_t *zil_itx_create(int txtype, size_t lrsize); -extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); - -extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); - -extern int zil_claim(char *osname, void *txarg); -extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); -extern void zil_clean(zilog_t *zilog); -extern int zil_is_committed(zilog_t *zilog); - -extern int zil_suspend(zilog_t *zilog); -extern void zil_resume(zilog_t *zilog); - -extern void zil_add_vdev(zilog_t *zilog, uint64_t vdev); - -extern int zil_disable; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZIL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h deleted file mode 100644 index 3ecf4e4..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZIL_IMPL_H -#define _SYS_ZIL_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zil.h> -#include <sys/dmu_objset.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Log write buffer. - */ -typedef struct lwb { - zilog_t *lwb_zilog; /* back pointer to log struct */ - blkptr_t lwb_blk; /* on disk address of this log blk */ - int lwb_nused; /* # used bytes in buffer */ - int lwb_sz; /* size of block and buffer */ - char *lwb_buf; /* log write buffer */ - zio_t *lwb_zio; /* zio for this buffer */ - uint64_t lwb_max_txg; /* highest txg in this lwb */ - txg_handle_t lwb_txgh; /* txg handle for txg_exit() */ - list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ -} lwb_t; - -/* - * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes. - * Any vdev numbers beyond that use a linked list of zil_vdev_t structures. - */ - -#define ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */ -typedef struct zil_vdev { - uint64_t vdev; /* device written */ - list_node_t vdev_seq_node; /* zilog->zl_vdev_list linkage */ -} zil_vdev_t; - -/* - * Stable storage intent log management structure. One per dataset. - */ -struct zilog { - kmutex_t zl_lock; /* protects most zilog_t fields */ - struct dsl_pool *zl_dmu_pool; /* DSL pool */ - spa_t *zl_spa; /* handle for read/write log */ - const zil_header_t *zl_header; /* log header buffer */ - objset_t *zl_os; /* object set we're logging */ - zil_get_data_t *zl_get_data; /* callback to get object content */ - zio_t *zl_root_zio; /* log writer root zio */ - uint64_t zl_itx_seq; /* next itx sequence number */ - uint64_t zl_commit_seq; /* committed upto this number */ - uint64_t zl_lr_seq; /* log record sequence number */ - uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ - uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */ - uint32_t zl_suspend; /* log suspend count */ - kcondvar_t zl_cv_writer; /* log writer thread completion */ - kcondvar_t zl_cv_suspend; /* log suspend completion */ - uint8_t zl_suspending; /* log is currently suspending */ - uint8_t zl_keep_first; /* keep first log block in destroy */ - uint8_t zl_stop_replay; /* don't replay any further */ - uint8_t zl_stop_sync; /* for debugging */ - uint8_t zl_writer; /* boolean: write setup in progress */ - uint8_t zl_log_error; /* boolean: log write error */ - list_t zl_itx_list; /* in-memory itx list */ - uint64_t zl_itx_list_sz; /* total size of records on list */ - uint64_t zl_cur_used; /* current commit log size used */ - uint64_t zl_prev_used; /* previous commit log size used */ - list_t zl_lwb_list; /* in-flight log write list */ - list_t zl_vdev_list; /* list of [vdev, seq] pairs */ - uint8_t zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */ - taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ - avl_tree_t zl_dva_tree; /* track DVAs during log parse */ - clock_t zl_replay_time; /* lbolt of when replay started */ - uint64_t zl_replay_blks; /* number of log blocks replayed */ -}; - -typedef struct zil_dva_node { - dva_t zn_dva; - avl_node_t zn_node; -} zil_dva_node_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZIL_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h deleted file mode 100644 index b026ae6..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ /dev/null @@ -1,366 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _ZIO_H -#define _ZIO_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/avl.h> -#include <sys/dkio.h> -#include <sys/fs/zfs.h> -#include <sys/zio_impl.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */ - -typedef struct zio_block_tail { - uint64_t zbt_magic; /* for validation, endianness */ - zio_cksum_t zbt_cksum; /* 256-bit checksum */ -} zio_block_tail_t; - -/* - * Gang block headers are self-checksumming and contain an array - * of block pointers. - */ -#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE -#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_block_tail_t)) / sizeof (blkptr_t)) -#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_block_tail_t) - \ - (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ - sizeof (uint64_t)) - -#define ZIO_GET_IOSIZE(zio) \ - (BP_IS_GANG((zio)->io_bp) ? \ - SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp)) - -typedef struct zio_gbh { - blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; - uint64_t zg_filler[SPA_GBH_FILLER]; - zio_block_tail_t zg_tail; -} zio_gbh_phys_t; - -enum zio_checksum { - ZIO_CHECKSUM_INHERIT = 0, - ZIO_CHECKSUM_ON, - ZIO_CHECKSUM_OFF, - ZIO_CHECKSUM_LABEL, - ZIO_CHECKSUM_GANG_HEADER, - ZIO_CHECKSUM_ZILOG, - ZIO_CHECKSUM_FLETCHER_2, - ZIO_CHECKSUM_FLETCHER_4, - ZIO_CHECKSUM_SHA256, - ZIO_CHECKSUM_FUNCTIONS -}; - -#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2 -#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON - -enum zio_compress { - ZIO_COMPRESS_INHERIT = 0, - ZIO_COMPRESS_ON, - ZIO_COMPRESS_OFF, - ZIO_COMPRESS_LZJB, - ZIO_COMPRESS_EMPTY, - ZIO_COMPRESS_GZIP_1, - ZIO_COMPRESS_GZIP_2, - ZIO_COMPRESS_GZIP_3, - ZIO_COMPRESS_GZIP_4, - ZIO_COMPRESS_GZIP_5, - ZIO_COMPRESS_GZIP_6, - ZIO_COMPRESS_GZIP_7, - ZIO_COMPRESS_GZIP_8, - ZIO_COMPRESS_GZIP_9, - ZIO_COMPRESS_FUNCTIONS -}; - -#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB -#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF - -#define ZIO_PRIORITY_NOW (zio_priority_table[0]) -#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) -#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) -#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3]) -#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4]) -#define ZIO_PRIORITY_FREE (zio_priority_table[5]) -#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6]) -#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7]) -#define ZIO_PRIORITY_RESILVER (zio_priority_table[8]) -#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) -#define ZIO_PRIORITY_TABLE_SIZE 10 - -#define ZIO_FLAG_MUSTSUCCEED 0x00000 -#define ZIO_FLAG_CANFAIL 0x00001 -#define ZIO_FLAG_FAILFAST 0x00002 -#define ZIO_FLAG_CONFIG_HELD 0x00004 -#define ZIO_FLAG_CONFIG_GRABBED 0x00008 - -#define ZIO_FLAG_DONT_CACHE 0x00010 -#define ZIO_FLAG_DONT_QUEUE 0x00020 -#define ZIO_FLAG_DONT_PROPAGATE 0x00040 -#define ZIO_FLAG_DONT_RETRY 0x00080 - -#define ZIO_FLAG_PHYSICAL 0x00100 -#define ZIO_FLAG_IO_BYPASS 0x00200 -#define ZIO_FLAG_IO_REPAIR 0x00400 -#define ZIO_FLAG_SPECULATIVE 0x00800 - -#define ZIO_FLAG_RESILVER 0x01000 -#define ZIO_FLAG_SCRUB 0x02000 -#define ZIO_FLAG_SCRUB_THREAD 0x04000 -#define ZIO_FLAG_SUBBLOCK 0x08000 - -#define ZIO_FLAG_NOBOOKMARK 0x10000 -#define ZIO_FLAG_USER 0x20000 - -#define ZIO_FLAG_METADATA 0x40000 - -#define ZIO_FLAG_GANG_INHERIT \ - (ZIO_FLAG_CANFAIL | \ - ZIO_FLAG_FAILFAST | \ - ZIO_FLAG_CONFIG_HELD | \ - ZIO_FLAG_DONT_RETRY | \ - ZIO_FLAG_IO_REPAIR | \ - ZIO_FLAG_SPECULATIVE | \ - ZIO_FLAG_RESILVER | \ - ZIO_FLAG_SCRUB | \ - ZIO_FLAG_SCRUB_THREAD) - -#define ZIO_FLAG_VDEV_INHERIT \ - (ZIO_FLAG_GANG_INHERIT | \ - ZIO_FLAG_DONT_CACHE | \ - ZIO_FLAG_PHYSICAL) - -/* - * We'll take the EILSEQ (Illegal byte sequence) errno - * to indicate checksum errors. - */ -#define ECKSUM EILSEQ - -typedef struct zio zio_t; -typedef void zio_done_func_t(zio_t *zio); - -extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; -extern char *zio_type_name[ZIO_TYPES]; - -/* - * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely - * identifies any block in the pool. By convention, the meta-objset (MOS) - * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is - * level -1 of the meta-dnode, and intent log blocks (which are chained - * off the root block) have blkid == sequence number. In summary: - * - * mos is objset 0 - * meta-dnode is object 0 - * root block is <objset, 0, -1, 0> - * intent log is <objset, 0, -1, ZIL sequence number> - * - * Note: this structure is called a bookmark because its first purpose was - * to remember where to resume a pool-wide traverse. The absolute ordering - * for block visitation during traversal is defined in compare_bookmark(). - * - * Note: this structure is passed between userland and the kernel. - * Therefore it must not change size or alignment between 32/64 bit - * compilation options. - */ -typedef struct zbookmark { - uint64_t zb_objset; - uint64_t zb_object; - int64_t zb_level; - uint64_t zb_blkid; -} zbookmark_t; - -struct zio { - /* Core information about this I/O */ - zio_t *io_parent; - zio_t *io_root; - spa_t *io_spa; - zbookmark_t io_bookmark; - enum zio_checksum io_checksum; - enum zio_compress io_compress; - int io_ndvas; - uint64_t io_txg; - blkptr_t *io_bp; - blkptr_t io_bp_copy; - zio_t *io_child; - zio_t *io_sibling_prev; - zio_t *io_sibling_next; - zio_transform_t *io_transform_stack; - zio_t *io_logical; - - /* Callback info */ - zio_done_func_t *io_ready; - zio_done_func_t *io_done; - void *io_private; - blkptr_t io_bp_orig; - - /* Data represented by this I/O */ - void *io_data; - uint64_t io_size; - - /* Stuff for the vdev stack */ - vdev_t *io_vd; - void *io_vsd; - uint64_t io_offset; - uint64_t io_deadline; - uint64_t io_timestamp; - avl_node_t io_offset_node; - avl_node_t io_deadline_node; - avl_tree_t *io_vdev_tree; - zio_t *io_delegate_list; - zio_t *io_delegate_next; - - /* Internal pipeline state */ - int io_flags; - enum zio_type io_type; - enum zio_stage io_stage; - uint8_t io_stalled; - uint8_t io_priority; - struct dk_callback io_dk_callback; - int io_cmd; - int io_retries; - int io_error; - uint32_t io_numerrors; - uint32_t io_pipeline; - uint32_t io_async_stages; - uint64_t io_children_notready; - uint64_t io_children_notdone; - void *io_waiter; - kmutex_t io_lock; - kcondvar_t io_cv; - - /* FMA state */ - uint64_t io_ena; -}; - -extern zio_t *zio_null(zio_t *pio, spa_t *spa, - zio_done_func_t *done, void *private, int flags); - -extern zio_t *zio_root(spa_t *spa, - zio_done_func_t *done, void *private, int flags); - -extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, - uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, zbookmark_t *zb); - -extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, - int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb); - -extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum, - uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags, - zbookmark_t *zb); - -extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private); - -extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private); - -extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, int flags); - -extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags); - -extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags); - -extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, - blkptr_t *old_bp, uint64_t txg); -extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); - -extern int zio_wait(zio_t *zio); -extern void zio_nowait(zio_t *zio); - -extern void *zio_buf_alloc(size_t size); -extern void zio_buf_free(void *buf, size_t size); -extern void *zio_data_buf_alloc(size_t size); -extern void zio_data_buf_free(void *buf, size_t size); - -/* - * Move an I/O to the next stage of the pipeline and execute that stage. - * There's no locking on io_stage because there's no legitimate way for - * multiple threads to be attempting to process the same I/O. - */ -extern void zio_next_stage(zio_t *zio); -extern void zio_next_stage_async(zio_t *zio); -extern void zio_wait_children_done(zio_t *zio); - -/* - * Delegate I/O to a child vdev. - */ -extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, - uint64_t offset, void *data, uint64_t size, int type, int priority, - int flags, zio_done_func_t *done, void *private); - -extern void zio_vdev_io_bypass(zio_t *zio); -extern void zio_vdev_io_reissue(zio_t *zio); -extern void zio_vdev_io_redone(zio_t *zio); - -extern void zio_checksum_verified(zio_t *zio); -extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp); - -extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); -extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); - -boolean_t zio_should_retry(zio_t *zio); - -/* - * Initial setup and teardown. - */ -extern void zio_init(void); -extern void zio_fini(void); - -/* - * Fault injection - */ -struct zinject_record; -extern uint32_t zio_injection_enabled; -extern int zio_inject_fault(char *name, int flags, int *id, - struct zinject_record *record); -extern int zio_inject_list_next(int *id, char *name, size_t buflen, - struct zinject_record *record); -extern int zio_clear_fault(int id); -extern int zio_handle_fault_injection(zio_t *zio, int error); -extern int zio_handle_device_injection(vdev_t *vd, int error); - -#ifdef __cplusplus -} -#endif - -#endif /* _ZIO_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h deleted file mode 100644 index bb7bd41..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZIO_CHECKSUM_H -#define _SYS_ZIO_CHECKSUM_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zio.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Signature for checksum functions. - */ -typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); - -/* - * Information about each checksum function. - */ -typedef struct zio_checksum_info { - zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ - int ci_correctable; /* number of correctable bits */ - int ci_zbt; /* uses zio block tail? */ - char *ci_name; /* descriptive name */ -} zio_checksum_info_t; - -extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; - -/* - * Checksum routines. - */ -extern zio_checksum_t fletcher_2_native; -extern zio_checksum_t fletcher_4_native; -extern zio_checksum_t fletcher_4_incremental_native; - -extern zio_checksum_t fletcher_2_byteswap; -extern zio_checksum_t fletcher_4_byteswap; -extern zio_checksum_t fletcher_4_incremental_byteswap; - -extern zio_checksum_t zio_checksum_SHA256; - -extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp, - void *data, uint64_t size); -extern int zio_checksum_error(zio_t *zio); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZIO_CHECKSUM_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h deleted file mode 100644 index 66ee8d4..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZIO_COMPRESS_H -#define _SYS_ZIO_COMPRESS_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zio.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Common signature for all zio compress/decompress functions. - */ -typedef size_t zio_compress_func_t(void *src, void *dst, - size_t s_len, size_t d_len, int); -typedef int zio_decompress_func_t(void *src, void *dst, - size_t s_len, size_t d_len, int); - -/* - * Information about each compression function. - */ -typedef struct zio_compress_info { - zio_compress_func_t *ci_compress; /* compression function */ - zio_decompress_func_t *ci_decompress; /* decompression function */ - int ci_level; /* level parameter */ - char *ci_name; /* algorithm name */ -} zio_compress_info_t; - -extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; - -/* - * Compression routines. - */ -extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); - -/* - * Compress and decompress data if necessary. - */ -extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, - void **destp, uint64_t *destsizep, uint64_t *destbufsizep); -extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, - void *dest, uint64_t destsize); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZIO_COMPRESS_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h deleted file mode 100644 index d2ddbc3..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _ZIO_IMPL_H -#define _ZIO_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/zio.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * I/O Groups: pipeline stage definitions. - */ - -typedef enum zio_stage { - ZIO_STAGE_OPEN = 0, /* RWFCI */ - ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */ - - ZIO_STAGE_WRITE_COMPRESS, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ - - ZIO_STAGE_GANG_PIPELINE, /* -WFC- */ - - ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */ - ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */ - ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */ - ZIO_STAGE_CLAIM_GANG_MEMBERS, /* ---C- */ - - ZIO_STAGE_DVA_ALLOCATE, /* -W--- */ - ZIO_STAGE_DVA_FREE, /* --F-- */ - ZIO_STAGE_DVA_CLAIM, /* ---C- */ - - ZIO_STAGE_GANG_CHECKSUM_GENERATE, /* -W--- */ - - ZIO_STAGE_READY, /* RWFCI */ - - ZIO_STAGE_VDEV_IO_START, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ - - ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */ - - ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ - ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */ - ZIO_STAGE_READ_DECOMPRESS, /* R---- */ - - ZIO_STAGE_DONE /* RWFCI */ -} zio_stage_t; - -/* - * The stages for which there's some performance value in going async. - * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well. - */ -#define ZIO_ASYNC_PIPELINE_STAGES \ - ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_VDEV_IO_DONE) | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ - (1U << ZIO_STAGE_READ_DECOMPRESS)) - -#define ZIO_VDEV_IO_PIPELINE \ - ((1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_VDEV_IO_DONE) | \ - (1U << ZIO_STAGE_VDEV_IO_ASSESS)) - -#define ZIO_READ_PHYS_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_READ_PIPELINE \ - ZIO_READ_PHYS_PIPELINE - -#define ZIO_WRITE_PHYS_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_WRITE_COMMON_PIPELINE \ - ZIO_WRITE_PHYS_PIPELINE - -#define ZIO_WRITE_PIPELINE \ - ((1U << ZIO_STAGE_WRITE_COMPRESS) | \ - ZIO_WRITE_COMMON_PIPELINE) - -#define ZIO_GANG_STAGES \ - ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_READ_GANG_MEMBERS)) - -#define ZIO_REWRITE_PIPELINE \ - ((1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ - ZIO_WRITE_COMMON_PIPELINE) - -#define ZIO_WRITE_ALLOCATE_PIPELINE \ - ((1U << ZIO_STAGE_DVA_ALLOCATE) | \ - ZIO_WRITE_COMMON_PIPELINE) - -#define ZIO_GANG_FREE_STAGES \ - ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS)) - -#define ZIO_FREE_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_DVA_FREE) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_CLAIM_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_DVA_CLAIM) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_IOCTL_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \ - ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \ - ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_VDEV_CHILD_PIPELINE \ - (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \ - ZIO_VDEV_IO_PIPELINE) - -#define ZIO_ERROR_PIPELINE_MASK \ - ZIO_WAIT_FOR_CHILDREN_PIPELINE - -typedef struct zio_transform zio_transform_t; -struct zio_transform { - void *zt_data; - uint64_t zt_size; - uint64_t zt_bufsize; - zio_transform_t *zt_next; -}; - -extern void zio_inject_init(void); -extern void zio_inject_fini(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _ZIO_IMPL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h deleted file mode 100644 index df85824..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZVOL_H -#define _SYS_ZVOL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _KERNEL -extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); -extern int zvol_check_volblocksize(uint64_t volblocksize); -extern int zvol_get_stats(objset_t *os, nvlist_t *nv); -extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx); -extern int zvol_create_minor(const char *, dev_t); -extern int zvol_remove_minor(const char *); -extern int zvol_set_volsize(const char *, dev_t, uint64_t); -extern int zvol_set_volblocksize(const char *, uint64_t); - -extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); -extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); -#ifndef __FreeBSD__ -extern int zvol_strategy(buf_t *bp); -extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr); -extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr); -extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); -extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); -#endif -extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, - int *rvalp); -extern int zvol_busy(void); -extern void zvol_init(void); -extern void zvol_fini(void); -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZVOL_H */ diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c deleted file mode 100644 index 844beb6..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c +++ /dev/null @@ -1,611 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/txg_impl.h> -#include <sys/dmu_impl.h> -#include <sys/dsl_pool.h> -#include <sys/callb.h> - -/* - * Pool-wide transaction groups. - */ - -static void txg_sync_thread(void *arg); -static void txg_quiesce_thread(void *arg); -static void txg_timelimit_thread(void *arg); - -int txg_time = 5; /* max 5 seconds worth of delta per txg */ - -/* - * Prepare the txg subsystem. - */ -void -txg_init(dsl_pool_t *dp, uint64_t txg) -{ - tx_state_t *tx = &dp->dp_tx; - int c, i; - bzero(tx, sizeof (tx_state_t)); - - tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); - for (c = 0; c < max_ncpus; c++) { - mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); - for (i = 0; i < TXG_SIZE; i++) - cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); - } - - rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL); - mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_timeout_exit_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); - - tx->tx_open_txg = txg; -} - -/* - * Close down the txg subsystem. - */ -void -txg_fini(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - int c, i; - - ASSERT(tx->tx_threads == 0); - - cv_destroy(&tx->tx_exit_cv); - cv_destroy(&tx->tx_timeout_exit_cv); - cv_destroy(&tx->tx_quiesce_done_cv); - cv_destroy(&tx->tx_quiesce_more_cv); - cv_destroy(&tx->tx_sync_done_cv); - cv_destroy(&tx->tx_sync_more_cv); - rw_destroy(&tx->tx_suspend); - mutex_destroy(&tx->tx_sync_lock); - - for (c = 0; c < max_ncpus; c++) { - for (i = 0; i < TXG_SIZE; i++) - cv_destroy(&tx->tx_cpu[c].tc_cv[i]); - mutex_destroy(&tx->tx_cpu[c].tc_lock); - } - - kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); - - bzero(tx, sizeof (tx_state_t)); -} - -/* - * Start syncing transaction groups. - */ -void -txg_sync_start(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - - mutex_enter(&tx->tx_sync_lock); - - dprintf("pool %p\n", dp); - - ASSERT(tx->tx_threads == 0); - - tx->tx_threads = 3; - - tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, - dp, 0, &p0, TS_RUN, minclsyspri); - - tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread, - dp, 0, &p0, TS_RUN, minclsyspri); - - tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread, - dp, 0, &p0, TS_RUN, minclsyspri); - - mutex_exit(&tx->tx_sync_lock); -} - -static void -txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) -{ - CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); - mutex_enter(&tx->tx_sync_lock); -} - -static void -txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) -{ - ASSERT(*tpp != NULL); - *tpp = NULL; - tx->tx_threads--; - cv_broadcast(&tx->tx_exit_cv); - CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ - thread_exit(); -} - -static void -txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax) -{ - CALLB_CPR_SAFE_BEGIN(cpr); - - if (secmax) - (void) cv_timedwait(cv, &tx->tx_sync_lock, secmax * hz); - else - cv_wait(cv, &tx->tx_sync_lock); - - CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); -} - -/* - * Stop syncing transaction groups. - */ -void -txg_sync_stop(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - - dprintf("pool %p\n", dp); - /* - * Finish off any work in progress. - */ - ASSERT(tx->tx_threads == 3); - txg_wait_synced(dp, 0); - - /* - * Wake all 3 sync threads (one per state) and wait for them to die. - */ - mutex_enter(&tx->tx_sync_lock); - - ASSERT(tx->tx_threads == 3); - - tx->tx_exiting = 1; - - cv_broadcast(&tx->tx_quiesce_more_cv); - cv_broadcast(&tx->tx_quiesce_done_cv); - cv_broadcast(&tx->tx_sync_more_cv); - cv_broadcast(&tx->tx_timeout_exit_cv); - - while (tx->tx_threads != 0) - cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); - - tx->tx_exiting = 0; - - mutex_exit(&tx->tx_sync_lock); -} - -uint64_t -txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) -{ - tx_state_t *tx = &dp->dp_tx; - tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; - uint64_t txg; - - mutex_enter(&tc->tc_lock); - - txg = tx->tx_open_txg; - tc->tc_count[txg & TXG_MASK]++; - - th->th_cpu = tc; - th->th_txg = txg; - - return (txg); -} - -void -txg_rele_to_quiesce(txg_handle_t *th) -{ - tx_cpu_t *tc = th->th_cpu; - - mutex_exit(&tc->tc_lock); -} - -void -txg_rele_to_sync(txg_handle_t *th) -{ - tx_cpu_t *tc = th->th_cpu; - int g = th->th_txg & TXG_MASK; - - mutex_enter(&tc->tc_lock); - ASSERT(tc->tc_count[g] != 0); - if (--tc->tc_count[g] == 0) - cv_broadcast(&tc->tc_cv[g]); - mutex_exit(&tc->tc_lock); - - th->th_cpu = NULL; /* defensive */ -} - -static void -txg_quiesce(dsl_pool_t *dp, uint64_t txg) -{ - tx_state_t *tx = &dp->dp_tx; - int g = txg & TXG_MASK; - int c; - - /* - * Grab all tx_cpu locks so nobody else can get into this txg. - */ - for (c = 0; c < max_ncpus; c++) - mutex_enter(&tx->tx_cpu[c].tc_lock); - - ASSERT(txg == tx->tx_open_txg); - tx->tx_open_txg++; - - /* - * Now that we've incremented tx_open_txg, we can let threads - * enter the next transaction group. - */ - for (c = 0; c < max_ncpus; c++) - mutex_exit(&tx->tx_cpu[c].tc_lock); - - /* - * Quiesce the transaction group by waiting for everyone to txg_exit(). - */ - for (c = 0; c < max_ncpus; c++) { - tx_cpu_t *tc = &tx->tx_cpu[c]; - mutex_enter(&tc->tc_lock); - while (tc->tc_count[g] != 0) - cv_wait(&tc->tc_cv[g], &tc->tc_lock); - mutex_exit(&tc->tc_lock); - } -} - -static void -txg_sync_thread(void *arg) -{ - dsl_pool_t *dp = arg; - tx_state_t *tx = &dp->dp_tx; - callb_cpr_t cpr; - - txg_thread_enter(tx, &cpr); - - for (;;) { - uint64_t txg; - - /* - * We sync when there's someone waiting on us, or the - * quiesce thread has handed off a txg to us. - */ - while (!tx->tx_exiting && - tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - tx->tx_quiesced_txg == 0) { - dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", - tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); - txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0); - } - - /* - * Wait until the quiesce thread hands off a txg to us, - * prompting it to do so if necessary. - */ - while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { - if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) - tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; - cv_broadcast(&tx->tx_quiesce_more_cv); - txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); - } - - if (tx->tx_exiting) - txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); - - rw_enter(&tx->tx_suspend, RW_WRITER); - - /* - * Consume the quiesced txg which has been handed off to - * us. This may cause the quiescing thread to now be - * able to quiesce another txg, so we must signal it. - */ - txg = tx->tx_quiesced_txg; - tx->tx_quiesced_txg = 0; - tx->tx_syncing_txg = txg; - cv_broadcast(&tx->tx_quiesce_more_cv); - rw_exit(&tx->tx_suspend); - - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, - tx->tx_sync_txg_waiting); - mutex_exit(&tx->tx_sync_lock); - spa_sync(dp->dp_spa, txg); - mutex_enter(&tx->tx_sync_lock); - rw_enter(&tx->tx_suspend, RW_WRITER); - tx->tx_synced_txg = txg; - tx->tx_syncing_txg = 0; - rw_exit(&tx->tx_suspend); - cv_broadcast(&tx->tx_sync_done_cv); - } -} - -static void -txg_quiesce_thread(void *arg) -{ - dsl_pool_t *dp = arg; - tx_state_t *tx = &dp->dp_tx; - callb_cpr_t cpr; - - txg_thread_enter(tx, &cpr); - - for (;;) { - uint64_t txg; - - /* - * We quiesce when there's someone waiting on us. - * However, we can only have one txg in "quiescing" or - * "quiesced, waiting to sync" state. So we wait until - * the "quiesced, waiting to sync" txg has been consumed - * by the sync thread. - */ - while (!tx->tx_exiting && - (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || - tx->tx_quiesced_txg != 0)) - txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); - - if (tx->tx_exiting) - txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); - - txg = tx->tx_open_txg; - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, - tx->tx_sync_txg_waiting); - mutex_exit(&tx->tx_sync_lock); - txg_quiesce(dp, txg); - mutex_enter(&tx->tx_sync_lock); - - /* - * Hand this txg off to the sync thread. - */ - dprintf("quiesce done, handing off txg %llu\n", txg); - tx->tx_quiesced_txg = txg; - cv_broadcast(&tx->tx_sync_more_cv); - cv_broadcast(&tx->tx_quiesce_done_cv); - } -} - -void -txg_wait_synced(dsl_pool_t *dp, uint64_t txg) -{ - tx_state_t *tx = &dp->dp_tx; - - mutex_enter(&tx->tx_sync_lock); - ASSERT(tx->tx_threads == 3); - if (txg == 0) - txg = tx->tx_open_txg; - if (tx->tx_sync_txg_waiting < txg) - tx->tx_sync_txg_waiting = txg; - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); - while (tx->tx_synced_txg < txg) { - dprintf("broadcasting sync more " - "tx_synced=%llu waiting=%llu dp=%p\n", - tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); - cv_broadcast(&tx->tx_sync_more_cv); - cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); - } - mutex_exit(&tx->tx_sync_lock); -} - -void -txg_wait_open(dsl_pool_t *dp, uint64_t txg) -{ - tx_state_t *tx = &dp->dp_tx; - - mutex_enter(&tx->tx_sync_lock); - ASSERT(tx->tx_threads == 3); - if (txg == 0) - txg = tx->tx_open_txg + 1; - if (tx->tx_quiesce_txg_waiting < txg) - tx->tx_quiesce_txg_waiting = txg; - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); - while (tx->tx_open_txg < txg) { - cv_broadcast(&tx->tx_quiesce_more_cv); - cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); - } - mutex_exit(&tx->tx_sync_lock); -} - -static void -txg_timelimit_thread(void *arg) -{ - dsl_pool_t *dp = arg; - tx_state_t *tx = &dp->dp_tx; - callb_cpr_t cpr; - - txg_thread_enter(tx, &cpr); - - while (!tx->tx_exiting) { - uint64_t txg = tx->tx_open_txg + 1; - - txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time); - - if (tx->tx_quiesce_txg_waiting < txg) - tx->tx_quiesce_txg_waiting = txg; - - while (!tx->tx_exiting && tx->tx_open_txg < txg) { - dprintf("pushing out %llu\n", txg); - cv_broadcast(&tx->tx_quiesce_more_cv); - txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); - } - } - txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread); -} - -int -txg_stalled(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); -} - -void -txg_suspend(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - /* XXX some code paths suspend when they are already suspended! */ - rw_enter(&tx->tx_suspend, RW_READER); -} - -void -txg_resume(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - rw_exit(&tx->tx_suspend); -} - -/* - * Per-txg object lists. - */ -void -txg_list_create(txg_list_t *tl, size_t offset) -{ - int t; - - mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); - - tl->tl_offset = offset; - - for (t = 0; t < TXG_SIZE; t++) - tl->tl_head[t] = NULL; -} - -void -txg_list_destroy(txg_list_t *tl) -{ - int t; - - for (t = 0; t < TXG_SIZE; t++) - ASSERT(txg_list_empty(tl, t)); - - mutex_destroy(&tl->tl_lock); -} - -int -txg_list_empty(txg_list_t *tl, uint64_t txg) -{ - return (tl->tl_head[txg & TXG_MASK] == NULL); -} - -/* - * Add an entry to the list. - * Returns 0 if it's a new entry, 1 if it's already there. - */ -int -txg_list_add(txg_list_t *tl, void *p, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - int already_on_list; - - mutex_enter(&tl->tl_lock); - already_on_list = tn->tn_member[t]; - if (!already_on_list) { - tn->tn_member[t] = 1; - tn->tn_next[t] = tl->tl_head[t]; - tl->tl_head[t] = tn; - } - mutex_exit(&tl->tl_lock); - - return (already_on_list); -} - -/* - * Remove the head of the list and return it. - */ -void * -txg_list_remove(txg_list_t *tl, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn; - void *p = NULL; - - mutex_enter(&tl->tl_lock); - if ((tn = tl->tl_head[t]) != NULL) { - p = (char *)tn - tl->tl_offset; - tl->tl_head[t] = tn->tn_next[t]; - tn->tn_next[t] = NULL; - tn->tn_member[t] = 0; - } - mutex_exit(&tl->tl_lock); - - return (p); -} - -/* - * Remove a specific item from the list and return it. - */ -void * -txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn, **tp; - - mutex_enter(&tl->tl_lock); - - for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { - if ((char *)tn - tl->tl_offset == p) { - *tp = tn->tn_next[t]; - tn->tn_next[t] = NULL; - tn->tn_member[t] = 0; - mutex_exit(&tl->tl_lock); - return (p); - } - } - - mutex_exit(&tl->tl_lock); - - return (NULL); -} - -int -txg_list_member(txg_list_t *tl, void *p, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - - return (tn->tn_member[t]); -} - -/* - * Walk a txg list -- only safe if you know it's not changing. - */ -void * -txg_list_head(txg_list_t *tl, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn = tl->tl_head[t]; - - return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); -} - -void * -txg_list_next(txg_list_t *tl, void *p, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - - tn = tn->tn_next[t]; - - return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c deleted file mode 100644 index 34d7e0c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/uberblock_impl.h> -#include <sys/vdev_impl.h> - -int -uberblock_verify(uberblock_t *ub) -{ - if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) - byteswap_uint64_array(ub, sizeof (uberblock_t)); - - if (ub->ub_magic != UBERBLOCK_MAGIC) - return (EINVAL); - - return (0); -} - -/* - * Update the uberblock and return a boolean value indicating whether - * anything changed in this transaction group. - */ -int -uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) -{ - ASSERT(ub->ub_txg < txg); - - /* - * We explicitly do not set ub_version here, so that older versions - * continue to be written with the previous uberblock version. - */ - ub->ub_magic = UBERBLOCK_MAGIC; - ub->ub_txg = txg; - ub->ub_guid_sum = rvd->vdev_guid_sum; - ub->ub_timestamp = gethrestime_sec(); - - return (ub->ub_rootbp.blk_birth == txg); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c deleted file mode 100644 index b52e729..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c +++ /dev/null @@ -1,107 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/avl.h> -#include <sys/unique.h> - -static avl_tree_t unique_avl; -static kmutex_t unique_mtx; /* Lock never initialized. */ -SX_SYSINIT(unique, &unique_mtx, "unique lock"); - -typedef struct unique { - avl_node_t un_link; - uint64_t un_value; -} unique_t; - -#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1) - -static int -unique_compare(const void *a, const void *b) -{ - const unique_t *una = a; - const unique_t *unb = b; - - if (una->un_value < unb->un_value) - return (-1); - if (una->un_value > unb->un_value) - return (+1); - return (0); -} - -void -unique_init(void) -{ - avl_create(&unique_avl, unique_compare, - sizeof (unique_t), offsetof(unique_t, un_link)); -} - -uint64_t -unique_create(void) -{ - return (unique_insert(0)); -} - -uint64_t -unique_insert(uint64_t value) -{ - avl_index_t idx; - unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP); - - un->un_value = value; - - mutex_enter(&unique_mtx); - while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK || - avl_find(&unique_avl, un, &idx)) { - mutex_exit(&unique_mtx); - (void) random_get_pseudo_bytes((void*)&un->un_value, - sizeof (un->un_value)); - un->un_value &= UNIQUE_MASK; - mutex_enter(&unique_mtx); - } - - avl_insert(&unique_avl, un, idx); - mutex_exit(&unique_mtx); - - return (un->un_value); -} - -void -unique_remove(uint64_t value) -{ - unique_t un_tofind; - unique_t *un; - - un_tofind.un_value = value; - mutex_enter(&unique_mtx); - un = avl_find(&unique_avl, &un_tofind, NULL); - if (un != NULL) { - avl_remove(&unique_avl, un); - kmem_free(un, sizeof (unique_t)); - } - mutex_exit(&unique_mtx); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c deleted file mode 100644 index b966099..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ /dev/null @@ -1,1915 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/fm/fs/zfs.h> -#include <sys/spa.h> -#include <sys/spa_impl.h> -#include <sys/dmu.h> -#include <sys/dmu_tx.h> -#include <sys/vdev_impl.h> -#include <sys/uberblock_impl.h> -#include <sys/metaslab.h> -#include <sys/metaslab_impl.h> -#include <sys/space_map.h> -#include <sys/zio.h> -#include <sys/zap.h> -#include <sys/fs/zfs.h> - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); - -/* - * Virtual device management. - */ - -static vdev_ops_t *vdev_ops_table[] = { - &vdev_root_ops, - &vdev_raidz_ops, - &vdev_mirror_ops, - &vdev_replacing_ops, - &vdev_spare_ops, -#ifdef _KERNEL - &vdev_geom_ops, -#else - &vdev_disk_ops, - &vdev_file_ops, -#endif - &vdev_missing_ops, - NULL -}; - -/* maximum scrub/resilver I/O queue */ -int zfs_scrub_limit = 70; - -/* - * Given a vdev type, return the appropriate ops vector. - */ -static vdev_ops_t * -vdev_getops(const char *type) -{ - vdev_ops_t *ops, **opspp; - - for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) - if (strcmp(ops->vdev_op_type, type) == 0) - break; - - return (ops); -} - -/* - * Default asize function: return the MAX of psize with the asize of - * all children. This is what's used by anything other than RAID-Z. - */ -uint64_t -vdev_default_asize(vdev_t *vd, uint64_t psize) -{ - uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); - uint64_t csize; - uint64_t c; - - for (c = 0; c < vd->vdev_children; c++) { - csize = vdev_psize_to_asize(vd->vdev_child[c], psize); - asize = MAX(asize, csize); - } - - return (asize); -} - -/* - * Get the replaceable or attachable device size. - * If the parent is a mirror or raidz, the replaceable size is the minimum - * psize of all its children. For the rest, just return our own psize. - * - * e.g. - * psize rsize - * root - - - * mirror/raidz - - - * disk1 20g 20g - * disk2 40g 20g - * disk3 80g 80g - */ -uint64_t -vdev_get_rsize(vdev_t *vd) -{ - vdev_t *pvd, *cvd; - uint64_t c, rsize; - - pvd = vd->vdev_parent; - - /* - * If our parent is NULL or the root, just return our own psize. - */ - if (pvd == NULL || pvd->vdev_parent == NULL) - return (vd->vdev_psize); - - rsize = 0; - - for (c = 0; c < pvd->vdev_children; c++) { - cvd = pvd->vdev_child[c]; - rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; - } - - return (rsize); -} - -vdev_t * -vdev_lookup_top(spa_t *spa, uint64_t vdev) -{ - vdev_t *rvd = spa->spa_root_vdev; - - if (vdev < rvd->vdev_children) - return (rvd->vdev_child[vdev]); - - return (NULL); -} - -vdev_t * -vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) -{ - int c; - vdev_t *mvd; - - if (vd->vdev_guid == guid) - return (vd); - - for (c = 0; c < vd->vdev_children; c++) - if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != - NULL) - return (mvd); - - return (NULL); -} - -void -vdev_add_child(vdev_t *pvd, vdev_t *cvd) -{ - size_t oldsize, newsize; - uint64_t id = cvd->vdev_id; - vdev_t **newchild; - - ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); - ASSERT(cvd->vdev_parent == NULL); - - cvd->vdev_parent = pvd; - - if (pvd == NULL) - return; - - ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); - - oldsize = pvd->vdev_children * sizeof (vdev_t *); - pvd->vdev_children = MAX(pvd->vdev_children, id + 1); - newsize = pvd->vdev_children * sizeof (vdev_t *); - - newchild = kmem_zalloc(newsize, KM_SLEEP); - if (pvd->vdev_child != NULL) { - bcopy(pvd->vdev_child, newchild, oldsize); - kmem_free(pvd->vdev_child, oldsize); - } - - pvd->vdev_child = newchild; - pvd->vdev_child[id] = cvd; - - cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); - ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); - - /* - * Walk up all ancestors to update guid sum. - */ - for (; pvd != NULL; pvd = pvd->vdev_parent) - pvd->vdev_guid_sum += cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; -} - -void -vdev_remove_child(vdev_t *pvd, vdev_t *cvd) -{ - int c; - uint_t id = cvd->vdev_id; - - ASSERT(cvd->vdev_parent == pvd); - - if (pvd == NULL) - return; - - ASSERT(id < pvd->vdev_children); - ASSERT(pvd->vdev_child[id] == cvd); - - pvd->vdev_child[id] = NULL; - cvd->vdev_parent = NULL; - - for (c = 0; c < pvd->vdev_children; c++) - if (pvd->vdev_child[c]) - break; - - if (c == pvd->vdev_children) { - kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); - pvd->vdev_child = NULL; - pvd->vdev_children = 0; - } - - /* - * Walk up all ancestors to update guid sum. - */ - for (; pvd != NULL; pvd = pvd->vdev_parent) - pvd->vdev_guid_sum -= cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; -} - -/* - * Remove any holes in the child array. - */ -void -vdev_compact_children(vdev_t *pvd) -{ - vdev_t **newchild, *cvd; - int oldc = pvd->vdev_children; - int newc, c; - - ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); - - for (c = newc = 0; c < oldc; c++) - if (pvd->vdev_child[c]) - newc++; - - newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); - - for (c = newc = 0; c < oldc; c++) { - if ((cvd = pvd->vdev_child[c]) != NULL) { - newchild[newc] = cvd; - cvd->vdev_id = newc++; - } - } - - kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); - pvd->vdev_child = newchild; - pvd->vdev_children = newc; -} - -/* - * Allocate and minimally initialize a vdev_t. - */ -static vdev_t * -vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) -{ - vdev_t *vd; - - vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); - - if (spa->spa_root_vdev == NULL) { - ASSERT(ops == &vdev_root_ops); - spa->spa_root_vdev = vd; - } - - if (guid == 0) { - if (spa->spa_root_vdev == vd) { - /* - * The root vdev's guid will also be the pool guid, - * which must be unique among all pools. - */ - while (guid == 0 || spa_guid_exists(guid, 0)) - guid = spa_get_random(-1ULL); - } else { - /* - * Any other vdev's guid must be unique within the pool. - */ - while (guid == 0 || - spa_guid_exists(spa_guid(spa), guid)) - guid = spa_get_random(-1ULL); - } - ASSERT(!spa_guid_exists(spa_guid(spa), guid)); - } - - vd->vdev_spa = spa; - vd->vdev_id = id; - vd->vdev_guid = guid; - vd->vdev_guid_sum = guid; - vd->vdev_ops = ops; - vd->vdev_state = VDEV_STATE_CLOSED; - - mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); - space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); - space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); - txg_list_create(&vd->vdev_ms_list, - offsetof(struct metaslab, ms_txg_node)); - txg_list_create(&vd->vdev_dtl_list, - offsetof(struct vdev, vdev_dtl_node)); - vd->vdev_stat.vs_timestamp = gethrtime(); - - return (vd); -} - -/* - * Free a vdev_t that has been removed from service. - */ -static void -vdev_free_common(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - if (vd->vdev_path) - spa_strfree(vd->vdev_path); - if (vd->vdev_devid) - spa_strfree(vd->vdev_devid); - - if (vd->vdev_isspare) - spa_spare_remove(vd); - - txg_list_destroy(&vd->vdev_ms_list); - txg_list_destroy(&vd->vdev_dtl_list); - mutex_enter(&vd->vdev_dtl_lock); - space_map_unload(&vd->vdev_dtl_map); - space_map_destroy(&vd->vdev_dtl_map); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - space_map_destroy(&vd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_stat_lock); - - if (vd == spa->spa_root_vdev) - spa->spa_root_vdev = NULL; - - kmem_free(vd, sizeof (vdev_t)); -} - -/* - * Allocate a new vdev. The 'alloctype' is used to control whether we are - * creating a new vdev or loading an existing one - the behavior is slightly - * different for each case. - */ -int -vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - int alloctype) -{ - vdev_ops_t *ops; - char *type; - uint64_t guid = 0; - vdev_t *vd; - - ASSERT(spa_config_held(spa, RW_WRITER)); - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) - return (EINVAL); - - if ((ops = vdev_getops(type)) == NULL) - return (EINVAL); - - /* - * If this is a load, get the vdev guid from the nvlist. - * Otherwise, vdev_alloc_common() will generate one for us. - */ - if (alloctype == VDEV_ALLOC_LOAD) { - uint64_t label_id; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || - label_id != id) - return (EINVAL); - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (EINVAL); - } else if (alloctype == VDEV_ALLOC_SPARE) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (EINVAL); - } - - /* - * The first allocated vdev must be of type 'root'. - */ - if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) - return (EINVAL); - - vd = vdev_alloc_common(spa, id, guid, ops); - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) - vd->vdev_path = spa_strdup(vd->vdev_path); - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) - vd->vdev_devid = spa_strdup(vd->vdev_devid); - - /* - * Set the nparity propery for RAID-Z vdevs. - */ - if (ops == &vdev_raidz_ops) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &vd->vdev_nparity) == 0) { - /* - * Currently, we can only support 2 parity devices. - */ - if (vd->vdev_nparity > 2) - return (EINVAL); - /* - * Older versions can only support 1 parity device. - */ - if (vd->vdev_nparity == 2 && - spa_version(spa) < ZFS_VERSION_RAID6) - return (ENOTSUP); - - } else { - /* - * We require the parity to be specified for SPAs that - * support multiple parity levels. - */ - if (spa_version(spa) >= ZFS_VERSION_RAID6) - return (EINVAL); - - /* - * Otherwise, we default to 1 parity device for RAID-Z. - */ - vd->vdev_nparity = 1; - } - } else { - vd->vdev_nparity = 0; - } - - /* - * Set the whole_disk property. If it's not specified, leave the value - * as -1. - */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &vd->vdev_wholedisk) != 0) - vd->vdev_wholedisk = -1ULL; - - /* - * Look for the 'not present' flag. This will only be set if the device - * was not present at the time of import. - */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &vd->vdev_not_present); - - /* - * Get the alignment requirement. - */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); - - /* - * If we're a top-level vdev, try to load the allocation parameters. - */ - if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, - &vd->vdev_ms_array); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, - &vd->vdev_ms_shift); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, - &vd->vdev_asize); - } - - /* - * If we're a leaf vdev, try to load the DTL object and offline state. - */ - if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, - &vd->vdev_dtl.smo_object); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, - &vd->vdev_offline); - } - - /* - * Add ourselves to the parent's list of children. - */ - vdev_add_child(parent, vd); - - *vdp = vd; - - return (0); -} - -void -vdev_free(vdev_t *vd) -{ - int c; - - /* - * vdev_free() implies closing the vdev first. This is simpler than - * trying to ensure complicated semantics for all callers. - */ - vdev_close(vd); - - ASSERT(!list_link_active(&vd->vdev_dirty_node)); - - /* - * Free all children. - */ - for (c = 0; c < vd->vdev_children; c++) - vdev_free(vd->vdev_child[c]); - - ASSERT(vd->vdev_child == NULL); - ASSERT(vd->vdev_guid_sum == vd->vdev_guid); - - /* - * Discard allocation state. - */ - if (vd == vd->vdev_top) - vdev_metaslab_fini(vd); - - ASSERT3U(vd->vdev_stat.vs_space, ==, 0); - ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); - ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); - - /* - * Remove this vdev from its parent's child list. - */ - vdev_remove_child(vd->vdev_parent, vd); - - ASSERT(vd->vdev_parent == NULL); - - vdev_free_common(vd); -} - -/* - * Transfer top-level vdev state from svd to tvd. - */ -static void -vdev_top_transfer(vdev_t *svd, vdev_t *tvd) -{ - spa_t *spa = svd->vdev_spa; - metaslab_t *msp; - vdev_t *vd; - int t; - - ASSERT(tvd == tvd->vdev_top); - - tvd->vdev_ms_array = svd->vdev_ms_array; - tvd->vdev_ms_shift = svd->vdev_ms_shift; - tvd->vdev_ms_count = svd->vdev_ms_count; - - svd->vdev_ms_array = 0; - svd->vdev_ms_shift = 0; - svd->vdev_ms_count = 0; - - tvd->vdev_mg = svd->vdev_mg; - tvd->vdev_ms = svd->vdev_ms; - - svd->vdev_mg = NULL; - svd->vdev_ms = NULL; - - if (tvd->vdev_mg != NULL) - tvd->vdev_mg->mg_vd = tvd; - - tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; - tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; - tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; - - svd->vdev_stat.vs_alloc = 0; - svd->vdev_stat.vs_space = 0; - svd->vdev_stat.vs_dspace = 0; - - for (t = 0; t < TXG_SIZE; t++) { - while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) - (void) txg_list_add(&tvd->vdev_ms_list, msp, t); - while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) - (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); - if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) - (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); - } - - if (list_link_active(&svd->vdev_dirty_node)) { - vdev_config_clean(svd); - vdev_config_dirty(tvd); - } - - tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted; - svd->vdev_reopen_wanted = 0; - - tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; - svd->vdev_deflate_ratio = 0; -} - -static void -vdev_top_update(vdev_t *tvd, vdev_t *vd) -{ - int c; - - if (vd == NULL) - return; - - vd->vdev_top = tvd; - - for (c = 0; c < vd->vdev_children; c++) - vdev_top_update(tvd, vd->vdev_child[c]); -} - -/* - * Add a mirror/replacing vdev above an existing vdev. - */ -vdev_t * -vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) -{ - spa_t *spa = cvd->vdev_spa; - vdev_t *pvd = cvd->vdev_parent; - vdev_t *mvd; - - ASSERT(spa_config_held(spa, RW_WRITER)); - - mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); - - mvd->vdev_asize = cvd->vdev_asize; - mvd->vdev_ashift = cvd->vdev_ashift; - mvd->vdev_state = cvd->vdev_state; - - vdev_remove_child(pvd, cvd); - vdev_add_child(pvd, mvd); - cvd->vdev_id = mvd->vdev_children; - vdev_add_child(mvd, cvd); - vdev_top_update(cvd->vdev_top, cvd->vdev_top); - - if (mvd == mvd->vdev_top) - vdev_top_transfer(cvd, mvd); - - return (mvd); -} - -/* - * Remove a 1-way mirror/replacing vdev from the tree. - */ -void -vdev_remove_parent(vdev_t *cvd) -{ - vdev_t *mvd = cvd->vdev_parent; - vdev_t *pvd = mvd->vdev_parent; - - ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); - - ASSERT(mvd->vdev_children == 1); - ASSERT(mvd->vdev_ops == &vdev_mirror_ops || - mvd->vdev_ops == &vdev_replacing_ops || - mvd->vdev_ops == &vdev_spare_ops); - cvd->vdev_ashift = mvd->vdev_ashift; - - vdev_remove_child(mvd, cvd); - vdev_remove_child(pvd, mvd); - cvd->vdev_id = mvd->vdev_id; - vdev_add_child(pvd, cvd); - /* - * If we created a new toplevel vdev, then we need to change the child's - * vdev GUID to match the old toplevel vdev. Otherwise, we could have - * detached an offline device, and when we go to import the pool we'll - * think we have two toplevel vdevs, instead of a different version of - * the same toplevel vdev. - */ - if (cvd->vdev_top == cvd) { - pvd->vdev_guid_sum -= cvd->vdev_guid; - cvd->vdev_guid_sum -= cvd->vdev_guid; - cvd->vdev_guid = mvd->vdev_guid; - cvd->vdev_guid_sum += mvd->vdev_guid; - pvd->vdev_guid_sum += cvd->vdev_guid; - } - vdev_top_update(cvd->vdev_top, cvd->vdev_top); - - if (cvd == cvd->vdev_top) - vdev_top_transfer(mvd, cvd); - - ASSERT(mvd->vdev_children == 0); - vdev_free(mvd); -} - -int -vdev_metaslab_init(vdev_t *vd, uint64_t txg) -{ - spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - metaslab_class_t *mc = spa_metaslab_class_select(spa); - uint64_t m; - uint64_t oldc = vd->vdev_ms_count; - uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; - metaslab_t **mspp; - int error; - - if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ - return (0); - - dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); - - ASSERT(oldc <= newc); - - if (vd->vdev_mg == NULL) - vd->vdev_mg = metaslab_group_create(mc, vd); - - mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); - - if (oldc != 0) { - bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); - kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); - } - - vd->vdev_ms = mspp; - vd->vdev_ms_count = newc; - - for (m = oldc; m < newc; m++) { - space_map_obj_t smo = { 0, 0, 0 }; - if (txg == 0) { - uint64_t object = 0; - error = dmu_read(mos, vd->vdev_ms_array, - m * sizeof (uint64_t), sizeof (uint64_t), &object); - if (error) - return (error); - if (object != 0) { - dmu_buf_t *db; - error = dmu_bonus_hold(mos, object, FTAG, &db); - if (error) - return (error); - ASSERT3U(db->db_size, ==, sizeof (smo)); - bcopy(db->db_data, &smo, db->db_size); - ASSERT3U(smo.smo_object, ==, object); - dmu_buf_rele(db, FTAG); - } - } - vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, - m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); - } - - return (0); -} - -void -vdev_metaslab_fini(vdev_t *vd) -{ - uint64_t m; - uint64_t count = vd->vdev_ms_count; - - if (vd->vdev_ms != NULL) { - for (m = 0; m < count; m++) - if (vd->vdev_ms[m] != NULL) - metaslab_fini(vd->vdev_ms[m]); - kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); - vd->vdev_ms = NULL; - } -} - -/* - * Prepare a virtual device for access. - */ -int -vdev_open(vdev_t *vd) -{ - int error; - int c; - uint64_t osize = 0; - uint64_t asize, psize; - uint64_t ashift = 0; - - ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || - vd->vdev_state == VDEV_STATE_CANT_OPEN || - vd->vdev_state == VDEV_STATE_OFFLINE); - - if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) - vd->vdev_fault_arg >>= 1; - else - vd->vdev_fault_mode = VDEV_FAULT_NONE; - - vd->vdev_stat.vs_aux = VDEV_AUX_NONE; - - if (vd->vdev_ops->vdev_op_leaf) { - vdev_cache_init(vd); - vdev_queue_init(vd); - vd->vdev_cache_active = B_TRUE; - } - - if (vd->vdev_offline) { - ASSERT(vd->vdev_children == 0); - vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); - return (ENXIO); - } - - error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); - - if (zio_injection_enabled && error == 0) - error = zio_handle_device_injection(vd, ENXIO); - - dprintf("%s = %d, osize %llu, state = %d\n", - vdev_description(vd), error, osize, vd->vdev_state); - - if (error) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - vd->vdev_stat.vs_aux); - return (error); - } - - vd->vdev_state = VDEV_STATE_HEALTHY; - - for (c = 0; c < vd->vdev_children; c++) - if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, - VDEV_AUX_NONE); - break; - } - - osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); - - if (vd->vdev_children == 0) { - if (osize < SPA_MINDEVSIZE) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_TOO_SMALL); - return (EOVERFLOW); - } - psize = osize; - asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); - } else { - if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_TOO_SMALL); - return (EOVERFLOW); - } - psize = 0; - asize = osize; - } - - vd->vdev_psize = psize; - - if (vd->vdev_asize == 0) { - /* - * This is the first-ever open, so use the computed values. - * For testing purposes, a higher ashift can be requested. - */ - vd->vdev_asize = asize; - vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); - } else { - /* - * Make sure the alignment requirement hasn't increased. - */ - if (ashift > vd->vdev_top->vdev_ashift) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); - } - - /* - * Make sure the device hasn't shrunk. - */ - if (asize < vd->vdev_asize) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); - } - - /* - * If all children are healthy and the asize has increased, - * then we've experienced dynamic LUN growth. - */ - if (vd->vdev_state == VDEV_STATE_HEALTHY && - asize > vd->vdev_asize) { - vd->vdev_asize = asize; - } - } - - /* - * If this is a top-level vdev, compute the raidz-deflation - * ratio. Note, we hard-code in 128k (1<<17) because it is the - * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE - * changes, this algorithm must never change, or we will - * inconsistently account for existing bp's. - */ - if (vd->vdev_top == vd) { - vd->vdev_deflate_ratio = (1<<17) / - (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); - } - - /* - * This allows the ZFS DE to close cases appropriately. If a device - * goes away and later returns, we want to close the associated case. - * But it's not enough to simply post this only when a device goes from - * CANT_OPEN -> HEALTHY. If we reboot the system and the device is - * back, we also need to close the case (otherwise we will try to replay - * it). So we have to post this notifier every time. Since this only - * occurs during pool open or error recovery, this should not be an - * issue. - */ - zfs_post_ok(vd->vdev_spa, vd); - - return (0); -} - -/* - * Called once the vdevs are all opened, this routine validates the label - * contents. This needs to be done before vdev_load() so that we don't - * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen() - * won't succeed if the device has been changed underneath. - * - * This function will only return failure if one of the vdevs indicates that it - * has since been destroyed or exported. This is only possible if - * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state - * will be updated but the function will return 0. - */ -int -vdev_validate(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - int c; - nvlist_t *label; - uint64_t guid; - uint64_t state; - - for (c = 0; c < vd->vdev_children; c++) - if (vdev_validate(vd->vdev_child[c]) != 0) - return (EBADF); - - /* - * If the device has already failed, or was marked offline, don't do - * any further validation. Otherwise, label I/O will fail and we will - * overwrite the previous state. - */ - if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { - - if ((label = vdev_label_read_config(vd)) == NULL) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (0); - } - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, - &guid) != 0 || guid != spa_guid(spa)) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - return (0); - } - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, - &guid) != 0 || guid != vd->vdev_guid) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - return (0); - } - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, - &state) != 0) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - return (0); - } - - nvlist_free(label); - - if (spa->spa_load_state == SPA_LOAD_OPEN && - state != POOL_STATE_ACTIVE) - return (EBADF); - } - - /* - * If we were able to open and validate a vdev that was previously - * marked permanently unavailable, clear that state now. - */ - if (vd->vdev_not_present) - vd->vdev_not_present = 0; - - return (0); -} - -/* - * Close a virtual device. - */ -void -vdev_close(vdev_t *vd) -{ - vd->vdev_ops->vdev_op_close(vd); - - if (vd->vdev_cache_active) { - vdev_cache_fini(vd); - vdev_queue_fini(vd); - vd->vdev_cache_active = B_FALSE; - } - - /* - * We record the previous state before we close it, so that if we are - * doing a reopen(), we don't generate FMA ereports if we notice that - * it's still faulted. - */ - vd->vdev_prevstate = vd->vdev_state; - - if (vd->vdev_offline) - vd->vdev_state = VDEV_STATE_OFFLINE; - else - vd->vdev_state = VDEV_STATE_CLOSED; - vd->vdev_stat.vs_aux = VDEV_AUX_NONE; -} - -void -vdev_reopen(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT(spa_config_held(spa, RW_WRITER)); - - vdev_close(vd); - (void) vdev_open(vd); - - /* - * Call vdev_validate() here to make sure we have the same device. - * Otherwise, a device with an invalid label could be successfully - * opened in response to vdev_reopen(). - * - * The downside to this is that if the user is simply experimenting by - * overwriting an entire disk, we'll fault the device rather than - * demonstrate self-healing capabilities. On the other hand, with - * proper FMA integration, the series of errors we'd see from the device - * would result in a faulted device anyway. Given that this doesn't - * model any real-world corruption, it's better to catch this here and - * correctly identify that the device has either changed beneath us, or - * is corrupted beyond recognition. - */ - (void) vdev_validate(vd); - - /* - * Reassess root vdev's health. - */ - vdev_propagate_state(spa->spa_root_vdev); -} - -int -vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) -{ - int error; - - /* - * Normally, partial opens (e.g. of a mirror) are allowed. - * For a create, however, we want to fail the request if - * there are any components we can't open. - */ - error = vdev_open(vd); - - if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { - vdev_close(vd); - return (error ? error : ENXIO); - } - - /* - * Recursively initialize all labels. - */ - if ((error = vdev_label_init(vd, txg, isreplacing ? - VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { - vdev_close(vd); - return (error); - } - - return (0); -} - -/* - * The is the latter half of vdev_create(). It is distinct because it - * involves initiating transactions in order to do metaslab creation. - * For creation, we want to try to create all vdevs at once and then undo it - * if anything fails; this is much harder if we have pending transactions. - */ -void -vdev_init(vdev_t *vd, uint64_t txg) -{ - /* - * Aim for roughly 200 metaslabs per vdev. - */ - vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); - vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); - - /* - * Initialize the vdev's metaslabs. This can't fail because - * there's nothing to read when creating all new metaslabs. - */ - VERIFY(vdev_metaslab_init(vd, txg) == 0); -} - -void -vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) -{ - ASSERT(vd == vd->vdev_top); - ASSERT(ISP2(flags)); - - if (flags & VDD_METASLAB) - (void) txg_list_add(&vd->vdev_ms_list, arg, txg); - - if (flags & VDD_DTL) - (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); - - (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); -} - -void -vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) -{ - mutex_enter(sm->sm_lock); - if (!space_map_contains(sm, txg, size)) - space_map_add(sm, txg, size); - mutex_exit(sm->sm_lock); -} - -int -vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) -{ - int dirty; - - /* - * Quick test without the lock -- covers the common case that - * there are no dirty time segments. - */ - if (sm->sm_space == 0) - return (0); - - mutex_enter(sm->sm_lock); - dirty = space_map_contains(sm, txg, size); - mutex_exit(sm->sm_lock); - - return (dirty); -} - -/* - * Reassess DTLs after a config change or scrub completion. - */ -void -vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) -{ - spa_t *spa = vd->vdev_spa; - int c; - - ASSERT(spa_config_held(spa, RW_WRITER)); - - if (vd->vdev_children == 0) { - mutex_enter(&vd->vdev_dtl_lock); - /* - * We're successfully scrubbed everything up to scrub_txg. - * Therefore, excise all old DTLs up to that point, then - * fold in the DTLs for everything we couldn't scrub. - */ - if (scrub_txg != 0) { - space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); - space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); - } - if (scrub_done) - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - mutex_exit(&vd->vdev_dtl_lock); - if (txg != 0) - vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - return; - } - - /* - * Make sure the DTLs are always correct under the scrub lock. - */ - if (vd == spa->spa_root_vdev) - mutex_enter(&spa->spa_scrub_lock); - - mutex_enter(&vd->vdev_dtl_lock); - space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - mutex_exit(&vd->vdev_dtl_lock); - - for (c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); - mutex_enter(&vd->vdev_dtl_lock); - space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); - space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); - } - - if (vd == spa->spa_root_vdev) - mutex_exit(&spa->spa_scrub_lock); -} - -static int -vdev_dtl_load(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl; - objset_t *mos = spa->spa_meta_objset; - dmu_buf_t *db; - int error; - - ASSERT(vd->vdev_children == 0); - - if (smo->smo_object == 0) - return (0); - - if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) - return (error); - - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(db->db_data, smo, db->db_size); - dmu_buf_rele(db, FTAG); - - mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); - mutex_exit(&vd->vdev_dtl_lock); - - return (error); -} - -void -vdev_dtl_sync(vdev_t *vd, uint64_t txg) -{ - spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl; - space_map_t *sm = &vd->vdev_dtl_map; - objset_t *mos = spa->spa_meta_objset; - space_map_t smsync; - kmutex_t smlock; - dmu_buf_t *db; - dmu_tx_t *tx; - - dprintf("%s in txg %llu pass %d\n", - vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); - - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - - if (vd->vdev_detached) { - if (smo->smo_object != 0) { - int err = dmu_object_free(mos, smo->smo_object, tx); - ASSERT3U(err, ==, 0); - smo->smo_object = 0; - } - dmu_tx_commit(tx); - dprintf("detach %s committed in txg %llu\n", - vdev_description(vd), txg); - return; - } - - if (smo->smo_object == 0) { - ASSERT(smo->smo_objsize == 0); - ASSERT(smo->smo_alloc == 0); - smo->smo_object = dmu_object_alloc(mos, - DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, - DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); - ASSERT(smo->smo_object != 0); - vdev_config_dirty(vd->vdev_top); - } - - mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); - - space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, - &smlock); - - mutex_enter(&smlock); - - mutex_enter(&vd->vdev_dtl_lock); - space_map_walk(sm, space_map_add, &smsync); - mutex_exit(&vd->vdev_dtl_lock); - - space_map_truncate(smo, mos, tx); - space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); - - space_map_destroy(&smsync); - - mutex_exit(&smlock); - mutex_destroy(&smlock); - - VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(smo, db->db_data, db->db_size); - dmu_buf_rele(db, FTAG); - - dmu_tx_commit(tx); -} - -void -vdev_load(vdev_t *vd) -{ - int c; - - /* - * Recursively load all children. - */ - for (c = 0; c < vd->vdev_children; c++) - vdev_load(vd->vdev_child[c]); - - /* - * If this is a top-level vdev, initialize its metaslabs. - */ - if (vd == vd->vdev_top && - (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || - vdev_metaslab_init(vd, 0) != 0)) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - - /* - * If this is a leaf vdev, load its DTL. - */ - if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); -} - -/* - * This special case of vdev_spare() is used for hot spares. It's sole purpose - * it to set the vdev state for the associated vdev. To do this, we make sure - * that we can open the underlying device, then try to read the label, and make - * sure that the label is sane and that it hasn't been repurposed to another - * pool. - */ -int -vdev_validate_spare(vdev_t *vd) -{ - nvlist_t *label; - uint64_t guid, version; - uint64_t state; - - if ((label = vdev_label_read_config(vd)) == NULL) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - return (-1); - } - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || - version > ZFS_VERSION || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || - guid != vd->vdev_guid || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - return (-1); - } - - spa_spare_add(vd); - - /* - * We don't actually check the pool state here. If it's in fact in - * use by another pool, we update this fact on the fly when requested. - */ - nvlist_free(label); - return (0); -} - -void -vdev_sync_done(vdev_t *vd, uint64_t txg) -{ - metaslab_t *msp; - - dprintf("%s txg %llu\n", vdev_description(vd), txg); - - while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) - metaslab_sync_done(msp, txg); -} - -void -vdev_sync(vdev_t *vd, uint64_t txg) -{ - spa_t *spa = vd->vdev_spa; - vdev_t *lvd; - metaslab_t *msp; - dmu_tx_t *tx; - - dprintf("%s txg %llu pass %d\n", - vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); - - if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { - ASSERT(vd == vd->vdev_top); - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); - ASSERT(vd->vdev_ms_array != 0); - vdev_config_dirty(vd); - dmu_tx_commit(tx); - } - - while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { - metaslab_sync(msp, txg); - (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); - } - - while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) - vdev_dtl_sync(lvd, txg); - - (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); -} - -uint64_t -vdev_psize_to_asize(vdev_t *vd, uint64_t psize) -{ - return (vd->vdev_ops->vdev_op_asize(vd, psize)); -} - -void -vdev_io_start(zio_t *zio) -{ - zio->io_vd->vdev_ops->vdev_op_io_start(zio); -} - -void -vdev_io_done(zio_t *zio) -{ - zio->io_vd->vdev_ops->vdev_op_io_done(zio); -} - -const char * -vdev_description(vdev_t *vd) -{ - if (vd == NULL || vd->vdev_ops == NULL) - return ("<unknown>"); - - if (vd->vdev_path != NULL) - return (vd->vdev_path); - - if (vd->vdev_parent == NULL) - return (spa_name(vd->vdev_spa)); - - return (vd->vdev_ops->vdev_op_type); -} - -int -vdev_online(spa_t *spa, uint64_t guid) -{ - vdev_t *rvd, *vd; - uint64_t txg; - - txg = spa_vdev_enter(spa); - - rvd = spa->spa_root_vdev; - - if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - dprintf("ONLINE: %s\n", vdev_description(vd)); - - vd->vdev_offline = B_FALSE; - vd->vdev_tmpoffline = B_FALSE; - vdev_reopen(vd->vdev_top); - - vdev_config_dirty(vd->vdev_top); - - (void) spa_vdev_exit(spa, NULL, txg, 0); - - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); - - return (0); -} - -int -vdev_offline(spa_t *spa, uint64_t guid, int istmp) -{ - vdev_t *rvd, *vd; - uint64_t txg; - - txg = spa_vdev_enter(spa); - - rvd = spa->spa_root_vdev; - - if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - dprintf("OFFLINE: %s\n", vdev_description(vd)); - - /* - * If the device isn't already offline, try to offline it. - */ - if (!vd->vdev_offline) { - /* - * If this device's top-level vdev has a non-empty DTL, - * don't allow the device to be offlined. - * - * XXX -- make this more precise by allowing the offline - * as long as the remaining devices don't have any DTL holes. - */ - if (vd->vdev_top->vdev_dtl_map.sm_space != 0) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - - /* - * Offline this device and reopen its top-level vdev. - * If this action results in the top-level vdev becoming - * unusable, undo it and fail the request. - */ - vd->vdev_offline = B_TRUE; - vdev_reopen(vd->vdev_top); - if (vdev_is_dead(vd->vdev_top)) { - vd->vdev_offline = B_FALSE; - vdev_reopen(vd->vdev_top); - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - } - } - - vd->vdev_tmpoffline = istmp; - - vdev_config_dirty(vd->vdev_top); - - return (spa_vdev_exit(spa, NULL, txg, 0)); -} - -/* - * Clear the error counts associated with this vdev. Unlike vdev_online() and - * vdev_offline(), we assume the spa config is locked. We also clear all - * children. If 'vd' is NULL, then the user wants to clear all vdevs. - */ -void -vdev_clear(spa_t *spa, vdev_t *vd) -{ - int c; - - if (vd == NULL) - vd = spa->spa_root_vdev; - - vd->vdev_stat.vs_read_errors = 0; - vd->vdev_stat.vs_write_errors = 0; - vd->vdev_stat.vs_checksum_errors = 0; - - for (c = 0; c < vd->vdev_children; c++) - vdev_clear(spa, vd->vdev_child[c]); -} - -int -vdev_is_dead(vdev_t *vd) -{ - return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); -} - -int -vdev_error_inject(vdev_t *vd, zio_t *zio) -{ - int error = 0; - - if (vd->vdev_fault_mode == VDEV_FAULT_NONE) - return (0); - - if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) - return (0); - - switch (vd->vdev_fault_mode) { - case VDEV_FAULT_RANDOM: - if (spa_get_random(vd->vdev_fault_arg) == 0) - error = EIO; - break; - - case VDEV_FAULT_COUNT: - if ((int64_t)--vd->vdev_fault_arg <= 0) - vd->vdev_fault_mode = VDEV_FAULT_NONE; - error = EIO; - break; - } - - if (error != 0) { - dprintf("returning %d for type %d on %s state %d offset %llx\n", - error, zio->io_type, vdev_description(vd), - vd->vdev_state, zio->io_offset); - } - - return (error); -} - -/* - * Get statistics for the given vdev. - */ -void -vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) -{ - vdev_t *rvd = vd->vdev_spa->spa_root_vdev; - int c, t; - - mutex_enter(&vd->vdev_stat_lock); - bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_timestamp = gethrtime() - vs->vs_timestamp; - vs->vs_state = vd->vdev_state; - vs->vs_rsize = vdev_get_rsize(vd); - mutex_exit(&vd->vdev_stat_lock); - - /* - * If we're getting stats on the root vdev, aggregate the I/O counts - * over all top-level vdevs (i.e. the direct children of the root). - */ - if (vd == rvd) { - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - vdev_stat_t *cvs = &cvd->vdev_stat; - - mutex_enter(&vd->vdev_stat_lock); - for (t = 0; t < ZIO_TYPES; t++) { - vs->vs_ops[t] += cvs->vs_ops[t]; - vs->vs_bytes[t] += cvs->vs_bytes[t]; - } - vs->vs_read_errors += cvs->vs_read_errors; - vs->vs_write_errors += cvs->vs_write_errors; - vs->vs_checksum_errors += cvs->vs_checksum_errors; - vs->vs_scrub_examined += cvs->vs_scrub_examined; - vs->vs_scrub_errors += cvs->vs_scrub_errors; - mutex_exit(&vd->vdev_stat_lock); - } - } -} - -void -vdev_stat_update(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_t *pvd; - uint64_t txg = zio->io_txg; - vdev_stat_t *vs = &vd->vdev_stat; - zio_type_t type = zio->io_type; - int flags = zio->io_flags; - - if (zio->io_error == 0) { - if (!(flags & ZIO_FLAG_IO_BYPASS)) { - mutex_enter(&vd->vdev_stat_lock); - vs->vs_ops[type]++; - vs->vs_bytes[type] += zio->io_size; - mutex_exit(&vd->vdev_stat_lock); - } - if ((flags & ZIO_FLAG_IO_REPAIR) && - zio->io_delegate_list == NULL) { - mutex_enter(&vd->vdev_stat_lock); - if (flags & ZIO_FLAG_SCRUB_THREAD) - vs->vs_scrub_repaired += zio->io_size; - else - vs->vs_self_healed += zio->io_size; - mutex_exit(&vd->vdev_stat_lock); - } - return; - } - - if (flags & ZIO_FLAG_SPECULATIVE) - return; - - if (!vdev_is_dead(vd)) { - mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ) { - if (zio->io_error == ECKSUM) - vs->vs_checksum_errors++; - else - vs->vs_read_errors++; - } - if (type == ZIO_TYPE_WRITE) - vs->vs_write_errors++; - mutex_exit(&vd->vdev_stat_lock); - } - - if (type == ZIO_TYPE_WRITE) { - if (txg == 0 || vd->vdev_children != 0) - return; - if (flags & ZIO_FLAG_SCRUB_THREAD) { - ASSERT(flags & ZIO_FLAG_IO_REPAIR); - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); - } - if (!(flags & ZIO_FLAG_IO_REPAIR)) { - if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) - return; - vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); - } - } -} - -void -vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) -{ - int c; - vdev_stat_t *vs = &vd->vdev_stat; - - for (c = 0; c < vd->vdev_children; c++) - vdev_scrub_stat_update(vd->vdev_child[c], type, complete); - - mutex_enter(&vd->vdev_stat_lock); - - if (type == POOL_SCRUB_NONE) { - /* - * Update completion and end time. Leave everything else alone - * so we can report what happened during the previous scrub. - */ - vs->vs_scrub_complete = complete; - vs->vs_scrub_end = gethrestime_sec(); - } else { - vs->vs_scrub_type = type; - vs->vs_scrub_complete = 0; - vs->vs_scrub_examined = 0; - vs->vs_scrub_repaired = 0; - vs->vs_scrub_errors = 0; - vs->vs_scrub_start = gethrestime_sec(); - vs->vs_scrub_end = 0; - } - - mutex_exit(&vd->vdev_stat_lock); -} - -/* - * Update the in-core space usage stats for this vdev and the root vdev. - */ -void -vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) -{ - ASSERT(vd == vd->vdev_top); - int64_t dspace_delta = space_delta; - - do { - if (vd->vdev_ms_count) { - /* - * If this is a top-level vdev, apply the - * inverse of its psize-to-asize (ie. RAID-Z) - * space-expansion factor. We must calculate - * this here and not at the root vdev because - * the root vdev's psize-to-asize is simply the - * max of its childrens', thus not accurate - * enough for us. - */ - ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); - dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; - } - - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_space += space_delta; - vd->vdev_stat.vs_alloc += alloc_delta; - vd->vdev_stat.vs_dspace += dspace_delta; - mutex_exit(&vd->vdev_stat_lock); - } while ((vd = vd->vdev_parent) != NULL); -} - -/* - * Mark a top-level vdev's config as dirty, placing it on the dirty list - * so that it will be written out next time the vdev configuration is synced. - * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. - */ -void -vdev_config_dirty(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int c; - - /* - * The dirty list is protected by the config lock. The caller must - * either hold the config lock as writer, or must be the sync thread - * (which holds the lock as reader). There's only one sync thread, - * so this is sufficient to ensure mutual exclusion. - */ - ASSERT(spa_config_held(spa, RW_WRITER) || - dsl_pool_sync_context(spa_get_dsl(spa))); - - if (vd == rvd) { - for (c = 0; c < rvd->vdev_children; c++) - vdev_config_dirty(rvd->vdev_child[c]); - } else { - ASSERT(vd == vd->vdev_top); - - if (!list_link_active(&vd->vdev_dirty_node)) - list_insert_head(&spa->spa_dirty_list, vd); - } -} - -void -vdev_config_clean(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT(spa_config_held(spa, RW_WRITER) || - dsl_pool_sync_context(spa_get_dsl(spa))); - - ASSERT(list_link_active(&vd->vdev_dirty_node)); - list_remove(&spa->spa_dirty_list, vd); -} - -void -vdev_propagate_state(vdev_t *vd) -{ - vdev_t *rvd = vd->vdev_spa->spa_root_vdev; - int degraded = 0, faulted = 0; - int corrupted = 0; - int c; - vdev_t *child; - - for (c = 0; c < vd->vdev_children; c++) { - child = vd->vdev_child[c]; - if (child->vdev_state <= VDEV_STATE_CANT_OPEN) - faulted++; - else if (child->vdev_state == VDEV_STATE_DEGRADED) - degraded++; - - if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) - corrupted++; - } - - vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); - - /* - * Root special: if there is a toplevel vdev that cannot be - * opened due to corrupted metadata, then propagate the root - * vdev's aux state as 'corrupt' rather than 'insufficient - * replicas'. - */ - if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) - vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); -} - -/* - * Set a vdev's state. If this is during an open, we don't update the parent - * state, because we're in the process of opening children depth-first. - * Otherwise, we propagate the change to the parent. - * - * If this routine places a device in a faulted state, an appropriate ereport is - * generated. - */ -void -vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) -{ - uint64_t save_state; - - if (state == vd->vdev_state) { - vd->vdev_stat.vs_aux = aux; - return; - } - - save_state = vd->vdev_state; - - vd->vdev_state = state; - vd->vdev_stat.vs_aux = aux; - - /* - * If we are setting the vdev state to anything but an open state, then - * always close the underlying device. Otherwise, we keep accessible - * but invalid devices open forever. We don't call vdev_close() itself, - * because that implies some extra checks (offline, etc) that we don't - * want here. This is limited to leaf devices, because otherwise - * closing the device will affect other children. - */ - if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) - vd->vdev_ops->vdev_op_close(vd); - - if (state == VDEV_STATE_CANT_OPEN) { - /* - * If we fail to open a vdev during an import, we mark it as - * "not available", which signifies that it was never there to - * begin with. Failure to open such a device is not considered - * an error. - */ - if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT && - vd->vdev_ops->vdev_op_leaf) - vd->vdev_not_present = 1; - - /* - * Post the appropriate ereport. If the 'prevstate' field is - * set to something other than VDEV_STATE_UNKNOWN, it indicates - * that this is part of a vdev_reopen(). In this case, we don't - * want to post the ereport if the device was already in the - * CANT_OPEN state beforehand. - */ - if (vd->vdev_prevstate != state && !vd->vdev_not_present && - vd != vd->vdev_spa->spa_root_vdev) { - const char *class; - - switch (aux) { - case VDEV_AUX_OPEN_FAILED: - class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; - break; - case VDEV_AUX_CORRUPT_DATA: - class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; - break; - case VDEV_AUX_NO_REPLICAS: - class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; - break; - case VDEV_AUX_BAD_GUID_SUM: - class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; - break; - case VDEV_AUX_TOO_SMALL: - class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; - break; - case VDEV_AUX_BAD_LABEL: - class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; - break; - default: - class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; - } - - zfs_ereport_post(class, vd->vdev_spa, - vd, NULL, save_state, 0); - } - } - - if (isopen) - return; - - if (vd->vdev_parent != NULL) - vdev_propagate_state(vd->vdev_parent); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c deleted file mode 100644 index 4e419b6..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c +++ /dev/null @@ -1,394 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/vdev_impl.h> -#include <sys/zio.h> - -/* - * Virtual device read-ahead caching. - * - * This file implements a simple LRU read-ahead cache. When the DMU reads - * a given block, it will often want other, nearby blocks soon thereafter. - * We take advantage of this by reading a larger disk region and caching - * the result. In the best case, this can turn 256 back-to-back 512-byte - * reads into a single 128k read followed by 255 cache hits; this reduces - * latency dramatically. In the worst case, it can turn an isolated 512-byte - * read into a 128k read, which doesn't affect latency all that much but is - * terribly wasteful of bandwidth. A more intelligent version of the cache - * could keep track of access patterns and not do read-ahead unless it sees - * at least two temporally close I/Os to the same region. It could also - * take advantage of semantic information about the I/O. And it could use - * something faster than an AVL tree; that was chosen solely for convenience. - * - * There are five cache operations: allocate, fill, read, write, evict. - * - * (1) Allocate. This reserves a cache entry for the specified region. - * We separate the allocate and fill operations so that multiple threads - * don't generate I/O for the same cache miss. - * - * (2) Fill. When the I/O for a cache miss completes, the fill routine - * places the data in the previously allocated cache entry. - * - * (3) Read. Read data from the cache. - * - * (4) Write. Update cache contents after write completion. - * - * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry - * if the total cache size exceeds zfs_vdev_cache_size. - */ - -/* - * These tunables are for performance analysis. - */ -/* - * All i/os smaller than zfs_vdev_cache_max will be turned into - * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software - * track buffer. At most zfs_vdev_cache_size bytes will be kept in each - * vdev's vdev_cache. - */ -int zfs_vdev_cache_max = 1<<14; -int zfs_vdev_cache_size = 10ULL << 20; -int zfs_vdev_cache_bshift = 16; - -SYSCTL_DECL(_vfs_zfs_vdev); -SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); -TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max); -SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN, - &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size"); -TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size); -SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN, - &zfs_vdev_cache_size, 0, "Size of VDEV cache"); - -#define VCBS (1 << zfs_vdev_cache_bshift) - -static int -vdev_cache_offset_compare(const void *a1, const void *a2) -{ - const vdev_cache_entry_t *ve1 = a1; - const vdev_cache_entry_t *ve2 = a2; - - if (ve1->ve_offset < ve2->ve_offset) - return (-1); - if (ve1->ve_offset > ve2->ve_offset) - return (1); - return (0); -} - -static int -vdev_cache_lastused_compare(const void *a1, const void *a2) -{ - const vdev_cache_entry_t *ve1 = a1; - const vdev_cache_entry_t *ve2 = a2; - - if (ve1->ve_lastused < ve2->ve_lastused) - return (-1); - if (ve1->ve_lastused > ve2->ve_lastused) - return (1); - - /* - * Among equally old entries, sort by offset to ensure uniqueness. - */ - return (vdev_cache_offset_compare(a1, a2)); -} - -/* - * Evict the specified entry from the cache. - */ -static void -vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) -{ - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); - ASSERT(ve->ve_data != NULL); - - dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n", - vc, ve->ve_offset, ve->ve_lastused, LBOLT - ve->ve_lastused, - ve->ve_hits, ve->ve_missed_update); - - avl_remove(&vc->vc_lastused_tree, ve); - avl_remove(&vc->vc_offset_tree, ve); - zio_buf_free(ve->ve_data, VCBS); - kmem_free(ve, sizeof (vdev_cache_entry_t)); -} - -/* - * Allocate an entry in the cache. At the point we don't have the data, - * we're just creating a placeholder so that multiple threads don't all - * go off and read the same blocks. - */ -static vdev_cache_entry_t * -vdev_cache_allocate(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - uint64_t offset = P2ALIGN(zio->io_offset, VCBS); - vdev_cache_entry_t *ve; - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - - if (zfs_vdev_cache_size == 0) - return (NULL); - - /* - * If adding a new entry would exceed the cache size, - * evict the oldest entry (LRU). - */ - if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > - zfs_vdev_cache_size) { - ve = avl_first(&vc->vc_lastused_tree); - if (ve->ve_fill_io != NULL) { - dprintf("can't evict in %p, still filling\n", vc); - return (NULL); - } - ASSERT(ve->ve_hits != 0); - vdev_cache_evict(vc, ve); - } - - ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); - ve->ve_offset = offset; - ve->ve_lastused = LBOLT; - ve->ve_data = zio_buf_alloc(VCBS); - - avl_add(&vc->vc_offset_tree, ve); - avl_add(&vc->vc_lastused_tree, ve); - - return (ve); -} - -static void -vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) -{ - uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); - - if (ve->ve_lastused != LBOLT) { - avl_remove(&vc->vc_lastused_tree, ve); - ve->ve_lastused = LBOLT; - avl_add(&vc->vc_lastused_tree, ve); - } - - ve->ve_hits++; - bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); -} - -/* - * Fill a previously allocated cache entry with data. - */ -static void -vdev_cache_fill(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve = zio->io_private; - zio_t *dio; - - ASSERT(zio->io_size == VCBS); - - /* - * Add data to the cache. - */ - mutex_enter(&vc->vc_lock); - - ASSERT(ve->ve_fill_io == zio); - ASSERT(ve->ve_offset == zio->io_offset); - ASSERT(ve->ve_data == zio->io_data); - - ve->ve_fill_io = NULL; - - /* - * Even if this cache line was invalidated by a missed write update, - * any reads that were queued up before the missed update are still - * valid, so we can satisfy them from this line before we evict it. - */ - for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) - vdev_cache_hit(vc, ve, dio); - - if (zio->io_error || ve->ve_missed_update) - vdev_cache_evict(vc, ve); - - mutex_exit(&vc->vc_lock); - - while ((dio = zio->io_delegate_list) != NULL) { - zio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - dio->io_error = zio->io_error; - zio_next_stage(dio); - } -} - -/* - * Read data from the cache. Returns 0 on cache hit, errno on a miss. - */ -int -vdev_cache_read(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); - uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); - zio_t *fio; - - ASSERT(zio->io_type == ZIO_TYPE_READ); - - if (zio->io_flags & ZIO_FLAG_DONT_CACHE) - return (EINVAL); - - if (zio->io_size > zfs_vdev_cache_max) - return (EOVERFLOW); - - /* - * If the I/O straddles two or more cache blocks, don't cache it. - */ - if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS)) - return (EXDEV); - - ASSERT(cache_phase + zio->io_size <= VCBS); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = cache_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); - - if (ve != NULL) { - if (ve->ve_missed_update) { - mutex_exit(&vc->vc_lock); - return (ESTALE); - } - - if ((fio = ve->ve_fill_io) != NULL) { - zio->io_delegate_next = fio->io_delegate_list; - fio->io_delegate_list = zio; - zio_vdev_io_bypass(zio); - mutex_exit(&vc->vc_lock); - return (0); - } - - vdev_cache_hit(vc, ve, zio); - zio_vdev_io_bypass(zio); - - mutex_exit(&vc->vc_lock); - zio_next_stage(zio); - return (0); - } - - ve = vdev_cache_allocate(zio); - - if (ve == NULL) { - mutex_exit(&vc->vc_lock); - return (ENOMEM); - } - - fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, - ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK, - vdev_cache_fill, ve); - - ve->ve_fill_io = fio; - fio->io_delegate_list = zio; - zio_vdev_io_bypass(zio); - - mutex_exit(&vc->vc_lock); - zio_nowait(fio); - - return (0); -} - -/* - * Update cache contents upon write completion. - */ -void -vdev_cache_write(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t io_start = zio->io_offset; - uint64_t io_end = io_start + zio->io_size; - uint64_t min_offset = P2ALIGN(io_start, VCBS); - uint64_t max_offset = P2ROUNDUP(io_end, VCBS); - avl_index_t where; - - ASSERT(zio->io_type == ZIO_TYPE_WRITE); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = min_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); - - if (ve == NULL) - ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); - - while (ve != NULL && ve->ve_offset < max_offset) { - uint64_t start = MAX(ve->ve_offset, io_start); - uint64_t end = MIN(ve->ve_offset + VCBS, io_end); - - if (ve->ve_fill_io != NULL) { - ve->ve_missed_update = 1; - } else { - bcopy((char *)zio->io_data + start - io_start, - ve->ve_data + start - ve->ve_offset, end - start); - } - ve = AVL_NEXT(&vc->vc_offset_tree, ve); - } - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_init(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); - - avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_offset_node)); - - avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_lastused_node)); -} - -void -vdev_cache_fini(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); - - avl_destroy(&vc->vc_offset_tree); - avl_destroy(&vc->vc_lastused_tree); - - mutex_destroy(&vc->vc_lock); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c deleted file mode 100644 index b965b1c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ /dev/null @@ -1,363 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/vdev_disk.h> -#include <sys/vdev_impl.h> -#include <sys/fs/zfs.h> -#include <sys/zio.h> -#include <sys/sunldi.h> - -/* - * Virtual device vector for disks. - */ - -extern ldi_ident_t zfs_li; - -typedef struct vdev_disk_buf { - buf_t vdb_buf; - zio_t *vdb_io; -} vdev_disk_buf_t; - -static int -vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) -{ - vdev_disk_t *dvd; - struct dk_minfo dkm; - int error; - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); - } - - dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); - - /* - * When opening a disk device, we want to preserve the user's original - * intent. We always want to open the device by the path the user gave - * us, even if it is one of multiple paths to the save device. But we - * also want to be able to survive disks being removed/recabled. - * Therefore the sequence of opening devices is: - * - * 1. Try opening the device by path. For legacy pools without the - * 'whole_disk' property, attempt to fix the path by appending 's0'. - * - * 2. If the devid of the device matches the stored value, return - * success. - * - * 3. Otherwise, the device may have moved. Try opening the device - * by the devid instead. - * - */ - if (vd->vdev_devid != NULL) { - if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, - &dvd->vd_minor) != 0) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); - } - } - - error = EINVAL; /* presume failure */ - - if (vd->vdev_path != NULL) { - ddi_devid_t devid; - - if (vd->vdev_wholedisk == -1ULL) { - size_t len = strlen(vd->vdev_path) + 3; - char *buf = kmem_alloc(len, KM_SLEEP); - ldi_handle_t lh; - - (void) snprintf(buf, len, "%ss0", vd->vdev_path); - - if (ldi_open_by_name(buf, spa_mode, kcred, - &lh, zfs_li) == 0) { - spa_strfree(vd->vdev_path); - vd->vdev_path = buf; - vd->vdev_wholedisk = 1ULL; - (void) ldi_close(lh, spa_mode, kcred); - } else { - kmem_free(buf, len); - } - } - - error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, - &dvd->vd_lh, zfs_li); - - /* - * Compare the devid to the stored value. - */ - if (error == 0 && vd->vdev_devid != NULL && - ldi_get_devid(dvd->vd_lh, &devid) == 0) { - if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { - error = EINVAL; - (void) ldi_close(dvd->vd_lh, spa_mode, kcred); - dvd->vd_lh = NULL; - } - ddi_devid_free(devid); - } - - /* - * If we succeeded in opening the device, but 'vdev_wholedisk' - * is not yet set, then this must be a slice. - */ - if (error == 0 && vd->vdev_wholedisk == -1ULL) - vd->vdev_wholedisk = 0; - } - - /* - * If we were unable to open by path, or the devid check fails, open by - * devid instead. - */ - if (error != 0 && vd->vdev_devid != NULL) - error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, - spa_mode, kcred, &dvd->vd_lh, zfs_li); - - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (error); - } - - /* - * Determine the actual size of the device. - */ - if (ldi_get_size(dvd->vd_lh, psize) != 0) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (EINVAL); - } - - /* - * If we own the whole disk, try to enable disk write caching. - * We ignore errors because it's OK if we can't do it. - */ - if (vd->vdev_wholedisk == 1) { - int wce = 1; - (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, - FKIOCTL, kcred, NULL); - } - - /* - * Determine the device's minimum transfer size. - * If the ioctl isn't supported, assume DEV_BSIZE. - */ - if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm, - FKIOCTL, kcred, NULL) != 0) - dkm.dki_lbsize = DEV_BSIZE; - - *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1; - - /* - * Clear the nowritecache bit, so that on a vdev_reopen() we will - * try again. - */ - vd->vdev_nowritecache = B_FALSE; - - return (0); -} - -static void -vdev_disk_close(vdev_t *vd) -{ - vdev_disk_t *dvd = vd->vdev_tsd; - - if (dvd == NULL) - return; - - dprintf("removing disk %s, devid %s\n", - vd->vdev_path ? vd->vdev_path : "<none>", - vd->vdev_devid ? vd->vdev_devid : "<none>"); - - if (dvd->vd_minor != NULL) - ddi_devid_str_free(dvd->vd_minor); - - if (dvd->vd_devid != NULL) - ddi_devid_free(dvd->vd_devid); - - if (dvd->vd_lh != NULL) - (void) ldi_close(dvd->vd_lh, spa_mode, kcred); - - kmem_free(dvd, sizeof (vdev_disk_t)); - vd->vdev_tsd = NULL; -} - -static void -vdev_disk_io_intr(buf_t *bp) -{ - vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; - zio_t *zio = vdb->vdb_io; - - if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0) - zio->io_error = EIO; - - kmem_free(vdb, sizeof (vdev_disk_buf_t)); - - zio_next_stage_async(zio); -} - -static void -vdev_disk_ioctl_done(void *zio_arg, int error) -{ - zio_t *zio = zio_arg; - - zio->io_error = error; - - zio_next_stage_async(zio); -} - -static void -vdev_disk_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_disk_t *dvd = vd->vdev_tsd; - vdev_disk_buf_t *vdb; - buf_t *bp; - int flags, error; - - if (zio->io_type == ZIO_TYPE_IOCTL) { - zio_vdev_io_bypass(zio); - - /* XXPOLICY */ - if (vdev_is_dead(vd)) { - zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; - } - - switch (zio->io_cmd) { - - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - if (vd->vdev_nowritecache) { - zio->io_error = ENOTSUP; - break; - } - - zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; - zio->io_dk_callback.dkc_cookie = zio; - - error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, - (uintptr_t)&zio->io_dk_callback, - FKIOCTL, kcred, NULL); - - if (error == 0) { - /* - * The ioctl will be done asychronously, - * and will call vdev_disk_ioctl_done() - * upon completion. - */ - return; - } else if (error == ENOTSUP) { - /* - * If we get ENOTSUP, we know that no future - * attempts will ever succeed. In this case we - * set a persistent bit so that we don't bother - * with the ioctl in the future. - */ - vd->vdev_nowritecache = B_TRUE; - } - zio->io_error = error; - - break; - - default: - zio->io_error = ENOTSUP; - } - - zio_next_stage_async(zio); - return; - } - - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; - - if ((zio = vdev_queue_io(zio)) == NULL) - return; - - flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); - flags |= B_BUSY | B_NOCACHE; - if (zio->io_flags & ZIO_FLAG_FAILFAST) - flags |= B_FAILFAST; - - vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); - - vdb->vdb_io = zio; - bp = &vdb->vdb_buf; - - bioinit(bp); - bp->b_flags = flags; - bp->b_bcount = zio->io_size; - bp->b_un.b_addr = zio->io_data; - bp->b_lblkno = lbtodb(zio->io_offset); - bp->b_bufsize = zio->io_size; - bp->b_iodone = (int (*)())vdev_disk_io_intr; - - /* XXPOLICY */ - error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); - if (error) { - zio->io_error = error; - bioerror(bp, error); - bp->b_resid = bp->b_bcount; - bp->b_iodone(bp); - return; - } - - error = ldi_strategy(dvd->vd_lh, bp); - /* ldi_strategy() will return non-zero only on programming errors */ - ASSERT(error == 0); -} - -static void -vdev_disk_io_done(zio_t *zio) -{ - vdev_queue_io_done(zio); - - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); - - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); - - zio_next_stage(zio); -} - -vdev_ops_t vdev_disk_ops = { - vdev_disk_open, - vdev_disk_close, - vdev_default_asize, - vdev_disk_io_start, - vdev_disk_io_done, - NULL, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c deleted file mode 100644 index b8e79f8..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ /dev/null @@ -1,225 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/vdev_file.h> -#include <sys/vdev_impl.h> -#include <sys/zio.h> -#include <sys/fs/zfs.h> - -/* - * Virtual device vector for files. - */ - -static int -vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) -{ - vdev_file_t *vf; - vnode_t *vp; - vattr_t vattr; - int error; - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); - } - - vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); - - /* - * We always open the files from the root of the global zone, even if - * we're in a local zone. If the user has gotten to this point, the - * administrator has already decided that the pool should be available - * to local zone users, so the underlying devices should be as well. - */ - ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); - error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX, - 0, &vp, 0, 0, rootdir); - - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (error); - } - - vf->vf_vnode = vp; - -#ifdef _KERNEL - /* - * Make sure it's a regular file. - */ - if (vp->v_type != VREG) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (ENODEV); - } -#endif - - /* - * Determine the physical size of the file. - */ - vattr.va_mask = AT_SIZE; - error = VOP_GETATTR(vp, &vattr, 0, kcred); - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (error); - } - - *psize = vattr.va_size; - *ashift = SPA_MINBLOCKSHIFT; - - return (0); -} - -static void -vdev_file_close(vdev_t *vd) -{ - vdev_file_t *vf = vd->vdev_tsd; - - if (vf == NULL) - return; - - if (vf->vf_vnode != NULL) { - (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred); - (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred); - VN_RELE(vf->vf_vnode); - } - - kmem_free(vf, sizeof (vdev_file_t)); - vd->vdev_tsd = NULL; -} - -static void -vdev_file_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_file_t *vf = vd->vdev_tsd; - ssize_t resid; - int error; - - if (zio->io_type == ZIO_TYPE_IOCTL) { - zio_vdev_io_bypass(zio); - - /* XXPOLICY */ - if (vdev_is_dead(vd)) { - zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; - } - - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, - kcred); - dprintf("fsync(%s) = %d\n", vdev_description(vd), - zio->io_error); - break; - default: - zio->io_error = ENOTSUP; - } - - zio_next_stage_async(zio); - return; - } - - /* - * In the kernel, don't bother double-caching, but in userland, - * we want to test the vdev_cache code. - */ -#ifndef _KERNEL - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; -#endif - - if ((zio = vdev_queue_io(zio)) == NULL) - return; - - /* XXPOLICY */ - error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); - if (error) { - zio->io_error = error; - zio_next_stage_async(zio); - return; - } - - zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, - zio->io_size, zio->io_offset, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid); - - if (resid != 0 && zio->io_error == 0) - zio->io_error = ENOSPC; - - zio_next_stage_async(zio); -} - -static void -vdev_file_io_done(zio_t *zio) -{ - vdev_queue_io_done(zio); - -#ifndef _KERNEL - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); -#endif - - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); - - zio_next_stage(zio); -} - -vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; - -/* - * From userland we access disks just like files. - */ -#ifndef _KERNEL - -vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; - -#endif diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c deleted file mode 100644 index eebc911..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ /dev/null @@ -1,583 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> - * All rights reserved. - */ - -#include <sys/zfs_context.h> -#include <sys/param.h> -#include <sys/kernel.h> -#include <sys/bio.h> -#include <sys/disk.h> -#include <sys/spa.h> -#include <sys/vdev_impl.h> -#include <sys/fs/zfs.h> -#include <sys/zio.h> -#include <geom/geom.h> -#include <geom/geom_int.h> - -/* - * Virtual device vector for GEOM. - */ - -struct g_class zfs_vdev_class = { - .name = "ZFS::VDEV", - .version = G_VERSION, -}; - -DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); - -typedef struct vdev_geom_ctx { - struct g_consumer *gc_consumer; - int gc_state; - struct bio_queue_head gc_queue; - struct mtx gc_queue_mtx; -} vdev_geom_ctx_t; - -static void -vdev_geom_release(vdev_t *vd) -{ - vdev_geom_ctx_t *ctx; - - ctx = vd->vdev_tsd; - vd->vdev_tsd = NULL; - - mtx_lock(&ctx->gc_queue_mtx); - ctx->gc_state = 1; - wakeup_one(&ctx->gc_queue); - while (ctx->gc_state != 2) - msleep(&ctx->gc_state, &ctx->gc_queue_mtx, 0, "vgeom:w", 0); - mtx_unlock(&ctx->gc_queue_mtx); - mtx_destroy(&ctx->gc_queue_mtx); - kmem_free(ctx, sizeof(*ctx)); -} - -static void -vdev_geom_orphan(struct g_consumer *cp) -{ - struct g_geom *gp; - vdev_t *vd; - int error; - - g_topology_assert(); - - vd = cp->private; - gp = cp->geom; - error = cp->provider->error; - - ZFS_LOG(1, "Closing access to %s.", cp->provider->name); - if (cp->acr + cp->acw + cp->ace > 0) - g_access(cp, -cp->acr, -cp->acw, -cp->ace); - ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name); - g_detach(cp); - g_destroy_consumer(cp); - /* Destroy geom if there are no consumers left. */ - if (LIST_EMPTY(&gp->consumer)) { - ZFS_LOG(1, "Destroyed geom %s.", gp->name); - g_wither_geom(gp, error); - } - vdev_geom_release(vd); - /* Both methods below work, but in a bit different way. */ -#if 0 - vd->vdev_reopen_wanted = 1; -#else - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); -#endif -} - -static struct g_consumer * -vdev_geom_attach(struct g_provider *pp, int write) -{ - struct g_geom *gp; - struct g_consumer *cp; - - g_topology_assert(); - - ZFS_LOG(1, "Attaching to %s.", pp->name); - /* Do we have geom already? No? Create one. */ - LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { - if (gp->flags & G_GEOM_WITHER) - continue; - if (strcmp(gp->name, "zfs::vdev") != 0) - continue; - break; - } - if (gp == NULL) { - gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); - gp->orphan = vdev_geom_orphan; - cp = g_new_consumer(gp); - if (g_attach(cp, pp) != 0) { - g_wither_geom(gp, ENXIO); - return (NULL); - } - if (g_access(cp, 1, write, 1) != 0) { - g_wither_geom(gp, ENXIO); - return (NULL); - } - ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); - } else { - /* Check if we are already connected to this provider. */ - LIST_FOREACH(cp, &gp->consumer, consumer) { - if (cp->provider == pp) { - ZFS_LOG(1, "Found consumer for %s.", pp->name); - break; - } - } - if (cp == NULL) { - cp = g_new_consumer(gp); - if (g_attach(cp, pp) != 0) { - g_destroy_consumer(cp); - return (NULL); - } - if (g_access(cp, 1, write, 1) != 0) { - g_detach(cp); - g_destroy_consumer(cp); - return (NULL); - } - ZFS_LOG(1, "Created consumer for %s.", pp->name); - } else { - if (g_access(cp, 1, cp->acw > 0 ? 0 : write, 1) != 0) - return (NULL); - ZFS_LOG(1, "Used existing consumer for %s.", pp->name); - } - } - return (cp); -} - -static void -vdev_geom_detach(void *arg, int flag __unused) -{ - struct g_geom *gp; - struct g_consumer *cp; - - g_topology_assert(); - cp = arg; - gp = cp->geom; - - ZFS_LOG(1, "Closing access to %s.", cp->provider->name); - g_access(cp, -1, 0, -1); - /* Destroy consumer on last close. */ - if (cp->acr == 0 && cp->ace == 0) { - ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name); - if (cp->acw > 0) - g_access(cp, 0, -cp->acw, 0); - g_detach(cp); - g_destroy_consumer(cp); - } - /* Destroy geom if there are no consumers left. */ - if (LIST_EMPTY(&gp->consumer)) { - ZFS_LOG(1, "Destroyed geom %s.", gp->name); - g_wither_geom(gp, ENXIO); - } -} - -static void -vdev_geom_worker(void *arg) -{ - vdev_geom_ctx_t *ctx; - zio_t *zio; - struct bio *bp; - - ctx = arg; - for (;;) { - mtx_lock(&ctx->gc_queue_mtx); - bp = bioq_takefirst(&ctx->gc_queue); - if (bp == NULL) { - if (ctx->gc_state == 1) { - ctx->gc_state = 2; - wakeup_one(&ctx->gc_state); - mtx_unlock(&ctx->gc_queue_mtx); - kproc_exit(0); - } - msleep(&ctx->gc_queue, &ctx->gc_queue_mtx, - PRIBIO | PDROP, "vgeom:io", 0); - continue; - } - mtx_unlock(&ctx->gc_queue_mtx); - zio = bp->bio_caller1; - zio->io_error = bp->bio_error; - if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) { - vdev_t *vd; - - /* - * If we get ENOTSUP, we know that no future - * attempts will ever succeed. In this case we - * set a persistent bit so that we don't bother - * with the ioctl in the future. - */ - vd = zio->io_vd; - vd->vdev_nowritecache = B_TRUE; - } - g_destroy_bio(bp); - zio_next_stage_async(zio); - } -} - -static char * -vdev_geom_get_id(struct g_consumer *cp) -{ - char *id; - int len; - - g_topology_assert_not(); - len = DISK_IDENT_SIZE; - id = kmem_zalloc(len, KM_SLEEP); - if (g_io_getattr("GEOM::ident", cp, &len, id) != 0) { - kmem_free(id, DISK_IDENT_SIZE); - return (NULL); - } - return (id); -} - -static void -vdev_geom_free_id(char *id) -{ - - if (id != NULL) - kmem_free(id, DISK_IDENT_SIZE); -} - -struct vdev_geom_find { - const char *id; - int write; - struct g_consumer *cp; -}; - -static void -vdev_geom_taste_orphan(struct g_consumer *cp) -{ - - KASSERT(1 == 0, ("%s called while tasting %s.", __func__, - cp->provider->name)); -} - -static void -vdev_geom_attach_by_id_event(void *arg, int flags __unused) -{ - struct vdev_geom_find *ap; - struct g_class *mp; - struct g_geom *gp, *zgp; - struct g_provider *pp; - struct g_consumer *zcp; - char *id; - - g_topology_assert(); - - ap = arg; - - zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste"); - /* This orphan function should be never called. */ - zgp->orphan = vdev_geom_taste_orphan; - zcp = g_new_consumer(zgp); - - LIST_FOREACH(mp, &g_classes, class) { - if (mp == &zfs_vdev_class) - continue; - LIST_FOREACH(gp, &mp->geom, geom) { - if (gp->flags & G_GEOM_WITHER) - continue; - LIST_FOREACH(pp, &gp->provider, provider) { - if (pp->flags & G_PF_WITHER) - continue; - g_attach(zcp, pp); - if (g_access(zcp, 1, 0, 0) != 0) { - g_detach(zcp); - continue; - } - g_topology_unlock(); - id = vdev_geom_get_id(zcp); - g_topology_lock(); - g_access(zcp, -1, 0, 0); - g_detach(zcp); - if (id == NULL || strcmp(id, ap->id) != 0) { - vdev_geom_free_id(id); - continue; - } - vdev_geom_free_id(id); - ap->cp = vdev_geom_attach(pp, ap->write); - if (ap->cp == NULL) { - printf("ZFS WARNING: Cannot open %s " - "for writting.\n", pp->name); - continue; - } - goto end; - } - } - } - ap->cp = NULL; -end: - g_destroy_consumer(zcp); - g_destroy_geom(zgp); -} - -static struct g_consumer * -vdev_geom_attach_by_id(const char *id, int write) -{ - struct vdev_geom_find *ap; - struct g_consumer *cp; - - ap = kmem_zalloc(sizeof(*ap), KM_SLEEP); - ap->id = id; - ap->write = write; - g_waitfor_event(vdev_geom_attach_by_id_event, ap, M_WAITOK, NULL); - cp = ap->cp; - kmem_free(ap, sizeof(*ap)); - return (cp); -} - -static int -vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) -{ - vdev_geom_ctx_t *ctx; - struct g_provider *pp; - struct g_consumer *cp; - char *id = NULL; - int owned; - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); - } - - if ((owned = mtx_owned(&Giant))) - mtx_unlock(&Giant); - cp = NULL; - g_topology_lock(); - pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); - if (pp != NULL) { - ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); - cp = vdev_geom_attach(pp, !!(spa_mode & FWRITE)); - if (cp != NULL && vd->vdev_devid != NULL) { - g_topology_unlock(); - id = vdev_geom_get_id(cp); - g_topology_lock(); - if (id == NULL || strcmp(id, vd->vdev_devid) != 0) { - vdev_geom_detach(cp, 0); - cp = NULL; - ZFS_LOG(1, "ID mismatch for provider %s: " - "[%s]!=[%s].", vd->vdev_path, - vd->vdev_devid, id); - goto next; - } - ZFS_LOG(1, "ID match for provider %s.", vd->vdev_path); - } - } -next: - g_topology_unlock(); - vdev_geom_free_id(id); - if (cp == NULL && vd->vdev_devid != NULL) { - ZFS_LOG(1, "Searching by ID [%s].", vd->vdev_devid); - cp = vdev_geom_attach_by_id(vd->vdev_devid, - !!(spa_mode & FWRITE)); - if (cp != NULL) { - size_t len = strlen(cp->provider->name) + 6; /* 6 == strlen("/dev/") + 1 */ - char *buf = kmem_alloc(len, KM_SLEEP); - - snprintf(buf, len, "/dev/%s", cp->provider->name); - spa_strfree(vd->vdev_path); - vd->vdev_path = buf; - - ZFS_LOG(1, "Attach by ID [%s] succeeded, provider %s.", - vd->vdev_devid, vd->vdev_path); - } - } - if (owned) - mtx_lock(&Giant); - if (cp == NULL) { - ZFS_LOG(1, "Provider %s (id=[%s]) not found.", vd->vdev_path, - vd->vdev_devid); - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (EACCES); - } - pp = cp->provider; - - /* - * Determine the actual size of the device. - */ - *psize = pp->mediasize; - - /* - * Determine the device's minimum transfer size. - */ - *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; - - /* - * Clear the nowritecache bit, so that on a vdev_reopen() we will - * try again. - */ - vd->vdev_nowritecache = B_FALSE; - - cp->private = vd; - - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP); - bioq_init(&ctx->gc_queue); - mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF); - ctx->gc_consumer = cp; - ctx->gc_state = 0; - - vd->vdev_tsd = ctx; - - kproc_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s", - pp->name); - - return (0); -} - -static void -vdev_geom_close(vdev_t *vd) -{ - vdev_geom_ctx_t *ctx; - struct g_consumer *cp; - - if ((ctx = vd->vdev_tsd) == NULL) - return; - if ((cp = ctx->gc_consumer) == NULL) - return; - vdev_geom_release(vd); - g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL); -} - -static void -vdev_geom_io_intr(struct bio *bp) -{ - vdev_geom_ctx_t *ctx; - zio_t *zio; - - zio = bp->bio_caller1; - ctx = zio->io_vd->vdev_tsd; - - mtx_lock(&ctx->gc_queue_mtx); - bioq_insert_tail(&ctx->gc_queue, bp); - wakeup_one(&ctx->gc_queue); - mtx_unlock(&ctx->gc_queue_mtx); -} - -static void -vdev_geom_io_start(zio_t *zio) -{ - vdev_t *vd; - vdev_geom_ctx_t *ctx; - struct g_consumer *cp; - struct bio *bp; - int error; - - cp = NULL; - - vd = zio->io_vd; - ctx = vd->vdev_tsd; - if (ctx != NULL) - cp = ctx->gc_consumer; - - if (zio->io_type == ZIO_TYPE_IOCTL) { - zio_vdev_io_bypass(zio); - - /* XXPOLICY */ - if (vdev_is_dead(vd)) { - zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; - } - - switch (zio->io_cmd) { - - case DKIOCFLUSHWRITECACHE: - if (vd->vdev_nowritecache) { - zio->io_error = ENOTSUP; - break; - } - - goto sendreq; - default: - zio->io_error = ENOTSUP; - } - - zio_next_stage_async(zio); - return; - } - - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; - - if ((zio = vdev_queue_io(zio)) == NULL) - return; - -sendreq: - - error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); - if (error == 0 && cp == NULL) - error = ENXIO; - if (error) { - zio->io_error = error; - zio_next_stage_async(zio); - return; - } - - bp = g_alloc_bio(); - bp->bio_caller1 = zio; - switch (zio->io_type) { - case ZIO_TYPE_READ: - case ZIO_TYPE_WRITE: - bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE; - bp->bio_data = zio->io_data; - bp->bio_offset = zio->io_offset; - bp->bio_length = zio->io_size; - break; - case ZIO_TYPE_IOCTL: - bp->bio_cmd = BIO_FLUSH; - bp->bio_data = NULL; - bp->bio_offset = cp->provider->mediasize; - bp->bio_length = 0; - break; - } - bp->bio_done = vdev_geom_io_intr; - - g_io_request(bp, cp); -} - -static void -vdev_geom_io_done(zio_t *zio) -{ - vdev_queue_io_done(zio); - - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); - - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); - - zio_next_stage(zio); -} - -vdev_ops_t vdev_geom_ops = { - vdev_geom_open, - vdev_geom_close, - vdev_default_asize, - vdev_geom_io_start, - vdev_geom_io_done, - NULL, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c deleted file mode 100644 index 9d9f555..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ /dev/null @@ -1,1011 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * Virtual Device Labels - * --------------------- - * - * The vdev label serves several distinct purposes: - * - * 1. Uniquely identify this device as part of a ZFS pool and confirm its - * identity within the pool. - * - * 2. Verify that all the devices given in a configuration are present - * within the pool. - * - * 3. Determine the uberblock for the pool. - * - * 4. In case of an import operation, determine the configuration of the - * toplevel vdev of which it is a part. - * - * 5. If an import operation cannot find all the devices in the pool, - * provide enough information to the administrator to determine which - * devices are missing. - * - * It is important to note that while the kernel is responsible for writing the - * label, it only consumes the information in the first three cases. The - * latter information is only consumed in userland when determining the - * configuration to import a pool. - * - * - * Label Organization - * ------------------ - * - * Before describing the contents of the label, it's important to understand how - * the labels are written and updated with respect to the uberblock. - * - * When the pool configuration is altered, either because it was newly created - * or a device was added, we want to update all the labels such that we can deal - * with fatal failure at any point. To this end, each disk has two labels which - * are updated before and after the uberblock is synced. Assuming we have - * labels and an uberblock with the following transacation groups: - * - * L1 UB L2 - * +------+ +------+ +------+ - * | | | | | | - * | t10 | | t10 | | t10 | - * | | | | | | - * +------+ +------+ +------+ - * - * In this stable state, the labels and the uberblock were all updated within - * the same transaction group (10). Each label is mirrored and checksummed, so - * that we can detect when we fail partway through writing the label. - * - * In order to identify which labels are valid, the labels are written in the - * following manner: - * - * 1. For each vdev, update 'L1' to the new label - * 2. Update the uberblock - * 3. For each vdev, update 'L2' to the new label - * - * Given arbitrary failure, we can determine the correct label to use based on - * the transaction group. If we fail after updating L1 but before updating the - * UB, we will notice that L1's transaction group is greater than the uberblock, - * so L2 must be valid. If we fail after writing the uberblock but before - * writing L2, we will notice that L2's transaction group is less than L1, and - * therefore L1 is valid. - * - * Another added complexity is that not every label is updated when the config - * is synced. If we add a single device, we do not want to have to re-write - * every label for every device in the pool. This means that both L1 and L2 may - * be older than the pool uberblock, because the necessary information is stored - * on another vdev. - * - * - * On-disk Format - * -------------- - * - * The vdev label consists of two distinct parts, and is wrapped within the - * vdev_label_t structure. The label includes 8k of padding to permit legacy - * VTOC disk labels, but is otherwise ignored. - * - * The first half of the label is a packed nvlist which contains pool wide - * properties, per-vdev properties, and configuration information. It is - * described in more detail below. - * - * The latter half of the label consists of a redundant array of uberblocks. - * These uberblocks are updated whenever a transaction group is committed, - * or when the configuration is updated. When a pool is loaded, we scan each - * vdev for the 'best' uberblock. - * - * - * Configuration Information - * ------------------------- - * - * The nvlist describing the pool and vdev contains the following elements: - * - * version ZFS on-disk version - * name Pool name - * state Pool state - * txg Transaction group in which this label was written - * pool_guid Unique identifier for this pool - * vdev_tree An nvlist describing vdev tree. - * - * Each leaf device label also contains the following: - * - * top_guid Unique ID for top-level vdev in which this is contained - * guid Unique ID for the leaf vdev - * - * The 'vs' configuration follows the format described in 'spa_config.c'. - */ - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/spa_impl.h> -#include <sys/dmu.h> -#include <sys/zap.h> -#include <sys/vdev.h> -#include <sys/vdev_impl.h> -#include <sys/uberblock_impl.h> -#include <sys/metaslab.h> -#include <sys/zio.h> -#include <sys/fs/zfs.h> - -/* - * Basic routines to read and write from a vdev label. - * Used throughout the rest of this file. - */ -uint64_t -vdev_label_offset(uint64_t psize, int l, uint64_t offset) -{ - ASSERT(offset < sizeof (vdev_label_t)); - - return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? - 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); -} - -static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private) -{ - ASSERT(vd->vdev_children == 0); - - zio_nowait(zio_read_phys(zio, vd, - vdev_label_offset(vd->vdev_psize, l, offset), - size, buf, ZIO_CHECKSUM_LABEL, done, private, - ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE)); -} - -static void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private) -{ - ASSERT(vd->vdev_children == 0); - - zio_nowait(zio_write_phys(zio, vd, - vdev_label_offset(vd->vdev_psize, l, offset), - size, buf, ZIO_CHECKSUM_LABEL, done, private, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL)); -} - -/* - * Generate the nvlist representing this vdev's config. - */ -nvlist_t * -vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - boolean_t isspare) -{ - nvlist_t *nv = NULL; - - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, - vd->vdev_ops->vdev_op_type) == 0); - if (!isspare) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) - == 0); - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); - - if (vd->vdev_path != NULL) - VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, - vd->vdev_path) == 0); - - if (vd->vdev_devid != NULL) - VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, - vd->vdev_devid) == 0); - - if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); - - /* - * Make sure someone hasn't managed to sneak a fancy new vdev - * into a crufty old storage pool. - */ - ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity == 2 && - spa_version(spa) >= ZFS_VERSION_RAID6)); - - /* - * Note that we'll add the nparity tag even on storage pools - * that only support a single parity device -- older software - * will just ignore it. - */ - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, - vd->vdev_nparity) == 0); - } - - if (vd->vdev_wholedisk != -1ULL) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - vd->vdev_wholedisk) == 0); - - if (vd->vdev_not_present) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0); - - if (vd->vdev_isspare) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); - - if (!isspare && vd == vd->vdev_top) { - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, - vd->vdev_ms_array) == 0); - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, - vd->vdev_ms_shift) == 0); - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, - vd->vdev_ashift) == 0); - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, - vd->vdev_asize) == 0); - } - - if (vd->vdev_dtl.smo_object != 0) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, - vd->vdev_dtl.smo_object) == 0); - - if (getstats) { - vdev_stat_t vs; - vdev_get_stats(vd, &vs); - VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS, - (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); - } - - if (!vd->vdev_ops->vdev_op_leaf) { - nvlist_t **child; - int c; - - child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), - KM_SLEEP); - - for (c = 0; c < vd->vdev_children; c++) - child[c] = vdev_config_generate(spa, vd->vdev_child[c], - getstats, isspare); - - VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - child, vd->vdev_children) == 0); - - for (c = 0; c < vd->vdev_children; c++) - nvlist_free(child[c]); - - kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); - - } else { - if (vd->vdev_offline && !vd->vdev_tmpoffline) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, - B_TRUE) == 0); - else - (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE, - DATA_TYPE_UINT64); - } - - return (nv); -} - -nvlist_t * -vdev_label_read_config(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - nvlist_t *config = NULL; - vdev_phys_t *vp; - zio_t *zio; - int l; - - ASSERT(spa_config_held(spa, RW_READER)); - - if (vdev_is_dead(vd)) - return (NULL); - - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - - for (l = 0; l < VDEV_LABELS; l++) { - - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD); - - vdev_label_read(zio, vd, l, vp, - offsetof(vdev_label_t, vl_vdev_phys), - sizeof (vdev_phys_t), NULL, NULL); - - if (zio_wait(zio) == 0 && - nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), - &config, 0) == 0) - break; - - if (config != NULL) { - nvlist_free(config); - config = NULL; - } - } - - zio_buf_free(vp, sizeof (vdev_phys_t)); - - return (config); -} - -/* - * Determine if a device is in use. The 'spare_guid' parameter will be filled - * in with the device guid if this spare is active elsewhere on the system. - */ -static boolean_t -vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, - uint64_t *spare_guid) -{ - spa_t *spa = vd->vdev_spa; - uint64_t state, pool_guid, device_guid, txg, spare_pool; - uint64_t vdtxg = 0; - nvlist_t *label; - - if (spare_guid) - *spare_guid = 0ULL; - - /* - * Read the label, if any, and perform some basic sanity checks. - */ - if ((label = vdev_label_read_config(vd)) == NULL) - return (B_FALSE); - - (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, - &vdtxg); - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, - &state) != 0 || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, - &device_guid) != 0) { - nvlist_free(label); - return (B_FALSE); - } - - if (state != POOL_STATE_SPARE && - (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, - &pool_guid) != 0 || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0)) { - nvlist_free(label); - return (B_FALSE); - } - - nvlist_free(label); - - /* - * Check to see if this device indeed belongs to the pool it claims to - * be a part of. The only way this is allowed is if the device is a hot - * spare (which we check for later on). - */ - if (state != POOL_STATE_SPARE && - !spa_guid_exists(pool_guid, device_guid) && - !spa_spare_exists(device_guid, NULL)) - return (B_FALSE); - - /* - * If the transaction group is zero, then this an initialized (but - * unused) label. This is only an error if the create transaction - * on-disk is the same as the one we're using now, in which case the - * user has attempted to add the same vdev multiple times in the same - * transaction. - */ - if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg) - return (B_TRUE); - - /* - * Check to see if this is a spare device. We do an explicit check for - * spa_has_spare() here because it may be on our pending list of spares - * to add. - */ - if (spa_spare_exists(device_guid, &spare_pool) || - spa_has_spare(spa, device_guid)) { - if (spare_guid) - *spare_guid = device_guid; - - switch (reason) { - case VDEV_LABEL_CREATE: - return (B_TRUE); - - case VDEV_LABEL_REPLACE: - return (!spa_has_spare(spa, device_guid) || - spare_pool != 0ULL); - - case VDEV_LABEL_SPARE: - return (spa_has_spare(spa, device_guid)); - } - } - - /* - * If the device is marked ACTIVE, then this device is in use by another - * pool on the system. - */ - return (state == POOL_STATE_ACTIVE); -} - -/* - * Initialize a vdev label. We check to make sure each leaf device is not in - * use, and writable. We put down an initial label which we will later - * overwrite with a complete label. Note that it's important to do this - * sequentially, not in parallel, so that we catch cases of multiple use of the - * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with - * itself. - */ -int -vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) -{ - spa_t *spa = vd->vdev_spa; - nvlist_t *label; - vdev_phys_t *vp; - vdev_boot_header_t *vb; - uberblock_t *ub; - zio_t *zio; - int l, c, n; - char *buf; - size_t buflen; - int error; - uint64_t spare_guid; - - ASSERT(spa_config_held(spa, RW_WRITER)); - - for (c = 0; c < vd->vdev_children; c++) - if ((error = vdev_label_init(vd->vdev_child[c], - crtxg, reason)) != 0) - return (error); - - if (!vd->vdev_ops->vdev_op_leaf) - return (0); - - /* - * Dead vdevs cannot be initialized. - */ - if (vdev_is_dead(vd)) - return (EIO); - - /* - * Determine if the vdev is in use. - */ - if (reason != VDEV_LABEL_REMOVE && - vdev_inuse(vd, crtxg, reason, &spare_guid)) - return (EBUSY); - - ASSERT(reason != VDEV_LABEL_REMOVE || - vdev_inuse(vd, crtxg, reason, NULL)); - - /* - * If this is a request to add or replace a spare that is in use - * elsewhere on the system, then we must update the guid (which was - * initialized to a random value) to reflect the actual GUID (which is - * shared between multiple pools). - */ - if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) { - vdev_t *pvd = vd->vdev_parent; - - for (; pvd != NULL; pvd = pvd->vdev_parent) { - pvd->vdev_guid_sum -= vd->vdev_guid; - pvd->vdev_guid_sum += spare_guid; - } - - vd->vdev_guid = vd->vdev_guid_sum = spare_guid; - - /* - * If this is a replacement, then we want to fallthrough to the - * rest of the code. If we're adding a spare, then it's already - * labelled appropriately and we can just return. - */ - if (reason == VDEV_LABEL_SPARE) - return (0); - ASSERT(reason == VDEV_LABEL_REPLACE); - } - - /* - * Initialize its label. - */ - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); - - /* - * Generate a label describing the pool and our top-level vdev. - * We mark it as being from txg 0 to indicate that it's not - * really part of an active pool just yet. The labels will - * be written again with a meaningful txg by spa_sync(). - */ - if (reason == VDEV_LABEL_SPARE || - (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { - /* - * For inactive hot spares, we generate a special label that - * identifies as a mutually shared hot spare. We write the - * label if we are adding a hot spare, or if we are removing an - * active hot spare (in which case we want to revert the - * labels). - */ - VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, - POOL_STATE_SPARE) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); - } else { - label = spa_config_generate(spa, vd, 0ULL, B_FALSE); - - /* - * Add our creation time. This allows us to detect multiple - * vdev uses as described above, and automatically expires if we - * fail. - */ - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, - crtxg) == 0); - } - - buf = vp->vp_nvlist; - buflen = sizeof (vp->vp_nvlist); - - error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); - if (error != 0) { - nvlist_free(label); - zio_buf_free(vp, sizeof (vdev_phys_t)); - /* EFAULT means nvlist_pack ran out of room */ - return (error == EFAULT ? ENAMETOOLONG : EINVAL); - } - - /* - * Initialize boot block header. - */ - vb = zio_buf_alloc(sizeof (vdev_boot_header_t)); - bzero(vb, sizeof (vdev_boot_header_t)); - vb->vb_magic = VDEV_BOOT_MAGIC; - vb->vb_version = VDEV_BOOT_VERSION; - vb->vb_offset = VDEV_BOOT_OFFSET; - vb->vb_size = VDEV_BOOT_SIZE; - - /* - * Initialize uberblock template. - */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ub, VDEV_UBERBLOCK_SIZE(vd)); - *ub = spa->spa_uberblock; - ub->ub_txg = 0; - - /* - * Write everything in parallel. - */ - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - - for (l = 0; l < VDEV_LABELS; l++) { - - vdev_label_write(zio, vd, l, vp, - offsetof(vdev_label_t, vl_vdev_phys), - sizeof (vdev_phys_t), NULL, NULL); - - vdev_label_write(zio, vd, l, vb, - offsetof(vdev_label_t, vl_boot_header), - sizeof (vdev_boot_header_t), NULL, NULL); - - for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { - vdev_label_write(zio, vd, l, ub, - VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), NULL, NULL); - } - } - - error = zio_wait(zio); - - nvlist_free(label); - zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); - zio_buf_free(vb, sizeof (vdev_boot_header_t)); - zio_buf_free(vp, sizeof (vdev_phys_t)); - - /* - * If this vdev hasn't been previously identified as a spare, then we - * mark it as such only if a) we are labelling it as a spare, or b) it - * exists as a spare elsewhere in the system. - */ - if (error == 0 && !vd->vdev_isspare && - (reason == VDEV_LABEL_SPARE || - spa_spare_exists(vd->vdev_guid, NULL))) - spa_spare_add(vd); - - return (error); -} - -/* - * ========================================================================== - * uberblock load/sync - * ========================================================================== - */ - -/* - * Consider the following situation: txg is safely synced to disk. We've - * written the first uberblock for txg + 1, and then we lose power. When we - * come back up, we fail to see the uberblock for txg + 1 because, say, - * it was on a mirrored device and the replica to which we wrote txg + 1 - * is now offline. If we then make some changes and sync txg + 1, and then - * the missing replica comes back, then for a new seconds we'll have two - * conflicting uberblocks on disk with the same txg. The solution is simple: - * among uberblocks with equal txg, choose the one with the latest timestamp. - */ -static int -vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) -{ - if (ub1->ub_txg < ub2->ub_txg) - return (-1); - if (ub1->ub_txg > ub2->ub_txg) - return (1); - - if (ub1->ub_timestamp < ub2->ub_timestamp) - return (-1); - if (ub1->ub_timestamp > ub2->ub_timestamp) - return (1); - - return (0); -} - -static void -vdev_uberblock_load_done(zio_t *zio) -{ - uberblock_t *ub = zio->io_data; - uberblock_t *ubbest = zio->io_private; - spa_t *spa = zio->io_spa; - - ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); - - if (zio->io_error == 0 && uberblock_verify(ub) == 0) { - mutex_enter(&spa->spa_uberblock_lock); - if (vdev_uberblock_compare(ub, ubbest) > 0) - *ubbest = *ub; - mutex_exit(&spa->spa_uberblock_lock); - } - - zio_buf_free(zio->io_data, zio->io_size); -} - -void -vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) -{ - int l, c, n; - - for (c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); - - if (!vd->vdev_ops->vdev_op_leaf) - return; - - if (vdev_is_dead(vd)) - return; - - for (l = 0; l < VDEV_LABELS; l++) { - for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { - vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), - VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), - vdev_uberblock_load_done, ubbest); - } - } -} - -/* - * Write the uberblock to both labels of all leaves of the specified vdev. - * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). - */ -static void -vdev_uberblock_sync_done(zio_t *zio) -{ - uint64_t *good_writes = zio->io_root->io_private; - - if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) - atomic_add_64(good_writes, 1); -} - -static void -vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg) -{ - int l, c, n; - - for (c = 0; c < vd->vdev_children; c++) - vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg); - - if (!vd->vdev_ops->vdev_op_leaf) - return; - - if (vdev_is_dead(vd)) - return; - - n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - - ASSERT(ub->ub_txg == txg); - - for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ub, - VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), - vdev_uberblock_sync_done, NULL); - - dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg); -} - -static int -vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg) -{ - uberblock_t *ubbuf; - size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE; - uint64_t *good_writes; - zio_t *zio; - int error; - - ubbuf = zio_buf_alloc(size); - bzero(ubbuf, size); - *ubbuf = *ub; - - good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); - - zio = zio_root(spa, NULL, good_writes, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - - vdev_uberblock_sync(zio, ubbuf, vd, txg); - - error = zio_wait(zio); - - if (error && *good_writes != 0) { - dprintf("partial success: good_writes = %llu\n", *good_writes); - error = 0; - } - - /* - * It's possible to have no good writes and no error if every vdev is in - * the CANT_OPEN state. - */ - if (*good_writes == 0 && error == 0) - error = EIO; - - kmem_free(good_writes, sizeof (uint64_t)); - zio_buf_free(ubbuf, size); - - return (error); -} - -/* - * Sync out an individual vdev. - */ -static void -vdev_sync_label_done(zio_t *zio) -{ - uint64_t *good_writes = zio->io_root->io_private; - - if (zio->io_error == 0) - atomic_add_64(good_writes, 1); -} - -static void -vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg) -{ - nvlist_t *label; - vdev_phys_t *vp; - char *buf; - size_t buflen; - int c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_sync_label(zio, vd->vdev_child[c], l, txg); - - if (!vd->vdev_ops->vdev_op_leaf) - return; - - if (vdev_is_dead(vd)) - return; - - /* - * Generate a label describing the top-level config to which we belong. - */ - label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); - - buf = vp->vp_nvlist; - buflen = sizeof (vp->vp_nvlist); - - if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) - vdev_label_write(zio, vd, l, vp, - offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), - vdev_sync_label_done, NULL); - - zio_buf_free(vp, sizeof (vdev_phys_t)); - nvlist_free(label); - - dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg); -} - -static int -vdev_sync_labels(vdev_t *vd, int l, uint64_t txg) -{ - uint64_t *good_writes; - zio_t *zio; - int error; - - ASSERT(vd == vd->vdev_top); - - good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); - - zio = zio_root(vd->vdev_spa, NULL, good_writes, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - - /* - * Recursively kick off writes to all labels. - */ - vdev_sync_label(zio, vd, l, txg); - - error = zio_wait(zio); - - if (error && *good_writes != 0) { - dprintf("partial success: good_writes = %llu\n", *good_writes); - error = 0; - } - - if (*good_writes == 0 && error == 0) - error = ENODEV; - - kmem_free(good_writes, sizeof (uint64_t)); - - return (error); -} - -/* - * Sync the entire vdev configuration. - * - * The order of operations is carefully crafted to ensure that - * if the system panics or loses power at any time, the state on disk - * is still transactionally consistent. The in-line comments below - * describe the failure semantics at each stage. - * - * Moreover, it is designed to be idempotent: if spa_sync_labels() fails - * at any time, you can just call it again, and it will resume its work. - */ -int -vdev_config_sync(vdev_t *uvd, uint64_t txg) -{ - spa_t *spa = uvd->vdev_spa; - uberblock_t *ub = &spa->spa_uberblock; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd; - zio_t *zio; - int l, error; - - ASSERT(ub->ub_txg <= txg); - - /* - * If this isn't a resync due to I/O errors, and nothing changed - * in this transaction group, and the vdev configuration hasn't changed, - * then there's nothing to do. - */ - if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE && - list_is_empty(&spa->spa_dirty_list)) { - dprintf("nothing to sync in %s in txg %llu\n", - spa_name(spa), txg); - return (0); - } - - if (txg > spa_freeze_txg(spa)) - return (0); - - ASSERT(txg <= spa->spa_final_txg); - - dprintf("syncing %s txg %llu\n", spa_name(spa), txg); - - /* - * Flush the write cache of every disk that's been written to - * in this transaction group. This ensures that all blocks - * written in this txg will be committed to stable storage - * before any uberblock that references them. - */ - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; - vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) { - zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); - } - (void) zio_wait(zio); - - /* - * Sync out the even labels (L0, L2) for every dirty vdev. If the - * system dies in the middle of this process, that's OK: all of the - * even labels that made it to disk will be newer than any uberblock, - * and will therefore be considered invalid. The odd labels (L1, L3), - * which have not yet been touched, will still be valid. - */ - for (vd = list_head(&spa->spa_dirty_list); vd != NULL; - vd = list_next(&spa->spa_dirty_list, vd)) { - for (l = 0; l < VDEV_LABELS; l++) { - if (l & 1) - continue; - if ((error = vdev_sync_labels(vd, l, txg)) != 0) - return (error); - } - } - - /* - * Flush the new labels to disk. This ensures that all even-label - * updates are committed to stable storage before the uberblock update. - */ - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - for (vd = list_head(&spa->spa_dirty_list); vd != NULL; - vd = list_next(&spa->spa_dirty_list, vd)) { - zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); - } - (void) zio_wait(zio); - - /* - * Sync the uberblocks to all vdevs in the tree specified by uvd. - * If the system dies in the middle of this step, there are two cases - * to consider, and the on-disk state is consistent either way: - * - * (1) If none of the new uberblocks made it to disk, then the - * previous uberblock will be the newest, and the odd labels - * (which had not yet been touched) will be valid with respect - * to that uberblock. - * - * (2) If one or more new uberblocks made it to disk, then they - * will be the newest, and the even labels (which had all - * been successfully committed) will be valid with respect - * to the new uberblocks. - */ - if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0) - return (error); - - /* - * Flush the uberblocks to disk. This ensures that the odd labels - * are no longer needed (because the new uberblocks and the even - * labels are safely on disk), so it is safe to overwrite them. - */ - (void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); - - /* - * Sync out odd labels for every dirty vdev. If the system dies - * in the middle of this process, the even labels and the new - * uberblocks will suffice to open the pool. The next time - * the pool is opened, the first thing we'll do -- before any - * user data is modified -- is mark every vdev dirty so that - * all labels will be brought up to date. - */ - for (vd = list_head(&spa->spa_dirty_list); vd != NULL; - vd = list_next(&spa->spa_dirty_list, vd)) { - for (l = 0; l < VDEV_LABELS; l++) { - if ((l & 1) == 0) - continue; - if ((error = vdev_sync_labels(vd, l, txg)) != 0) - return (error); - } - } - - /* - * Flush the new labels to disk. This ensures that all odd-label - * updates are committed to stable storage before the next - * transaction group begins. - */ - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - for (vd = list_head(&spa->spa_dirty_list); vd != NULL; - vd = list_next(&spa->spa_dirty_list, vd)) { - zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); - } - (void) zio_wait(zio); - - return (0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c deleted file mode 100644 index 73d1a83..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ /dev/null @@ -1,495 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/vdev_impl.h> -#include <sys/zio.h> -#include <sys/fs/zfs.h> - -/* - * Virtual device vector for mirroring. - */ - -typedef struct mirror_child { - vdev_t *mc_vd; - uint64_t mc_offset; - int mc_error; - short mc_tried; - short mc_skipped; -} mirror_child_t; - -typedef struct mirror_map { - int mm_children; - int mm_replacing; - int mm_preferred; - int mm_root; - mirror_child_t mm_child[1]; -} mirror_map_t; - -int vdev_mirror_shift = 21; - -static mirror_map_t * -vdev_mirror_map_alloc(zio_t *zio) -{ - mirror_map_t *mm = NULL; - mirror_child_t *mc; - vdev_t *vd = zio->io_vd; - int c, d; - - if (vd == NULL) { - dva_t *dva = zio->io_bp->blk_dva; - spa_t *spa = zio->io_spa; - - c = BP_GET_NDVAS(zio->io_bp); - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = B_FALSE; - mm->mm_preferred = spa_get_random(c); - mm->mm_root = B_TRUE; - - /* - * Check the other, lower-index DVAs to see if they're on - * the same vdev as the child we picked. If they are, use - * them since they are likely to have been allocated from - * the primary metaslab in use at the time, and hence are - * more likely to have locality with single-copy data. - */ - for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { - if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) - mm->mm_preferred = d; - } - - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - - mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); - mc->mc_offset = DVA_GET_OFFSET(&dva[c]); - } - } else { - c = vd->vdev_children; - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - mm->mm_preferred = mm->mm_replacing ? 0 : - (zio->io_offset >> vdev_mirror_shift) % c; - mm->mm_root = B_FALSE; - - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - mc->mc_vd = vd->vdev_child[c]; - mc->mc_offset = zio->io_offset; - } - } - - zio->io_vsd = mm; - return (mm); -} - -static void -vdev_mirror_map_free(zio_t *zio) -{ - mirror_map_t *mm = zio->io_vsd; - - kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); - zio->io_vsd = NULL; -} - -static int -vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) -{ - vdev_t *cvd; - uint64_t c; - int numerrors = 0; - int ret, lasterror = 0; - - if (vd->vdev_children == 0) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); - } - - for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; - - if ((ret = vdev_open(cvd)) != 0) { - lasterror = ret; - numerrors++; - continue; - } - - *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); - } - - if (numerrors == vd->vdev_children) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (lasterror); - } - - return (0); -} - -static void -vdev_mirror_close(vdev_t *vd) -{ - uint64_t c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); -} - -static void -vdev_mirror_child_done(zio_t *zio) -{ - mirror_child_t *mc = zio->io_private; - - mc->mc_error = zio->io_error; - mc->mc_tried = 1; - mc->mc_skipped = 0; -} - -static void -vdev_mirror_scrub_done(zio_t *zio) -{ - mirror_child_t *mc = zio->io_private; - - if (zio->io_error == 0) { - zio_t *pio = zio->io_parent; - mutex_enter(&pio->io_lock); - ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); - mutex_exit(&pio->io_lock); - } - - zio_buf_free(zio->io_data, zio->io_size); - - mc->mc_error = zio->io_error; - mc->mc_tried = 1; - mc->mc_skipped = 0; -} - -static void -vdev_mirror_repair_done(zio_t *zio) -{ - ASSERT(zio->io_private == zio->io_parent); - vdev_mirror_map_free(zio->io_private); -} - -/* - * Try to find a child whose DTL doesn't contain the block we want to read. - * If we can't, try the read on any vdev we haven't already tried. - */ -static int -vdev_mirror_child_select(zio_t *zio) -{ - mirror_map_t *mm = zio->io_vsd; - mirror_child_t *mc; - uint64_t txg = zio->io_txg; - int i, c; - - ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg); - - /* - * Try to find a child whose DTL doesn't contain the block to read. - * If a child is known to be completely inaccessible (indicated by - * vdev_is_dead() returning B_TRUE), don't even try. - */ - for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { - if (c >= mm->mm_children) - c = 0; - mc = &mm->mm_child[c]; - if (mc->mc_tried || mc->mc_skipped) - continue; - if (vdev_is_dead(mc->mc_vd)) { - mc->mc_error = ENXIO; - mc->mc_tried = 1; /* don't even try */ - mc->mc_skipped = 1; - continue; - } - if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) - return (c); - mc->mc_error = ESTALE; - mc->mc_skipped = 1; - } - - /* - * Every device is either missing or has this txg in its DTL. - * Look for any child we haven't already tried before giving up. - */ - for (c = 0; c < mm->mm_children; c++) - if (!mm->mm_child[c].mc_tried) - return (c); - - /* - * Every child failed. There's no place left to look. - */ - return (-1); -} - -static void -vdev_mirror_io_start(zio_t *zio) -{ - mirror_map_t *mm; - mirror_child_t *mc; - int c, children; - - mm = vdev_mirror_map_alloc(zio); - - if (zio->io_type == ZIO_TYPE_READ) { - if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { - /* - * For scrubbing reads we need to allocate a read - * buffer for each child and issue reads to all - * children. If any child succeeds, it will copy its - * data into zio->io_data in vdev_mirror_scrub_done. - */ - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, - zio_buf_alloc(zio->io_size), zio->io_size, - zio->io_type, zio->io_priority, - ZIO_FLAG_CANFAIL, - vdev_mirror_scrub_done, mc)); - } - zio_wait_children_done(zio); - return; - } - /* - * For normal reads just pick one child. - */ - c = vdev_mirror_child_select(zio); - children = (c >= 0); - } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); - - /* - * If this is a resilvering I/O to a replacing vdev, - * only the last child should be written -- unless the - * first child happens to have a DTL entry here as well. - * All other writes go to all children. - */ - if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && - !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, - zio->io_txg, 1)) { - c = mm->mm_children - 1; - children = 1; - } else { - c = 0; - children = mm->mm_children; - } - } - - while (children--) { - mc = &mm->mm_child[c]; - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, - zio->io_data, zio->io_size, zio->io_type, zio->io_priority, - ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc)); - c++; - } - - zio_wait_children_done(zio); -} - -static void -vdev_mirror_io_done(zio_t *zio) -{ - mirror_map_t *mm = zio->io_vsd; - mirror_child_t *mc; - int c; - int good_copies = 0; - int unexpected_errors = 0; - - zio->io_error = 0; - zio->io_numerrors = 0; - - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - - if (mc->mc_tried && mc->mc_error == 0) { - good_copies++; - continue; - } - - /* - * We preserve any EIOs because those may be worth retrying; - * whereas ECKSUM and ENXIO are more likely to be persistent. - */ - if (mc->mc_error) { - if (zio->io_error != EIO) - zio->io_error = mc->mc_error; - if (!mc->mc_skipped) - unexpected_errors++; - zio->io_numerrors++; - } - } - - if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * XXX -- for now, treat partial writes as success. - * XXX -- For a replacing vdev, we need to make sure the - * new child succeeds. - */ - /* XXPOLICY */ - if (good_copies != 0) - zio->io_error = 0; - vdev_mirror_map_free(zio); - zio_next_stage(zio); - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); - - /* - * If we don't have a good copy yet, keep trying other children. - */ - /* XXPOLICY */ - if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { - ASSERT(c >= 0 && c < mm->mm_children); - mc = &mm->mm_child[c]; - dprintf("retrying i/o (err=%d) on child %s\n", - zio->io_error, vdev_description(mc->mc_vd)); - zio->io_error = 0; - zio_vdev_io_redone(zio); - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, - ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL, - vdev_mirror_child_done, mc)); - zio_wait_children_done(zio); - return; - } - - /* XXPOLICY */ - if (good_copies) - zio->io_error = 0; - else - ASSERT(zio->io_error != 0); - - if (good_copies && (spa_mode & FWRITE) && - (unexpected_errors || - (zio->io_flags & ZIO_FLAG_RESILVER) || - ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { - zio_t *rio; - - /* - * Use the good data we have in hand to repair damaged children. - * - * We issue all repair I/Os as children of 'rio' to arrange - * that vdev_mirror_map_free(zio) will be invoked after all - * repairs complete, but before we advance to the next stage. - */ - rio = zio_null(zio, zio->io_spa, - vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL); - - for (c = 0; c < mm->mm_children; c++) { - /* - * Don't rewrite known good children. - * Not only is it unnecessary, it could - * actually be harmful: if the system lost - * power while rewriting the only good copy, - * there would be no good copies left! - */ - mc = &mm->mm_child[c]; - - if (mc->mc_error == 0) { - if (mc->mc_tried) - continue; - if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, - zio->io_txg, 1)) - continue; - mc->mc_error = ESTALE; - } - - dprintf("resilvered %s @ 0x%llx error %d\n", - vdev_description(mc->mc_vd), mc->mc_offset, - mc->mc_error); - - zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd, - mc->mc_offset, zio->io_data, zio->io_size, - ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); - } - - zio_nowait(rio); - zio_wait_children_done(zio); - return; - } - - vdev_mirror_map_free(zio); - zio_next_stage(zio); -} - -static void -vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) -{ - if (faulted == vd->vdev_children) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_NO_REPLICAS); - else if (degraded + faulted != 0) - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - else - vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); -} - -vdev_ops_t vdev_mirror_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - VDEV_TYPE_MIRROR, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; - -vdev_ops_t vdev_replacing_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - VDEV_TYPE_REPLACING, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; - -vdev_ops_t vdev_spare_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - VDEV_TYPE_SPARE, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c deleted file mode 100644 index b35f4a5..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * The 'missing' vdev is a special vdev type used only during import. It - * signifies a placeholder in the root vdev for some vdev that we know is - * missing. We pass it down to the kernel to allow the rest of the - * configuration to parsed and an attempt made to open all available devices. - * Because its GUID is always 0, we know that the guid sum will mismatch and we - * won't be able to open the pool anyway. - */ - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/vdev_impl.h> -#include <sys/fs/zfs.h> -#include <sys/zio.h> - -/* ARGSUSED */ -static int -vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) -{ - /* - * Really this should just fail. But then the root vdev will be in the - * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is - * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we - * will fail the GUID sum check before ever trying to open the pool. - */ - *psize = SPA_MINDEVSIZE; - *ashift = SPA_MINBLOCKSHIFT; - return (0); -} - -/* ARGSUSED */ -static void -vdev_missing_close(vdev_t *vd) -{ -} - -/* ARGSUSED */ -static void -vdev_missing_io_start(zio_t *zio) -{ - zio->io_error = ENOTSUP; - zio_next_stage_async(zio); -} - -/* ARGSUSED */ -static void -vdev_missing_io_done(zio_t *zio) -{ - zio_next_stage(zio); -} - -vdev_ops_t vdev_missing_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - VDEV_TYPE_MISSING, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c deleted file mode 100644 index 8ef524f..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/vdev_impl.h> -#include <sys/zio.h> -#include <sys/avl.h> - -/* - * These tunables are for performance analysis. - */ -/* - * zfs_vdev_max_pending is the maximum number of i/os concurrently - * pending to each device. zfs_vdev_min_pending is the initial number - * of i/os pending to each device (before it starts ramping up to - * max_pending). - */ -int zfs_vdev_max_pending = 35; -int zfs_vdev_min_pending = 4; - -/* deadline = pri + (LBOLT >> time_shift) */ -int zfs_vdev_time_shift = 6; - -/* exponential I/O issue ramp-up rate */ -int zfs_vdev_ramp_rate = 2; - -/* - * i/os will be aggregated into a single large i/o up to - * zfs_vdev_aggregation_limit bytes long. - */ -int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; - -/* - * Virtual device vector for disk I/O scheduling. - */ -int -vdev_queue_deadline_compare(const void *x1, const void *x2) -{ - const zio_t *z1 = x1; - const zio_t *z2 = x2; - - if (z1->io_deadline < z2->io_deadline) - return (-1); - if (z1->io_deadline > z2->io_deadline) - return (1); - - if (z1->io_offset < z2->io_offset) - return (-1); - if (z1->io_offset > z2->io_offset) - return (1); - - if (z1 < z2) - return (-1); - if (z1 > z2) - return (1); - - return (0); -} - -int -vdev_queue_offset_compare(const void *x1, const void *x2) -{ - const zio_t *z1 = x1; - const zio_t *z2 = x2; - - if (z1->io_offset < z2->io_offset) - return (-1); - if (z1->io_offset > z2->io_offset) - return (1); - - if (z1 < z2) - return (-1); - if (z1 > z2) - return (1); - - return (0); -} - -void -vdev_queue_init(vdev_t *vd) -{ - vdev_queue_t *vq = &vd->vdev_queue; - - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); - - avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, - sizeof (zio_t), offsetof(struct zio, io_deadline_node)); - - avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_offset_node)); - - avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_offset_node)); - - avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_offset_node)); -} - -void -vdev_queue_fini(vdev_t *vd) -{ - vdev_queue_t *vq = &vd->vdev_queue; - - avl_destroy(&vq->vq_deadline_tree); - avl_destroy(&vq->vq_read_tree); - avl_destroy(&vq->vq_write_tree); - avl_destroy(&vq->vq_pending_tree); - - mutex_destroy(&vq->vq_lock); -} - -static void -vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) -{ - avl_add(&vq->vq_deadline_tree, zio); - avl_add(zio->io_vdev_tree, zio); -} - -static void -vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) -{ - avl_remove(&vq->vq_deadline_tree, zio); - avl_remove(zio->io_vdev_tree, zio); -} - -static void -vdev_queue_agg_io_done(zio_t *aio) -{ - zio_t *dio; - uint64_t offset = 0; - - while ((dio = aio->io_delegate_list) != NULL) { - if (aio->io_type == ZIO_TYPE_READ) - bcopy((char *)aio->io_data + offset, dio->io_data, - dio->io_size); - offset += dio->io_size; - aio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - dio->io_error = aio->io_error; - zio_next_stage(dio); - } - ASSERT3U(offset, ==, aio->io_size); - - zio_buf_free(aio->io_data, aio->io_size); -} - -#define IS_ADJACENT(io, nio) \ - ((io)->io_offset + (io)->io_size == (nio)->io_offset) - -typedef void zio_issue_func_t(zio_t *); - -static zio_t * -vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, - zio_issue_func_t **funcp) -{ - zio_t *fio, *lio, *aio, *dio; - avl_tree_t *tree; - uint64_t size; - - ASSERT(MUTEX_HELD(&vq->vq_lock)); - - *funcp = NULL; - - if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || - avl_numnodes(&vq->vq_deadline_tree) == 0) - return (NULL); - - fio = lio = avl_first(&vq->vq_deadline_tree); - - tree = fio->io_vdev_tree; - size = fio->io_size; - - while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - dio->io_delegate_next = fio; - fio = dio; - size += dio->io_size; - } - - while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - lio->io_delegate_next = dio; - lio = dio; - size += dio->io_size; - } - - if (fio != lio) { - char *buf = zio_buf_alloc(size); - uint64_t offset = 0; - int nagg = 0; - - ASSERT(size <= zfs_vdev_aggregation_limit); - - aio = zio_vdev_child_io(fio, NULL, fio->io_vd, - fio->io_offset, buf, size, fio->io_type, - ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_NOBOOKMARK, - vdev_queue_agg_io_done, NULL); - - aio->io_delegate_list = fio; - - for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { - ASSERT(dio->io_type == aio->io_type); - ASSERT(dio->io_vdev_tree == tree); - if (dio->io_type == ZIO_TYPE_WRITE) - bcopy(dio->io_data, buf + offset, dio->io_size); - offset += dio->io_size; - vdev_queue_io_remove(vq, dio); - zio_vdev_io_bypass(dio); - nagg++; - } - - ASSERT(offset == size); - - dprintf("%5s T=%llu off=%8llx agg=%3d " - "old=%5llx new=%5llx\n", - zio_type_name[fio->io_type], - fio->io_deadline, fio->io_offset, nagg, fio->io_size, size); - - avl_add(&vq->vq_pending_tree, aio); - - *funcp = zio_nowait; - return (aio); - } - - ASSERT(fio->io_vdev_tree == tree); - vdev_queue_io_remove(vq, fio); - - avl_add(&vq->vq_pending_tree, fio); - - *funcp = zio_next_stage; - - return (fio); -} - -zio_t * -vdev_queue_io(zio_t *zio) -{ - vdev_queue_t *vq = &zio->io_vd->vdev_queue; - zio_t *nio; - zio_issue_func_t *func; - - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); - - if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) - return (zio); - - zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; - - if (zio->io_type == ZIO_TYPE_READ) - zio->io_vdev_tree = &vq->vq_read_tree; - else - zio->io_vdev_tree = &vq->vq_write_tree; - - mutex_enter(&vq->vq_lock); - - zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + - zio->io_priority; - - vdev_queue_io_add(vq, zio); - - nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func); - - mutex_exit(&vq->vq_lock); - - if (nio == NULL || func != zio_nowait) - return (nio); - - func(nio); - return (NULL); -} - -void -vdev_queue_io_done(zio_t *zio) -{ - vdev_queue_t *vq = &zio->io_vd->vdev_queue; - zio_t *nio; - zio_issue_func_t *func; - int i; - - mutex_enter(&vq->vq_lock); - - avl_remove(&vq->vq_pending_tree, zio); - - for (i = 0; i < zfs_vdev_ramp_rate; i++) { - nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func); - if (nio == NULL) - break; - mutex_exit(&vq->vq_lock); - if (func == zio_next_stage) - zio_vdev_io_reissue(nio); - func(nio); - mutex_enter(&vq->vq_lock); - } - - mutex_exit(&vq->vq_lock); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c deleted file mode 100644 index 0c86630..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ /dev/null @@ -1,1237 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/vdev_impl.h> -#include <sys/zio.h> -#include <sys/zio_checksum.h> -#include <sys/fs/zfs.h> -#include <sys/fm/fs/zfs.h> - -/* - * Virtual device vector for RAID-Z. - * - * This vdev supports both single and double parity. For single parity, we - * use a simple XOR of all the data columns. For double parity, we use both - * the simple XOR as well as a technique described in "The mathematics of - * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), - * over the integers expressable in a single byte. Briefly, the operations on - * the field are defined as follows: - * - * o addition (+) is represented by a bitwise XOR - * o subtraction (-) is therefore identical to addition: A + B = A - B - * o multiplication of A by 2 is defined by the following bitwise expression: - * (A * 2)_7 = A_6 - * (A * 2)_6 = A_5 - * (A * 2)_5 = A_4 - * (A * 2)_4 = A_3 + A_7 - * (A * 2)_3 = A_2 + A_7 - * (A * 2)_2 = A_1 + A_7 - * (A * 2)_1 = A_0 - * (A * 2)_0 = A_7 - * - * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). - * - * Observe that any number in the field (except for 0) can be expressed as a - * power of 2 -- a generator for the field. We store a table of the powers of - * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can - * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather - * than field addition). The inverse of a field element A (A^-1) is A^254. - * - * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, - * can be expressed by field operations: - * - * P = D_0 + D_1 + ... + D_n-2 + D_n-1 - * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 - * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 - * - * See the reconstruction code below for how P and Q can used individually or - * in concert to recover missing data columns. - */ - -typedef struct raidz_col { - uint64_t rc_devidx; /* child device index for I/O */ - uint64_t rc_offset; /* device offset */ - uint64_t rc_size; /* I/O size */ - void *rc_data; /* I/O data */ - int rc_error; /* I/O error for this device */ - uint8_t rc_tried; /* Did we attempt this I/O column? */ - uint8_t rc_skipped; /* Did we skip this I/O column? */ -} raidz_col_t; - -typedef struct raidz_map { - uint64_t rm_cols; /* Column count */ - uint64_t rm_bigcols; /* Number of oversized columns */ - uint64_t rm_asize; /* Actual total I/O size */ - uint64_t rm_missingdata; /* Count of missing data devices */ - uint64_t rm_missingparity; /* Count of missing parity devices */ - uint64_t rm_firstdatacol; /* First data column/parity count */ - raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ -} raidz_map_t; - -#define VDEV_RAIDZ_P 0 -#define VDEV_RAIDZ_Q 1 - -#define VDEV_RAIDZ_MAXPARITY 2 - -#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) - -/* - * These two tables represent powers and logs of 2 in the Galois field defined - * above. These values were computed by repeatedly multiplying by 2 as above. - */ -static const uint8_t vdev_raidz_pow2[256] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, - 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, - 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, - 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, - 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, - 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, - 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, - 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, - 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, - 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, - 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, - 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, - 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, - 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, - 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, - 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, - 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, - 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, - 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, - 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, - 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, - 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, - 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, - 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, - 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, - 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, - 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, - 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, - 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, - 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, - 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 -}; -static const uint8_t vdev_raidz_log2[256] = { - 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, - 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, - 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, - 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, - 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, - 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, - 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, - 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, - 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, - 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, - 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, - 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, - 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, - 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, - 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, - 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, - 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, - 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, - 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, - 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, - 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, - 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, - 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, - 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, - 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, - 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, - 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, - 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, - 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, - 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, - 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, - 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, -}; - -/* - * Multiply a given number by 2 raised to the given power. - */ -static uint8_t -vdev_raidz_exp2(uint_t a, int exp) -{ - if (a == 0) - return (0); - - ASSERT(exp >= 0); - ASSERT(vdev_raidz_log2[a] > 0 || a == 1); - - exp += vdev_raidz_log2[a]; - if (exp > 255) - exp -= 255; - - return (vdev_raidz_pow2[exp]); -} - -static raidz_map_t * -vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, - uint64_t nparity) -{ - raidz_map_t *rm; - uint64_t b = zio->io_offset >> unit_shift; - uint64_t s = zio->io_size >> unit_shift; - uint64_t f = b % dcols; - uint64_t o = (b / dcols) << unit_shift; - uint64_t q, r, c, bc, col, acols, coff, devidx; - - q = s / (dcols - nparity); - r = s - q * (dcols - nparity); - bc = (r == 0 ? 0 : r + nparity); - - acols = (q == 0 ? bc : dcols); - - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); - - rm->rm_cols = acols; - rm->rm_bigcols = bc; - rm->rm_asize = 0; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - rm->rm_firstdatacol = nparity; - - for (c = 0; c < acols; c++) { - col = f + c; - coff = o; - if (col >= dcols) { - col -= dcols; - coff += 1ULL << unit_shift; - } - rm->rm_col[c].rc_devidx = col; - rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; - rm->rm_col[c].rc_data = NULL; - rm->rm_col[c].rc_error = 0; - rm->rm_col[c].rc_tried = 0; - rm->rm_col[c].rc_skipped = 0; - rm->rm_asize += rm->rm_col[c].rc_size; - } - - rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); - - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); - - rm->rm_col[c].rc_data = zio->io_data; - - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; - - /* - * If all data stored spans all columns, there's a danger that parity - * will always be on the same device and, since parity isn't read - * during normal operation, that that device's I/O bandwidth won't be - * used effectively. We therefore switch the parity every 1MB. - * - * ... at least that was, ostensibly, the theory. As a practical - * matter unless we juggle the parity between all devices evenly, we - * won't see any benefit. Further, occasional writes that aren't a - * multiple of the LCM of the number of children and the minimum - * stripe width are sufficient to avoid pessimal behavior. - * Unfortunately, this decision created an implicit on-disk format - * requirement that we need to support for all eternity, but only - * for single-parity RAID-Z. - */ - ASSERT(rm->rm_cols >= 2); - ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); - - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { - devidx = rm->rm_col[0].rc_devidx; - o = rm->rm_col[0].rc_offset; - rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; - rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; - rm->rm_col[1].rc_devidx = devidx; - rm->rm_col[1].rc_offset = o; - } - - zio->io_vsd = rm; - return (rm); -} - -static void -vdev_raidz_map_free(zio_t *zio) -{ - raidz_map_t *rm = zio->io_vsd; - int c; - - for (c = 0; c < rm->rm_firstdatacol; c++) - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); - - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); - zio->io_vsd = NULL; -} - -static void -vdev_raidz_generate_parity_p(raidz_map_t *rm) -{ - uint64_t *p, *src, pcount, ccount, i; - int c; - - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - - if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, p++, src++) { - *p = *src; - } - } else { - ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, p++, src++) { - *p ^= *src; - } - } - } -} - -static void -vdev_raidz_generate_parity_pq(raidz_map_t *rm) -{ - uint64_t *q, *p, *src, pcount, ccount, mask, i; - int c; - - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - - if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount || ccount == 0); - for (i = 0; i < ccount; i++, p++, q++, src++) { - *q = *src; - *p = *src; - } - for (; i < pcount; i++, p++, q++, src++) { - *q = 0; - *p = 0; - } - } else { - ASSERT(ccount <= pcount); - - /* - * Rather than multiplying each byte individually (as - * described above), we are able to handle 8 at once - * by generating a mask based on the high bit in each - * byte and using that to conditionally XOR in 0x1d. - */ - for (i = 0; i < ccount; i++, p++, q++, src++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); - *q ^= *src; - *p ^= *src; - } - - /* - * Treat short columns as though they are full of 0s. - */ - for (; i < pcount; i++, q++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); - } - } - } -} - -static void -vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) -{ - uint64_t *dst, *src, xcount, ccount, count, i; - int c; - - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); - ASSERT(xcount > 0); - - src = rm->rm_col[VDEV_RAIDZ_P].rc_data; - dst = rm->rm_col[x].rc_data; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst = *src; - } - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; - - if (c == x) - continue; - - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - count = MIN(ccount, xcount); - - for (i = 0; i < count; i++, dst++, src++) { - *dst ^= *src; - } - } -} - -static void -vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) -{ - uint64_t *dst, *src, xcount, ccount, count, mask, i; - uint8_t *b; - int c, j, exp; - - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; - - if (c == x) - ccount = 0; - else - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - - count = MIN(ccount, xcount); - - if (c == rm->rm_firstdatacol) { - for (i = 0; i < count; i++, dst++, src++) { - *dst = *src; - } - for (; i < xcount; i++, dst++) { - *dst = 0; - } - - } else { - /* - * For an explanation of this, see the comment in - * vdev_raidz_generate_parity_pq() above. - */ - for (i = 0; i < count; i++, dst++, src++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); - *dst ^= *src; - } - - for (; i < xcount; i++, dst++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); - } - } - } - - src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - dst = rm->rm_col[x].rc_data; - exp = 255 - (rm->rm_cols - 1 - x); - - for (i = 0; i < xcount; i++, dst++, src++) { - *dst ^= *src; - for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { - *b = vdev_raidz_exp2(*b, exp); - } - } -} - -static void -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) -{ - uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; - void *pdata, *qdata; - uint64_t xsize, ysize, i; - - ASSERT(x < y); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(y < rm->rm_cols); - - ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); - - /* - * Move the parity data aside -- we're going to compute parity as - * though columns x and y were full of zeros -- Pxy and Qxy. We want to - * reuse the parity generation mechanism without trashing the actual - * parity so we make those columns appear to be full of zeros by - * setting their lengths to zero. - */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - xsize = rm->rm_col[x].rc_size; - ysize = rm->rm_col[y].rc_size; - - rm->rm_col[VDEV_RAIDZ_P].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); - rm->rm_col[VDEV_RAIDZ_Q].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); - rm->rm_col[x].rc_size = 0; - rm->rm_col[y].rc_size = 0; - - vdev_raidz_generate_parity_pq(rm); - - rm->rm_col[x].rc_size = xsize; - rm->rm_col[y].rc_size = ysize; - - p = pdata; - q = qdata; - pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - xd = rm->rm_col[x].rc_data; - yd = rm->rm_col[y].rc_data; - - /* - * We now have: - * Pxy = P + D_x + D_y - * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y - * - * We can then solve for D_x: - * D_x = A * (P + Pxy) + B * (Q + Qxy) - * where - * A = 2^(x - y) * (2^(x - y) + 1)^-1 - * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 - * - * With D_x in hand, we can easily solve for D_y: - * D_y = P + Pxy + D_x - */ - - a = vdev_raidz_pow2[255 + x - y]; - b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; - tmp = 255 - vdev_raidz_log2[a ^ 1]; - - aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; - bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; - - for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { - *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ - vdev_raidz_exp2(*q ^ *qxy, bexp); - - if (i < ysize) - *yd = *p ^ *pxy ^ *xd; - } - - zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, - rm->rm_col[VDEV_RAIDZ_P].rc_size); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - - /* - * Restore the saved parity data. - */ - rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; -} - - -static int -vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) -{ - vdev_t *cvd; - uint64_t nparity = vd->vdev_nparity; - int c, error; - int lasterror = 0; - int numerrors = 0; - - ASSERT(nparity > 0); - - if (nparity > VDEV_RAIDZ_MAXPARITY || - vd->vdev_children < nparity + 1) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); - } - - for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; - - if ((error = vdev_open(cvd)) != 0) { - lasterror = error; - numerrors++; - continue; - } - - *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); - } - - *asize *= vd->vdev_children; - - if (numerrors > nparity) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (lasterror); - } - - return (0); -} - -static void -vdev_raidz_close(vdev_t *vd) -{ - int c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); -} - -static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize) -{ - uint64_t asize; - uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; - - asize = ((psize - 1) >> ashift) + 1; - asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); - asize = roundup(asize, nparity + 1) << ashift; - - return (asize); -} - -static void -vdev_raidz_child_done(zio_t *zio) -{ - raidz_col_t *rc = zio->io_private; - - rc->rc_error = zio->io_error; - rc->rc_tried = 1; - rc->rc_skipped = 0; -} - -static void -vdev_raidz_repair_done(zio_t *zio) -{ - ASSERT(zio->io_private == zio->io_parent); - vdev_raidz_map_free(zio->io_private); -} - -static void -vdev_raidz_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd->vdev_top; - vdev_t *cvd; - blkptr_t *bp = zio->io_bp; - raidz_map_t *rm; - raidz_col_t *rc; - int c; - - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, - vd->vdev_nparity); - - ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); - - if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * Generate RAID parity in the first virtual columns. - */ - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, - zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, - vdev_raidz_child_done, rc)); - } - zio_wait_children_done(zio); - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); - - /* - * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity - * data. - */ - for (c = rm->rm_cols - 1; c >= 0; c--) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - if (vdev_is_dead(cvd)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = ENXIO; - rc->rc_tried = 1; /* don't even try */ - rc->rc_skipped = 1; - continue; - } - if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = ESTALE; - rc->rc_skipped = 1; - continue; - } - if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || - (zio->io_flags & ZIO_FLAG_SCRUB)) { - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, - zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, - vdev_raidz_child_done, rc)); - } - } - - zio_wait_children_done(zio); -} - -/* - * Report a checksum error for a child of a RAID-Z device. - */ -static void -raidz_checksum_error(zio_t *zio, raidz_col_t *rc) -{ - vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; - dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", - vdev_description(vd)); - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - } - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); -} - -/* - * Generate the parity from the data columns. If we tried and were able to - * read the parity without error, verify that the generated parity matches the - * data we read. If it doesn't, we fire off a checksum error. Return the - * number such failures. - */ -static int -raidz_parity_verify(zio_t *zio, raidz_map_t *rm) -{ - void *orig[VDEV_RAIDZ_MAXPARITY]; - int c, ret = 0; - raidz_col_t *rc; - - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; - if (!rc->rc_tried || rc->rc_error != 0) - continue; - orig[c] = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig[c], rc->rc_size); - } - - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); - - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; - if (!rc->rc_tried || rc->rc_error != 0) - continue; - if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - ret++; - } - zio_buf_free(orig[c], rc->rc_size); - } - - return (ret); -} - -static uint64_t raidz_corrected_p; -static uint64_t raidz_corrected_q; -static uint64_t raidz_corrected_pq; - -static void -vdev_raidz_io_done(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_t *cvd; - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc, *rc1; - int unexpected_errors = 0; - int parity_errors = 0; - int parity_untried = 0; - int data_errors = 0; - int n, c, c1; - - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ - - zio->io_error = 0; - zio->io_numerrors = 0; - - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); - ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - - /* - * We preserve any EIOs because those may be worth retrying; - * whereas ECKSUM and ENXIO are more likely to be persistent. - */ - if (rc->rc_error) { - if (zio->io_error != EIO) - zio->io_error = rc->rc_error; - - if (c < rm->rm_firstdatacol) - parity_errors++; - else - data_errors++; - - if (!rc->rc_skipped) - unexpected_errors++; - - zio->io_numerrors++; - } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { - parity_untried++; - } - } - - if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * If this is not a failfast write, and we were able to - * write enough columns to reconstruct the data, good enough. - */ - /* XXPOLICY */ - if (zio->io_numerrors <= rm->rm_firstdatacol && - !(zio->io_flags & ZIO_FLAG_FAILFAST)) - zio->io_error = 0; - - vdev_raidz_map_free(zio); - zio_next_stage(zio); - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); - /* - * There are three potential phases for a read: - * 1. produce valid data from the columns read - * 2. read all disks and try again - * 3. perform combinatorial reconstruction - * - * Each phase is progressively both more expensive and less likely to - * occur. If we encounter more errors than we can repair or all phases - * fail, we have no choice but to return an error. - */ - - /* - * If the number of errors we saw was correctable -- less than or equal - * to the number of parity disks read -- attempt to produce data that - * has a valid checksum. Naturally, this case applies in the absence of - * any errors. - */ - if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) { - switch (data_errors) { - case 0: - if (zio_checksum_error(zio) == 0) { - zio->io_error = 0; - - /* - * If we read parity information (unnecessarily - * as it happens since no reconstruction was - * needed) regenerate and verify the parity. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. - */ - if (parity_errors + parity_untried < - rm->rm_firstdatacol || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - goto done; - } - break; - - case 1: - /* - * We either attempt to read all the parity columns or - * none of them. If we didn't try to read parity, we - * wouldn't be here in the correctable case. There must - * also have been fewer parity errors than parity - * columns or, again, we wouldn't be in this code path. - */ - ASSERT(parity_untried == 0); - ASSERT(parity_errors < rm->rm_firstdatacol); - - /* - * Find the column that reported the error. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { - vdev_raidz_reconstruct_p(rm, c); - } else { - ASSERT(rm->rm_firstdatacol > 1); - vdev_raidz_reconstruct_q(rm, c); - } - - if (zio_checksum_error(zio) == 0) { - zio->io_error = 0; - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) - atomic_inc_64(&raidz_corrected_p); - else - atomic_inc_64(&raidz_corrected_q); - - /* - * If there's more than one parity disk that - * was successfully read, confirm that the - * other parity disk produced the correct data. - * This routine is suboptimal in that it - * regenerates both the parity we wish to test - * as well as the parity we just used to - * perform the reconstruction, but this should - * be a relatively uncommon case, and can be - * optimized if it becomes a problem. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. - */ - if (parity_errors < rm->rm_firstdatacol - 1 || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - - goto done; - } - break; - - case 2: - /* - * Two data column errors require double parity. - */ - ASSERT(rm->rm_firstdatacol == 2); - - /* - * Find the two columns that reported errors. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - for (c1 = c++; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - vdev_raidz_reconstruct_pq(rm, c1, c); - - if (zio_checksum_error(zio) == 0) { - zio->io_error = 0; - atomic_inc_64(&raidz_corrected_pq); - - goto done; - } - break; - - default: - ASSERT(rm->rm_firstdatacol <= 2); - ASSERT(0); - } - } - - /* - * This isn't a typical situation -- either we got a read error or - * a child silently returned bad data. Read every block so we can - * try again with as much data and parity as we can track down. If - * we've already been through once before, all children will be marked - * as tried so we'll proceed to combinatorial reconstruction. - */ - unexpected_errors = 1; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - - for (c = 0; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_tried) - continue; - - zio->io_error = 0; - zio_vdev_io_redone(zio); - do { - rc = &rm->rm_col[c]; - if (rc->rc_tried) - continue; - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_data, rc->rc_size, - zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, - vdev_raidz_child_done, rc)); - } while (++c < rm->rm_cols); - dprintf("rereading\n"); - zio_wait_children_done(zio); - return; - } - - /* - * At this point we've attempted to reconstruct the data given the - * errors we detected, and we've attempted to read all columns. There - * must, therefore, be one or more additional problems -- silent errors - * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. Before we attempt combinatorial reconstruction make - * sure we have a chance of coming up with the right answer. - */ - if (zio->io_numerrors >= rm->rm_firstdatacol) { - ASSERT(zio->io_error != 0); - goto done; - } - - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { - /* - * Attempt to reconstruct the data from parity P. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_p(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - zio->io_error = 0; - atomic_inc_64(&raidz_corrected_p); - - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { - /* - * Attempt to reconstruct the data from parity Q. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_q(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - zio->io_error = 0; - atomic_inc_64(&raidz_corrected_q); - - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - if (rm->rm_firstdatacol > 1 && - rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && - rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { - /* - * Attempt to reconstruct the data from both P and Q. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { - void *orig, *orig1; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - - for (c1 = c + 1; c1 < rm->rm_cols; c1++) { - rc1 = &rm->rm_col[c1]; - - orig1 = zio_buf_alloc(rc1->rc_size); - bcopy(rc1->rc_data, orig1, rc1->rc_size); - - vdev_raidz_reconstruct_pq(rm, c, c1); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - zio_buf_free(orig1, rc1->rc_size); - zio->io_error = 0; - atomic_inc_64(&raidz_corrected_pq); - - /* - * If these children didn't know they - * returned bad data, inform them. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - if (rc1->rc_tried && rc1->rc_error == 0) - raidz_checksum_error(zio, rc1); - - rc->rc_error = ECKSUM; - rc1->rc_error = ECKSUM; - - goto done; - } - - bcopy(orig1, rc1->rc_data, rc1->rc_size); - zio_buf_free(orig1, rc1->rc_size); - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - /* - * All combinations failed to checksum. Generate checksum ereports for - * all children. - */ - zio->io_error = ECKSUM; - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, - rc->rc_offset, rc->rc_size); - } - } - -done: - zio_checksum_verified(zio); - - if (zio->io_error == 0 && (spa_mode & FWRITE) && - (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { - zio_t *rio; - - /* - * Use the good data we have in hand to repair damaged children. - * - * We issue all repair I/Os as children of 'rio' to arrange - * that vdev_raidz_map_free(zio) will be invoked after all - * repairs complete, but before we advance to the next stage. - */ - rio = zio_null(zio, zio->io_spa, - vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - if (rc->rc_error == 0) - continue; - - dprintf("%s resilvered %s @ 0x%llx error %d\n", - vdev_description(vd), - vdev_description(cvd), - zio->io_offset, rc->rc_error); - - zio_nowait(zio_vdev_child_io(rio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, - ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_CANFAIL, NULL, NULL)); - } - - zio_nowait(rio); - zio_wait_children_done(zio); - return; - } - - vdev_raidz_map_free(zio); - zio_next_stage(zio); -} - -static void -vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) -{ - if (faulted > vd->vdev_nparity) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_NO_REPLICAS); - else if (degraded + faulted != 0) - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - else - vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); -} - -vdev_ops_t vdev_raidz_ops = { - vdev_raidz_open, - vdev_raidz_close, - vdev_raidz_asize, - vdev_raidz_io_start, - vdev_raidz_io_done, - vdev_raidz_state_change, - VDEV_TYPE_RAIDZ, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c deleted file mode 100644 index 0e8752c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/vdev_impl.h> -#include <sys/zio.h> -#include <sys/fs/zfs.h> - -/* - * Virtual device vector for the pool's root vdev. - */ - -/* - * We should be able to tolerate one failure with absolutely no damage - * to our metadata. Two failures will take out space maps, a bunch of - * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy - * place to live. When we get smarter, we can liberalize this policy. - * e.g. If we haven't lost two consecutive top-level vdevs, then we are - * probably fine. Adding bean counters during alloc/free can make this - * future guesswork more accurate. - */ -/*ARGSUSED*/ -static int -too_many_errors(vdev_t *vd, int numerrors) -{ - return (numerrors > 0); -} - -static int -vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) -{ - vdev_t *cvd; - int c, error; - int lasterror = 0; - int numerrors = 0; - - if (vd->vdev_children == 0) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); - } - - for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; - - if ((error = vdev_open(cvd)) != 0) { - lasterror = error; - numerrors++; - continue; - } - } - - if (too_many_errors(vd, numerrors)) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (lasterror); - } - - *asize = 0; - *ashift = 0; - - return (0); -} - -static void -vdev_root_close(vdev_t *vd) -{ - int c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); -} - -static void -vdev_root_state_change(vdev_t *vd, int faulted, int degraded) -{ - if (too_many_errors(vd, faulted)) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_NO_REPLICAS); - else if (degraded != 0) - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - else - vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); -} - -vdev_ops_t vdev_root_ops = { - vdev_root_open, - vdev_root_close, - vdev_default_asize, - NULL, /* io_start - not applicable to the root */ - NULL, /* io_done - not applicable to the root */ - vdev_root_state_change, - VDEV_TYPE_ROOT, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c deleted file mode 100644 index 4246ec0..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ /dev/null @@ -1,1071 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - - -/* - * This file contains the top half of the zfs directory structure - * implementation. The bottom half is in zap_leaf.c. - * - * The zdir is an extendable hash data structure. There is a table of - * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are - * each a constant size and hold a variable number of directory entries. - * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. - * - * The pointer table holds a power of 2 number of pointers. - * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to - * by the pointer at index i in the table holds entries whose hash value - * has a zd_prefix_len - bit prefix - */ - -#include <sys/spa.h> -#include <sys/dmu.h> -#include <sys/zfs_context.h> -#include <sys/zap.h> -#include <sys/refcount.h> -#include <sys/zap_impl.h> -#include <sys/zap_leaf.h> -#include <sys/zfs_znode.h> - -int fzap_default_block_shift = 14; /* 16k blocksize */ - -static void zap_leaf_pageout(dmu_buf_t *db, void *vl); -static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); - - -void -fzap_byteswap(void *vbuf, size_t size) -{ - uint64_t block_type; - - block_type = *(uint64_t *)vbuf; - - if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) - zap_leaf_byteswap(vbuf, size); - else { - /* it's a ptrtbl block */ - byteswap_uint64_array(vbuf, size); - } -} - -void -fzap_upgrade(zap_t *zap, dmu_tx_t *tx) -{ - dmu_buf_t *db; - zap_leaf_t *l; - int i; - zap_phys_t *zp; - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - zap->zap_ismicro = FALSE; - - (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, - &zap->zap_f.zap_phys, zap_evict); - - mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0); - zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1; - - zp = zap->zap_f.zap_phys; - /* - * explicitly zero it since it might be coming from an - * initialized microzap - */ - bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); - zp->zap_block_type = ZBT_HEADER; - zp->zap_magic = ZAP_MAGIC; - - zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); - - zp->zap_freeblk = 2; /* block 1 will be the first leaf */ - zp->zap_num_leafs = 1; - zp->zap_num_entries = 0; - zp->zap_salt = zap->zap_salt; - - /* block 1 will be the first leaf */ - for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++) - ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; - - /* - * set up block 1 - the first leaf - */ - VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db)); - dmu_buf_will_dirty(db, tx); - - l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - l->l_dbuf = db; - l->l_phys = db->db_data; - - zap_leaf_init(l); - - kmem_free(l, sizeof (zap_leaf_t)); - dmu_buf_rele(db, FTAG); -} - -static int -zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) -{ - if (RW_WRITE_HELD(&zap->zap_rwlock)) - return (1); - if (rw_tryupgrade(&zap->zap_rwlock)) { - dmu_buf_will_dirty(zap->zap_dbuf, tx); - return (1); - } - return (0); -} - -/* - * Generic routines for dealing with the pointer & cookie tables. - */ - -static int -zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, - void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), - dmu_tx_t *tx) -{ - uint64_t b, newblk; - dmu_buf_t *db_old, *db_new; - int err; - int bs = FZAP_BLOCK_SHIFT(zap); - int hepb = 1<<(bs-4); - /* hepb = half the number of entries in a block */ - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(tbl->zt_blk != 0); - ASSERT(tbl->zt_numblks > 0); - - if (tbl->zt_nextblk != 0) { - newblk = tbl->zt_nextblk; - } else { - newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); - tbl->zt_nextblk = newblk; - ASSERT3U(tbl->zt_blks_copied, ==, 0); - dmu_prefetch(zap->zap_objset, zap->zap_object, - tbl->zt_blk << bs, tbl->zt_numblks << bs); - } - - /* - * Copy the ptrtbl from the old to new location. - */ - - b = tbl->zt_blks_copied; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + b) << bs, FTAG, &db_old); - if (err) - return (err); - - /* first half of entries in old[b] go to new[2*b+0] */ - VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+0) << bs, FTAG, &db_new)); - dmu_buf_will_dirty(db_new, tx); - transfer_func(db_old->db_data, db_new->db_data, hepb); - dmu_buf_rele(db_new, FTAG); - - /* second half of entries in old[b] go to new[2*b+1] */ - VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+1) << bs, FTAG, &db_new)); - dmu_buf_will_dirty(db_new, tx); - transfer_func((uint64_t *)db_old->db_data + hepb, - db_new->db_data, hepb); - dmu_buf_rele(db_new, FTAG); - - dmu_buf_rele(db_old, FTAG); - - tbl->zt_blks_copied++; - - dprintf("copied block %llu of %llu\n", - tbl->zt_blks_copied, tbl->zt_numblks); - - if (tbl->zt_blks_copied == tbl->zt_numblks) { - (void) dmu_free_range(zap->zap_objset, zap->zap_object, - tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); - - tbl->zt_blk = newblk; - tbl->zt_numblks *= 2; - tbl->zt_shift++; - tbl->zt_nextblk = 0; - tbl->zt_blks_copied = 0; - - dprintf("finished; numblocks now %llu (%lluk entries)\n", - tbl->zt_numblks, 1<<(tbl->zt_shift-10)); - } - - return (0); -} - -static int -zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, - dmu_tx_t *tx) -{ - int err; - uint64_t blk, off; - int bs = FZAP_BLOCK_SHIFT(zap); - dmu_buf_t *db; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT(tbl->zt_blk != 0); - - dprintf("storing %llx at index %llx\n", val, idx); - - blk = idx >> (bs-3); - off = idx & ((1<<(bs-3))-1); - - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + blk) << bs, FTAG, &db); - if (err) - return (err); - dmu_buf_will_dirty(db, tx); - - if (tbl->zt_nextblk != 0) { - uint64_t idx2 = idx * 2; - uint64_t blk2 = idx2 >> (bs-3); - uint64_t off2 = idx2 & ((1<<(bs-3))-1); - dmu_buf_t *db2; - - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_nextblk + blk2) << bs, FTAG, &db2); - if (err) { - dmu_buf_rele(db, FTAG); - return (err); - } - dmu_buf_will_dirty(db2, tx); - ((uint64_t *)db2->db_data)[off2] = val; - ((uint64_t *)db2->db_data)[off2+1] = val; - dmu_buf_rele(db2, FTAG); - } - - ((uint64_t *)db->db_data)[off] = val; - dmu_buf_rele(db, FTAG); - - return (0); -} - -static int -zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) -{ - uint64_t blk, off; - int err; - dmu_buf_t *db; - int bs = FZAP_BLOCK_SHIFT(zap); - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - blk = idx >> (bs-3); - off = idx & ((1<<(bs-3))-1); - - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + blk) << bs, FTAG, &db); - if (err) - return (err); - *valp = ((uint64_t *)db->db_data)[off]; - dmu_buf_rele(db, FTAG); - - if (tbl->zt_nextblk != 0) { - /* - * read the nextblk for the sake of i/o error checking, - * so that zap_table_load() will catch errors for - * zap_table_store. - */ - blk = (idx*2) >> (bs-3); - - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_nextblk + blk) << bs, FTAG, &db); - dmu_buf_rele(db, FTAG); - } - return (err); -} - -/* - * Routines for growing the ptrtbl. - */ - -static void -zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) -{ - int i; - for (i = 0; i < n; i++) { - uint64_t lb = src[i]; - dst[2*i+0] = lb; - dst[2*i+1] = lb; - } -} - -static int -zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) -{ - /* In case things go horribly wrong. */ - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2) - return (ENOSPC); - - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { - /* - * We are outgrowing the "embedded" ptrtbl (the one - * stored in the header block). Give it its own entire - * block, which will double the size of the ptrtbl. - */ - uint64_t newblk; - dmu_buf_t *db_new; - int err; - - ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, - ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0); - - newblk = zap_allocate_blocks(zap, 1); - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new); - if (err) - return (err); - dmu_buf_will_dirty(db_new, tx); - zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - dmu_buf_rele(db_new, FTAG); - - zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; - zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1; - zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++; - - ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, - zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << - (FZAP_BLOCK_SHIFT(zap)-3)); - - return (0); - } else { - return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl, - zap_ptrtbl_transfer, tx)); - } -} - -static void -zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) -{ - dmu_buf_will_dirty(zap->zap_dbuf, tx); - mutex_enter(&zap->zap_f.zap_num_entries_mtx); - ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta); - zap->zap_f.zap_phys->zap_num_entries += delta; - mutex_exit(&zap->zap_f.zap_num_entries_mtx); -} - -static uint64_t -zap_allocate_blocks(zap_t *zap, int nblocks) -{ - uint64_t newblk; - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - newblk = zap->zap_f.zap_phys->zap_freeblk; - zap->zap_f.zap_phys->zap_freeblk += nblocks; - return (newblk); -} - -static zap_leaf_t * -zap_create_leaf(zap_t *zap, dmu_tx_t *tx) -{ - void *winner; - zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0); - rw_enter(&l->l_rwlock, RW_WRITER); - l->l_blkid = zap_allocate_blocks(zap, 1); - l->l_dbuf = NULL; - l->l_phys = NULL; - - VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf)); - winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); - ASSERT(winner == NULL); - dmu_buf_will_dirty(l->l_dbuf, tx); - - zap_leaf_init(l); - - zap->zap_f.zap_phys->zap_num_leafs++; - - return (l); -} - -int -fzap_count(zap_t *zap, uint64_t *count) -{ - ASSERT(!zap->zap_ismicro); - mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ - *count = zap->zap_f.zap_phys->zap_num_entries; - mutex_exit(&zap->zap_f.zap_num_entries_mtx); - return (0); -} - -/* - * Routines for obtaining zap_leaf_t's - */ - -void -zap_put_leaf(zap_leaf_t *l) -{ - rw_exit(&l->l_rwlock); - dmu_buf_rele(l->l_dbuf, NULL); -} - -_NOTE(ARGSUSED(0)) -static void -zap_leaf_pageout(dmu_buf_t *db, void *vl) -{ - zap_leaf_t *l = vl; - - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - -static zap_leaf_t * -zap_open_leaf(uint64_t blkid, dmu_buf_t *db) -{ - zap_leaf_t *l, *winner; - - ASSERT(blkid != 0); - - l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); - rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0); - rw_enter(&l->l_rwlock, RW_WRITER); - l->l_blkid = blkid; - l->l_bs = highbit(db->db_size)-1; - l->l_dbuf = db; - l->l_phys = NULL; - - winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); - - rw_exit(&l->l_rwlock); - if (winner != NULL) { - /* someone else set it first */ - zap_leaf_pageout(NULL, l); - l = winner; - } - - /* - * lhr_pad was previously used for the next leaf in the leaf - * chain. There should be no chained leafs (as we have removed - * support for them). - */ - ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0); - - /* - * There should be more hash entries than there can be - * chunks to put in the hash table - */ - ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); - - /* The chunks should begin at the end of the hash table */ - ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, - &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); - - /* The chunks should end at the end of the block */ - ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - - (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size); - - return (l); -} - -static int -zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, - zap_leaf_t **lp) -{ - dmu_buf_t *db; - zap_leaf_t *l; - int bs = FZAP_BLOCK_SHIFT(zap); - int err; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - blkid << bs, NULL, &db); - if (err) - return (err); - - ASSERT3U(db->db_object, ==, zap->zap_object); - ASSERT3U(db->db_offset, ==, blkid << bs); - ASSERT3U(db->db_size, ==, 1 << bs); - ASSERT(blkid != 0); - - l = dmu_buf_get_user(db); - - if (l == NULL) - l = zap_open_leaf(blkid, db); - - rw_enter(&l->l_rwlock, lt); - /* - * Must lock before dirtying, otherwise l->l_phys could change, - * causing ASSERT below to fail. - */ - if (lt == RW_WRITER) - dmu_buf_will_dirty(db, tx); - ASSERT3U(l->l_blkid, ==, blkid); - ASSERT3P(l->l_dbuf, ==, db); - ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); - ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); - ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - - *lp = l; - return (0); -} - -static int -zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) -{ - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { - ASSERT3U(idx, <, - (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift)); - *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); - return (0); - } else { - return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl, - idx, valp)); - } -} - -static int -zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) -{ - ASSERT(tx != NULL); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) { - ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; - return (0); - } else { - return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, - idx, blk, tx)); - } -} - -static int -zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) -{ - uint64_t idx, blk; - int err; - - ASSERT(zap->zap_dbuf == NULL || - zap->zap_f.zap_phys == zap->zap_dbuf->db_data); - ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); - idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); - err = zap_idx_to_blk(zap, idx, &blk); - if (err != 0) - return (err); - err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); - - ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) == - (*lp)->l_phys->l_hdr.lh_prefix); - return (err); -} - -static int -zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx, - zap_leaf_t **lp) -{ - zap_leaf_t *nl; - int prefix_diff, i, err; - uint64_t sibling; - int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len; - - ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - l->l_phys->l_hdr.lh_prefix); - - if (zap_tryupgradedir(zap, tx) == 0 || - old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { - /* We failed to upgrade, or need to grow the pointer table */ - objset_t *os = zap->zap_objset; - uint64_t object = zap->zap_object; - - zap_put_leaf(l); - zap_unlockdir(zap); - err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap); - if (err) - return (err); - ASSERT(!zap->zap_ismicro); - - while (old_prefix_len == - zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { - err = zap_grow_ptrtbl(zap, tx); - if (err) - return (err); - } - - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); - if (err) - return (err); - - if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) { - /* it split while our locks were down */ - *lp = l; - return (0); - } - } - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); - ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - l->l_phys->l_hdr.lh_prefix); - - prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - - (old_prefix_len + 1); - sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; - - /* check for i/o errors before doing zap_leaf_split */ - for (i = 0; i < (1ULL<<prefix_diff); i++) { - uint64_t blk; - err = zap_idx_to_blk(zap, sibling+i, &blk); - if (err) - return (err); - ASSERT3U(blk, ==, l->l_blkid); - } - - nl = zap_create_leaf(zap, tx); - zap_leaf_split(l, nl); - - /* set sibling pointers */ - for (i = 0; i < (1ULL<<prefix_diff); i++) { - err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx); - ASSERT3U(err, ==, 0); /* we checked for i/o errors above */ - } - - if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) { - /* we want the sibling */ - zap_put_leaf(l); - *lp = nl; - } else { - zap_put_leaf(nl); - *lp = l; - } - - return (0); -} - -static void -zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx) -{ - int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; - int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift && - l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); - - zap_put_leaf(l); - - if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) { - int err; - - /* - * We are in the middle of growing the pointer table, or - * this leaf will soon make us grow it. - */ - if (zap_tryupgradedir(zap, tx) == 0) { - objset_t *os = zap->zap_objset; - uint64_t zapobj = zap->zap_object; - - zap_unlockdir(zap); - err = zap_lockdir(os, zapobj, tx, - RW_WRITER, FALSE, &zap); - if (err) - return; - } - - /* could have finished growing while our locks were down */ - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift) - (void) zap_grow_ptrtbl(zap, tx); - } -} - - -static int -fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers) -{ - if (name && strlen(name) > ZAP_MAXNAMELEN) - return (E2BIG); - - /* Only integer sizes supported by C */ - switch (integer_size) { - case 1: - case 2: - case 4: - case 8: - break; - default: - return (EINVAL); - } - - if (integer_size * num_integers > ZAP_MAXVALUELEN) - return (E2BIG); - - return (0); -} - -/* - * Routines for maniplulating attributes. - */ -int -fzap_lookup(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf) -{ - zap_leaf_t *l; - int err; - uint64_t hash; - zap_entry_handle_t zeh; - - err = fzap_checksize(name, integer_size, num_integers); - if (err != 0) - return (err); - - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, name, hash, &zeh); - if (err == 0) - err = zap_entry_read(&zeh, integer_size, num_integers, buf); - - zap_put_leaf(l); - return (err); -} - -int -fzap_add_cd(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, dmu_tx_t *tx) -{ - zap_leaf_t *l; - uint64_t hash; - int err; - zap_entry_handle_t zeh; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT(!zap->zap_ismicro); - ASSERT(fzap_checksize(name, integer_size, num_integers) == 0); - - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); -retry: - err = zap_leaf_lookup(l, name, hash, &zeh); - if (err == 0) { - err = EEXIST; - goto out; - } - if (err != ENOENT) - goto out; - - err = zap_entry_create(l, name, hash, cd, - integer_size, num_integers, val, &zeh); - - if (err == 0) { - zap_increment_num_entries(zap, 1, tx); - } else if (err == EAGAIN) { - err = zap_expand_leaf(zap, l, hash, tx, &l); - if (err == 0) - goto retry; - } - -out: - zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); - return (err); -} - -int -fzap_add(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - int err = fzap_checksize(name, integer_size, num_integers); - if (err != 0) - return (err); - - return (fzap_add_cd(zap, name, integer_size, num_integers, - val, ZAP_MAXCD, tx)); -} - -int -fzap_update(zap_t *zap, const char *name, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) -{ - zap_leaf_t *l; - uint64_t hash; - int err, create; - zap_entry_handle_t zeh; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - err = fzap_checksize(name, integer_size, num_integers); - if (err != 0) - return (err); - - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); -retry: - err = zap_leaf_lookup(l, name, hash, &zeh); - create = (err == ENOENT); - ASSERT(err == 0 || err == ENOENT); - - /* XXX If this leaf is chained, split it if we can. */ - - if (create) { - err = zap_entry_create(l, name, hash, ZAP_MAXCD, - integer_size, num_integers, val, &zeh); - if (err == 0) - zap_increment_num_entries(zap, 1, tx); - } else { - err = zap_entry_update(&zeh, integer_size, num_integers, val); - } - - if (err == EAGAIN) { - err = zap_expand_leaf(zap, l, hash, tx, &l); - if (err == 0) - goto retry; - } - - zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); - return (err); -} - -int -fzap_length(zap_t *zap, const char *name, - uint64_t *integer_size, uint64_t *num_integers) -{ - zap_leaf_t *l; - int err; - uint64_t hash; - zap_entry_handle_t zeh; - - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, name, hash, &zeh); - if (err != 0) - goto out; - - if (integer_size) - *integer_size = zeh.zeh_integer_size; - if (num_integers) - *num_integers = zeh.zeh_num_integers; -out: - zap_put_leaf(l); - return (err); -} - -int -fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx) -{ - zap_leaf_t *l; - uint64_t hash; - int err; - zap_entry_handle_t zeh; - - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, name, hash, &zeh); - if (err == 0) { - zap_entry_remove(&zeh); - zap_increment_num_entries(zap, -1, tx); - } - zap_put_leaf(l); - dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n", - zap->zap_objset, zap->zap_object, name, err); - return (err); -} - -int -zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name) -{ - zap_cursor_t zc; - zap_attribute_t *za; - int err; - - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - for (zap_cursor_init(&zc, os, zapobj); - (err = zap_cursor_retrieve(&zc, za)) == 0; - zap_cursor_advance(&zc)) { - if (ZFS_DIRENT_OBJ(za->za_first_integer) == value) { - (void) strcpy(name, za->za_name); - break; - } - } - zap_cursor_fini(&zc); - kmem_free(za, sizeof (zap_attribute_t)); - return (err); -} - - -/* - * Routines for iterating over the attributes. - */ - -int -fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) -{ - int err = ENOENT; - zap_entry_handle_t zeh; - zap_leaf_t *l; - - /* retrieve the next entry at or after zc_hash/zc_cd */ - /* if no entry, return ENOENT */ - - if (zc->zc_leaf && - (ZAP_HASH_IDX(zc->zc_hash, - zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) != - zc->zc_leaf->l_phys->l_hdr.lh_prefix)) { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - } - -again: - if (zc->zc_leaf == NULL) { - err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, - &zc->zc_leaf); - if (err != 0) - return (err); - } else { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - } - l = zc->zc_leaf; - - err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); - - if (err == ENOENT) { - uint64_t nocare = - (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1; - zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; - zc->zc_cd = 0; - if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { - zc->zc_hash = -1ULL; - } else { - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - goto again; - } - } - - if (err == 0) { - zc->zc_hash = zeh.zeh_hash; - zc->zc_cd = zeh.zeh_cd; - za->za_integer_length = zeh.zeh_integer_size; - za->za_num_integers = zeh.zeh_num_integers; - if (zeh.zeh_num_integers == 0) { - za->za_first_integer = 0; - } else { - err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); - ASSERT(err == 0 || err == EOVERFLOW); - } - err = zap_entry_read_name(&zeh, - sizeof (za->za_name), za->za_name); - ASSERT(err == 0); - } - rw_exit(&zc->zc_leaf->l_rwlock); - return (err); -} - - -static void -zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) -{ - int i, err; - uint64_t lastblk = 0; - - /* - * NB: if a leaf has more pointers than an entire ptrtbl block - * can hold, then it'll be accounted for more than once, since - * we won't have lastblk. - */ - for (i = 0; i < len; i++) { - zap_leaf_t *l; - - if (tbl[i] == lastblk) - continue; - lastblk = tbl[i]; - - err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); - if (err == 0) { - zap_leaf_stats(zap, l, zs); - zap_put_leaf(l); - } - } -} - -void -fzap_get_stats(zap_t *zap, zap_stats_t *zs) -{ - int bs = FZAP_BLOCK_SHIFT(zap); - zs->zs_blocksize = 1ULL << bs; - - /* - * Set zap_phys_t fields - */ - zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs; - zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries; - zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk; - zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type; - zs->zs_magic = zap->zap_f.zap_phys->zap_magic; - zs->zs_salt = zap->zap_f.zap_phys->zap_salt; - - /* - * Set zap_ptrtbl fields - */ - zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; - zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk; - zs->zs_ptrtbl_blks_copied = - zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied; - zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk; - zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; - zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; - - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { - /* the ptrtbl is entirely in the header block. */ - zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); - } else { - int b; - - dmu_prefetch(zap->zap_objset, zap->zap_object, - zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs, - zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs); - - for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; - b++) { - dmu_buf_t *db; - int err; - - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, - FTAG, &db); - if (err == 0) { - zap_stats_ptrtbl(zap, db->db_data, - 1<<(bs-3), zs); - dmu_buf_rele(db, FTAG); - } - } - } -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c deleted file mode 100644 index 5dff514..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c +++ /dev/null @@ -1,741 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * The 512-byte leaf is broken into 32 16-byte chunks. - * chunk number n means l_chunk[n], even though the header precedes it. - * the names are stored null-terminated. - */ - -#include <sys/zfs_context.h> -#include <sys/zap.h> -#include <sys/zap_impl.h> -#include <sys/zap_leaf.h> -#include <sys/spa.h> -#include <sys/dmu.h> - -#define CHAIN_END 0xffff /* end of the chunk chain */ - -/* half the (current) minimum block size */ -#define MAX_ARRAY_BYTES (8<<10) - -#define LEAF_HASH(l, h) \ - ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ - ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len))) - -#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) - - -static void -zap_memset(void *a, int c, size_t n) -{ - char *cp = a; - char *cpend = cp + n; - - while (cp < cpend) - *cp++ = c; -} - -static void -stv(int len, void *addr, uint64_t value) -{ - switch (len) { - case 1: - *(uint8_t *)addr = value; - return; - case 2: - *(uint16_t *)addr = value; - return; - case 4: - *(uint32_t *)addr = value; - return; - case 8: - *(uint64_t *)addr = value; - return; - } - ASSERT(!"bad int len"); -} - -static uint64_t -ldv(int len, const void *addr) -{ - switch (len) { - case 1: - return (*(uint8_t *)addr); - case 2: - return (*(uint16_t *)addr); - case 4: - return (*(uint32_t *)addr); - case 8: - return (*(uint64_t *)addr); - } - ASSERT(!"bad int len"); - return (0xFEEDFACEDEADBEEFULL); -} - -void -zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) -{ - int i; - zap_leaf_t l; - l.l_bs = highbit(size)-1; - l.l_phys = buf; - - buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); - buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); - buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); - buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); - buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); - buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); - buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); - - for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) - buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); - - for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { - zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); - struct zap_leaf_entry *le; - - switch (lc->l_free.lf_type) { - case ZAP_CHUNK_ENTRY: - le = &lc->l_entry; - - le->le_type = BSWAP_8(le->le_type); - le->le_int_size = BSWAP_8(le->le_int_size); - le->le_next = BSWAP_16(le->le_next); - le->le_name_chunk = BSWAP_16(le->le_name_chunk); - le->le_name_length = BSWAP_16(le->le_name_length); - le->le_value_chunk = BSWAP_16(le->le_value_chunk); - le->le_value_length = BSWAP_16(le->le_value_length); - le->le_cd = BSWAP_32(le->le_cd); - le->le_hash = BSWAP_64(le->le_hash); - break; - case ZAP_CHUNK_FREE: - lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); - lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); - break; - case ZAP_CHUNK_ARRAY: - lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); - lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); - /* la_array doesn't need swapping */ - break; - default: - ASSERT(!"bad leaf type"); - } - } -} - -void -zap_leaf_init(zap_leaf_t *l) -{ - int i; - - l->l_bs = highbit(l->l_dbuf->db_size)-1; - zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header)); - zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); - for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { - ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; - ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; - } - ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; - l->l_phys->l_hdr.lh_block_type = ZBT_LEAF; - l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC; - l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); -} - -/* - * Routines which manipulate leaf chunks (l_chunk[]). - */ - -static uint16_t -zap_leaf_chunk_alloc(zap_leaf_t *l) -{ - int chunk; - - ASSERT(l->l_phys->l_hdr.lh_nfree > 0); - - chunk = l->l_phys->l_hdr.lh_freelist; - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); - - l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; - - l->l_phys->l_hdr.lh_nfree--; - - return (chunk); -} - -static void -zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) -{ - struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; - ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); - - zlf->lf_type = ZAP_CHUNK_FREE; - zlf->lf_next = l->l_phys->l_hdr.lh_freelist; - bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ - l->l_phys->l_hdr.lh_freelist = chunk; - - l->l_phys->l_hdr.lh_nfree++; -} - -/* - * Routines which manipulate leaf arrays (zap_leaf_array type chunks). - */ - -static uint16_t -zap_leaf_array_create(zap_leaf_t *l, const char *buf, - int integer_size, int num_integers) -{ - uint16_t chunk_head; - uint16_t *chunkp = &chunk_head; - int byten = 0; - uint64_t value; - int shift = (integer_size-1)*8; - int len = num_integers; - - ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES); - - while (len > 0) { - uint16_t chunk = zap_leaf_chunk_alloc(l); - struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int i; - - la->la_type = ZAP_CHUNK_ARRAY; - for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { - if (byten == 0) - value = ldv(integer_size, buf); - la->la_array[i] = value >> shift; - value <<= 8; - if (++byten == integer_size) { - byten = 0; - buf += integer_size; - if (--len == 0) - break; - } - } - - *chunkp = chunk; - chunkp = &la->la_next; - } - *chunkp = CHAIN_END; - - return (chunk_head); -} - -static void -zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) -{ - uint16_t chunk = *chunkp; - - *chunkp = CHAIN_END; - - while (chunk != CHAIN_END) { - int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; - ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, - ZAP_CHUNK_ARRAY); - zap_leaf_chunk_free(l, chunk); - chunk = nextchunk; - } -} - -/* array_len and buf_len are in integers, not bytes */ -static void -zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, - int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, - char *buf) -{ - int len = MIN(array_len, buf_len); - int byten = 0; - uint64_t value = 0; - - ASSERT3U(array_int_len, <=, buf_int_len); - - /* Fast path for one 8-byte integer */ - if (array_int_len == 8 && buf_int_len == 8 && len == 1) { - struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - uint8_t *ip = la->la_array; - uint64_t *buf64 = (uint64_t *)buf; - - *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | - (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | - (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | - (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; - return; - } - - /* Fast path for an array of 1-byte integers (eg. the entry name) */ - if (array_int_len == 1 && buf_int_len == 1 && - buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { - while (chunk != CHAIN_END) { - struct zap_leaf_array *la = - &ZAP_LEAF_CHUNK(l, chunk).l_array; - bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES); - buf += ZAP_LEAF_ARRAY_BYTES; - chunk = la->la_next; - } - return; - } - - while (len > 0) { - struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int i; - - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { - value = (value << 8) | la->la_array[i]; - byten++; - if (byten == array_int_len) { - stv(buf_int_len, buf, value); - byten = 0; - len--; - if (len == 0) - return; - buf += buf_int_len; - } - } - chunk = la->la_next; - } -} - -/* - * Only to be used on 8-bit arrays. - * array_len is actual len in bytes (not encoded le_value_length). - * buf is null-terminated. - */ -static int -zap_leaf_array_equal(zap_leaf_t *l, int chunk, - int array_len, const char *buf) -{ - int bseen = 0; - - while (bseen < array_len) { - struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES); - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - if (bcmp(la->la_array, buf + bseen, toread)) - break; - chunk = la->la_next; - bseen += toread; - } - return (bseen == array_len); -} - -/* - * Routines which manipulate leaf entries. - */ - -int -zap_leaf_lookup(zap_leaf_t *l, - const char *name, uint64_t h, zap_entry_handle_t *zeh) -{ - uint16_t *chunkp; - struct zap_leaf_entry *le; - - ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - - for (chunkp = LEAF_HASH_ENTPTR(l, h); - *chunkp != CHAIN_END; chunkp = &le->le_next) { - uint16_t chunk = *chunkp; - le = ZAP_LEAF_ENTRY(l, chunk); - - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - if (le->le_hash != h) - continue; - - if (zap_leaf_array_equal(l, le->le_name_chunk, - le->le_name_length, name)) { - zeh->zeh_num_integers = le->le_value_length; - zeh->zeh_integer_size = le->le_int_size; - zeh->zeh_cd = le->le_cd; - zeh->zeh_hash = le->le_hash; - zeh->zeh_chunkp = chunkp; - zeh->zeh_leaf = l; - return (0); - } - } - - return (ENOENT); -} - -/* Return (h1,cd1 >= h2,cd2) */ -#define HCD_GTEQ(h1, cd1, h2, cd2) \ - ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) - -int -zap_leaf_lookup_closest(zap_leaf_t *l, - uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) -{ - uint16_t chunk; - uint64_t besth = -1ULL; - uint32_t bestcd = ZAP_MAXCD; - uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; - uint16_t lh; - struct zap_leaf_entry *le; - - ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - - for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { - for (chunk = l->l_phys->l_hash[lh]; - chunk != CHAIN_END; chunk = le->le_next) { - le = ZAP_LEAF_ENTRY(l, chunk); - - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && - HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { - ASSERT3U(bestlh, >=, lh); - bestlh = lh; - besth = le->le_hash; - bestcd = le->le_cd; - - zeh->zeh_num_integers = le->le_value_length; - zeh->zeh_integer_size = le->le_int_size; - zeh->zeh_cd = le->le_cd; - zeh->zeh_hash = le->le_hash; - zeh->zeh_fakechunk = chunk; - zeh->zeh_chunkp = &zeh->zeh_fakechunk; - zeh->zeh_leaf = l; - } - } - } - - return (bestcd == ZAP_MAXCD ? ENOENT : 0); -} - -int -zap_entry_read(const zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, void *buf) -{ - struct zap_leaf_entry *le = - ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - if (le->le_int_size > integer_size) - return (EINVAL); - - zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size, - le->le_value_length, integer_size, num_integers, buf); - - if (zeh->zeh_num_integers > num_integers) - return (EOVERFLOW); - return (0); - -} - -int -zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf) -{ - struct zap_leaf_entry *le = - ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, - le->le_name_length, 1, buflen, buf); - if (le->le_name_length > buflen) - return (EOVERFLOW); - return (0); -} - -int -zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf) -{ - int delta_chunks; - zap_leaf_t *l = zeh->zeh_leaf; - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); - - delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size); - - if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) - return (EAGAIN); - - /* - * We should search other chained leaves (via - * zap_entry_remove,create?) otherwise returning EAGAIN will - * just send us into an infinite loop if we have to chain - * another leaf block, rather than being able to split this - * block. - */ - - zap_leaf_array_free(l, &le->le_value_chunk); - le->le_value_chunk = - zap_leaf_array_create(l, buf, integer_size, num_integers); - le->le_value_length = num_integers; - le->le_int_size = integer_size; - return (0); -} - -void -zap_entry_remove(zap_entry_handle_t *zeh) -{ - uint16_t entry_chunk; - struct zap_leaf_entry *le; - zap_leaf_t *l = zeh->zeh_leaf; - - ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); - - entry_chunk = *zeh->zeh_chunkp; - le = ZAP_LEAF_ENTRY(l, entry_chunk); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - zap_leaf_array_free(l, &le->le_name_chunk); - zap_leaf_array_free(l, &le->le_value_chunk); - - *zeh->zeh_chunkp = le->le_next; - zap_leaf_chunk_free(l, entry_chunk); - - l->l_phys->l_hdr.lh_nentries--; -} - -int -zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, - uint8_t integer_size, uint64_t num_integers, const void *buf, - zap_entry_handle_t *zeh) -{ - uint16_t chunk; - uint16_t *chunkp; - struct zap_leaf_entry *le; - uint64_t namelen, valuelen; - int numchunks; - - valuelen = integer_size * num_integers; - namelen = strlen(name) + 1; - ASSERT(namelen >= 2); - - numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) + - ZAP_LEAF_ARRAY_NCHUNKS(valuelen); - if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) - return (E2BIG); - - if (cd == ZAP_MAXCD) { - for (cd = 0; cd < ZAP_MAXCD; cd++) { - for (chunk = *LEAF_HASH_ENTPTR(l, h); - chunk != CHAIN_END; chunk = le->le_next) { - le = ZAP_LEAF_ENTRY(l, chunk); - if (le->le_hash == h && - le->le_cd == cd) { - break; - } - } - /* If this cd is not in use, we are good. */ - if (chunk == CHAIN_END) - break; - } - /* If we tried all the cd's, we lose. */ - if (cd == ZAP_MAXCD) - return (ENOSPC); - } - - if (l->l_phys->l_hdr.lh_nfree < numchunks) - return (EAGAIN); - - /* make the entry */ - chunk = zap_leaf_chunk_alloc(l); - le = ZAP_LEAF_ENTRY(l, chunk); - le->le_type = ZAP_CHUNK_ENTRY; - le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen); - le->le_name_length = namelen; - le->le_value_chunk = - zap_leaf_array_create(l, buf, integer_size, num_integers); - le->le_value_length = num_integers; - le->le_int_size = integer_size; - le->le_hash = h; - le->le_cd = cd; - - /* link it into the hash chain */ - chunkp = LEAF_HASH_ENTPTR(l, h); - le->le_next = *chunkp; - *chunkp = chunk; - - l->l_phys->l_hdr.lh_nentries++; - - zeh->zeh_leaf = l; - zeh->zeh_num_integers = num_integers; - zeh->zeh_integer_size = le->le_int_size; - zeh->zeh_cd = le->le_cd; - zeh->zeh_hash = le->le_hash; - zeh->zeh_chunkp = chunkp; - - return (0); -} - -/* - * Routines for transferring entries between leafs. - */ - -static void -zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) -{ - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); - uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash); - le->le_next = *ptr; - *ptr = entry; -} - -static uint16_t -zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) -{ - uint16_t new_chunk; - uint16_t *nchunkp = &new_chunk; - - while (chunk != CHAIN_END) { - uint16_t nchunk = zap_leaf_chunk_alloc(nl); - struct zap_leaf_array *nla = - &ZAP_LEAF_CHUNK(nl, nchunk).l_array; - struct zap_leaf_array *la = - &ZAP_LEAF_CHUNK(l, chunk).l_array; - int nextchunk = la->la_next; - - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); - - *nla = *la; /* structure assignment */ - - zap_leaf_chunk_free(l, chunk); - chunk = nextchunk; - *nchunkp = nchunk; - nchunkp = &nla->la_next; - } - *nchunkp = CHAIN_END; - return (new_chunk); -} - -static void -zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) -{ - struct zap_leaf_entry *le, *nle; - uint16_t chunk; - - le = ZAP_LEAF_ENTRY(l, entry); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - chunk = zap_leaf_chunk_alloc(nl); - nle = ZAP_LEAF_ENTRY(nl, chunk); - *nle = *le; /* structure assignment */ - - zap_leaf_rehash_entry(nl, chunk); - - nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); - nle->le_value_chunk = - zap_leaf_transfer_array(l, le->le_value_chunk, nl); - - zap_leaf_chunk_free(l, entry); - - l->l_phys->l_hdr.lh_nentries--; - nl->l_phys->l_hdr.lh_nentries++; -} - -/* - * Transfer the entries whose hash prefix ends in 1 to the new leaf. - */ -void -zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl) -{ - int i; - int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len; - - /* set new prefix and prefix_len */ - l->l_phys->l_hdr.lh_prefix <<= 1; - l->l_phys->l_hdr.lh_prefix_len++; - nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1; - nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len; - - /* break existing hash chains */ - zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); - - /* - * Transfer entries whose hash bit 'bit' is set to nl; rehash - * the remaining entries - * - * NB: We could find entries via the hashtable instead. That - * would be O(hashents+numents) rather than O(numblks+numents), - * but this accesses memory more sequentially, and when we're - * called, the block is usually pretty full. - */ - for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); - if (le->le_type != ZAP_CHUNK_ENTRY) - continue; - - if (le->le_hash & (1ULL << bit)) - zap_leaf_transfer_entry(l, i, nl); - else - zap_leaf_rehash_entry(l, i); - } -} - -void -zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) -{ - int i, n; - - n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - - l->l_phys->l_hdr.lh_prefix_len; - n = MIN(n, ZAP_HISTOGRAM_SIZE-1); - zs->zs_leafs_with_2n_pointers[n]++; - - - n = l->l_phys->l_hdr.lh_nentries/5; - n = MIN(n, ZAP_HISTOGRAM_SIZE-1); - zs->zs_blocks_with_n5_entries[n]++; - - n = ((1<<FZAP_BLOCK_SHIFT(zap)) - - l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / - (1<<FZAP_BLOCK_SHIFT(zap)); - n = MIN(n, ZAP_HISTOGRAM_SIZE-1); - zs->zs_blocks_n_tenths_full[n]++; - - for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { - int nentries = 0; - int chunk = l->l_phys->l_hash[i]; - - while (chunk != CHAIN_END) { - struct zap_leaf_entry *le = - ZAP_LEAF_ENTRY(l, chunk); - - n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) + - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * - le->le_int_size); - n = MIN(n, ZAP_HISTOGRAM_SIZE-1); - zs->zs_entries_using_n_chunks[n]++; - - chunk = le->le_next; - nentries++; - } - - n = nentries; - n = MIN(n, ZAP_HISTOGRAM_SIZE-1); - zs->zs_buckets_with_n_entries[n]++; - } -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c deleted file mode 100644 index 9a882a5..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ /dev/null @@ -1,857 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/dmu.h> -#include <sys/zfs_context.h> -#include <sys/zap.h> -#include <sys/refcount.h> -#include <sys/zap_impl.h> -#include <sys/zap_leaf.h> -#include <sys/avl.h> - - -static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx); - - -static void -mzap_byteswap(mzap_phys_t *buf, size_t size) -{ - int i, max; - buf->mz_block_type = BSWAP_64(buf->mz_block_type); - buf->mz_salt = BSWAP_64(buf->mz_salt); - max = (size / MZAP_ENT_LEN) - 1; - for (i = 0; i < max; i++) { - buf->mz_chunk[i].mze_value = - BSWAP_64(buf->mz_chunk[i].mze_value); - buf->mz_chunk[i].mze_cd = - BSWAP_32(buf->mz_chunk[i].mze_cd); - } -} - -void -zap_byteswap(void *buf, size_t size) -{ - uint64_t block_type; - - block_type = *(uint64_t *)buf; - - if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { - /* ASSERT(magic == ZAP_LEAF_MAGIC); */ - mzap_byteswap(buf, size); - } else { - fzap_byteswap(buf, size); - } -} - -static int -mze_compare(const void *arg1, const void *arg2) -{ - const mzap_ent_t *mze1 = arg1; - const mzap_ent_t *mze2 = arg2; - - if (mze1->mze_hash > mze2->mze_hash) - return (+1); - if (mze1->mze_hash < mze2->mze_hash) - return (-1); - if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) - return (+1); - if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) - return (-1); - return (0); -} - -static void -mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) -{ - mzap_ent_t *mze; - - ASSERT(zap->zap_ismicro); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(mzep->mze_cd < ZAP_MAXCD); - ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash); - - mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); - mze->mze_chunkid = chunkid; - mze->mze_hash = hash; - mze->mze_phys = *mzep; - avl_add(&zap->zap_m.zap_avl, mze); -} - -static mzap_ent_t * -mze_find(zap_t *zap, const char *name, uint64_t hash) -{ - mzap_ent_t mze_tofind; - mzap_ent_t *mze; - avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; - - ASSERT(zap->zap_ismicro); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT3U(zap_hash(zap, name), ==, hash); - - if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name)) - return (NULL); - - mze_tofind.mze_hash = hash; - mze_tofind.mze_phys.mze_cd = 0; - - mze = avl_find(avl, &mze_tofind, &idx); - if (mze == NULL) - mze = avl_nearest(avl, idx, AVL_AFTER); - for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { - if (strcmp(name, mze->mze_phys.mze_name) == 0) - return (mze); - } - return (NULL); -} - -static uint32_t -mze_find_unused_cd(zap_t *zap, uint64_t hash) -{ - mzap_ent_t mze_tofind; - mzap_ent_t *mze; - avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; - uint32_t cd; - - ASSERT(zap->zap_ismicro); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - mze_tofind.mze_hash = hash; - mze_tofind.mze_phys.mze_cd = 0; - - cd = 0; - for (mze = avl_find(avl, &mze_tofind, &idx); - mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { - if (mze->mze_phys.mze_cd != cd) - break; - cd++; - } - - return (cd); -} - -static void -mze_remove(zap_t *zap, mzap_ent_t *mze) -{ - ASSERT(zap->zap_ismicro); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - avl_remove(&zap->zap_m.zap_avl, mze); - kmem_free(mze, sizeof (mzap_ent_t)); -} - -static void -mze_destroy(zap_t *zap) -{ - mzap_ent_t *mze; - void *avlcookie = NULL; - - while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) - kmem_free(mze, sizeof (mzap_ent_t)); - avl_destroy(&zap->zap_m.zap_avl); -} - -static zap_t * -mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) -{ - zap_t *winner; - zap_t *zap; - int i; - - ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); - - zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); - rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0); - rw_enter(&zap->zap_rwlock, RW_WRITER); - zap->zap_objset = os; - zap->zap_object = obj; - zap->zap_dbuf = db; - - if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) { - mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, - MUTEX_DEFAULT, 0); - zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; - } else { - zap->zap_ismicro = TRUE; - } - - /* - * Make sure that zap_ismicro is set before we let others see - * it, because zap_lockdir() checks zap_ismicro without the lock - * held. - */ - winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); - - if (winner != NULL) { - rw_exit(&zap->zap_rwlock); - rw_destroy(&zap->zap_rwlock); - if (!zap->zap_ismicro) - mutex_destroy(&zap->zap_f.zap_num_entries_mtx); - kmem_free(zap, sizeof (zap_t)); - return (winner); - } - - if (zap->zap_ismicro) { - zap->zap_salt = zap->zap_m.zap_phys->mz_salt; - zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; - avl_create(&zap->zap_m.zap_avl, mze_compare, - sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); - - for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = - &zap->zap_m.zap_phys->mz_chunk[i]; - if (mze->mze_name[0]) { - zap->zap_m.zap_num_entries++; - mze_insert(zap, i, - zap_hash(zap, mze->mze_name), mze); - } - } - } else { - zap->zap_salt = zap->zap_f.zap_phys->zap_salt; - - ASSERT3U(sizeof (struct zap_leaf_header), ==, - 2*ZAP_LEAF_CHUNKSIZE); - - /* - * The embedded pointer table should not overlap the - * other members. - */ - ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, - &zap->zap_f.zap_phys->zap_salt); - - /* - * The embedded pointer table should end at the end of - * the block - */ - ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, - 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - - (uintptr_t)zap->zap_f.zap_phys, ==, - zap->zap_dbuf->db_size); - } - rw_exit(&zap->zap_rwlock); - return (zap); -} - -int -zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, int fatreader, zap_t **zapp) -{ - zap_t *zap; - dmu_buf_t *db; - krw_t lt; - int err; - - *zapp = NULL; - - err = dmu_buf_hold(os, obj, 0, NULL, &db); - if (err) - return (err); - -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); - } -#endif - - zap = dmu_buf_get_user(db); - if (zap == NULL) - zap = mzap_open(os, obj, db); - - /* - * We're checking zap_ismicro without the lock held, in order to - * tell what type of lock we want. Once we have some sort of - * lock, see if it really is the right type. In practice this - * can only be different if it was upgraded from micro to fat, - * and micro wanted WRITER but fat only needs READER. - */ - lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; - rw_enter(&zap->zap_rwlock, lt); - if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { - /* it was upgraded, now we only need reader */ - ASSERT(lt == RW_WRITER); - ASSERT(RW_READER == - (!zap->zap_ismicro && fatreader) ? RW_READER : lti); - rw_downgrade(&zap->zap_rwlock); - lt = RW_READER; - } - - zap->zap_objset = os; - - if (lt == RW_WRITER) - dmu_buf_will_dirty(db, tx); - - ASSERT3P(zap->zap_dbuf, ==, db); - - ASSERT(!zap->zap_ismicro || - zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); - if (zap->zap_ismicro && tx && - zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { - uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; - if (newsz > MZAP_MAX_BLKSZ) { - dprintf("upgrading obj %llu: num_entries=%u\n", - obj, zap->zap_m.zap_num_entries); - mzap_upgrade(zap, tx); - *zapp = zap; - return (0); - } - err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); - ASSERT3U(err, ==, 0); - zap->zap_m.zap_num_chunks = - db->db_size / MZAP_ENT_LEN - 1; - } - - *zapp = zap; - return (0); -} - -void -zap_unlockdir(zap_t *zap) -{ - rw_exit(&zap->zap_rwlock); - dmu_buf_rele(zap->zap_dbuf, NULL); -} - -static void -mzap_upgrade(zap_t *zap, dmu_tx_t *tx) -{ - mzap_phys_t *mzp; - int i, sz, nchunks, err; - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - sz = zap->zap_dbuf->db_size; - mzp = kmem_alloc(sz, KM_SLEEP); - bcopy(zap->zap_dbuf->db_data, mzp, sz); - nchunks = zap->zap_m.zap_num_chunks; - - err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, - 1ULL << fzap_default_block_shift, 0, tx); - ASSERT(err == 0); - - dprintf("upgrading obj=%llu with %u chunks\n", - zap->zap_object, nchunks); - mze_destroy(zap); - - fzap_upgrade(zap, tx); - - for (i = 0; i < nchunks; i++) { - int err; - mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; - if (mze->mze_name[0] == 0) - continue; - dprintf("adding %s=%llu\n", - mze->mze_name, mze->mze_value); - err = fzap_add_cd(zap, - mze->mze_name, 8, 1, &mze->mze_value, - mze->mze_cd, tx); - ASSERT3U(err, ==, 0); - } - kmem_free(mzp, sz); -} - -uint64_t -zap_hash(zap_t *zap, const char *name) -{ - const uint8_t *cp; - uint8_t c; - uint64_t crc = zap->zap_salt; - - ASSERT(crc != 0); - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; - - /* - * Only use 28 bits, since we need 4 bits in the cookie for the - * collision differentiator. We MUST use the high bits, since - * those are the onces that we first pay attention to when - * chosing the bucket. - */ - crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); - - return (crc); -} - - -static void -mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx) -{ - dmu_buf_t *db; - mzap_phys_t *zp; - - VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); - -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); - } -#endif - - dmu_buf_will_dirty(db, tx); - zp = db->db_data; - zp->mz_block_type = ZBT_MICRO; - zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; - ASSERT(zp->mz_salt != 0); - dmu_buf_rele(db, FTAG); -} - -int -zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - int err; - - err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); - if (err != 0) - return (err); - mzap_create_impl(os, obj, tx); - return (0); -} - -uint64_t -zap_create(objset_t *os, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); - - mzap_create_impl(os, obj, tx); - return (obj); -} - -int -zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) -{ - /* - * dmu_object_free will free the object number and free the - * data. Freeing the data will cause our pageout function to be - * called, which will destroy our data (zap_leaf_t's and zap_t). - */ - - return (dmu_object_free(os, zapobj, tx)); -} - -_NOTE(ARGSUSED(0)) -void -zap_evict(dmu_buf_t *db, void *vzap) -{ - zap_t *zap = vzap; - - rw_destroy(&zap->zap_rwlock); - - if (zap->zap_ismicro) - mze_destroy(zap); - else - mutex_destroy(&zap->zap_f.zap_num_entries_mtx); - - kmem_free(zap, sizeof (zap_t)); -} - -int -zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) -{ - zap_t *zap; - int err; - - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); - if (err) - return (err); - if (!zap->zap_ismicro) { - err = fzap_count(zap, count); - } else { - *count = zap->zap_m.zap_num_entries; - } - zap_unlockdir(zap); - return (err); -} - -/* - * Routines for maniplulating attributes. - */ - -int -zap_lookup(objset_t *os, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf) -{ - zap_t *zap; - int err; - mzap_ent_t *mze; - - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); - if (err) - return (err); - if (!zap->zap_ismicro) { - err = fzap_lookup(zap, name, - integer_size, num_integers, buf); - } else { - mze = mze_find(zap, name, zap_hash(zap, name)); - if (mze == NULL) { - err = ENOENT; - } else { - if (num_integers < 1) - err = EOVERFLOW; - else if (integer_size != 8) - err = EINVAL; - else - *(uint64_t *)buf = mze->mze_phys.mze_value; - } - } - zap_unlockdir(zap); - return (err); -} - -int -zap_length(objset_t *os, uint64_t zapobj, const char *name, - uint64_t *integer_size, uint64_t *num_integers) -{ - zap_t *zap; - int err; - mzap_ent_t *mze; - - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); - if (err) - return (err); - if (!zap->zap_ismicro) { - err = fzap_length(zap, name, integer_size, num_integers); - } else { - mze = mze_find(zap, name, zap_hash(zap, name)); - if (mze == NULL) { - err = ENOENT; - } else { - if (integer_size) - *integer_size = 8; - if (num_integers) - *num_integers = 1; - } - } - zap_unlockdir(zap); - return (err); -} - -static void -mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value) -{ - int i; - int start = zap->zap_m.zap_alloc_next; - uint32_t cd; - - dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - -#ifdef ZFS_DEBUG - for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; - ASSERT(strcmp(name, mze->mze_name) != 0); - } -#endif - - cd = mze_find_unused_cd(zap, hash); - /* given the limited size of the microzap, this can't happen */ - ASSERT(cd != ZAP_MAXCD); - -again: - for (i = start; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; - if (mze->mze_name[0] == 0) { - mze->mze_value = value; - mze->mze_cd = cd; - (void) strcpy(mze->mze_name, name); - zap->zap_m.zap_num_entries++; - zap->zap_m.zap_alloc_next = i+1; - if (zap->zap_m.zap_alloc_next == - zap->zap_m.zap_num_chunks) - zap->zap_m.zap_alloc_next = 0; - mze_insert(zap, i, hash, mze); - return; - } - } - if (start != 0) { - start = 0; - goto again; - } - ASSERT(!"out of entries!"); -} - -int -zap_add(objset_t *os, uint64_t zapobj, const char *name, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - mzap_ent_t *mze; - const uint64_t *intval = val; - uint64_t hash; - - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); - if (err) - return (err); - if (!zap->zap_ismicro) { - err = fzap_add(zap, name, integer_size, num_integers, val, tx); - } else if (integer_size != 8 || num_integers != 1 || - strlen(name) >= MZAP_NAME_LEN) { - dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", - zapobj, integer_size, num_integers, name); - mzap_upgrade(zap, tx); - err = fzap_add(zap, name, integer_size, num_integers, val, tx); - } else { - hash = zap_hash(zap, name); - mze = mze_find(zap, name, hash); - if (mze != NULL) { - err = EEXIST; - } else { - mzap_addent(zap, name, hash, *intval); - } - } - zap_unlockdir(zap); - return (err); -} - -int -zap_update(objset_t *os, uint64_t zapobj, const char *name, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - mzap_ent_t *mze; - const uint64_t *intval = val; - uint64_t hash; - int err; - - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); - if (err) - return (err); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - if (!zap->zap_ismicro) { - err = fzap_update(zap, name, - integer_size, num_integers, val, tx); - } else if (integer_size != 8 || num_integers != 1 || - strlen(name) >= MZAP_NAME_LEN) { - dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", - zapobj, integer_size, num_integers, name); - mzap_upgrade(zap, tx); - err = fzap_update(zap, name, - integer_size, num_integers, val, tx); - } else { - hash = zap_hash(zap, name); - mze = mze_find(zap, name, hash); - if (mze != NULL) { - mze->mze_phys.mze_value = *intval; - zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid].mze_value = *intval; - } else { - mzap_addent(zap, name, hash, *intval); - } - } - zap_unlockdir(zap); - return (err); -} - -int -zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - mzap_ent_t *mze; - - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); - if (err) - return (err); - if (!zap->zap_ismicro) { - err = fzap_remove(zap, name, tx); - } else { - mze = mze_find(zap, name, zap_hash(zap, name)); - if (mze == NULL) { - dprintf("fail: %s\n", name); - err = ENOENT; - } else { - dprintf("success: %s\n", name); - zap->zap_m.zap_num_entries--; - bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], - sizeof (mzap_ent_phys_t)); - mze_remove(zap, mze); - } - } - zap_unlockdir(zap); - return (err); -} - - -/* - * Routines for iterating over the attributes. - */ - -/* - * We want to keep the high 32 bits of the cursor zero if we can, so - * that 32-bit programs can access this. So use a small hash value so - * we can fit 4 bits of cd into the 32-bit cursor. - * - * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] - */ -void -zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized) -{ - zc->zc_objset = os; - zc->zc_zap = NULL; - zc->zc_leaf = NULL; - zc->zc_zapobj = zapobj; - if (serialized == -1ULL) { - zc->zc_hash = -1ULL; - zc->zc_cd = 0; - } else { - zc->zc_hash = serialized << (64-ZAP_HASHBITS); - zc->zc_cd = serialized >> ZAP_HASHBITS; - if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ - zc->zc_cd = 0; - } -} - -void -zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) -{ - zap_cursor_init_serialized(zc, os, zapobj, 0); -} - -void -zap_cursor_fini(zap_cursor_t *zc) -{ - if (zc->zc_zap) { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - zap_unlockdir(zc->zc_zap); - zc->zc_zap = NULL; - } - if (zc->zc_leaf) { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - } - zc->zc_objset = NULL; -} - -uint64_t -zap_cursor_serialize(zap_cursor_t *zc) -{ - if (zc->zc_hash == -1ULL) - return (-1ULL); - ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); - ASSERT(zc->zc_cd < ZAP_MAXCD); - return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | - ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); -} - -int -zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) -{ - int err; - avl_index_t idx; - mzap_ent_t mze_tofind; - mzap_ent_t *mze; - - if (zc->zc_hash == -1ULL) - return (ENOENT); - - if (zc->zc_zap == NULL) { - err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, &zc->zc_zap); - if (err) - return (err); - } else { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - } - if (!zc->zc_zap->zap_ismicro) { - err = fzap_cursor_retrieve(zc->zc_zap, zc, za); - } else { - err = ENOENT; - - mze_tofind.mze_hash = zc->zc_hash; - mze_tofind.mze_phys.mze_cd = zc->zc_cd; - - mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); - ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys, - &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], - sizeof (mze->mze_phys))); - if (mze == NULL) { - mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, - idx, AVL_AFTER); - } - if (mze) { - za->za_integer_length = 8; - za->za_num_integers = 1; - za->za_first_integer = mze->mze_phys.mze_value; - (void) strcpy(za->za_name, mze->mze_phys.mze_name); - zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_phys.mze_cd; - err = 0; - } else { - zc->zc_hash = -1ULL; - } - } - rw_exit(&zc->zc_zap->zap_rwlock); - return (err); -} - -void -zap_cursor_advance(zap_cursor_t *zc) -{ - if (zc->zc_hash == -1ULL) - return; - zc->zc_cd++; - if (zc->zc_cd >= ZAP_MAXCD) { - zc->zc_cd = 0; - zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); - if (zc->zc_hash == 0) /* EOF */ - zc->zc_hash = -1ULL; - } -} - -int -zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) -{ - int err; - zap_t *zap; - - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); - if (err) - return (err); - - bzero(zs, sizeof (zap_stats_t)); - - if (zap->zap_ismicro) { - zs->zs_blocksize = zap->zap_dbuf->db_size; - zs->zs_num_entries = zap->zap_m.zap_num_entries; - zs->zs_num_blocks = 1; - } else { - fzap_get_stats(zap, zs); - } - zap_unlockdir(zap); - return (0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf deleted file mode 100644 index 0988190..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf +++ /dev/null @@ -1,28 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# -# -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# -# ident "%Z%%M% %I% %E% SMI" -# -name="zfs" parent="pseudo"; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c deleted file mode 100644 index dd94618..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c +++ /dev/null @@ -1,1608 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/resource.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/file.h> -#include <sys/stat.h> -#include <sys/kmem.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/unistd.h> -#include <sys/sdt.h> -#include <sys/fs/zfs.h> -#include <sys/policy.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_vfsops.h> -#include <sys/dmu.h> -#include <sys/zap.h> -#include <acl/acl_common.h> - -#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE -#define DENY ACE_ACCESS_DENIED_ACE_TYPE - -#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) -#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ - ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) -#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER) - -#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) - -#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) - -#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ - ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) - -#define SECURE_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) - -#define OGE_PAD 6 /* traditional owner/group/everyone ACES */ - -static int zfs_ace_can_use(znode_t *zp, ace_t *); - -static zfs_acl_t * -zfs_acl_alloc(int slots) -{ - zfs_acl_t *aclp; - - aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); - if (slots != 0) { - aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP); - aclp->z_acl_count = 0; - aclp->z_state = ACL_DATA_ALLOCED; - } else { - aclp->z_state = 0; - } - aclp->z_slots = slots; - return (aclp); -} - -void -zfs_acl_free(zfs_acl_t *aclp) -{ - if (aclp->z_state == ACL_DATA_ALLOCED) { - kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots)); - } - kmem_free(aclp, sizeof (zfs_acl_t)); -} - -static uint32_t -zfs_v4_to_unix(uint32_t access_mask) -{ - uint32_t new_mask = 0; - - /* - * This is used for mapping v4 permissions into permissions - * that can be passed to secpolicy_vnode_access() - */ - if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY | - ACE_READ_ATTRIBUTES | ACE_READ_ACL)) - new_mask |= S_IROTH; - if (access_mask & (ACE_WRITE_DATA | ACE_APPEND_DATA | - ACE_WRITE_ATTRIBUTES | ACE_ADD_FILE | ACE_WRITE_NAMED_ATTRS)) - new_mask |= S_IWOTH; - if (access_mask & (ACE_EXECUTE | ACE_READ_NAMED_ATTRS)) - new_mask |= S_IXOTH; - - return (new_mask); -} - -/* - * Convert unix access mask to v4 access mask - */ -static uint32_t -zfs_unix_to_v4(uint32_t access_mask) -{ - uint32_t new_mask = 0; - - if (access_mask & 01) - new_mask |= (ACE_EXECUTE); - if (access_mask & 02) { - new_mask |= (ACE_WRITE_DATA); - } if (access_mask & 04) { - new_mask |= ACE_READ_DATA; - } - return (new_mask); -} - -static void -zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type, - uid_t uid, int entry_type) -{ - zacep->a_access_mask = access_mask; - zacep->a_type = access_type; - zacep->a_who = uid; - zacep->a_flags = entry_type; -} - -static uint64_t -zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) -{ - int i; - int entry_type; - mode_t mode = (zp->z_phys->zp_mode & - (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); - mode_t seen = 0; - ace_t *acep; - - for (i = 0, acep = aclp->z_acl; - i != aclp->z_acl_count; i++, acep++) { - entry_type = (acep->a_flags & ACE_TYPE_FLAGS); - if (entry_type == ACE_OWNER) { - if ((acep->a_access_mask & ACE_READ_DATA) && - (!(seen & S_IRUSR))) { - seen |= S_IRUSR; - if (acep->a_type == ALLOW) { - mode |= S_IRUSR; - } - } - if ((acep->a_access_mask & ACE_WRITE_DATA) && - (!(seen & S_IWUSR))) { - seen |= S_IWUSR; - if (acep->a_type == ALLOW) { - mode |= S_IWUSR; - } - } - if ((acep->a_access_mask & ACE_EXECUTE) && - (!(seen & S_IXUSR))) { - seen |= S_IXUSR; - if (acep->a_type == ALLOW) { - mode |= S_IXUSR; - } - } - } else if (entry_type == OWNING_GROUP) { - if ((acep->a_access_mask & ACE_READ_DATA) && - (!(seen & S_IRGRP))) { - seen |= S_IRGRP; - if (acep->a_type == ALLOW) { - mode |= S_IRGRP; - } - } - if ((acep->a_access_mask & ACE_WRITE_DATA) && - (!(seen & S_IWGRP))) { - seen |= S_IWGRP; - if (acep->a_type == ALLOW) { - mode |= S_IWGRP; - } - } - if ((acep->a_access_mask & ACE_EXECUTE) && - (!(seen & S_IXGRP))) { - seen |= S_IXGRP; - if (acep->a_type == ALLOW) { - mode |= S_IXGRP; - } - } - } else if (entry_type == ACE_EVERYONE) { - if ((acep->a_access_mask & ACE_READ_DATA)) { - if (!(seen & S_IRUSR)) { - seen |= S_IRUSR; - if (acep->a_type == ALLOW) { - mode |= S_IRUSR; - } - } - if (!(seen & S_IRGRP)) { - seen |= S_IRGRP; - if (acep->a_type == ALLOW) { - mode |= S_IRGRP; - } - } - if (!(seen & S_IROTH)) { - seen |= S_IROTH; - if (acep->a_type == ALLOW) { - mode |= S_IROTH; - } - } - } - if ((acep->a_access_mask & ACE_WRITE_DATA)) { - if (!(seen & S_IWUSR)) { - seen |= S_IWUSR; - if (acep->a_type == ALLOW) { - mode |= S_IWUSR; - } - } - if (!(seen & S_IWGRP)) { - seen |= S_IWGRP; - if (acep->a_type == ALLOW) { - mode |= S_IWGRP; - } - } - if (!(seen & S_IWOTH)) { - seen |= S_IWOTH; - if (acep->a_type == ALLOW) { - mode |= S_IWOTH; - } - } - } - if ((acep->a_access_mask & ACE_EXECUTE)) { - if (!(seen & S_IXUSR)) { - seen |= S_IXUSR; - if (acep->a_type == ALLOW) { - mode |= S_IXUSR; - } - } - if (!(seen & S_IXGRP)) { - seen |= S_IXGRP; - if (acep->a_type == ALLOW) { - mode |= S_IXGRP; - } - } - if (!(seen & S_IXOTH)) { - seen |= S_IXOTH; - if (acep->a_type == ALLOW) { - mode |= S_IXOTH; - } - } - } - } - } - return (mode); -} - -static zfs_acl_t * -zfs_acl_node_read_internal(znode_t *zp) -{ - zfs_acl_t *aclp; - - aclp = zfs_acl_alloc(0); - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; - aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0]; - - return (aclp); -} - -/* - * Read an external acl object. - */ -static int -zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp) -{ - uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; - zfs_acl_t *aclp; - int error; - - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - - if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { - *aclpp = zfs_acl_node_read_internal(zp); - return (0); - } - - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count); - - error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, - ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl); - if (error != 0) { - zfs_acl_free(aclp); - return (error); - } - - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; - - *aclpp = aclp; - return (0); -} - -static boolean_t -zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit) -{ - ace_t *acep; - int i; - - *inherit = 0; - - if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) { - return (B_FALSE); - } - - for (i = 0, acep = uace; i != aclcnt; i++, acep++) { - - /* - * first check type of entry - */ - - switch (acep->a_flags & ACE_TYPE_FLAGS) { - case ACE_OWNER: - acep->a_who = -1; - break; - case (ACE_IDENTIFIER_GROUP | ACE_GROUP): - case ACE_IDENTIFIER_GROUP: - if (acep->a_flags & ACE_GROUP) { - acep->a_who = -1; - } - break; - case ACE_EVERYONE: - acep->a_who = -1; - break; - } - - /* - * next check inheritance level flags - */ - - if (acep->a_type != ALLOW && acep->a_type != DENY) - return (B_FALSE); - - /* - * Only directories should have inheritance flags. - */ - if (ZTOV(zp)->v_type != VDIR && (acep->a_flags & - (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE| - ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) { - return (B_FALSE); - } - - if (acep->a_flags & - (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)) - *inherit = 1; - - if (acep->a_flags & - (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { - if ((acep->a_flags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE)) == 0) { - return (B_FALSE); - } - } - } - - return (B_TRUE); -} -/* - * common code for setting acl's. - * - * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. - * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's - * already checked the acl and knows whether to inherit. - */ -int -zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp) -{ - int inherit = 0; - int error; - znode_phys_t *zphys = zp->z_phys; - zfs_znode_acl_t *zacl = &zphys->zp_acl; - uint32_t acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; - - ASSERT(MUTEX_HELD(&zp->z_lock)); - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - - if (ihp) - inherit = *ihp; /* already determined by caller */ - else if (!zfs_acl_valid(zp, aclp->z_acl, - aclp->z_acl_count, &inherit)) { - return (EINVAL); - } - - dmu_buf_will_dirty(zp->z_dbuf, tx); - - /* - * Will ACL fit internally? - */ - if (aclp->z_acl_count > ACE_SLOT_CNT) { - if (aoid == 0) { - aoid = dmu_object_alloc(zfsvfs->z_os, - DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx); - } else { - (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, - acl_phys_size, 0, tx); - } - zphys->zp_acl.z_acl_extern_obj = aoid; - zphys->zp_acl.z_acl_count = aclp->z_acl_count; - dmu_write(zfsvfs->z_os, aoid, 0, - acl_phys_size, aclp->z_acl, tx); - } else { - /* - * Migrating back embedded? - */ - if (zphys->zp_acl.z_acl_extern_obj) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - if (error) - return (error); - zphys->zp_acl.z_acl_extern_obj = 0; - } - bcopy(aclp->z_acl, zacl->z_ace_data, - aclp->z_acl_count * sizeof (ace_t)); - zacl->z_acl_count = aclp->z_acl_count; - } - - zp->z_phys->zp_flags &= ~(ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE); - if (inherit) { - zp->z_phys->zp_flags |= ZFS_INHERIT_ACE; - } else if (ace_trivial(zacl->z_ace_data, zacl->z_acl_count) == 0) { - zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; - } - - zphys->zp_mode = zfs_mode_compute(zp, aclp); - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - - return (0); -} - -/* - * Create space for slots_needed ACEs to be append - * to aclp. - */ -static void -zfs_acl_append(zfs_acl_t *aclp, int slots_needed) -{ - ace_t *newacep; - ace_t *oldaclp; - int slot_cnt; - int slots_left = aclp->z_slots - aclp->z_acl_count; - - if (aclp->z_state == ACL_DATA_ALLOCED) - ASSERT(aclp->z_slots >= aclp->z_acl_count); - if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) { - slot_cnt = aclp->z_slots + 1 + (slots_needed - slots_left); - newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP); - bcopy(aclp->z_acl, newacep, - ZFS_ACL_SIZE(aclp->z_acl_count)); - oldaclp = aclp->z_acl; - if (aclp->z_state == ACL_DATA_ALLOCED) - kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots)); - aclp->z_acl = newacep; - aclp->z_slots = slot_cnt; - aclp->z_state = ACL_DATA_ALLOCED; - } -} - -/* - * Remove "slot" ACE from aclp - */ -static void -zfs_ace_remove(zfs_acl_t *aclp, int slot) -{ - if (aclp->z_acl_count > 1) { - (void) memmove(&aclp->z_acl[slot], - &aclp->z_acl[slot +1], sizeof (ace_t) * - (--aclp->z_acl_count - slot)); - } else - aclp->z_acl_count--; -} - -/* - * Update access mask for prepended ACE - * - * This applies the "groupmask" value for aclmode property. - */ -static void -zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner) -{ - - int rmask, wmask, xmask; - int user_ace; - - user_ace = (!(acep->a_flags & - (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP))); - - if (user_ace && (acep->a_who == owner)) { - rmask = S_IRUSR; - wmask = S_IWUSR; - xmask = S_IXUSR; - } else { - rmask = S_IRGRP; - wmask = S_IWGRP; - xmask = S_IXGRP; - } - - if (origacep->a_access_mask & ACE_READ_DATA) { - if (mode & rmask) - acep->a_access_mask &= ~ACE_READ_DATA; - else - acep->a_access_mask |= ACE_READ_DATA; - } - - if (origacep->a_access_mask & ACE_WRITE_DATA) { - if (mode & wmask) - acep->a_access_mask &= ~ACE_WRITE_DATA; - else - acep->a_access_mask |= ACE_WRITE_DATA; - } - - if (origacep->a_access_mask & ACE_APPEND_DATA) { - if (mode & wmask) - acep->a_access_mask &= ~ACE_APPEND_DATA; - else - acep->a_access_mask |= ACE_APPEND_DATA; - } - - if (origacep->a_access_mask & ACE_EXECUTE) { - if (mode & xmask) - acep->a_access_mask &= ~ACE_EXECUTE; - else - acep->a_access_mask |= ACE_EXECUTE; - } -} - -/* - * Apply mode to canonical six ACEs. - */ -static void -zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode) -{ - int cnt; - ace_t *acep; - - cnt = aclp->z_acl_count -1; - acep = aclp->z_acl; - - /* - * Fixup final ACEs to match the mode - */ - - ASSERT(cnt >= 5); - adjust_ace_pair(&acep[cnt - 1], mode); /* everyone@ */ - adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3); /* group@ */ - adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6); /* owner@ */ -} - - -static int -zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask) -{ - return (acep->a_access_mask == mask && acep->a_type == allow_deny && - ((acep->a_flags & ACE_TYPE_FLAGS) == type)); -} - -/* - * Can prepended ACE be reused? - */ -static int -zfs_reuse_deny(ace_t *acep, int i) -{ - int okay_masks; - - if (i < 1) - return (B_FALSE); - - if (acep[i-1].a_type != DENY) - return (B_FALSE); - - if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP)) - return (B_FALSE); - - okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS); - - if (acep[i-1].a_access_mask & ~okay_masks) - return (B_FALSE); - - return (B_TRUE); -} - -/* - * Create space to prepend an ACE - */ -static void -zfs_acl_prepend(zfs_acl_t *aclp, int i) -{ - ace_t *oldaclp = NULL; - ace_t *to, *from; - int slots_left = aclp->z_slots - aclp->z_acl_count; - int oldslots; - int need_free = 0; - - if (aclp->z_state == ACL_DATA_ALLOCED) - ASSERT(aclp->z_slots >= aclp->z_acl_count); - - if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) { - - to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count + - OGE_PAD), KM_SLEEP); - if (aclp->z_state == ACL_DATA_ALLOCED) - need_free++; - from = aclp->z_acl; - oldaclp = aclp->z_acl; - (void) memmove(to, from, - sizeof (ace_t) * aclp->z_acl_count); - aclp->z_state = ACL_DATA_ALLOCED; - } else { - from = aclp->z_acl; - to = aclp->z_acl; - } - - - (void) memmove(&to[i + 1], &from[i], - sizeof (ace_t) * (aclp->z_acl_count - i)); - - if (oldaclp) { - aclp->z_acl = to; - oldslots = aclp->z_slots; - aclp->z_slots = aclp->z_acl_count + OGE_PAD; - if (need_free) - kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots)); - } - -} - -/* - * Prepend deny ACE - */ -static void -zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i, - mode_t mode) -{ - ace_t *acep; - - zfs_acl_prepend(aclp, i); - - acep = aclp->z_acl; - zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who, - (acep[i + 1].a_flags & ACE_TYPE_FLAGS)); - zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid); - aclp->z_acl_count++; -} - -/* - * Split an inherited ACE into inherit_only ACE - * and original ACE with inheritance flags stripped off. - */ -static void -zfs_acl_split_ace(zfs_acl_t *aclp, int i) -{ - ace_t *acep = aclp->z_acl; - - zfs_acl_prepend(aclp, i); - acep = aclp->z_acl; - acep[i] = acep[i + 1]; - acep[i].a_flags |= ACE_INHERIT_ONLY_ACE; - acep[i + 1].a_flags &= ~ALL_INHERIT; - aclp->z_acl_count++; -} - -/* - * Are ACES started at index i, the canonical six ACES? - */ -static int -zfs_have_canonical_six(zfs_acl_t *aclp, int i) -{ - ace_t *acep = aclp->z_acl; - - if ((zfs_acl_ace_match(&acep[i], - DENY, ACE_OWNER, 0) && - zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER, - OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2], - DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3], - ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4], - DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) && - zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE, - EVERYONE_ALLOW_MASK))) { - return (1); - } else { - return (0); - } -} - -/* - * Apply step 1g, to group entries - * - * Need to deal with corner case where group may have - * greater permissions than owner. If so then limit - * group permissions, based on what extra permissions - * group has. - */ -static void -zfs_fixup_group_entries(ace_t *acep, mode_t mode) -{ - mode_t extramode = (mode >> 3) & 07; - mode_t ownermode = (mode >> 6); - - if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) { - - extramode &= ~ownermode; - - if (extramode) { - if (extramode & 04) { - acep[0].a_access_mask &= ~ACE_READ_DATA; - acep[1].a_access_mask &= ~ACE_READ_DATA; - } - if (extramode & 02) { - acep[0].a_access_mask &= - ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - acep[1].a_access_mask &= - ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - } - if (extramode & 01) { - acep[0].a_access_mask &= ~ACE_EXECUTE; - acep[1].a_access_mask &= ~ACE_EXECUTE; - } - } - } -} - -/* - * Apply the chmod algorithm as described - * in PSARC/2002/240 - */ -static int -zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp, - dmu_tx_t *tx) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ace_t *acep; - int i; - int error; - int entry_type; - int reuse_deny; - int need_canonical_six = 1; - int inherit = 0; - int iflags; - - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - ASSERT(MUTEX_HELD(&zp->z_lock)); - - i = 0; - while (i < aclp->z_acl_count) { - acep = aclp->z_acl; - entry_type = (acep[i].a_flags & ACE_TYPE_FLAGS); - iflags = (acep[i].a_flags & ALL_INHERIT); - - if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) || - (iflags & ACE_INHERIT_ONLY_ACE)) { - i++; - if (iflags) - inherit = 1; - continue; - } - - - if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) { - zfs_ace_remove(aclp, i); - continue; - } - - /* - * Need to split ace into two? - */ - if ((iflags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE)) && - (!(iflags & ACE_INHERIT_ONLY_ACE))) { - zfs_acl_split_ace(aclp, i); - i++; - inherit = 1; - continue; - } - - if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || - (entry_type == OWNING_GROUP)) { - acep[i].a_access_mask &= ~OGE_CLEAR; - i++; - continue; - - } else { - if (acep[i].a_type == ALLOW) { - - /* - * Check preceding ACE if any, to see - * if we need to prepend a DENY ACE. - * This is only applicable when the acl_mode - * property == groupmask. - */ - if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) { - - reuse_deny = zfs_reuse_deny(acep, i); - - if (reuse_deny == B_FALSE) { - zfs_acl_prepend_deny(zp, aclp, - i, mode); - i++; - acep = aclp->z_acl; - } else { - zfs_acl_prepend_fixup( - &acep[i - 1], - &acep[i], mode, - zp->z_phys->zp_uid); - } - zfs_fixup_group_entries(&acep[i - 1], - mode); - } - } - i++; - } - } - - /* - * Check out last six aces, if we have six. - */ - - if (aclp->z_acl_count >= 6) { - i = aclp->z_acl_count - 6; - - if (zfs_have_canonical_six(aclp, i)) { - need_canonical_six = 0; - } - } - - if (need_canonical_six) { - - zfs_acl_append(aclp, 6); - i = aclp->z_acl_count; - acep = aclp->z_acl; - zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER); - zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); - zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP); - zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP); - zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK, - DENY, -1, ACE_EVERYONE); - zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK, - ALLOW, -1, ACE_EVERYONE); - aclp->z_acl_count += 6; - } - - zfs_acl_fixup_canonical_six(aclp, mode); - - zp->z_phys->zp_mode = mode; - error = zfs_aclset_common(zp, aclp, tx, &inherit); - return (error); -} - - -int -zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx) -{ - zfs_acl_t *aclp = NULL; - int error; - - ASSERT(MUTEX_HELD(&zp->z_lock)); - mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, &aclp); - if (error == 0) - error = zfs_acl_chmod(zp, mode, aclp, tx); - mutex_exit(&zp->z_acl_lock); - if (aclp) - zfs_acl_free(aclp); - return (error); -} - -/* - * strip off write_owner and write_acl - */ -static void -zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep) -{ - if ((zfsvfs->z_acl_inherit == ZFS_ACL_SECURE) && - (acep->a_type == ALLOW)) - acep->a_access_mask &= ~SECURE_CLEAR; -} - -/* - * inherit inheritable ACEs from parent - */ -static zfs_acl_t * -zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ace_t *pacep; - ace_t *acep; - int ace_cnt = 0; - int pace_cnt; - int i, j; - zfs_acl_t *aclp = NULL; - - i = j = 0; - pace_cnt = paclp->z_acl_count; - pacep = paclp->z_acl; - if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) { - for (i = 0; i != pace_cnt; i++) { - - if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW && - pacep[i].a_type == ALLOW) - continue; - - if (zfs_ace_can_use(zp, &pacep[i])) { - ace_cnt++; - if (!(pacep[i].a_flags & - ACE_NO_PROPAGATE_INHERIT_ACE)) - ace_cnt++; - } - } - } - - aclp = zfs_acl_alloc(ace_cnt + OGE_PAD); - if (ace_cnt && zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) { - acep = aclp->z_acl; - pacep = paclp->z_acl; - for (i = 0; i != pace_cnt; i++) { - - if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW && - pacep[i].a_type == ALLOW) - continue; - - if (zfs_ace_can_use(zp, &pacep[i])) { - - /* - * Now create entry for inherited ace - */ - - acep[j] = pacep[i]; - - /* - * When AUDIT/ALARM a_types are supported - * they should be inherited here. - */ - - if ((pacep[i].a_flags & - ACE_NO_PROPAGATE_INHERIT_ACE) || - (ZTOV(zp)->v_type != VDIR)) { - acep[j].a_flags &= ~ALL_INHERIT; - zfs_securemode_update(zfsvfs, &acep[j]); - j++; - continue; - } - - ASSERT(ZTOV(zp)->v_type == VDIR); - - /* - * If we are inheriting an ACE targeted for - * only files, then make sure inherit_only - * is on for future propagation. - */ - if ((pacep[i].a_flags & (ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE)) != - ACE_FILE_INHERIT_ACE) { - j++; - acep[j] = acep[j-1]; - acep[j-1].a_flags |= - ACE_INHERIT_ONLY_ACE; - acep[j].a_flags &= ~ALL_INHERIT; - } else { - acep[j].a_flags |= ACE_INHERIT_ONLY_ACE; - } - zfs_securemode_update(zfsvfs, &acep[j]); - j++; - } - } - } - aclp->z_acl_count = j; - ASSERT(aclp->z_slots >= aclp->z_acl_count); - - return (aclp); -} - -/* - * Create file system object initial permissions - * including inheritable ACEs. - */ -void -zfs_perm_init(znode_t *zp, znode_t *parent, int flag, - vattr_t *vap, dmu_tx_t *tx, cred_t *cr) -{ - uint64_t mode; - uid_t uid; - gid_t gid; - int error; - int pull_down; - zfs_acl_t *aclp, *paclp; - - mode = MAKEIMODE(vap->va_type, vap->va_mode); - - /* - * Determine uid and gid. - */ - if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || - ((flag & IS_XATTR) && (vap->va_type == VDIR))) { - uid = vap->va_uid; - gid = vap->va_gid; - } else { - uid = crgetuid(cr); - if ((vap->va_mask & AT_GID) && - ((vap->va_gid == parent->z_phys->zp_gid) || - groupmember(vap->va_gid, cr) || - secpolicy_vnode_create_gid(cr) == 0)) - gid = vap->va_gid; - else -#ifdef __FreeBSD__ - gid = parent->z_phys->zp_gid; -#else - gid = (parent->z_phys->zp_mode & S_ISGID) ? - parent->z_phys->zp_gid : crgetgid(cr); -#endif - } - - /* - * If we're creating a directory, and the parent directory has the - * set-GID bit set, set in on the new directory. - * Otherwise, if the user is neither privileged nor a member of the - * file's new group, clear the file's set-GID bit. - */ - - if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) - mode |= S_ISGID; - else { - if ((mode & S_ISGID) && - secpolicy_vnode_setids_setgids(cr, gid) != 0) - mode &= ~S_ISGID; - } - - zp->z_phys->zp_uid = uid; - zp->z_phys->zp_gid = gid; - zp->z_phys->zp_mode = mode; - - mutex_enter(&parent->z_lock); - pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE); - if (pull_down) { - mutex_enter(&parent->z_acl_lock); - VERIFY(0 == zfs_acl_node_read(parent, &paclp)); - mutex_exit(&parent->z_acl_lock); - aclp = zfs_acl_inherit(zp, paclp); - zfs_acl_free(paclp); - } else { - aclp = zfs_acl_alloc(6); - } - mutex_exit(&parent->z_lock); - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - error = zfs_acl_chmod(zp, mode, aclp, tx); - mutex_exit(&zp->z_lock); - mutex_exit(&zp->z_acl_lock); - ASSERT3U(error, ==, 0); - zfs_acl_free(aclp); -} - -/* - * Should ACE be inherited? - */ -static int -zfs_ace_can_use(znode_t *zp, ace_t *acep) -{ - int vtype = ZTOV(zp)->v_type; - - int iflags = (acep->a_flags & 0xf); - - if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) - return (1); - else if (iflags & ACE_FILE_INHERIT_ACE) - return (!((vtype == VDIR) && - (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); - return (0); -} - -#ifdef TODO -/* - * Retrieve a files ACL - */ -int -zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) -{ - zfs_acl_t *aclp; - ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); - int error; - - if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) { - /* - * If owner of file then allow reading of the - * ACL. - */ - if (crgetuid(cr) != zp->z_phys->zp_uid) - return (error); - } - - if (mask == 0) - return (ENOSYS); - - mutex_enter(&zp->z_acl_lock); - - error = zfs_acl_node_read(zp, &aclp); - if (error != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); - } - - - if (mask & VSA_ACECNT) { - vsecp->vsa_aclcnt = aclp->z_acl_count; - } - - if (mask & VSA_ACE) { - vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count * - sizeof (ace_t), KM_SLEEP); - bcopy(aclp->z_acl, vsecp->vsa_aclentp, - aclp->z_acl_count * sizeof (ace_t)); - } - - mutex_exit(&zp->z_acl_lock); - - zfs_acl_free(aclp); - - return (0); -} -#endif /* TODO */ - -#ifdef TODO -/* - * Set a files ACL - */ -int -zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - ace_t *acep = vsecp->vsa_aclentp; - int aclcnt = vsecp->vsa_aclcnt; - ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); - dmu_tx_t *tx; - int error; - int inherit; - zfs_acl_t *aclp; - - if (mask == 0) - return (EINVAL); - - if (!zfs_acl_valid(zp, acep, aclcnt, &inherit)) - return (EINVAL); -top: - error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr); - if (error == EACCES || error == ACCESS_UNDETERMINED) { - if ((error = secpolicy_vnode_setdac(cr, - zp->z_phys->zp_uid)) != 0) { - return (error); - } - } else if (error) { - return (error == EROFS ? error : EPERM); - } - - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - - if (zp->z_phys->zp_acl.z_acl_extern_obj) { - dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj, - 0, ZFS_ACL_SIZE(aclcnt)); - } else if (aclcnt > ACE_SLOT_CNT) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt)); - } - - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); - - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - return (error); - } - - aclp = zfs_acl_alloc(aclcnt); - bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt); - aclp->z_acl_count = aclcnt; - error = zfs_aclset_common(zp, aclp, tx, &inherit); - ASSERT(error == 0); - - zfs_acl_free(aclp); - zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep); - dmu_tx_commit(tx); -done: - mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); - - return (error); -} -#endif /* TODO */ - -static int -zfs_ace_access(ace_t *zacep, int *working_mode) -{ - if (*working_mode == 0) { - return (0); - } - - if (zacep->a_access_mask & *working_mode) { - if (zacep->a_type == ALLOW) { - *working_mode &= - ~(*working_mode & zacep->a_access_mask); - if (*working_mode == 0) - return (0); - } else if (zacep->a_type == DENY) { - return (EACCES); - } - } - - /* - * haven't been specifcally denied at this point - * so return UNDETERMINED. - */ - - return (ACCESS_UNDETERMINED); -} - - -static int -zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr) -{ - zfs_acl_t *aclp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ace_t *zacep; - gid_t gid; - int cnt; - int i; - int error; - int access_deny = ACCESS_UNDETERMINED; - uint_t entry_type; - uid_t uid = crgetuid(cr); - - if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ - *working_mode = 0; - return (0); - } - - *working_mode = v4_mode; - - if ((v4_mode & WRITE_MASK) && - (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && - (!IS_DEVVP(ZTOV(zp)))) { - return (EROFS); - } - - mutex_enter(&zp->z_acl_lock); - - error = zfs_acl_node_read(zp, &aclp); - if (error != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); - } - - - zacep = aclp->z_acl; - cnt = aclp->z_acl_count; - - for (i = 0; i != cnt; i++) { - - DTRACE_PROBE2(zfs__access__common, - ace_t *, &zacep[i], int, *working_mode); - - if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE) - continue; - - entry_type = (zacep[i].a_flags & ACE_TYPE_FLAGS); - switch (entry_type) { - case ACE_OWNER: - if (uid == zp->z_phys->zp_uid) { - access_deny = zfs_ace_access(&zacep[i], - working_mode); - } - break; - case (ACE_IDENTIFIER_GROUP | ACE_GROUP): - case ACE_IDENTIFIER_GROUP: - /* - * Owning group gid is in znode not ACL - */ - if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP)) - gid = zp->z_phys->zp_gid; - else - gid = zacep[i].a_who; - - if (groupmember(gid, cr)) { - access_deny = zfs_ace_access(&zacep[i], - working_mode); - } - break; - case ACE_EVERYONE: - access_deny = zfs_ace_access(&zacep[i], working_mode); - break; - - /* USER Entry */ - default: - if (entry_type == 0) { - if (uid == zacep[i].a_who) { - access_deny = zfs_ace_access(&zacep[i], - working_mode); - } - break; - } - zfs_acl_free(aclp); - mutex_exit(&zp->z_acl_lock); - return (EIO); - } - - if (access_deny != ACCESS_UNDETERMINED) - break; - } - - mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); - - return (access_deny); -} - - -/* - * Determine whether Access should be granted/denied, invoking least - * priv subsytem when a deny is determined. - */ -int -zfs_zaccess(znode_t *zp, int mode, cred_t *cr) -{ - int working_mode; - int error; - int is_attr; - znode_t *xzp; - znode_t *check_zp = zp; - - is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && - (ZTOV(zp)->v_type == VDIR)); - - /* - * If attribute then validate against base file - */ - if (is_attr) { - if ((error = zfs_zget(zp->z_zfsvfs, - zp->z_phys->zp_parent, &xzp)) != 0) { - return (error); - } - check_zp = xzp; - /* - * fixup mode to map to xattr perms - */ - - if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { - mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - mode |= ACE_WRITE_NAMED_ATTRS; - } - - if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { - mode &= ~(ACE_READ_DATA|ACE_EXECUTE); - mode |= ACE_READ_NAMED_ATTRS; - } - } - - error = zfs_zaccess_common(check_zp, mode, &working_mode, cr); - - if (error == EROFS) { - if (is_attr) - VN_RELE(ZTOV(xzp)); - return (error); - } - - if (error || working_mode) { - working_mode = (zfs_v4_to_unix(working_mode) << 6); - error = secpolicy_vnode_access(cr, ZTOV(check_zp), - check_zp->z_phys->zp_uid, working_mode); - } - - if (is_attr) - VN_RELE(ZTOV(xzp)); - - return (error); -} - -/* - * Special zaccess function to check for special nfsv4 perm. - * doesn't call secpolicy_vnode_access() for failure, since that - * would probably be the wrong policy function to call. - * instead its up to the caller to handle that situation. - */ - -int -zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr) -{ - int working_mode = 0; - return (zfs_zaccess_common(zp, mode, &working_mode, cr)); -} - -/* - * Translate tradition unix VREAD/VWRITE/VEXEC mode into - * native ACL format and call zfs_zaccess() - */ -int -zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr) -{ - int v4_mode = zfs_unix_to_v4(mode >> 6); - - return (zfs_zaccess(zp, v4_mode, cr)); -} - -static int -zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr) -{ - int error; - - error = secpolicy_vnode_access(cr, ZTOV(zp), - dzp->z_phys->zp_uid, S_IWRITE|S_IEXEC); - - if (error == 0) - error = zfs_sticky_remove_access(dzp, zp, cr); - - return (error); -} - -/* - * Determine whether Access should be granted/deny, without - * consulting least priv subsystem. - * - * - * The following chart is the recommended NFSv4 enforcement for - * ability to delete an object. - * - * ------------------------------------------------------- - * | Parent Dir | Target Object Permissions | - * | permissions | | - * ------------------------------------------------------- - * | | ACL Allows | ACL Denies| Delete | - * | | Delete | Delete | unspecified| - * ------------------------------------------------------- - * | ACL Allows | Permit | Permit | Permit | - * | DELETE_CHILD | | - * ------------------------------------------------------- - * | ACL Denies | Permit | Deny | Deny | - * | DELETE_CHILD | | | | - * ------------------------------------------------------- - * | ACL specifies | | | | - * | only allow | Permit | Permit | Permit | - * | write and | | | | - * | execute | | | | - * ------------------------------------------------------- - * | ACL denies | | | | - * | write and | Permit | Deny | Deny | - * | execute | | | | - * ------------------------------------------------------- - * ^ - * | - * No search privilege, can't even look up file? - * - */ -int -zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) -{ - int dzp_working_mode = 0; - int zp_working_mode = 0; - int dzp_error, zp_error; - - /* - * Arghh, this check is going to require a couple of questions - * to be asked. We want specific DELETE permissions to - * take precedence over WRITE/EXECUTE. We don't - * want an ACL such as this to mess us up. - * user:joe:write_data:deny,user:joe:delete:allow - * - * However, deny permissions may ultimately be overridden - * by secpolicy_vnode_access(). - */ - - dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, - &dzp_working_mode, cr); - zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr); - - if (dzp_error == EROFS || zp_error == EROFS) - return (dzp_error); - - /* - * First check the first row. - * We only need to see if parent Allows delete_child - */ - if ((dzp_working_mode & ACE_DELETE_CHILD) == 0) - return (0); - - /* - * Second row - * we already have the necessary information in - * zp_working_mode, zp_error and dzp_error. - */ - - if ((zp_working_mode & ACE_DELETE) == 0) - return (0); - - /* - * Now zp_error should either be EACCES which indicates - * a "deny" delete entry or ACCESS_UNDETERMINED if the "delete" - * entry exists on the target. - * - * dzp_error should be either EACCES which indicates a "deny" - * entry for delete_child or ACCESS_UNDETERMINED if no delete_child - * entry exists. If value is EACCES then we are done - * and zfs_delete_final_check() will make the final decision - * regarding to allow the delete. - */ - - ASSERT(zp_error != 0 && dzp_error != 0); - if (dzp_error == EACCES) - return (zfs_delete_final_check(zp, dzp, cr)); - - /* - * Third Row - * Only need to check for write/execute on parent - */ - - dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE, - &dzp_working_mode, cr); - - if (dzp_error == EROFS) - return (dzp_error); - - if ((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0) - return (zfs_sticky_remove_access(dzp, zp, cr)); - - /* - * Fourth Row - */ - - if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) != 0) && - ((zp_working_mode & ACE_DELETE) == 0)) - return (zfs_sticky_remove_access(dzp, zp, cr)); - - return (zfs_delete_final_check(zp, dzp, cr)); -} - -int -zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - znode_t *tzp, cred_t *cr) -{ - int add_perm; - int error; - - add_perm = (ZTOV(szp)->v_type == VDIR) ? - ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; - - /* - * Rename permissions are combination of delete permission + - * add file/subdir permission. - */ - - /* - * first make sure we do the delete portion. - * - * If that succeeds then check for add_file/add_subdir permissions - */ - - if (error = zfs_zaccess_delete(sdzp, szp, cr)) - return (error); - - /* - * If we have a tzp, see if we can delete it? - */ - if (tzp) { - if (error = zfs_zaccess_delete(tdzp, tzp, cr)) - return (error); - } - - /* - * Now check for add permissions - */ - error = zfs_zaccess(tdzp, add_perm, cr); - - return (error); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c deleted file mode 100644 index c8450d4..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/vfs.h> -#include <sys/fs/zfs.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_acl.h> - -void -zfs_ace_byteswap(ace_t *ace, int ace_cnt) -{ - int i; - - for (i = 0; i != ace_cnt; i++, ace++) { - ace->a_who = BSWAP_32(ace->a_who); - ace->a_access_mask = BSWAP_32(ace->a_access_mask); - ace->a_flags = BSWAP_16(ace->a_flags); - ace->a_type = BSWAP_16(ace->a_type); - } -} - -/* ARGSUSED */ -void -zfs_acl_byteswap(void *buf, size_t size) -{ - int cnt; - - /* - * Arggh, since we don't know how many ACEs are in - * the array, we have to swap the entire block - */ - - cnt = size / sizeof (ace_t); - - zfs_ace_byteswap((ace_t *)buf, cnt); -} - -void -zfs_znode_byteswap(void *buf, size_t size) -{ - znode_phys_t *zp = buf; - - ASSERT(size >= sizeof (znode_phys_t)); - - zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]); - zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]); - zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]); - zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]); - zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]); - zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]); - zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]); - zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]); - zp->zp_gen = BSWAP_64(zp->zp_gen); - zp->zp_mode = BSWAP_64(zp->zp_mode); - zp->zp_size = BSWAP_64(zp->zp_size); - zp->zp_parent = BSWAP_64(zp->zp_parent); - zp->zp_links = BSWAP_64(zp->zp_links); - zp->zp_xattr = BSWAP_64(zp->zp_xattr); - zp->zp_rdev = BSWAP_64(zp->zp_rdev); - zp->zp_flags = BSWAP_64(zp->zp_flags); - zp->zp_uid = BSWAP_64(zp->zp_uid); - zp->zp_gid = BSWAP_64(zp->zp_gid); - zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); - zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); - zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); - zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]); - - zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); - zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count); - zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); - zp->zp_acl.z_acl_pad = BSWAP_16(zp->zp_acl.z_acl_pad); - zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c deleted file mode 100644 index 0c2fb02..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c +++ /dev/null @@ -1,1119 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * ZFS control directory (a.k.a. ".zfs") - * - * This directory provides a common location for all ZFS meta-objects. - * Currently, this is only the 'snapshot' directory, but this may expand in the - * future. The elements are built using the GFS primitives, as the hierarchy - * does not actually exist on disk. - * - * For 'snapshot', we don't want to have all snapshots always mounted, because - * this would take up a huge amount of space in /etc/mnttab. We have three - * types of objects: - * - * ctldir ------> snapshotdir -------> snapshot - * | - * | - * V - * mounted fs - * - * The 'snapshot' node contains just enough information to lookup '..' and act - * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we - * perform an automount of the underlying filesystem and return the - * corresponding vnode. - * - * All mounts are handled automatically by the kernel, but unmounts are - * (currently) handled from user land. The main reason is that there is no - * reliable way to auto-unmount the filesystem when it's "no longer in use". - * When the user unmounts a filesystem, we call zfsctl_unmount(), which - * unmounts any snapshots within the snapshot directory. - */ - -#include <sys/zfs_context.h> -#include <sys/zfs_ctldir.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_vfsops.h> -#include <sys/namei.h> -#include <sys/gfs.h> -#include <sys/stat.h> -#include <sys/dmu.h> -#include <sys/mount.h> - -typedef struct { - char *se_name; - vnode_t *se_root; - avl_node_t se_node; -} zfs_snapentry_t; - -static int -snapentry_compare(const void *a, const void *b) -{ - const zfs_snapentry_t *sa = a; - const zfs_snapentry_t *sb = b; - int ret = strcmp(sa->se_name, sb->se_name); - - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); -} - -static struct vop_vector zfsctl_ops_root; -static struct vop_vector zfsctl_ops_snapdir; -static struct vop_vector zfsctl_ops_snapshot; - -static vnode_t *zfsctl_mknode_snapdir(vnode_t *); -static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); - -typedef struct zfsctl_node { - gfs_dir_t zc_gfs_private; - uint64_t zc_id; - timestruc_t zc_cmtime; /* ctime and mtime, always the same */ -} zfsctl_node_t; - -typedef struct zfsctl_snapdir { - zfsctl_node_t sd_node; - kmutex_t sd_lock; - avl_tree_t sd_snaps; -} zfsctl_snapdir_t; - -/* - * Root directory elements. We have only a single static entry, 'snapshot'. - */ -static gfs_dirent_t zfsctl_root_entries[] = { - { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, - { NULL } -}; - -/* include . and .. in the calculation */ -#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ - sizeof (gfs_dirent_t)) + 1) - - -/* - * Initialize the various GFS pieces we'll need to create and manipulate .zfs - * directories. This is called from the ZFS init routine, and initializes the - * vnode ops vectors that we'll be using. - */ -void -zfsctl_init(void) -{ -} - -void -zfsctl_fini(void) -{ -} - -/* - * Return the inode number associated with the 'snapshot' directory. - */ -/* ARGSUSED */ -static ino64_t -zfsctl_root_inode_cb(vnode_t *vp, int index) -{ - ASSERT(index == 0); - return (ZFSCTL_INO_SNAPDIR); -} - -/* - * Create the '.zfs' directory. This directory is cached as part of the VFS - * structure. This results in a hold on the vfs_t. The code in zfs_umount() - * therefore checks against a vfs_count of 2 instead of 1. This reference - * is removed when the ctldir is destroyed in the unmount. - */ -void -zfsctl_create(zfsvfs_t *zfsvfs) -{ - vnode_t *vp, *rvp; - zfsctl_node_t *zcp; - - ASSERT(zfsvfs->z_ctldir == NULL); - - vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, - &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, - zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); - zcp = vp->v_data; - zcp->zc_id = ZFSCTL_INO_ROOT; - - VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0); - ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); - VN_URELE(rvp); - - /* - * We're only faking the fact that we have a root of a filesystem for - * the sake of the GFS interfaces. Undo the flag manipulation it did - * for us. - */ - vp->v_vflag &= ~VV_ROOT; - - zfsvfs->z_ctldir = vp; -} - -/* - * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. - * There might still be more references if we were force unmounted, but only - * new zfs_inactive() calls can occur and they don't reference .zfs - */ -void -zfsctl_destroy(zfsvfs_t *zfsvfs) -{ - VN_RELE(zfsvfs->z_ctldir); - zfsvfs->z_ctldir = NULL; -} - -/* - * Given a root znode, retrieve the associated .zfs directory. - * Add a hold to the vnode and return it. - */ -vnode_t * -zfsctl_root(znode_t *zp) -{ - ASSERT(zfs_has_ctldir(zp)); - VN_HOLD(zp->z_zfsvfs->z_ctldir); - return (zp->z_zfsvfs->z_ctldir); -} - -/* - * Common open routine. Disallow any write access. - */ -/* ARGSUSED */ -static int -zfsctl_common_open(struct vop_open_args *ap) -{ - int flags = ap->a_mode; - - if (flags & FWRITE) - return (EACCES); - - return (0); -} - -/* - * Common close routine. Nothing to do here. - */ -/* ARGSUSED */ -static int -zfsctl_common_close(struct vop_close_args *ap) -{ - return (0); -} - -/* - * Common access routine. Disallow writes. - */ -/* ARGSUSED */ -static int -zfsctl_common_access(ap) - struct vop_access_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - int mode = ap->a_mode; - - if (mode & VWRITE) - return (EACCES); - - return (0); -} - -/* - * Common getattr function. Fill in basic information. - */ -static void -zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) -{ - zfsctl_node_t *zcp = vp->v_data; - timestruc_t now; - - vap->va_uid = 0; - vap->va_gid = 0; - vap->va_rdev = 0; - /* - * We are a purly virtual object, so we have no - * blocksize or allocated blocks. - */ - vap->va_blksize = 0; - vap->va_nblocks = 0; - vap->va_seq = 0; - vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; - vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | - S_IROTH | S_IXOTH; - vap->va_type = VDIR; - /* - * We live in the now (for atime). - */ - gethrestime(&now); - vap->va_atime = now; - vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime; - /* FreeBSD: Reset chflags(2) flags. */ - vap->va_flags = 0; -} - -static int -zfsctl_common_fid(ap) - struct vop_fid_args /* { - struct vnode *a_vp; - struct fid *a_fid; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - fid_t *fidp = (void *)ap->a_fid; - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - zfsctl_node_t *zcp = vp->v_data; - uint64_t object = zcp->zc_id; - zfid_short_t *zfid; - int i; - - ZFS_ENTER(zfsvfs); - - fidp->fid_len = SHORT_FID_LEN; - - zfid = (zfid_short_t *)fidp; - - zfid->zf_len = SHORT_FID_LEN; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* .zfs znodes always have a generation number of 0 */ - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = 0; - - ZFS_EXIT(zfsvfs); - return (0); -} - -static int -zfsctl_common_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - - /* - * Destroy the vm object and flush associated pages. - */ - vnode_destroy_vobject(vp); - VI_LOCK(vp); - vp->v_data = NULL; - VI_UNLOCK(vp); - return (0); -} - -/* - * .zfs inode namespace - * - * We need to generate unique inode numbers for all files and directories - * within the .zfs pseudo-filesystem. We use the following scheme: - * - * ENTRY ZFSCTL_INODE - * .zfs 1 - * .zfs/snapshot 2 - * .zfs/snapshot/<snap> objectid(snap) - */ - -#define ZFSCTL_INO_SNAP(id) (id) - -/* - * Get root directory attributes. - */ -/* ARGSUSED */ -static int -zfsctl_root_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct vattr *vap = ap->a_vap; - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - - ZFS_ENTER(zfsvfs); - vap->va_nodeid = ZFSCTL_INO_ROOT; - vap->va_nlink = vap->va_size = NROOT_ENTRIES; - - zfsctl_common_getattr(vp, vap); - ZFS_EXIT(zfsvfs); - - return (0); -} - -/* - * Special case the handling of "..". - */ -/* ARGSUSED */ -int -zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr) -{ - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - int err; - - ZFS_ENTER(zfsvfs); - - if (strcmp(nm, "..") == 0) { - err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread); - if (err == 0) - VOP_UNLOCK(*vpp, 0); - } else { - err = gfs_dir_lookup(dvp, nm, vpp); - } - - ZFS_EXIT(zfsvfs); - - return (err); -} - -/* - * Special case the handling of "..". - */ -/* ARGSUSED */ -int -zfsctl_root_lookup_vop(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; -{ - vnode_t *dvp = ap->a_dvp; - vnode_t **vpp = ap->a_vpp; - cred_t *cr = ap->a_cnp->cn_cred; - int flags = ap->a_cnp->cn_flags; - int nameiop = ap->a_cnp->cn_nameiop; - char nm[NAME_MAX + 1]; - int err; - - if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE)) - return (EOPNOTSUPP); - - ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); - strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); - - err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr); - if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - - return (err); -} - -static struct vop_vector zfsctl_ops_root = { - .vop_default = &default_vnodeops, - .vop_open = zfsctl_common_open, - .vop_close = zfsctl_common_close, - .vop_ioctl = VOP_EINVAL, - .vop_getattr = zfsctl_root_getattr, - .vop_access = zfsctl_common_access, - .vop_readdir = gfs_vop_readdir, - .vop_lookup = zfsctl_root_lookup_vop, - .vop_inactive = gfs_vop_inactive, - .vop_reclaim = zfsctl_common_reclaim, - .vop_fid = zfsctl_common_fid, -}; - -static int -zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) -{ - objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; - - dmu_objset_name(os, zname); - if (strlen(zname) + 1 + strlen(name) >= len) - return (ENAMETOOLONG); - (void) strcat(zname, "@"); - (void) strcat(zname, name); - return (0); -} - -static int -zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr) -{ - zfsctl_snapdir_t *sdp = dvp->v_data; - zfs_snapentry_t search, *sep; - struct vop_inactive_args ap; - avl_index_t where; - int err; - - ASSERT(MUTEX_HELD(&sdp->sd_lock)); - - search.se_name = (char *)name; - if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) - return (ENOENT); - - ASSERT(vn_ismntpt(sep->se_root)); - - /* this will be dropped by dounmount() */ - if ((err = vn_vfswlock(sep->se_root)) != 0) - return (err); - - err = dounmount(vn_mountedvfs(sep->se_root), force, curthread); - if (err) - return (err); - ASSERT(sep->se_root->v_count == 1); - ap.a_vp = sep->se_root; - gfs_vop_inactive(&ap); - - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); - - return (0); -} - -#if 0 -static void -zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) -{ - avl_index_t where; - vfs_t *vfsp; - refstr_t *pathref; - char newpath[MAXNAMELEN]; - char *tail; - - ASSERT(MUTEX_HELD(&sdp->sd_lock)); - ASSERT(sep != NULL); - - vfsp = vn_mountedvfs(sep->se_root); - ASSERT(vfsp != NULL); - - vfs_lock_wait(vfsp); - - /* - * Change the name in the AVL tree. - */ - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); - (void) strcpy(sep->se_name, nm); - VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); - avl_insert(&sdp->sd_snaps, sep, where); - - /* - * Change the current mountpoint info: - * - update the tail of the mntpoint path - * - update the tail of the resource path - */ - pathref = vfs_getmntpoint(vfsp); - (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); - VERIFY((tail = strrchr(newpath, '/')) != NULL); - *(tail+1) = '\0'; - ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); - (void) strcat(newpath, nm); - refstr_rele(pathref); - vfs_setmntpoint(vfsp, newpath); - - pathref = vfs_getresource(vfsp); - (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); - VERIFY((tail = strrchr(newpath, '@')) != NULL); - *(tail+1) = '\0'; - ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); - (void) strcat(newpath, nm); - refstr_rele(pathref); - vfs_setresource(vfsp, newpath); - - vfs_unlock(vfsp); -} -#endif - -#if 0 -static int -zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, - cred_t *cr) -{ - zfsctl_snapdir_t *sdp = sdvp->v_data; - zfs_snapentry_t search, *sep; - avl_index_t where; - char from[MAXNAMELEN], to[MAXNAMELEN]; - int err; - - err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); - if (err) - return (err); - err = zfs_secpolicy_write(from, cr); - if (err) - return (err); - - /* - * Cannot move snapshots out of the snapdir. - */ - if (sdvp != tdvp) - return (EINVAL); - - if (strcmp(snm, tnm) == 0) - return (0); - - err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); - if (err) - return (err); - - mutex_enter(&sdp->sd_lock); - - search.se_name = (char *)snm; - if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { - mutex_exit(&sdp->sd_lock); - return (ENOENT); - } - - err = dmu_objset_rename(from, to, B_FALSE); - if (err == 0) - zfsctl_rename_snap(sdp, sep, tnm); - - mutex_exit(&sdp->sd_lock); - - return (err); -} -#endif - -#if 0 -/* ARGSUSED */ -static int -zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) -{ - zfsctl_snapdir_t *sdp = dvp->v_data; - char snapname[MAXNAMELEN]; - int err; - - err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); - if (err) - return (err); - err = zfs_secpolicy_write(snapname, cr); - if (err) - return (err); - - mutex_enter(&sdp->sd_lock); - - err = zfsctl_unmount_snap(dvp, name, 0, cr); - if (err) { - mutex_exit(&sdp->sd_lock); - return (err); - } - - err = dmu_objset_destroy(snapname); - - mutex_exit(&sdp->sd_lock); - - return (err); -} -#endif - -/* - * Lookup entry point for the 'snapshot' directory. Try to open the - * snapshot if it exist, creating the pseudo filesystem vnode as necessary. - * Perform a mount of the associated dataset on top of the vnode. - */ -/* ARGSUSED */ -int -zfsctl_snapdir_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; -{ - vnode_t *dvp = ap->a_dvp; - vnode_t **vpp = ap->a_vpp; - char nm[NAME_MAX + 1]; - zfsctl_snapdir_t *sdp = dvp->v_data; - objset_t *snap; - char snapname[MAXNAMELEN]; - char *mountpoint; - zfs_snapentry_t *sep, search; - size_t mountpoint_len; - avl_index_t where; - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - int err; - - ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); - strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); - - ASSERT(dvp->v_type == VDIR); - - if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) - return (0); - - *vpp = NULL; - - /* - * If we get a recursive call, that means we got called - * from the domount() code while it was trying to look up the - * spec (which looks like a local path for zfs). We need to - * add some flag to domount() to tell it not to do this lookup. - */ - if (MUTEX_HELD(&sdp->sd_lock)) - return (ENOENT); - - ZFS_ENTER(zfsvfs); - - mutex_enter(&sdp->sd_lock); - search.se_name = (char *)nm; - if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { - *vpp = sep->se_root; - VN_HOLD(*vpp); - if ((*vpp)->v_mountedhere == NULL) { - /* - * The snapshot was unmounted behind our backs, - * try to remount it. - */ - goto domount; - } - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* - * The requested snapshot is not currently mounted, look it up. - */ - err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); - if (err) { - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - return (err); - } - if (dmu_objset_open(snapname, DMU_OST_ZFS, - DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) { - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - return (ENOENT); - } - - sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); - sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); - (void) strcpy(sep->se_name, nm); - *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); - VN_HOLD(*vpp); - avl_insert(&sdp->sd_snaps, sep, where); - - dmu_objset_close(snap); -domount: - mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + - strlen("/.zfs/snapshot/") + strlen(nm) + 1; - mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); - (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", - dvp->v_vfsp->mnt_stat.f_mntonname, nm); - err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0); - kmem_free(mountpoint, mountpoint_len); - /* FreeBSD: This line was moved from below to avoid a lock recursion. */ - if (err == 0) - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - mutex_exit(&sdp->sd_lock); - - /* - * If we had an error, drop our hold on the vnode and - * zfsctl_snapshot_inactive() will clean up. - */ - if (err) { - VN_RELE(*vpp); - *vpp = NULL; - } - return (err); -} - -/* ARGSUSED */ -static int -zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, - offset_t *offp, offset_t *nextp, void *data) -{ - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - char snapname[MAXNAMELEN]; - uint64_t id, cookie; - - ZFS_ENTER(zfsvfs); - - cookie = *offp; - if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, - &cookie) == ENOENT) { - *eofp = 1; - ZFS_EXIT(zfsvfs); - return (0); - } - - (void) strcpy(dp->d_name, snapname); - dp->d_ino = ZFSCTL_INO_SNAP(id); - *nextp = cookie; - - ZFS_EXIT(zfsvfs); - - return (0); -} - -vnode_t * -zfsctl_mknode_snapdir(vnode_t *pvp) -{ - vnode_t *vp; - zfsctl_snapdir_t *sdp; - - vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp, - &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, - zfsctl_snapdir_readdir_cb, NULL); - sdp = vp->v_data; - sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; - sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; - mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&sdp->sd_snaps, snapentry_compare, - sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); - return (vp); -} - -/* ARGSUSED */ -static int -zfsctl_snapdir_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct vattr *vap = ap->a_vap; - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - zfsctl_snapdir_t *sdp = vp->v_data; - - ZFS_ENTER(zfsvfs); - zfsctl_common_getattr(vp, vap); - vap->va_nodeid = gfs_file_inode(vp); - vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; - ZFS_EXIT(zfsvfs); - - return (0); -} - -/* ARGSUSED */ -static int -zfsctl_snapdir_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - zfsctl_snapdir_t *sdp = vp->v_data; - void *private; - - private = gfs_dir_inactive(vp); - if (private != NULL) { - ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); - mutex_destroy(&sdp->sd_lock); - avl_destroy(&sdp->sd_snaps); - kmem_free(private, sizeof (zfsctl_snapdir_t)); - } - return (0); -} - -static struct vop_vector zfsctl_ops_snapdir = { - .vop_default = &default_vnodeops, - .vop_open = zfsctl_common_open, - .vop_close = zfsctl_common_close, - .vop_ioctl = VOP_EINVAL, - .vop_getattr = zfsctl_snapdir_getattr, - .vop_access = zfsctl_common_access, - .vop_readdir = gfs_vop_readdir, - .vop_lookup = zfsctl_snapdir_lookup, - .vop_inactive = zfsctl_snapdir_inactive, - .vop_reclaim = zfsctl_common_reclaim, - .vop_fid = zfsctl_common_fid, -}; - -static vnode_t * -zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) -{ - vnode_t *vp; - zfsctl_node_t *zcp; - - vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, - &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); - zcp = vp->v_data; - zcp->zc_id = objset; - - return (vp); -} - -static int -zfsctl_snapshot_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - struct vop_inactive_args iap; - zfsctl_snapdir_t *sdp; - zfs_snapentry_t *sep, *next; - int locked; - vnode_t *dvp; - - VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0); - sdp = dvp->v_data; - VOP_UNLOCK(dvp, 0); - - if (!(locked = MUTEX_HELD(&sdp->sd_lock))) - mutex_enter(&sdp->sd_lock); - - if (vp->v_count > 1) { - if (!locked) - mutex_exit(&sdp->sd_lock); - return (0); - } - ASSERT(!vn_ismntpt(vp)); - - sep = avl_first(&sdp->sd_snaps); - while (sep != NULL) { - next = AVL_NEXT(&sdp->sd_snaps, sep); - - if (sep->se_root == vp) { - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); - break; - } - sep = next; - } - ASSERT(sep != NULL); - - if (!locked) - mutex_exit(&sdp->sd_lock); - VN_RELE(dvp); - - /* - * Dispose of the vnode for the snapshot mount point. - * This is safe to do because once this entry has been removed - * from the AVL tree, it can't be found again, so cannot become - * "active". If we lookup the same name again we will end up - * creating a new vnode. - */ - iap.a_vp = vp; - return (gfs_vop_inactive(&iap)); -} - -static int -zfsctl_traverse_begin(vnode_t **vpp, int lktype, kthread_t *td) -{ - - VN_HOLD(*vpp); - /* Snapshot should be already mounted, but just in case. */ - if (vn_mountedvfs(*vpp) == NULL) - return (ENOENT); - return (traverse(vpp, lktype)); -} - -static void -zfsctl_traverse_end(vnode_t *vp, int err) -{ - - if (err == 0) - vput(vp); - else - VN_RELE(vp); -} - -static int -zfsctl_snapshot_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - int err; - - err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, ap->a_td); - if (err == 0) - err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td); - zfsctl_traverse_end(vp, err); - return (err); -} - -static int -zfsctl_snapshot_fid(ap) - struct vop_fid_args /* { - struct vnode *a_vp; - struct fid *a_fid; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - int err; - - err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, curthread); - if (err == 0) - err = VOP_VPTOFH(vp, (void *)ap->a_fid); - zfsctl_traverse_end(vp, err); - return (err); -} - -/* - * These VP's should never see the light of day. They should always - * be covered. - */ -static struct vop_vector zfsctl_ops_snapshot = { - .vop_default = &default_vnodeops, - .vop_inactive = zfsctl_snapshot_inactive, - .vop_reclaim = zfsctl_common_reclaim, - .vop_getattr = zfsctl_snapshot_getattr, - .vop_fid = zfsctl_snapshot_fid, -}; - -int -zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - vnode_t *dvp, *vp; - zfsctl_snapdir_t *sdp; - zfsctl_node_t *zcp; - zfs_snapentry_t *sep; - int error; - - ASSERT(zfsvfs->z_ctldir != NULL); - error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, - NULL, 0, NULL, kcred); - if (error != 0) - return (error); - sdp = dvp->v_data; - - mutex_enter(&sdp->sd_lock); - sep = avl_first(&sdp->sd_snaps); - while (sep != NULL) { - vp = sep->se_root; - zcp = vp->v_data; - if (zcp->zc_id == objsetid) - break; - - sep = AVL_NEXT(&sdp->sd_snaps, sep); - } - - if (sep != NULL) { - VN_HOLD(vp); - error = traverse(&vp, LK_SHARED | LK_RETRY); - if (error == 0) { - if (vp == sep->se_root) - error = EINVAL; - else - *zfsvfsp = VTOZ(vp)->z_zfsvfs; - } - mutex_exit(&sdp->sd_lock); - if (error == 0) - VN_URELE(vp); - else - VN_RELE(vp); - } else { - error = EINVAL; - mutex_exit(&sdp->sd_lock); - } - - VN_RELE(dvp); - - return (error); -} - -/* - * Unmount any snapshots for the given filesystem. This is called from - * zfs_umount() - if we have a ctldir, then go through and unmount all the - * snapshots. - */ -int -zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) -{ - struct vop_inactive_args ap; - zfsvfs_t *zfsvfs = vfsp->vfs_data; - vnode_t *dvp, *svp; - zfsctl_snapdir_t *sdp; - zfs_snapentry_t *sep, *next; - int error; - - ASSERT(zfsvfs->z_ctldir != NULL); - error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, - NULL, 0, NULL, cr); - if (error != 0) - return (error); - sdp = dvp->v_data; - - mutex_enter(&sdp->sd_lock); - - sep = avl_first(&sdp->sd_snaps); - while (sep != NULL) { - svp = sep->se_root; - next = AVL_NEXT(&sdp->sd_snaps, sep); - - /* - * If this snapshot is not mounted, then it must - * have just been unmounted by somebody else, and - * will be cleaned up by zfsctl_snapdir_inactive(). - */ - if (vn_ismntpt(svp)) { - if ((error = vn_vfswlock(svp)) != 0) - goto out; - - /* - * Increase usecount, so dounmount() won't vrele() it - * to 0 and call zfsctl_snapdir_inactive(). - */ - VN_HOLD(svp); - vfsp = vn_mountedvfs(svp); - mtx_lock(&Giant); - error = dounmount(vfsp, fflags, curthread); - mtx_unlock(&Giant); - if (error != 0) { - VN_RELE(svp); - goto out; - } - - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); - - /* - * We can't use VN_RELE(), as that will try to - * invoke zfsctl_snapdir_inactive(), and that - * would lead to an attempt to re-grab the sd_lock. - */ - ASSERT3U(svp->v_count, ==, 1); - ap.a_vp = svp; - gfs_vop_inactive(&ap); - } - sep = next; - } -out: - mutex_exit(&sdp->sd_lock); - VN_RELE(dvp); - - return (error); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c deleted file mode 100644 index f233b8f..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c +++ /dev/null @@ -1,797 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/resource.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/file.h> -#include <sys/kmem.h> -#include <sys/uio.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/stat.h> -#include <sys/unistd.h> -#include <sys/random.h> -#include <sys/policy.h> -#include <sys/kcondvar.h> -#include <sys/callb.h> -#include <sys/smp.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/fs/zfs.h> -#include <sys/zap.h> -#include <sys/dmu.h> -#include <sys/atomic.h> -#include <sys/zfs_ctldir.h> -#include <sys/dnlc.h> - -/* - * Lock a directory entry. A dirlock on <dzp, name> protects that name - * in dzp's directory zap object. As long as you hold a dirlock, you can - * assume two things: (1) dzp cannot be reaped, and (2) no other thread - * can change the zap entry for (i.e. link or unlink) this name. - * - * Input arguments: - * dzp - znode for directory - * name - name of entry to lock - * flag - ZNEW: if the entry already exists, fail with EEXIST. - * ZEXISTS: if the entry does not exist, fail with ENOENT. - * ZSHARED: allow concurrent access with other ZSHARED callers. - * ZXATTR: we want dzp's xattr directory - * - * Output arguments: - * zpp - pointer to the znode for the entry (NULL if there isn't one) - * dlpp - pointer to the dirlock for this entry (NULL on error) - * - * Return value: 0 on success or errno on failure. - * - * NOTE: Always checks for, and rejects, '.' and '..'. - */ -int -zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - int flag) -{ - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zfs_dirlock_t *dl; - uint64_t zoid; - int error; - vnode_t *vp; - - *zpp = NULL; - *dlpp = NULL; - - /* - * Verify that we are not trying to lock '.', '..', or '.zfs' - */ - if (name[0] == '.' && - (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || - zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) - return (EEXIST); - - /* - * Wait until there are no locks on this name. - */ - rw_enter(&dzp->z_name_lock, RW_READER); - mutex_enter(&dzp->z_lock); - for (;;) { - if (dzp->z_unlinked) { - mutex_exit(&dzp->z_lock); - rw_exit(&dzp->z_name_lock); - return (ENOENT); - } - for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) - if (strcmp(name, dl->dl_name) == 0) - break; - if (dl == NULL) { - /* - * Allocate a new dirlock and add it to the list. - */ - dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); - cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); - dl->dl_name = name; - dl->dl_sharecnt = 0; - dl->dl_namesize = 0; - dl->dl_dzp = dzp; - dl->dl_next = dzp->z_dirlocks; - dzp->z_dirlocks = dl; - break; - } - if ((flag & ZSHARED) && dl->dl_sharecnt != 0) - break; - cv_wait(&dl->dl_cv, &dzp->z_lock); - } - - if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { - /* - * We're the second shared reference to dl. Make a copy of - * dl_name in case the first thread goes away before we do. - * Note that we initialize the new name before storing its - * pointer into dl_name, because the first thread may load - * dl->dl_name at any time. He'll either see the old value, - * which is his, or the new shared copy; either is OK. - */ - dl->dl_namesize = strlen(dl->dl_name) + 1; - name = kmem_alloc(dl->dl_namesize, KM_SLEEP); - bcopy(dl->dl_name, name, dl->dl_namesize); - dl->dl_name = name; - } - - mutex_exit(&dzp->z_lock); - - /* - * We have a dirlock on the name. (Note that it is the dirlock, - * not the dzp's z_lock, that protects the name in the zap object.) - * See if there's an object by this name; if so, put a hold on it. - */ - if (flag & ZXATTR) { - zoid = dzp->z_phys->zp_xattr; - error = (zoid == 0 ? ENOENT : 0); - } else { - vp = dnlc_lookup(ZTOV(dzp), name); - if (vp == DNLC_NO_VNODE) { - VN_RELE(vp); - error = ENOENT; - } else if (vp) { - if (flag & ZNEW) { - zfs_dirent_unlock(dl); - VN_RELE(vp); - return (EEXIST); - } - *dlpp = dl; - *zpp = VTOZ(vp); - return (0); - } else { - error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, - 8, 1, &zoid); - zoid = ZFS_DIRENT_OBJ(zoid); - if (error == ENOENT) - dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); - } - } - if (error) { - if (error != ENOENT || (flag & ZEXISTS)) { - zfs_dirent_unlock(dl); - return (error); - } - } else { - if (flag & ZNEW) { - zfs_dirent_unlock(dl); - return (EEXIST); - } - error = zfs_zget(zfsvfs, zoid, zpp); - if (error) { - zfs_dirent_unlock(dl); - return (error); - } - if (!(flag & ZXATTR)) - dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); - } - - *dlpp = dl; - - return (0); -} - -/* - * Unlock this directory entry and wake anyone who was waiting for it. - */ -void -zfs_dirent_unlock(zfs_dirlock_t *dl) -{ - znode_t *dzp = dl->dl_dzp; - zfs_dirlock_t **prev_dl, *cur_dl; - - mutex_enter(&dzp->z_lock); - rw_exit(&dzp->z_name_lock); - if (dl->dl_sharecnt > 1) { - dl->dl_sharecnt--; - mutex_exit(&dzp->z_lock); - return; - } - prev_dl = &dzp->z_dirlocks; - while ((cur_dl = *prev_dl) != dl) - prev_dl = &cur_dl->dl_next; - *prev_dl = dl->dl_next; - cv_broadcast(&dl->dl_cv); - mutex_exit(&dzp->z_lock); - - if (dl->dl_namesize != 0) - kmem_free(dl->dl_name, dl->dl_namesize); - cv_destroy(&dl->dl_cv); - kmem_free(dl, sizeof (*dl)); -} - -/* - * Look up an entry in a directory. - * - * NOTE: '.' and '..' are handled as special cases because - * no directory entries are actually stored for them. If this is - * the root of a filesystem, then '.zfs' is also treated as a - * special pseudo-directory. - */ -int -zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp) -{ - zfs_dirlock_t *dl; - znode_t *zp; - int error = 0; - - if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { - *vpp = ZTOV(dzp); - VN_HOLD(*vpp); - } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - /* - * If we are a snapshot mounted under .zfs, return - * the vp for the snapshot directory. - */ - if (dzp->z_phys->zp_parent == dzp->z_id && - zfsvfs->z_parent != zfsvfs) { - error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, - "snapshot", vpp, NULL, 0, NULL, kcred); - return (error); - } - rw_enter(&dzp->z_parent_lock, RW_READER); - error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); - if (error == 0) - *vpp = ZTOV(zp); - rw_exit(&dzp->z_parent_lock); - } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { - *vpp = zfsctl_root(dzp); - } else { - error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED); - if (error == 0) { - *vpp = ZTOV(zp); - zfs_dirent_unlock(dl); - dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ - } - } - - return (error); -} - -static char * -zfs_unlinked_hexname(char namebuf[17], uint64_t x) -{ - char *name = &namebuf[16]; - const char digits[16] = "0123456789abcdef"; - - *name = '\0'; - do { - *--name = digits[x & 0xf]; - x >>= 4; - } while (x != 0); - - return (name); -} - -/* - * unlinked Set (formerly known as the "delete queue") Error Handling - * - * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we - * don't specify the name of the entry that we will be manipulating. We - * also fib and say that we won't be adding any new entries to the - * unlinked set, even though we might (this is to lower the minimum file - * size that can be deleted in a full filesystem). So on the small - * chance that the nlink list is using a fat zap (ie. has more than - * 2000 entries), we *may* not pre-read a block that's needed. - * Therefore it is remotely possible for some of the assertions - * regarding the unlinked set below to fail due to i/o error. On a - * nondebug system, this will result in the space being leaked. - */ -void -zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - char obj_name[17]; - int error; - - ASSERT(zp->z_unlinked); - ASSERT3U(zp->z_phys->zp_links, ==, 0); - - error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj, - zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx); - ASSERT3U(error, ==, 0); -} - -/* - * Clean up any znodes that had no links when we either crashed or - * (force) umounted the file system. - */ -void -zfs_unlinked_drain(zfsvfs_t *zfsvfs) -{ - zap_cursor_t zc; - zap_attribute_t zap; - dmu_object_info_t doi; - znode_t *zp; - int error; - - /* - * Interate over the contents of the unlinked set. - */ - for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); - zap_cursor_retrieve(&zc, &zap) == 0; - zap_cursor_advance(&zc)) { - - /* - * See what kind of object we have in list - */ - - error = dmu_object_info(zfsvfs->z_os, - zap.za_first_integer, &doi); - if (error != 0) - continue; - - ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || - (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); - /* - * We need to re-mark these list entries for deletion, - * so we pull them back into core and set zp->z_unlinked. - */ - error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); - - /* - * We may pick up znodes that are already marked for deletion. - * This could happen during the purge of an extended attribute - * directory. All we need to do is skip over them, since they - * are already in the system marked z_unlinked. - */ - if (error != 0) - continue; - - zp->z_unlinked = B_TRUE; - VN_RELE(ZTOV(zp)); - } - zap_cursor_fini(&zc); -} - -/* - * Delete the entire contents of a directory. Return a count - * of the number of entries that could not be deleted. - * - * NOTE: this function assumes that the directory is inactive, - * so there is no need to lock its entries before deletion. - * Also, it assumes the directory contents is *only* regular - * files. - */ -static int -zfs_purgedir(znode_t *dzp) -{ - zap_cursor_t zc; - zap_attribute_t zap; - znode_t *xzp; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zfs_dirlock_t dl; - int skipped = 0; - int error; - - for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); - (error = zap_cursor_retrieve(&zc, &zap)) == 0; - zap_cursor_advance(&zc)) { - error = zfs_zget(zfsvfs, - ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); - ASSERT3U(error, ==, 0); - - ASSERT((ZTOV(xzp)->v_type == VREG) || - (ZTOV(xzp)->v_type == VLNK)); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); - dmu_tx_hold_bonus(tx, xzp->z_id); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - VN_RELE(ZTOV(xzp)); - skipped += 1; - continue; - } - bzero(&dl, sizeof (dl)); - dl.dl_dzp = dzp; - dl.dl_name = zap.za_name; - - error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); - ASSERT3U(error, ==, 0); - dmu_tx_commit(tx); - - VN_RELE(ZTOV(xzp)); - } - zap_cursor_fini(&zc); - ASSERT(error == ENOENT); - return (skipped); -} - -void -zfs_rmnode(znode_t *zp) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os = zfsvfs->z_os; - znode_t *xzp = NULL; - char obj_name[17]; - dmu_tx_t *tx; - uint64_t acl_obj; - int error; - int vfslocked; - - vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs); - - ASSERT(zp->z_phys->zp_links == 0); - - /* - * If this is an attribute directory, purge its contents. - */ - if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && - (zp->z_phys->zp_flags & ZFS_XATTR)) { - if (zfs_purgedir(zp) != 0) { - /* - * Not enough space to delete some xattrs. - * Leave it on the unlinked set. - */ - VFS_UNLOCK_GIANT(vfslocked); - return; - } - } - - /* - * If the file has extended attributes, we're going to unlink - * the xattr dir. - */ - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); - ASSERT(error == 0); - } - - acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; - - /* - * Set up the transaction. - */ - tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - if (xzp) { - dmu_tx_hold_bonus(tx, xzp->z_id); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); - } - if (acl_obj) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - /* - * Not enough space to delete the file. Leave it in the - * unlinked set, leaking it until the fs is remounted (at - * which point we'll call zfs_unlinked_drain() to process it). - */ - dmu_tx_abort(tx); - VFS_UNLOCK_GIANT(vfslocked); - return; - } - - if (xzp) { - dmu_buf_will_dirty(xzp->z_dbuf, tx); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ - xzp->z_phys->zp_links = 0; /* no more links to it */ - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - } - - /* Remove this znode from the unlinked set */ - error = zap_remove(os, zfsvfs->z_unlinkedobj, - zfs_unlinked_hexname(obj_name, zp->z_id), tx); - ASSERT3U(error, ==, 0); - - zfs_znode_delete(zp, tx); - - dmu_tx_commit(tx); - - if (xzp) - VN_RELE(ZTOV(xzp)); - VFS_UNLOCK_GIANT(vfslocked); -} - -/* - * Link zp into dl. Can only fail if zp has been unlinked. - */ -int -zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) -{ - znode_t *dzp = dl->dl_dzp; - vnode_t *vp = ZTOV(zp); - uint64_t value; - int zp_is_dir = (vp->v_type == VDIR); - int error; - - dmu_buf_will_dirty(zp->z_dbuf, tx); - mutex_enter(&zp->z_lock); - - if (!(flag & ZRENAMING)) { - if (zp->z_unlinked) { /* no new links to unlinked zp */ - ASSERT(!(flag & (ZNEW | ZEXISTS))); - mutex_exit(&zp->z_lock); - return (ENOENT); - } - zp->z_phys->zp_links++; - } - zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ - - if (!(flag & ZNEW)) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - mutex_exit(&zp->z_lock); - - dmu_buf_will_dirty(dzp->z_dbuf, tx); - mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size++; /* one dirent added */ - dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); - mutex_exit(&dzp->z_lock); - - /* - * MacOS X will fill in the 4-bit object type here. - */ - value = ZFS_DIRENT_MAKE(IFTODT(zp->z_phys->zp_mode), zp->z_id); - error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, - 8, 1, &value, tx); - ASSERT(error == 0); - - dnlc_update(ZTOV(dzp), dl->dl_name, vp); - - return (0); -} - -/* - * Unlink zp from dl, and mark zp for deletion if this was the last link. - * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). - * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. - * If it's non-NULL, we use it to indicate whether the znode needs deletion, - * and it's the caller's job to do it. - */ -int -zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, - boolean_t *unlinkedp) -{ - znode_t *dzp = dl->dl_dzp; - vnode_t *vp = ZTOV(zp); - int zp_is_dir = (vp->v_type == VDIR); - boolean_t unlinked = B_FALSE; - int error; - - dnlc_remove(ZTOV(dzp), dl->dl_name); - - if (!(flag & ZRENAMING)) { - dmu_buf_will_dirty(zp->z_dbuf, tx); - - if (vn_vfswlock(vp)) /* prevent new mounts on zp */ - return (EBUSY); - - if (vn_ismntpt(vp)) { /* don't remove mount point */ - vn_vfsunlock(vp); - return (EBUSY); - } - - mutex_enter(&zp->z_lock); - if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ - mutex_exit(&zp->z_lock); - vn_vfsunlock(vp); - return (ENOTEMPTY); - } - if (zp->z_phys->zp_links <= zp_is_dir) { - zfs_panic_recover("zfs: link count on vnode %p is %u, " - "should be at least %u", zp->z_vnode, - (int)zp->z_phys->zp_links, - zp_is_dir + 1); - zp->z_phys->zp_links = zp_is_dir + 1; - } - if (--zp->z_phys->zp_links == zp_is_dir) { - zp->z_unlinked = B_TRUE; - zp->z_phys->zp_links = 0; - unlinked = B_TRUE; - } else { - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - } - mutex_exit(&zp->z_lock); - vn_vfsunlock(vp); - } - - dmu_buf_will_dirty(dzp->z_dbuf, tx); - mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size--; /* one dirent removed */ - dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); - mutex_exit(&dzp->z_lock); - - error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx); - ASSERT(error == 0); - - if (unlinkedp != NULL) - *unlinkedp = unlinked; - else if (unlinked) - zfs_unlinked_add(zp, tx); - - return (0); -} - -/* - * Indicate whether the directory is empty. Works with or without z_lock - * held, but can only be consider a hint in the latter case. Returns true - * if only "." and ".." remain and there's no work in progress. - */ -boolean_t -zfs_dirempty(znode_t *dzp) -{ - return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); -} - -int -zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_t *xzp; - dmu_tx_t *tx; - uint64_t xoid; - int error; - - *xvpp = NULL; - - if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr)) - return (error); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) - dmu_tx_wait(tx); - dmu_tx_abort(tx); - return (error); - } - zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0); - ASSERT(xzp->z_id == xoid); - ASSERT(xzp->z_phys->zp_parent == zp->z_id); - dmu_buf_will_dirty(zp->z_dbuf, tx); - zp->z_phys->zp_xattr = xoid; - - (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, ""); - dmu_tx_commit(tx); - - *xvpp = ZTOV(xzp); - - return (0); -} - -/* - * Return a znode for the extended attribute directory for zp. - * ** If the directory does not already exist, it is created ** - * - * IN: zp - znode to obtain attribute directory from - * cr - credentials of caller - * flags - flags from the VOP_LOOKUP call - * - * OUT: xzpp - pointer to extended attribute znode - * - * RETURN: 0 on success - * error number on failure - */ -int -zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_t *xzp; - zfs_dirlock_t *dl; - vattr_t va; - int error; -top: - error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR); - if (error) - return (error); - - if (xzp != NULL) { - *xvpp = ZTOV(xzp); - zfs_dirent_unlock(dl); - return (0); - } - - ASSERT(zp->z_phys->zp_xattr == 0); - -#ifdef TODO - if (!(flags & CREATE_XATTR_DIR)) { - zfs_dirent_unlock(dl); - return (ENOENT); - } -#endif - - if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - zfs_dirent_unlock(dl); - return (EROFS); - } - - /* - * The ability to 'create' files in an attribute - * directory comes from the write_xattr permission on the base file. - * - * The ability to 'search' an attribute directory requires - * read_xattr permission on the base file. - * - * Once in a directory the ability to read/write attributes - * is controlled by the permissions on the attribute file. - */ - va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; - va.va_type = VDIR; - va.va_mode = S_IFDIR | S_ISVTX | 0777; - va.va_uid = (uid_t)zp->z_phys->zp_uid; - va.va_gid = (gid_t)zp->z_phys->zp_gid; - - error = zfs_make_xattrdir(zp, &va, xvpp, cr); - zfs_dirent_unlock(dl); - - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - /* NB: we already did dmu_tx_wait() if necessary */ - goto top; - } - - return (error); -} - -/* - * Decide whether it is okay to remove within a sticky directory. - * - * In sticky directories, write access is not sufficient; - * you can remove entries from a directory only if: - * - * you own the directory, - * you own the entry, - * the entry is a plain file and you have write access, - * or you are privileged (checked in secpolicy...). - * - * The function returns 0 if remove access is granted. - */ -int -zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) -{ - uid_t uid; - - if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ - return (0); - - if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 || - (uid = crgetuid(cr)) == zdp->z_phys->zp_uid || - uid == zp->z_phys->zp_uid || - (ZTOV(zp)->v_type == VREG && - zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0)) - return (0); - else - return (secpolicy_vnode_remove(cr)); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c deleted file mode 100644 index e2385a0..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c +++ /dev/null @@ -1,335 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/spa.h> -#include <sys/spa_impl.h> -#include <sys/vdev.h> -#include <sys/vdev_impl.h> -#include <sys/zio.h> - -#include <sys/fm/fs/zfs.h> -#include <sys/fm/protocol.h> -#include <sys/fm/util.h> - -#ifdef _KERNEL -/* Including sys/bus.h is just too hard, so I declare what I need here. */ -extern void devctl_notify(const char *__system, const char *__subsystem, - const char *__type, const char *__data); -#endif - -/* - * This general routine is responsible for generating all the different ZFS - * ereports. The payload is dependent on the class, and which arguments are - * supplied to the function: - * - * EREPORT POOL VDEV IO - * block X X X - * data X X - * device X X - * pool X - * - * If we are in a loading state, all errors are chained together by the same - * SPA-wide ENA. - * - * For isolated I/O requests, we get the ENA from the zio_t. The propagation - * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want - * to chain together all ereports associated with a logical piece of data. For - * read I/Os, there are basically three 'types' of I/O, which form a roughly - * layered diagram: - * - * +---------------+ - * | Aggregate I/O | No associated logical data or device - * +---------------+ - * | - * V - * +---------------+ Reads associated with a piece of logical data. - * | Read I/O | This includes reads on behalf of RAID-Z, - * +---------------+ mirrors, gang blocks, retries, etc. - * | - * V - * +---------------+ Reads associated with a particular device, but - * | Physical I/O | no logical data. Issued as part of vdev caching - * +---------------+ and I/O aggregation. - * - * Note that 'physical I/O' here is not the same terminology as used in the rest - * of ZIO. Typically, 'physical I/O' simply means that there is no attached - * blockpointer. But I/O with no associated block pointer can still be related - * to a logical piece of data (i.e. RAID-Z requests). - * - * Purely physical I/O always have unique ENAs. They are not related to a - * particular piece of logical data, and therefore cannot be chained together. - * We still generate an ereport, but the DE doesn't correlate it with any - * logical piece of data. When such an I/O fails, the delegated I/O requests - * will issue a retry, which will trigger the 'real' ereport with the correct - * ENA. - * - * We keep track of the ENA for a ZIO chain through the 'io_logical' member. - * When a new logical I/O is issued, we set this to point to itself. Child I/Os - * then inherit this pointer, so that when it is first set subsequent failures - * will use the same ENA. If a physical I/O is issued (by passing the - * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a - * unique ENA will be generated. For an aggregate I/O, this pointer is set to - * NULL, and no ereport will be generated (since it doesn't actually correspond - * to any particular device or piece of data). - */ -void -zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, - uint64_t stateoroffset, uint64_t size) -{ -#ifdef _KERNEL - char buf[1024]; - struct sbuf sb; - struct timespec ts; - - /* - * If we are doing a spa_tryimport(), ignore errors. - */ - if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) - return; - - /* - * If we are in the middle of opening a pool, and the previous attempt - * failed, don't bother logging any new ereports - we're just going to - * get the same diagnosis anyway. - */ - if (spa->spa_load_state != SPA_LOAD_NONE && - spa->spa_last_open_failed) - return; - - /* - * Ignore any errors from I/Os that we are going to retry anyway - we - * only generate errors from the final failure. - */ - if (zio && zio_should_retry(zio)) - return; - - /* - * If this is not a read or write zio, ignore the error. This can occur - * if the DKIOCFLUSHWRITECACHE ioctl fails. - */ - if (zio && zio->io_type != ZIO_TYPE_READ && - zio->io_type != ZIO_TYPE_WRITE) - return; - - nanotime(&ts); - - sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); - sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec); - - /* - * Serialize ereport generation - */ - mutex_enter(&spa->spa_errlist_lock); - -#if 0 - /* - * Determine the ENA to use for this event. If we are in a loading - * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use - * a root zio-wide ENA. Otherwise, simply use a unique ENA. - */ - if (spa->spa_load_state != SPA_LOAD_NONE) { -#if 0 - if (spa->spa_ena == 0) - spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); -#endif - ena = spa->spa_ena; - } else if (zio != NULL && zio->io_logical != NULL) { -#if 0 - if (zio->io_logical->io_ena == 0) - zio->io_logical->io_ena = - fm_ena_generate(0, FM_ENA_FMT1); -#endif - ena = zio->io_logical->io_ena; - } else { -#if 0 - ena = fm_ena_generate(0, FM_ENA_FMT1); -#else - ena = 0; -#endif - } -#endif - - /* - * Construct the full class, detector, and other standard FMA fields. - */ - sbuf_printf(&sb, " ereport_version=%u", FM_EREPORT_VERSION); - sbuf_printf(&sb, " class=%s.%s", ZFS_ERROR_CLASS, subclass); - - sbuf_printf(&sb, " zfs_scheme_version=%u", FM_ZFS_SCHEME_VERSION); - - /* - * Construct the per-ereport payload, depending on which parameters are - * passed in. - */ - - /* - * Generic payload members common to all ereports. - * - * The direct reference to spa_name is used rather than spa_name() - * because of the asynchronous nature of the zio pipeline. spa_name() - * asserts that the config lock is held in some form. This is always - * the case in I/O context, but because the check for RW_WRITER compares - * against 'curthread', we may be in an asynchronous context and blow - * this assert. Rather than loosen this assert, we acknowledge that all - * contexts in which this function is called (pool open, I/O) are safe, - * and dereference the name directly. - */ - sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name); - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, - spa_guid(spa)); - sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, - spa->spa_load_state); - - if (vd != NULL) { - vdev_t *pvd = vd->vdev_parent; - - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, - vd->vdev_guid); - sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, - vd->vdev_ops->vdev_op_type); - if (vd->vdev_path) - sbuf_printf(&sb, " %s=%s", - FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path); - if (vd->vdev_devid) - sbuf_printf(&sb, " %s=%s", - FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid); - - if (pvd != NULL) { - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid); - sbuf_printf(&sb, " %s=%s", - FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, - pvd->vdev_ops->vdev_op_type); - if (pvd->vdev_path) - sbuf_printf(&sb, " %s=%s", - FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, - pvd->vdev_path); - if (pvd->vdev_devid) - sbuf_printf(&sb, " %s=%s", - FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, - pvd->vdev_devid); - } - } - - if (zio != NULL) { - /* - * Payload common to all I/Os. - */ - sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, - zio->io_error); - - /* - * If the 'size' parameter is non-zero, it indicates this is a - * RAID-Z or other I/O where the physical offset and length are - * provided for us, instead of within the zio_t. - */ - if (vd != NULL) { - if (size) { - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, - stateoroffset); - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size); - } else { - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, - zio->io_offset); - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, - zio->io_size); - } - } - - /* - * Payload for I/Os with corresponding logical information. - */ - if (zio->io_logical != NULL) { - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, - zio->io_logical->io_bookmark.zb_object); - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, - zio->io_logical->io_bookmark.zb_level); - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, - zio->io_logical->io_bookmark.zb_blkid); - } - } else if (vd != NULL) { - /* - * If we have a vdev but no zio, this is a device fault, and the - * 'stateoroffset' parameter indicates the previous state of the - * vdev. - */ - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, - stateoroffset); - } - mutex_exit(&spa->spa_errlist_lock); - - sbuf_finish(&sb); - ZFS_LOG(1, "%s", sbuf_data(&sb)); - devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb)); - if (sbuf_overflowed(&sb)) - printf("ZFS WARNING: sbuf overflowed\n"); - sbuf_delete(&sb); -#endif -} - -/* - * The 'resource.fs.zfs.ok' event is an internal signal that the associated - * resource (pool or disk) has been identified by ZFS as healthy. This will - * then trigger the DE to close the associated case, if any. - */ -void -zfs_post_ok(spa_t *spa, vdev_t *vd) -{ -#ifdef _KERNEL - char buf[1024]; - char class[64]; - struct sbuf sb; - struct timespec ts; - - nanotime(&ts); - - sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); - sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec); - - snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE, - ZFS_ERROR_CLASS, FM_RESOURCE_OK); - sbuf_printf(&sb, " %s=%hhu", FM_VERSION, FM_RSRC_VERSION); - sbuf_printf(&sb, " %s=%s", FM_CLASS, class); - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, - spa_guid(spa)); - if (vd) - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, - vd->vdev_guid); - sbuf_finish(&sb); - devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb)); - if (sbuf_overflowed(&sb)) - printf("ZFS WARNING: sbuf overflowed\n"); - sbuf_delete(&sb); -#endif -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c deleted file mode 100644 index c9424be..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ /dev/null @@ -1,1826 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/conf.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/malloc.h> -#include <sys/mutex.h> -#include <sys/proc.h> -#include <sys/errno.h> -#include <sys/uio.h> -#include <sys/buf.h> -#include <sys/file.h> -#include <sys/kmem.h> -#include <sys/conf.h> -#include <sys/cmn_err.h> -#include <sys/stat.h> -#include <sys/zfs_ioctl.h> -#include <sys/zap.h> -#include <sys/spa.h> -#include <sys/spa_impl.h> -#include <sys/vdev.h> -#include <sys/vdev_impl.h> -#include <sys/dmu.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_prop.h> -#include <sys/sunddi.h> -#include <sys/policy.h> -#include <sys/zone.h> -#include <sys/nvpair.h> -#include <sys/mount.h> -#include <sys/taskqueue.h> -#include <sys/sdt.h> -#include <sys/varargs.h> -#include <sys/fs/zfs.h> -#include <sys/zfs_ctldir.h> -#include <sys/zvol.h> - -#include "zfs_namecheck.h" -#include "zfs_prop.h" - -CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE); - -static struct cdev *zfsdev; - -extern void zfs_init(void); -extern void zfs_fini(void); - -typedef int zfs_ioc_func_t(zfs_cmd_t *); -typedef int zfs_secpolicy_func_t(const char *, cred_t *); - -typedef struct zfs_ioc_vec { - zfs_ioc_func_t *zvec_func; - zfs_secpolicy_func_t *zvec_secpolicy; - enum { - no_name, - pool_name, - dataset_name - } zvec_namecheck; -} zfs_ioc_vec_t; - -/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ -void -__dprintf(const char *file, const char *func, int line, const char *fmt, ...) -{ - const char *newfile; - char buf[256]; - va_list adx; - - /* - * Get rid of annoying "../common/" prefix to filename. - */ - newfile = strrchr(file, '/'); - if (newfile != NULL) { - newfile = newfile + 1; /* Get rid of leading / */ - } else { - newfile = file; - } - - va_start(adx, fmt); - (void) vsnprintf(buf, sizeof (buf), fmt, adx); - va_end(adx); - - /* - * To get this data, use the zfs-dprintf probe as so: - * dtrace -q -n 'zfs-dprintf \ - * /stringof(arg0) == "dbuf.c"/ \ - * {printf("%s: %s", stringof(arg1), stringof(arg3))}' - * arg0 = file name - * arg1 = function name - * arg2 = line number - * arg3 = message - */ - DTRACE_PROBE4(zfs__dprintf, - char *, newfile, char *, func, int, line, char *, buf); -} - -/* - * Policy for top-level read operations (list pools). Requires no privileges, - * and can be used in the local zone, as there is no associated dataset. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_none(const char *unused1, cred_t *cr) -{ - return (0); -} - -/* - * Policy for dataset read operations (list children, get statistics). Requires - * no privileges, but must be visible in the local zone. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_read(const char *dataset, cred_t *cr) -{ - if (INGLOBALZONE(curproc) || - zone_dataset_visible(dataset, NULL)) - return (0); - - return (ENOENT); -} - -static int -zfs_dozonecheck(const char *dataset, cred_t *cr) -{ - uint64_t zoned; - int writable = 1; - - /* - * The dataset must be visible by this zone -- check this first - * so they don't see EPERM on something they shouldn't know about. - */ - if (!INGLOBALZONE(curproc) && - !zone_dataset_visible(dataset, &writable)) - return (ENOENT); - - if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) - return (ENOENT); - - if (INGLOBALZONE(curproc)) { - /* - * If the fs is zoned, only root can access it from the - * global zone. - */ - if (secpolicy_zfs(cr) && zoned) - return (EPERM); - } else { - /* - * If we are in a local zone, the 'zoned' property must be set. - */ - if (!zoned) - return (EPERM); - - /* must be writable by this zone */ - if (!writable) - return (EPERM); - } - return (0); -} - -/* - * Policy for dataset write operations (create children, set properties, etc). - * Requires SYS_MOUNT privilege, and must be writable in the local zone. - */ -int -zfs_secpolicy_write(const char *dataset, cred_t *cr) -{ - int error; - - if (error = zfs_dozonecheck(dataset, cr)) - return (error); - - return (secpolicy_zfs(cr)); -} - -/* - * Policy for operations that want to write a dataset's parent: - * create, destroy, snapshot, clone, restore. - */ -static int -zfs_secpolicy_parent(const char *dataset, cred_t *cr) -{ - char parentname[MAXNAMELEN]; - char *cp; - - /* - * Remove the @bla or /bla from the end of the name to get the parent. - */ - (void) strncpy(parentname, dataset, sizeof (parentname)); - cp = strrchr(parentname, '@'); - if (cp != NULL) { - cp[0] = '\0'; - } else { - cp = strrchr(parentname, '/'); - if (cp == NULL) - return (ENOENT); - cp[0] = '\0'; - - } - - return (zfs_secpolicy_write(parentname, cr)); -} - -/* - * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires - * SYS_CONFIG privilege, which is not available in a local zone. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_config(const char *unused, cred_t *cr) -{ - if (secpolicy_sys_config(cr, B_FALSE) != 0) - return (EPERM); - - return (0); -} - -/* - * Policy for fault injection. Requires all privileges. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_inject(const char *unused, cred_t *cr) -{ - return (secpolicy_zinject(cr)); -} - -/* - * Policy for dataset backup operations (sendbackup). - * Requires SYS_MOUNT privilege, and must be writable in the local zone. - */ -static int -zfs_secpolicy_operator(const char *dataset, cred_t *cr) -{ - int writable = 1; - - if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) - return (ENOENT); - if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr)) - return (EPERM); - return (0); -} - -/* - * Returns the nvlist as specified by the user in the zfs_cmd_t. - */ -static int -get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp) -{ - char *packed; - size_t size; - int error; - nvlist_t *config = NULL; - - /* - * Read in and unpack the user-supplied nvlist. - */ - if ((size = zc->zc_nvlist_src_size) == 0) - return (EINVAL); - - packed = kmem_alloc(size, KM_SLEEP); - - if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed, - size)) != 0) { - kmem_free(packed, size); - return (error); - } - - if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) { - kmem_free(packed, size); - return (error); - } - - kmem_free(packed, size); - - *nvp = config; - return (0); -} - -static int -put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) -{ - char *packed = NULL; - size_t size; - int error; - - VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); - - if (size > zc->zc_nvlist_dst_size) { - /* - * Solaris returns ENOMEM here, because even if an error is - * returned from an ioctl(2), new zc_nvlist_dst_size will be - * passed to the userland. This is not the case for FreeBSD. - * We need to return 0, so the kernel will copy the - * zc_nvlist_dst_size back and the userland can discover that a - * bigger buffer is needed. - */ - error = 0; - } else { - VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, - KM_SLEEP) == 0); - error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, - size); - kmem_free(packed, size); - } - - zc->zc_nvlist_dst_size = size; - return (error); -} - -static int -zfs_ioc_pool_create(zfs_cmd_t *zc) -{ - int error; - nvlist_t *config; - - if ((error = get_nvlist(zc, &config)) != 0) - return (error); - - error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ? - NULL : zc->zc_value); - - nvlist_free(config); - - return (error); -} - -static int -zfs_ioc_pool_destroy(zfs_cmd_t *zc) -{ - return (spa_destroy(zc->zc_name)); -} - -static int -zfs_ioc_pool_import(zfs_cmd_t *zc) -{ - int error; - nvlist_t *config; - uint64_t guid; - - if ((error = get_nvlist(zc, &config)) != 0) - return (error); - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || - guid != zc->zc_guid) - error = EINVAL; - else - error = spa_import(zc->zc_name, config, - zc->zc_value[0] == '\0' ? NULL : zc->zc_value); - - nvlist_free(config); - - return (error); -} - -static int -zfs_ioc_pool_export(zfs_cmd_t *zc) -{ - return (spa_export(zc->zc_name, NULL)); -} - -static int -zfs_ioc_pool_configs(zfs_cmd_t *zc) -{ - nvlist_t *configs; - int error; - - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) - return (EEXIST); - - error = put_nvlist(zc, configs); - - nvlist_free(configs); - - return (error); -} - -static int -zfs_ioc_pool_stats(zfs_cmd_t *zc) -{ - nvlist_t *config; - int error; - int ret = 0; - - error = spa_get_stats(zc->zc_name, &config, zc->zc_value, - sizeof (zc->zc_value)); - - if (config != NULL) { - ret = put_nvlist(zc, config); - nvlist_free(config); - - /* - * The config may be present even if 'error' is non-zero. - * In this case we return success, and preserve the real errno - * in 'zc_cookie'. - */ - zc->zc_cookie = error; - } else { - ret = error; - } - - return (ret); -} - -/* - * Try to import the given pool, returning pool stats as appropriate so that - * user land knows which devices are available and overall pool health. - */ -static int -zfs_ioc_pool_tryimport(zfs_cmd_t *zc) -{ - nvlist_t *tryconfig, *config; - int error; - - if ((error = get_nvlist(zc, &tryconfig)) != 0) - return (error); - - config = spa_tryimport(tryconfig); - - nvlist_free(tryconfig); - - if (config == NULL) - return (EINVAL); - - error = put_nvlist(zc, config); - nvlist_free(config); - - return (error); -} - -static int -zfs_ioc_pool_scrub(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_scrub(spa, zc->zc_cookie, B_FALSE); - - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_pool_freeze(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error == 0) { - spa_freeze(spa); - spa_close(spa, FTAG); - } - return (error); -} - -static int -zfs_ioc_pool_upgrade(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - spa_upgrade(spa); - - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_pool_get_history(zfs_cmd_t *zc) -{ - spa_t *spa; - char *hist_buf; - uint64_t size; - int error; - - if ((size = zc->zc_history_len) == 0) - return (EINVAL); - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) { - spa_close(spa, FTAG); - return (ENOTSUP); - } - - hist_buf = kmem_alloc(size, KM_SLEEP); - if ((error = spa_history_get(spa, &zc->zc_history_offset, - &zc->zc_history_len, hist_buf)) == 0) { - error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history, - zc->zc_history_len); - } - - spa_close(spa, FTAG); - kmem_free(hist_buf, size); - return (error); -} - -static int -zfs_ioc_pool_log_history(zfs_cmd_t *zc) -{ - spa_t *spa; - char *history_str = NULL; - size_t size; - int error; - - size = zc->zc_history_len; - if (size == 0 || size > HIS_MAX_RECORD_LEN) - return (EINVAL); - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) { - spa_close(spa, FTAG); - return (ENOTSUP); - } - - /* add one for the NULL delimiter */ - size++; - history_str = kmem_alloc(size, KM_SLEEP); - if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str, - size)) != 0) { - spa_close(spa, FTAG); - kmem_free(history_str, size); - return (error); - } - history_str[size - 1] = '\0'; - - error = spa_history_log(spa, history_str, zc->zc_history_offset); - - spa_close(spa, FTAG); - kmem_free(history_str, size); - - return (error); -} - -static int -zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) -{ - int error; - - if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)) - return (error); - - return (0); -} - -static int -zfs_ioc_obj_to_path(zfs_cmd_t *zc) -{ - objset_t *osp; - int error; - - if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS, - DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0) - return (error); - - error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value, - sizeof (zc->zc_value)); - dmu_objset_close(osp); - - return (error); -} - -static int -zfs_ioc_vdev_add(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - nvlist_t *config; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - - /* - * A root pool with concatenated devices is not supported. - * Thus, can not add a device to a root pool with one device. - */ - if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) { - spa_close(spa, FTAG); - return (EDOM); - } - - if ((error = get_nvlist(zc, &config)) == 0) { - error = spa_vdev_add(spa, config); - nvlist_free(config); - } - - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_remove(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_online(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - error = vdev_online(spa, zc->zc_guid); - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_offline(zfs_cmd_t *zc) -{ - spa_t *spa; - int istmp = zc->zc_cookie; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - error = vdev_offline(spa, zc->zc_guid, istmp); - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_attach(zfs_cmd_t *zc) -{ - spa_t *spa; - int replacing = zc->zc_cookie; - nvlist_t *config; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if ((error = get_nvlist(zc, &config)) == 0) { - error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); - nvlist_free(config); - } - - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_detach(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); - - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_setpath(zfs_cmd_t *zc) -{ - spa_t *spa; - char *path = zc->zc_value; - uint64_t guid = zc->zc_guid; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - - error = spa_vdev_setpath(spa, guid, path); - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_objset_stats(zfs_cmd_t *zc) -{ - objset_t *os = NULL; - int error; - nvlist_t *nv; - -retry: - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os); - if (error != 0) { - /* - * This is ugly: dmu_objset_open() can return EBUSY if - * the objset is held exclusively. Fortunately this hold is - * only for a short while, so we retry here. - * This avoids user code having to handle EBUSY, - * for example for a "zfs list". - */ - if (error == EBUSY) { - delay(1); - goto retry; - } - return (error); - } - - dmu_objset_fast_stat(os, &zc->zc_objset_stats); - - if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_all(os, &nv)) == 0) { - dmu_objset_stats(os, nv); - /* - * NB: zvol_get_stats() will read the objset contents, - * which we aren't supposed to do with a - * DS_MODE_STANDARD open, because it could be - * inconsistent. So this is a bit of a workaround... - */ - if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) - VERIFY(zvol_get_stats(os, nv) == 0); - error = put_nvlist(zc, nv); - nvlist_free(nv); - } - - spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value)); - - dmu_objset_close(os); - if (error == ENOMEM) - error = 0; - return (error); -} - -static int -zfs_ioc_dataset_list_next(zfs_cmd_t *zc) -{ - objset_t *os; - int error; - char *p; - -retry: - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os); - if (error != 0) { - /* - * This is ugly: dmu_objset_open() can return EBUSY if - * the objset is held exclusively. Fortunately this hold is - * only for a short while, so we retry here. - * This avoids user code having to handle EBUSY, - * for example for a "zfs list". - */ - if (error == EBUSY) { - delay(1); - goto retry; - } - if (error == ENOENT) - error = ESRCH; - return (error); - } - - p = strrchr(zc->zc_name, '/'); - if (p == NULL || p[1] != '\0') - (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); - p = zc->zc_name + strlen(zc->zc_name); - - do { - error = dmu_dir_list_next(os, - sizeof (zc->zc_name) - (p - zc->zc_name), p, - NULL, &zc->zc_cookie); - if (error == ENOENT) - error = ESRCH; - } while (error == 0 && !INGLOBALZONE(curproc) && - !zone_dataset_visible(zc->zc_name, NULL)); - - /* - * If it's a hidden dataset (ie. with a '$' in its name), don't - * try to get stats for it. Userland will skip over it. - */ - if (error == 0 && strchr(zc->zc_name, '$') == NULL) - error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - - dmu_objset_close(os); - return (error); -} - -static int -zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) -{ - objset_t *os; - int error; - -retry: - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os); - if (error != 0) { - /* - * This is ugly: dmu_objset_open() can return EBUSY if - * the objset is held exclusively. Fortunately this hold is - * only for a short while, so we retry here. - * This avoids user code having to handle EBUSY, - * for example for a "zfs list". - */ - if (error == EBUSY) { - delay(1); - goto retry; - } - if (error == ENOENT) - error = ESRCH; - return (error); - } - - /* - * A dataset name of maximum length cannot have any snapshots, - * so exit immediately. - */ - if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { - dmu_objset_close(os); - return (ESRCH); - } - - error = dmu_snapshot_list_next(os, - sizeof (zc->zc_name) - strlen(zc->zc_name), - zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie); - if (error == ENOENT) - error = ESRCH; - - if (error == 0) - error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - - dmu_objset_close(os); - return (error); -} - -static int -zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl) -{ - nvpair_t *elem; - int error; - const char *propname; - zfs_prop_t prop; - uint64_t intval; - char *strval; - char buf[MAXNAMELEN]; - const char *p; - spa_t *spa; - - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - propname = nvpair_name(elem); - - if ((prop = zfs_name_to_prop(propname)) == - ZFS_PROP_INVAL) { - /* - * If this is a user-defined property, it must be a - * string, and there is no further validation to do. - */ - if (!zfs_prop_user(propname) || - nvpair_type(elem) != DATA_TYPE_STRING) - return (EINVAL); - - VERIFY(nvpair_value_string(elem, &strval) == 0); - error = dsl_prop_set(name, propname, 1, - strlen(strval) + 1, strval); - if (error == 0) - continue; - else - return (error); - } - - /* - * Check permissions for special properties. - */ - switch (prop) { - case ZFS_PROP_ZONED: - /* - * Disallow setting of 'zoned' from within a local zone. - */ - if (!INGLOBALZONE(curproc)) - return (EPERM); - break; - - case ZFS_PROP_QUOTA: - if (error = zfs_dozonecheck(name, cr)) - return (error); - - if (!INGLOBALZONE(curproc)) { - uint64_t zoned; - char setpoint[MAXNAMELEN]; - int dslen; - /* - * Unprivileged users are allowed to modify the - * quota on things *under* (ie. contained by) - * the thing they own. - */ - if (dsl_prop_get_integer(name, "jailed", &zoned, - setpoint)) - return (EPERM); - if (!zoned) /* this shouldn't happen */ - return (EPERM); - dslen = strlen(name); - if (dslen <= strlen(setpoint)) - return (EPERM); - } - break; - - case ZFS_PROP_COMPRESSION: - /* - * If the user specified gzip compression, make sure - * the SPA supports it. We ignore any errors here since - * we'll catch them later. - */ - if (nvpair_type(elem) == DATA_TYPE_UINT64 && - nvpair_value_uint64(elem, &intval) == 0 && - intval >= ZIO_COMPRESS_GZIP_1 && - intval <= ZIO_COMPRESS_GZIP_9) { - if ((p = strchr(name, '/')) == NULL) { - p = name; - } else { - bcopy(name, buf, p - name); - buf[p - name] = '\0'; - p = buf; - } - - if (spa_open(p, &spa, FTAG) == 0) { - if (spa_version(spa) < - ZFS_VERSION_GZIP_COMPRESSION) { - spa_close(spa, FTAG); - return (ENOTSUP); - } - - spa_close(spa, FTAG); - } - } - break; - } - - switch (prop) { - case ZFS_PROP_QUOTA: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dir_set_quota(name, - intval)) != 0) - return (error); - break; - - case ZFS_PROP_RESERVATION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dir_set_reservation(name, - intval)) != 0) - return (error); - break; - - case ZFS_PROP_VOLSIZE: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volsize(name, dev, - intval)) != 0) - return (error); - break; - - case ZFS_PROP_VOLBLOCKSIZE: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volblocksize(name, - intval)) != 0) - return (error); - break; - - default: - if (nvpair_type(elem) == DATA_TYPE_STRING) { - if (zfs_prop_get_type(prop) != - prop_type_string) - return (EINVAL); - VERIFY(nvpair_value_string(elem, &strval) == 0); - if ((error = dsl_prop_set(name, - nvpair_name(elem), 1, strlen(strval) + 1, - strval)) != 0) - return (error); - } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { - const char *unused; - - VERIFY(nvpair_value_uint64(elem, &intval) == 0); - - switch (zfs_prop_get_type(prop)) { - case prop_type_number: - break; - case prop_type_boolean: - if (intval > 1) - return (EINVAL); - break; - case prop_type_string: - return (EINVAL); - case prop_type_index: - if (zfs_prop_index_to_string(prop, - intval, &unused) != 0) - return (EINVAL); - break; - default: - cmn_err(CE_PANIC, "unknown property " - "type"); - break; - } - - if ((error = dsl_prop_set(name, propname, - 8, 1, &intval)) != 0) - return (error); - } else { - return (EINVAL); - } - break; - } - } - - return (0); -} - -static int -zfs_ioc_set_prop(zfs_cmd_t *zc) -{ - nvlist_t *nvl; - int error; - zfs_prop_t prop; - - /* - * If zc_value is set, then this is an attempt to inherit a value. - * Otherwise, zc_nvlist refers to a list of properties to set. - */ - if (zc->zc_value[0] != '\0') { - if (!zfs_prop_user(zc->zc_value) && - ((prop = zfs_name_to_prop(zc->zc_value)) == - ZFS_PROP_INVAL || - !zfs_prop_inheritable(prop))) - return (EINVAL); - - return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL)); - } - - if ((error = get_nvlist(zc, &nvl)) != 0) - return (error); - - error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev, - (cred_t *)(uintptr_t)zc->zc_cred, nvl); - nvlist_free(nvl); - return (error); -} - -static int -zfs_ioc_pool_set_props(zfs_cmd_t *zc) -{ - nvlist_t *nvl; - int error, reset_bootfs = 0; - uint64_t objnum; - zpool_prop_t prop; - nvpair_t *elem; - char *propname, *strval; - spa_t *spa; - vdev_t *rvdev; - char *vdev_type; - objset_t *os; - - if ((error = get_nvlist(zc, &nvl)) != 0) - return (error); - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { - nvlist_free(nvl); - return (error); - } - - if (spa_version(spa) < ZFS_VERSION_BOOTFS) { - nvlist_free(nvl); - spa_close(spa, FTAG); - return (ENOTSUP); - } - - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - - propname = nvpair_name(elem); - - if ((prop = zpool_name_to_prop(propname)) == - ZFS_PROP_INVAL) { - nvlist_free(nvl); - spa_close(spa, FTAG); - return (EINVAL); - } - - switch (prop) { - case ZFS_PROP_BOOTFS: - /* - * A bootable filesystem can not be on a RAIDZ pool - * nor a striped pool with more than 1 device. - */ - rvdev = spa->spa_root_vdev; - vdev_type = - rvdev->vdev_child[0]->vdev_ops->vdev_op_type; - if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || - (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 && - rvdev->vdev_children > 1)) { - error = ENOTSUP; - break; - } - - reset_bootfs = 1; - - VERIFY(nvpair_value_string(elem, &strval) == 0); - if (strval == NULL || strval[0] == '\0') { - objnum = - zfs_prop_default_numeric(ZFS_PROP_BOOTFS); - break; - } - - if (error = dmu_objset_open(strval, DMU_OST_ZFS, - DS_MODE_STANDARD | DS_MODE_READONLY, &os)) - break; - objnum = dmu_objset_id(os); - dmu_objset_close(os); - break; - - default: - error = EINVAL; - } - - if (error) - break; - } - if (error == 0) { - if (reset_bootfs) { - VERIFY(nvlist_remove(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), - DATA_TYPE_STRING) == 0); - VERIFY(nvlist_add_uint64(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0); - } - error = spa_set_props(spa, nvl); - } - - nvlist_free(nvl); - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_pool_get_props(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - nvlist_t *nvp = NULL; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_get_props(spa, &nvp); - - if (error == 0 && zc->zc_nvlist_dst != 0) - error = put_nvlist(zc, nvp); - else - error = EFAULT; - - spa_close(spa, FTAG); - - if (nvp) - nvlist_free(nvp); - return (error); -} - -static int -zfs_ioc_create_minor(zfs_cmd_t *zc) -{ - return (zvol_create_minor(zc->zc_name, zc->zc_dev)); -} - -static int -zfs_ioc_remove_minor(zfs_cmd_t *zc) -{ - return (zvol_remove_minor(zc->zc_name)); -} - -/* - * Search the vfs list for a specified resource. Returns a pointer to it - * or NULL if no suitable entry is found. The caller of this routine - * is responsible for releasing the returned vfs pointer. - */ -static vfs_t * -zfs_get_vfs(const char *resource) -{ - vfs_t *vfsp; - - mtx_lock(&mountlist_mtx); - TAILQ_FOREACH(vfsp, &mountlist, mnt_list) { - if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) { - VFS_HOLD(vfsp); - break; - } - } - mtx_unlock(&mountlist_mtx); - return (vfsp); -} - -static void -zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx) -{ - zfs_create_data_t *zc = arg; - - zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx); -} - -static int -zfs_ioc_create(zfs_cmd_t *zc) -{ - objset_t *clone; - int error = 0; - zfs_create_data_t cbdata = { 0 }; - void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx); - dmu_objset_type_t type = zc->zc_objset_type; - - switch (type) { - - case DMU_OST_ZFS: - cbfunc = zfs_create_cb; - break; - - case DMU_OST_ZVOL: - cbfunc = zvol_create_cb; - break; - - default: - cbfunc = NULL; - } - if (strchr(zc->zc_name, '@')) - return (EINVAL); - - if (zc->zc_nvlist_src != 0 && - (error = get_nvlist(zc, &cbdata.zc_props)) != 0) - return (error); - - cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred; - cbdata.zc_dev = (dev_t)zc->zc_dev; - - if (zc->zc_value[0] != '\0') { - /* - * We're creating a clone of an existing snapshot. - */ - zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { - nvlist_free(cbdata.zc_props); - return (EINVAL); - } - - error = dmu_objset_open(zc->zc_value, type, - DS_MODE_STANDARD | DS_MODE_READONLY, &clone); - if (error) { - nvlist_free(cbdata.zc_props); - return (error); - } - error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL); - dmu_objset_close(clone); - } else { - if (cbfunc == NULL) { - nvlist_free(cbdata.zc_props); - return (EINVAL); - } - - if (type == DMU_OST_ZVOL) { - uint64_t volsize, volblocksize; - - if (cbdata.zc_props == NULL || - nvlist_lookup_uint64(cbdata.zc_props, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), - &volsize) != 0) { - nvlist_free(cbdata.zc_props); - return (EINVAL); - } - - if ((error = nvlist_lookup_uint64(cbdata.zc_props, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - &volblocksize)) != 0 && error != ENOENT) { - nvlist_free(cbdata.zc_props); - return (EINVAL); - } - - if (error != 0) - volblocksize = zfs_prop_default_numeric( - ZFS_PROP_VOLBLOCKSIZE); - - if ((error = zvol_check_volblocksize( - volblocksize)) != 0 || - (error = zvol_check_volsize(volsize, - volblocksize)) != 0) { - nvlist_free(cbdata.zc_props); - return (error); - } - } - - error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc, - &cbdata); - } - - /* - * It would be nice to do this atomically. - */ - if (error == 0) { - if ((error = zfs_set_prop_nvlist(zc->zc_name, - zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred, - cbdata.zc_props)) != 0) - (void) dmu_objset_destroy(zc->zc_name); - } - - nvlist_free(cbdata.zc_props); - return (error); -} - -static int -zfs_ioc_snapshot(zfs_cmd_t *zc) -{ - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - return (dmu_objset_snapshot(zc->zc_name, - zc->zc_value, zc->zc_cookie)); -} - -int -zfs_unmount_snap(char *name, void *arg) -{ - char *snapname = arg; - char *cp; - vfs_t *vfsp = NULL; - - /* - * Snapshots (which are under .zfs control) must be unmounted - * before they can be destroyed. - */ - - if (snapname) { - (void) strcat(name, "@"); - (void) strcat(name, snapname); - vfsp = zfs_get_vfs(name); - cp = strchr(name, '@'); - *cp = '\0'; - } else if (strchr(name, '@')) { - vfsp = zfs_get_vfs(name); - } - - if (vfsp) { - /* - * Always force the unmount for snapshots. - */ - int flag = MS_FORCE; - int err; - - if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { - VFS_RELE(vfsp); - return (err); - } - VFS_RELE(vfsp); - mtx_lock(&Giant); /* dounmount() */ - dounmount(vfsp, flag, curthread); - mtx_unlock(&Giant); /* dounmount() */ - } - return (0); -} - -static int -zfs_ioc_destroy_snaps(zfs_cmd_t *zc) -{ - int err; - - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - err = dmu_objset_find(zc->zc_name, - zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); - if (err) - return (err); - return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); -} - -static int -zfs_ioc_destroy(zfs_cmd_t *zc) -{ - if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { - int err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); - } - - return (dmu_objset_destroy(zc->zc_name)); -} - -static int -zfs_ioc_rollback(zfs_cmd_t *zc) -{ - return (dmu_objset_rollback(zc->zc_name)); -} - -static int -zfs_ioc_rename(zfs_cmd_t *zc) -{ - int recursive = zc->zc_cookie & 1; - - zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - - /* - * Unmount snapshot unless we're doing a recursive rename, - * in which case the dataset code figures out which snapshots - * to unmount. - */ - if (!recursive && strchr(zc->zc_name, '@') != NULL && - zc->zc_objset_type == DMU_OST_ZFS) { - int err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); - } - - return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); -} - -static int -zfs_ioc_recvbackup(zfs_cmd_t *zc) -{ - kthread_t *td = curthread; - struct file *fp; - int error; - offset_t new_off; - - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || - strchr(zc->zc_value, '@') == NULL) - return (EINVAL); - - error = fget_read(td, zc->zc_cookie, &fp); - if (error) - return (error); - - error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record, - &zc->zc_cookie, (boolean_t)zc->zc_guid, fp, - fp->f_offset); - - new_off = fp->f_offset + zc->zc_cookie; - fp->f_offset = new_off; - - fdrop(fp, td); - return (error); -} - -static int -zfs_ioc_sendbackup(zfs_cmd_t *zc) -{ - kthread_t *td = curthread; - struct file *fp; - objset_t *fromsnap = NULL; - objset_t *tosnap; - int error, fd; - - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap); - if (error) - return (error); - - if (zc->zc_value[0] != '\0') { - char buf[MAXPATHLEN]; - char *cp; - - (void) strncpy(buf, zc->zc_name, sizeof (buf)); - cp = strchr(buf, '@'); - if (cp) - *(cp+1) = 0; - (void) strlcat(buf, zc->zc_value, sizeof (buf)); - error = dmu_objset_open(buf, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap); - if (error) { - dmu_objset_close(tosnap); - return (error); - } - } - - fd = zc->zc_cookie; - error = fget_write(td, fd, &fp); - if (error) { - dmu_objset_close(tosnap); - if (fromsnap) - dmu_objset_close(fromsnap); - return (error); - } - - error = dmu_sendbackup(tosnap, fromsnap, fp); - - fdrop(fp, td); - if (fromsnap) - dmu_objset_close(fromsnap); - dmu_objset_close(tosnap); - return (error); -} - -static int -zfs_ioc_inject_fault(zfs_cmd_t *zc) -{ - int id, error; - - error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, - &zc->zc_inject_record); - - if (error == 0) - zc->zc_guid = (uint64_t)id; - - return (error); -} - -static int -zfs_ioc_clear_fault(zfs_cmd_t *zc) -{ - return (zio_clear_fault((int)zc->zc_guid)); -} - -static int -zfs_ioc_inject_list_next(zfs_cmd_t *zc) -{ - int id = (int)zc->zc_guid; - int error; - - error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), - &zc->zc_inject_record); - - zc->zc_guid = id; - - return (error); -} - -static int -zfs_ioc_error_log(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - size_t count = (size_t)zc->zc_nvlist_dst_size; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, - &count); - if (error == 0) - zc->zc_nvlist_dst_size = count; - else - zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); - - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_clear(zfs_cmd_t *zc) -{ - spa_t *spa; - vdev_t *vd; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - spa_config_enter(spa, RW_WRITER, FTAG); - - if (zc->zc_guid == 0) { - vd = NULL; - } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { - spa_config_exit(spa, FTAG); - spa_close(spa, FTAG); - return (ENODEV); - } - - vdev_clear(spa, vd); - - spa_config_exit(spa, FTAG); - - spa_close(spa, FTAG); - - return (0); -} - -static int -zfs_ioc_promote(zfs_cmd_t *zc) -{ - char *cp; - - /* - * We don't need to unmount *all* the origin fs's snapshots, but - * it's easier. - */ - cp = strchr(zc->zc_value, '@'); - if (cp) - *cp = '\0'; - (void) dmu_objset_find(zc->zc_value, - zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); - return (dsl_dataset_promote(zc->zc_name)); -} - -static int -zfs_ioc_jail(zfs_cmd_t *zc) -{ - - return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred, - zc->zc_name, (int)zc->zc_jailid)); -} - -static int -zfs_ioc_unjail(zfs_cmd_t *zc) -{ - - return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred, - zc->zc_name, (int)zc->zc_jailid)); -} - -static zfs_ioc_vec_t zfs_ioc_vec[] = { - { zfs_ioc_pool_create, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_import, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_export, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_configs, zfs_secpolicy_none, no_name }, - { zfs_ioc_pool_stats, zfs_secpolicy_read, pool_name }, - { zfs_ioc_pool_tryimport, zfs_secpolicy_config, no_name }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_freeze, zfs_secpolicy_config, no_name }, - { zfs_ioc_pool_upgrade, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_get_history, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_log_history, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_setpath, zfs_secpolicy_config, pool_name }, - { zfs_ioc_objset_stats, zfs_secpolicy_read, dataset_name }, - { zfs_ioc_dataset_list_next, zfs_secpolicy_read, dataset_name }, - { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, dataset_name }, - { zfs_ioc_set_prop, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_create_minor, zfs_secpolicy_config, dataset_name }, - { zfs_ioc_remove_minor, zfs_secpolicy_config, dataset_name }, - { zfs_ioc_create, zfs_secpolicy_parent, dataset_name }, - { zfs_ioc_destroy, zfs_secpolicy_parent, dataset_name }, - { zfs_ioc_rollback, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_rename, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_sendbackup, zfs_secpolicy_operator, dataset_name }, - { zfs_ioc_inject_fault, zfs_secpolicy_inject, no_name }, - { zfs_ioc_clear_fault, zfs_secpolicy_inject, no_name }, - { zfs_ioc_inject_list_next, zfs_secpolicy_inject, no_name }, - { zfs_ioc_error_log, zfs_secpolicy_inject, pool_name }, - { zfs_ioc_clear, zfs_secpolicy_config, pool_name }, - { zfs_ioc_promote, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_snapshot, zfs_secpolicy_operator, dataset_name }, - { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, pool_name }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, no_name }, - { zfs_ioc_pool_set_props, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_get_props, zfs_secpolicy_read, pool_name }, - { zfs_ioc_jail, zfs_secpolicy_config, dataset_name }, - { zfs_ioc_unjail, zfs_secpolicy_config, dataset_name } -}; - -static int -zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, - struct thread *td) -{ - zfs_cmd_t *zc = (void *)addr; - uint_t vec; - int error; - - vec = ZFS_IOC(cmd); - - if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) - return (EINVAL); - - zc->zc_cred = (uintptr_t)td->td_ucred; - zc->zc_dev = (uintptr_t)dev; - error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred); - - /* - * Ensure that all pool/dataset names are valid before we pass down to - * the lower layers. - */ - if (error == 0) { - zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; - switch (zfs_ioc_vec[vec].zvec_namecheck) { - case pool_name: - if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) - error = EINVAL; - break; - - case dataset_name: - if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) - error = EINVAL; - break; - - case no_name: - break; - } - } - - if (error == 0) - error = zfs_ioc_vec[vec].zvec_func(zc); - - return (error); -} - -/* - * OK, so this is a little weird. - * - * /dev/zfs is the control node, i.e. minor 0. - * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. - * - * /dev/zfs has basically nothing to do except serve up ioctls, - * so most of the standard driver entry points are in zvol.c. - */ -static struct cdevsw zfs_cdevsw = { - .d_version = D_VERSION, - .d_ioctl = zfsdev_ioctl, - .d_name = ZFS_DEV_NAME -}; - -static void -zfsdev_init(void) -{ - zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660, - ZFS_DEV_NAME); -} - -static void -zfsdev_fini(void) -{ - if (zfsdev != NULL) - destroy_dev(zfsdev); -} - -static struct task zfs_start_task; -static struct root_hold_token *zfs_root_token; - -static void -zfs_start(void *context __unused, int pending __unused) -{ - - zfsdev_init(); - spa_init(FREAD | FWRITE); - zfs_init(); - zvol_init(); - printf("ZFS storage pool version " ZFS_VERSION_STRING "\n"); - root_mount_rel(zfs_root_token); -} - -static int -zfs_modevent(module_t mod, int type, void *unused __unused) -{ - int error; - - error = EOPNOTSUPP; - switch (type) { - case MOD_LOAD: - zfs_root_token = root_mount_hold("ZFS"); - printf("WARNING: ZFS is considered to be an experimental " - "feature in FreeBSD.\n"); - TASK_INIT(&zfs_start_task, 0, zfs_start, NULL); - taskqueue_enqueue(taskqueue_thread, &zfs_start_task); - error = 0; - break; - case MOD_UNLOAD: - if (spa_busy() || zfs_busy() || zvol_busy() || - zio_injection_enabled) { - error = EBUSY; - break; - } - zvol_fini(); - zfs_fini(); - spa_fini(); - zfsdev_fini(); - error = 0; - break; - } - return (error); -} - -static moduledata_t zfs_mod = { - "zfsctrl", - zfs_modevent, - 0 -}; -DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY); diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c deleted file mode 100644 index dde9ec1..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c +++ /dev/null @@ -1,349 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/cmn_err.h> -#include <sys/kmem.h> -#include <sys/file.h> -#include <sys/vfs.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_dir.h> -#include <sys/zil.h> -#include <sys/byteorder.h> -#include <sys/policy.h> -#include <sys/stat.h> -#include <sys/acl.h> -#include <sys/dmu.h> -#include <sys/spa.h> - -/* - * All the functions in this file are used to construct the log entries - * to record transactions. They allocate * a intent log transaction - * structure (itx_t) and save within it all the information necessary to - * possibly replay the transaction. The itx is then assigned a sequence - * number and inserted in the in-memory list anchored in the zilog. - */ - -/* - * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR - * transactions. - */ -void -zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, znode_t *zp, char *name) -{ - itx_t *itx; - uint64_t seq; - lr_create_t *lr; - size_t namesize = strlen(name) + 1; - - if (zilog == NULL) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); - lr = (lr_create_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - lr->lr_uid = zp->z_phys->zp_uid; - lr->lr_gid = zp->z_phys->zp_gid; - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; - lr->lr_rdev = zp->z_phys->zp_rdev; - bcopy(name, (char *)(lr + 1), namesize); - - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; -} - -/* - * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. - */ -void -zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, char *name) -{ - itx_t *itx; - uint64_t seq; - lr_remove_t *lr; - size_t namesize = strlen(name) + 1; - - if (zilog == NULL) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); - lr = (lr_remove_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - bcopy(name, (char *)(lr + 1), namesize); - - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; -} - -/* - * zfs_log_link() handles TX_LINK transactions. - */ -void -zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, znode_t *zp, char *name) -{ - itx_t *itx; - uint64_t seq; - lr_link_t *lr; - size_t namesize = strlen(name) + 1; - - if (zilog == NULL) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); - lr = (lr_link_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - lr->lr_link_obj = zp->z_id; - bcopy(name, (char *)(lr + 1), namesize); - - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; -} - -/* - * zfs_log_symlink() handles TX_SYMLINK transactions. - */ -void -zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, znode_t *zp, char *name, char *link) -{ - itx_t *itx; - uint64_t seq; - lr_create_t *lr; - size_t namesize = strlen(name) + 1; - size_t linksize = strlen(link) + 1; - - if (zilog == NULL) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); - lr = (lr_create_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - lr->lr_uid = zp->z_phys->zp_uid; - lr->lr_gid = zp->z_phys->zp_gid; - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; - bcopy(name, (char *)(lr + 1), namesize); - bcopy(link, (char *)(lr + 1) + namesize, linksize); - - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; -} - -/* - * zfs_log_rename() handles TX_RENAME transactions. - */ -void -zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) -{ - itx_t *itx; - uint64_t seq; - lr_rename_t *lr; - size_t snamesize = strlen(sname) + 1; - size_t dnamesize = strlen(dname) + 1; - - if (zilog == NULL) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); - lr = (lr_rename_t *)&itx->itx_lr; - lr->lr_sdoid = sdzp->z_id; - lr->lr_tdoid = tdzp->z_id; - bcopy(sname, (char *)(lr + 1), snamesize); - bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); - - seq = zil_itx_assign(zilog, itx, tx); - sdzp->z_last_itx = seq; - tdzp->z_last_itx = seq; - szp->z_last_itx = seq; -} - -/* - * zfs_log_write() handles TX_WRITE transactions. - */ -ssize_t zfs_immediate_write_sz = 32768; - -void -zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t len, int ioflag) -{ - itx_t *itx; - uint64_t seq; - lr_write_t *lr; - itx_wr_state_t write_state; - int err; - - if (zilog == NULL || zp->z_unlinked) - return; - - /* - * Writes are handled in three different ways: - * - * WR_INDIRECT: - * If the write is greater than zfs_immediate_write_sz then - * later *if* we need to log the write then dmu_sync() is used - * to immediately write the block and it's block pointer is put - * in the log record. - * WR_COPIED: - * If we know we'll immediately be committing the - * transaction (FDSYNC (O_DSYNC)), the we allocate a larger - * log record here for the data and copy the data in. - * WR_NEED_COPY: - * Otherwise we don't allocate a buffer, and *if* we need to - * flush the write later then a buffer is allocated and - * we retrieve the data using the dmu. - */ - if (len > zfs_immediate_write_sz) - write_state = WR_INDIRECT; - else if (ioflag & FDSYNC) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; - - itx = zil_itx_create(txtype, sizeof (*lr) + - (write_state == WR_COPIED ? len : 0)); - lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED) { - err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1); - if (err) { - kmem_free(itx, offsetof(itx_t, itx_lr) + - itx->itx_lr.lrc_reclen); - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_write_t *)&itx->itx_lr; - write_state = WR_NEED_COPY; - } - } - - itx->itx_wr_state = write_state; - lr->lr_foid = zp->z_id; - lr->lr_offset = off; - lr->lr_length = len; - lr->lr_blkoff = 0; - BP_ZERO(&lr->lr_blkptr); - - itx->itx_private = zp->z_zfsvfs; - - itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; -} - -/* - * zfs_log_truncate() handles TX_TRUNCATE transactions. - */ -void -zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, uint64_t off, uint64_t len) -{ - itx_t *itx; - uint64_t seq; - lr_truncate_t *lr; - - if (zilog == NULL || zp->z_unlinked) - return; - - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_truncate_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; - lr->lr_offset = off; - lr->lr_length = len; - - itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; -} - -/* - * zfs_log_setattr() handles TX_SETATTR transactions. - */ -void -zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, vattr_t *vap, uint_t mask_applied) -{ - itx_t *itx; - uint64_t seq; - lr_setattr_t *lr; - - if (zilog == NULL || zp->z_unlinked) - return; - - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_setattr_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; - lr->lr_mask = (uint64_t)mask_applied; - lr->lr_mode = (uint64_t)vap->va_mode; - lr->lr_uid = (uint64_t)vap->va_uid; - lr->lr_gid = (uint64_t)vap->va_gid; - lr->lr_size = (uint64_t)vap->va_size; - ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); - ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); - - itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; -} - -/* - * zfs_log_acl() handles TX_ACL transactions. - */ -void -zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, int aclcnt, ace_t *z_ace) -{ - itx_t *itx; - uint64_t seq; - lr_acl_t *lr; - - if (zilog == NULL || zp->z_unlinked) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t)); - lr = (lr_acl_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; - lr->lr_aclcnt = (uint64_t)aclcnt; - bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t)); - - itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c deleted file mode 100644 index 2be3093..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c +++ /dev/null @@ -1,430 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/cmn_err.h> -#include <sys/kmem.h> -#include <sys/file.h> -#include <sys/fcntl.h> -#include <sys/vfs.h> -#include <sys/fs/zfs.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/spa.h> -#include <sys/zil.h> -#include <sys/byteorder.h> -#include <sys/stat.h> -#include <sys/acl.h> -#include <sys/atomic.h> -#include <sys/cred.h> -#include <sys/namei.h> - -/* - * Functions to replay ZFS intent log (ZIL) records - * The functions are called through a function vector (zfs_replay_vector) - * which is indexed by the transaction type. - */ - -static void -zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, - uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) -{ - VATTR_NULL(vap); - vap->va_mask = (uint_t)mask; - vap->va_type = IFTOVT(mode); - vap->va_mode = mode & MODEMASK; - vap->va_uid = (uid_t)uid; - vap->va_gid = (gid_t)gid; - vap->va_rdev = zfs_cmpldev(rdev); - vap->va_nodeid = nodeid; -} - -/* ARGSUSED */ -static int -zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) -{ - return (ENOTSUP); -} - -static int -zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) -{ - char *name = (char *)(lr + 1); /* name follows lr_create_t */ - char *link; /* symlink content follows name */ - znode_t *dzp; - vnode_t *vp = NULL; - vattr_t va; - struct componentname cn; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID, - lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); - - /* - * All forms of zfs create (create, mkdir, mkxattrdir, symlink) - * eventually end up in zfs_mknode(), which assigns the object's - * creation time and generation number. The generic VOP_CREATE() - * doesn't have either concept, so we smuggle the values inside - * the vattr's otherwise unused va_ctime and va_nblocks fields. - */ - ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime); - va.va_nblocks = lr->lr_gen; - - cn.cn_nameptr = name; - cn.cn_cred = kcred; - cn.cn_thread = curthread; - cn.cn_flags = SAVENAME; - - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); - switch ((int)lr->lr_common.lrc_txtype) { - case TX_CREATE: - error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va); - break; - case TX_MKDIR: - error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va); - break; - case TX_MKXATTR: - error = zfs_make_xattrdir(dzp, &va, &vp, kcred); - break; - case TX_SYMLINK: - link = name + strlen(name) + 1; - error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link); - break; - default: - error = ENOTSUP; - } - VOP_UNLOCK(ZTOV(dzp), 0); - - if (error == 0 && vp != NULL) { - VOP_UNLOCK(vp, 0); - VN_RELE(vp); - } - - VN_RELE(ZTOV(dzp)); - - return (error); -} - -static int -zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) -{ - char *name = (char *)(lr + 1); /* name follows lr_remove_t */ - znode_t *dzp; - struct componentname cn; - vnode_t *vp; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - bzero(&cn, sizeof(cn)); - cn.cn_nameptr = name; - cn.cn_namelen = strlen(name); - cn.cn_nameiop = DELETE; - cn.cn_flags = ISLASTCN | SAVENAME; - cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; - cn.cn_cred = kcred; - cn.cn_thread = curthread; - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); - error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn); - if (error != 0) { - VOP_UNLOCK(ZTOV(dzp), 0); - goto fail; - } - - switch ((int)lr->lr_common.lrc_txtype) { - case TX_REMOVE: - error = VOP_REMOVE(ZTOV(dzp), vp, &cn); - break; - case TX_RMDIR: - error = VOP_RMDIR(ZTOV(dzp), vp, &cn); - break; - default: - error = ENOTSUP; - } - vput(vp); - VOP_UNLOCK(ZTOV(dzp), 0); -fail: - VN_RELE(ZTOV(dzp)); - - return (error); -} - -static int -zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) -{ - char *name = (char *)(lr + 1); /* name follows lr_link_t */ - znode_t *dzp, *zp; - struct componentname cn; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { - VN_RELE(ZTOV(dzp)); - return (error); - } - - cn.cn_nameptr = name; - cn.cn_cred = kcred; - cn.cn_thread = curthread; - cn.cn_flags = SAVENAME; - - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); - vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); - error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn); - VOP_UNLOCK(ZTOV(zp), 0); - VOP_UNLOCK(ZTOV(dzp), 0); - - VN_RELE(ZTOV(zp)); - VN_RELE(ZTOV(dzp)); - - return (error); -} - -static int -zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) -{ - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; - znode_t *sdzp, *tdzp; - struct componentname scn, tcn; - vnode_t *svp, *tvp; - kthread_t *td = curthread; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) - return (error); - - if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { - VN_RELE(ZTOV(sdzp)); - return (error); - } - - svp = tvp = NULL; - - bzero(&scn, sizeof(scn)); - scn.cn_nameptr = sname; - scn.cn_namelen = strlen(sname); - scn.cn_nameiop = DELETE; - scn.cn_flags = ISLASTCN | SAVENAME; - scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; - scn.cn_cred = kcred; - scn.cn_thread = td; - vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY); - error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn); - VOP_UNLOCK(ZTOV(sdzp), 0); - if (error != 0) - goto fail; - VOP_UNLOCK(svp, 0); - - bzero(&tcn, sizeof(tcn)); - tcn.cn_nameptr = tname; - tcn.cn_namelen = strlen(tname); - tcn.cn_nameiop = RENAME; - tcn.cn_flags = ISLASTCN | SAVENAME; - tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; - tcn.cn_cred = kcred; - tcn.cn_thread = td; - vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY); - error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn); - if (error == EJUSTRETURN) - tvp = NULL; - else if (error != 0) { - VOP_UNLOCK(ZTOV(tdzp), 0); - goto fail; - } - - error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn); - return (error); -fail: - if (svp != NULL) - vrele(svp); - if (tvp != NULL) - vrele(tvp); - VN_RELE(ZTOV(tdzp)); - VN_RELE(ZTOV(sdzp)); - - return (error); -} - -static int -zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) -{ - char *data = (char *)(lr + 1); /* data follows lr_write_t */ - znode_t *zp; - int error; - ssize_t resid; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log writes out of order, it's possible the - * file has been removed. In this case just drop the write - * and return success. - */ - if (error == ENOENT) - error = 0; - return (error); - } - - error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, - lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); - - VN_RELE(ZTOV(zp)); - - return (error); -} - -static int -zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) -{ - - ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); - return (EOPNOTSUPP); -} - -static int -zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) -{ - znode_t *zp; - vattr_t va; - vnode_t *vp; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log setattrs out of order, it's possible the - * file has been removed. In this case just drop the setattr - * and return success. - */ - if (error == ENOENT) - error = 0; - return (error); - } - - zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode, - lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); - - va.va_size = lr->lr_size; - ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime); - ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime); - - vp = ZTOV(zp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - error = VOP_SETATTR(vp, &va, kcred, curthread); - VOP_UNLOCK(vp, 0); - VN_RELE(vp); - - return (error); -} - -static int -zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) -{ - ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ -#ifdef TODO - vsecattr_t vsa; -#endif - znode_t *zp; - int error; - - if (byteswap) { - byteswap_uint64_array(lr, sizeof (*lr)); - zfs_ace_byteswap(ace, lr->lr_aclcnt); - } - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log acls out of order, it's possible the - * file has been removed. In this case just drop the acl - * and return success. - */ - if (error == ENOENT) - error = 0; - return (error); - } - -#ifdef TODO - bzero(&vsa, sizeof (vsa)); - vsa.vsa_mask = VSA_ACE | VSA_ACECNT; - vsa.vsa_aclcnt = lr->lr_aclcnt; - vsa.vsa_aclentp = ace; - - error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred); -#else - error = EOPNOTSUPP; -#endif - - VN_RELE(ZTOV(zp)); - - return (error); -} - -/* - * Callback vectors for replaying records - */ -zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { - zfs_replay_error, /* 0 no such transaction type */ - zfs_replay_create, /* TX_CREATE */ - zfs_replay_create, /* TX_MKDIR */ - zfs_replay_create, /* TX_MKXATTR */ - zfs_replay_create, /* TX_SYMLINK */ - zfs_replay_remove, /* TX_REMOVE */ - zfs_replay_remove, /* TX_RMDIR */ - zfs_replay_link, /* TX_LINK */ - zfs_replay_rename, /* TX_RENAME */ - zfs_replay_write, /* TX_WRITE */ - zfs_replay_truncate, /* TX_TRUNCATE */ - zfs_replay_setattr, /* TX_SETATTR */ - zfs_replay_acl, /* TX_ACL */ -}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c deleted file mode 100644 index 07ec0f6..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c +++ /dev/null @@ -1,594 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * This file contains the code to implement file range locking in - * ZFS, although there isn't much specific to ZFS (all that comes to mind - * support for growing the blocksize). - * - * Interface - * --------- - * Defined in zfs_rlock.h but essentially: - * rl = zfs_range_lock(zp, off, len, lock_type); - * zfs_range_unlock(rl); - * zfs_range_reduce(rl, off, len); - * - * AVL tree - * -------- - * An AVL tree is used to maintain the state of the existing ranges - * that are locked for exclusive (writer) or shared (reader) use. - * The starting range offset is used for searching and sorting the tree. - * - * Common case - * ----------- - * The (hopefully) usual case is of no overlaps or contention for - * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree - * searched that finds no overlap, and *this* rl_t is placed in the tree. - * - * Overlaps/Reference counting/Proxy locks - * --------------------------------------- - * The avl code only allows one node at a particular offset. Also it's very - * inefficient to search through all previous entries looking for overlaps - * (because the very 1st in the ordered list might be at offset 0 but - * cover the whole file). - * So this implementation uses reference counts and proxy range locks. - * Firstly, only reader locks use reference counts and proxy locks, - * because writer locks are exclusive. - * When a reader lock overlaps with another then a proxy lock is created - * for that range and replaces the original lock. If the overlap - * is exact then the reference count of the proxy is simply incremented. - * Otherwise, the proxy lock is split into smaller lock ranges and - * new proxy locks created for non overlapping ranges. - * The reference counts are adjusted accordingly. - * Meanwhile, the orginal lock is kept around (this is the callers handle) - * and its offset and length are used when releasing the lock. - * - * Thread coordination - * ------------------- - * In order to make wakeups efficient and to ensure multiple continuous - * readers on a range don't starve a writer for the same range lock, - * two condition variables are allocated in each rl_t. - * If a writer (or reader) can't get a range it initialises the writer - * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; - * and waits on that cv. When a thread unlocks that range it wakes up all - * writers then all readers before destroying the lock. - * - * Append mode writes - * ------------------ - * Append mode writes need to lock a range at the end of a file. - * The offset of the end of the file is determined under the - * range locking mutex, and the lock type converted from RL_APPEND to - * RL_WRITER and the range locked. - * - * Grow block handling - * ------------------- - * ZFS supports multiple block sizes currently upto 128K. The smallest - * block size is used for the file which is grown as needed. During this - * growth all other writers and readers must be excluded. - * So if the block size needs to be grown then the whole file is - * exclusively locked, then later the caller will reduce the lock - * range to just the range to be written using zfs_reduce_range. - */ - -#include <sys/zfs_rlock.h> - -/* - * Check if a write lock can be grabbed, or wait and recheck until available. - */ -static void -zfs_range_lock_writer(znode_t *zp, rl_t *new) -{ - avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl; - avl_index_t where; - uint64_t end_size; - uint64_t off = new->r_off; - uint64_t len = new->r_len; - - for (;;) { - /* - * Range locking is also used by zvol and uses a - * dummied up znode. However, for zvol, we don't need to - * append or grow blocksize, and besides we don't have - * a z_phys or z_zfsvfs - so skip that processing. - * - * Yes, this is ugly, and would be solved by not handling - * grow or append in range lock code. If that was done then - * we could make the range locking code generically available - * to other non-zfs consumers. - */ - if (zp->z_vnode) { /* caller is ZPL */ - /* - * If in append mode pick up the current end of file. - * This is done under z_range_lock to avoid races. - */ - if (new->r_type == RL_APPEND) - new->r_off = zp->z_phys->zp_size; - - /* - * If we need to grow the block size then grab the whole - * file range. This is also done under z_range_lock to - * avoid races. - */ - end_size = MAX(zp->z_phys->zp_size, new->r_off + len); - if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || - zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { - new->r_off = 0; - new->r_len = UINT64_MAX; - } - } - - /* - * First check for the usual case of no locks - */ - if (avl_numnodes(tree) == 0) { - new->r_type = RL_WRITER; /* convert to writer */ - avl_add(tree, new); - return; - } - - /* - * Look for any locks in the range. - */ - rl = avl_find(tree, new, &where); - if (rl) - goto wait; /* already locked at same offset */ - - rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - if (rl && (rl->r_off < new->r_off + new->r_len)) - goto wait; - - rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); - if (rl && rl->r_off + rl->r_len > new->r_off) - goto wait; - - new->r_type = RL_WRITER; /* convert possible RL_APPEND */ - avl_insert(tree, new, where); - return; -wait: - if (!rl->r_write_wanted) { - cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); - rl->r_write_wanted = B_TRUE; - } - cv_wait(&rl->r_wr_cv, &zp->z_range_lock); - - /* reset to original */ - new->r_off = off; - new->r_len = len; - } -} - -/* - * If this is an original (non-proxy) lock then replace it by - * a proxy and return the proxy. - */ -static rl_t * -zfs_range_proxify(avl_tree_t *tree, rl_t *rl) -{ - rl_t *proxy; - - if (rl->r_proxy) - return (rl); /* already a proxy */ - - ASSERT3U(rl->r_cnt, ==, 1); - ASSERT(rl->r_write_wanted == B_FALSE); - ASSERT(rl->r_read_wanted == B_FALSE); - avl_remove(tree, rl); - rl->r_cnt = 0; - - /* create a proxy range lock */ - proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); - proxy->r_off = rl->r_off; - proxy->r_len = rl->r_len; - proxy->r_cnt = 1; - proxy->r_type = RL_READER; - proxy->r_proxy = B_TRUE; - proxy->r_write_wanted = B_FALSE; - proxy->r_read_wanted = B_FALSE; - avl_add(tree, proxy); - - return (proxy); -} - -/* - * Split the range lock at the supplied offset - * returning the *front* proxy. - */ -static rl_t * -zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) -{ - rl_t *front, *rear; - - ASSERT3U(rl->r_len, >, 1); - ASSERT3U(off, >, rl->r_off); - ASSERT3U(off, <, rl->r_off + rl->r_len); - ASSERT(rl->r_write_wanted == B_FALSE); - ASSERT(rl->r_read_wanted == B_FALSE); - - /* create the rear proxy range lock */ - rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rear->r_off = off; - rear->r_len = rl->r_off + rl->r_len - off; - rear->r_cnt = rl->r_cnt; - rear->r_type = RL_READER; - rear->r_proxy = B_TRUE; - rear->r_write_wanted = B_FALSE; - rear->r_read_wanted = B_FALSE; - - front = zfs_range_proxify(tree, rl); - front->r_len = off - rl->r_off; - - avl_insert_here(tree, rear, front, AVL_AFTER); - return (front); -} - -/* - * Create and add a new proxy range lock for the supplied range. - */ -static void -zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) -{ - rl_t *rl; - - ASSERT(len); - rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rl->r_off = off; - rl->r_len = len; - rl->r_cnt = 1; - rl->r_type = RL_READER; - rl->r_proxy = B_TRUE; - rl->r_write_wanted = B_FALSE; - rl->r_read_wanted = B_FALSE; - avl_add(tree, rl); -} - -static void -zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) -{ - rl_t *next; - uint64_t off = new->r_off; - uint64_t len = new->r_len; - - /* - * prev arrives either: - * - pointing to an entry at the same offset - * - pointing to the entry with the closest previous offset whose - * range may overlap with the new range - * - null, if there were no ranges starting before the new one - */ - if (prev) { - if (prev->r_off + prev->r_len <= off) { - prev = NULL; - } else if (prev->r_off != off) { - /* - * convert to proxy if needed then - * split this entry and bump ref count - */ - prev = zfs_range_split(tree, prev, off); - prev = AVL_NEXT(tree, prev); /* move to rear range */ - } - } - ASSERT((prev == NULL) || (prev->r_off == off)); - - if (prev) - next = prev; - else - next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - - if (next == NULL || off + len <= next->r_off) { - /* no overlaps, use the original new rl_t in the tree */ - avl_insert(tree, new, where); - return; - } - - if (off < next->r_off) { - /* Add a proxy for initial range before the overlap */ - zfs_range_new_proxy(tree, off, next->r_off - off); - } - - new->r_cnt = 0; /* will use proxies in tree */ - /* - * We now search forward through the ranges, until we go past the end - * of the new range. For each entry we make it a proxy if it - * isn't already, then bump its reference count. If there's any - * gaps between the ranges then we create a new proxy range. - */ - for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { - if (off + len <= next->r_off) - break; - if (prev && prev->r_off + prev->r_len < next->r_off) { - /* there's a gap */ - ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); - zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - next->r_off - (prev->r_off + prev->r_len)); - } - if (off + len == next->r_off + next->r_len) { - /* exact overlap with end */ - next = zfs_range_proxify(tree, next); - next->r_cnt++; - return; - } - if (off + len < next->r_off + next->r_len) { - /* new range ends in the middle of this block */ - next = zfs_range_split(tree, next, off + len); - next->r_cnt++; - return; - } - ASSERT3U(off + len, >, next->r_off + next->r_len); - next = zfs_range_proxify(tree, next); - next->r_cnt++; - } - - /* Add the remaining end range. */ - zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - (off + len) - (prev->r_off + prev->r_len)); -} - -/* - * Check if a reader lock can be grabbed, or wait and recheck until available. - */ -static void -zfs_range_lock_reader(znode_t *zp, rl_t *new) -{ - avl_tree_t *tree = &zp->z_range_avl; - rl_t *prev, *next; - avl_index_t where; - uint64_t off = new->r_off; - uint64_t len = new->r_len; - - /* - * Look for any writer locks in the range. - */ -retry: - prev = avl_find(tree, new, &where); - if (prev == NULL) - prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); - - /* - * Check the previous range for a writer lock overlap. - */ - if (prev && (off < prev->r_off + prev->r_len)) { - if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { - if (!prev->r_read_wanted) { - cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); - prev->r_read_wanted = B_TRUE; - } - cv_wait(&prev->r_rd_cv, &zp->z_range_lock); - goto retry; - } - if (off + len < prev->r_off + prev->r_len) - goto got_lock; - } - - /* - * Search through the following ranges to see if there's - * write lock any overlap. - */ - if (prev) - next = AVL_NEXT(tree, prev); - else - next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - for (; next; next = AVL_NEXT(tree, next)) { - if (off + len <= next->r_off) - goto got_lock; - if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { - if (!next->r_read_wanted) { - cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); - next->r_read_wanted = B_TRUE; - } - cv_wait(&next->r_rd_cv, &zp->z_range_lock); - goto retry; - } - if (off + len <= next->r_off + next->r_len) - goto got_lock; - } - -got_lock: - /* - * Add the read lock, which may involve splitting existing - * locks and bumping ref counts (r_cnt). - */ - zfs_range_add_reader(tree, new, prev, where); -} - -/* - * Lock a range (offset, length) as either shared (RL_READER) - * or exclusive (RL_WRITER). Returns the range lock structure - * for later unlocking or reduce range (if entire file - * previously locked as RL_WRITER). - */ -rl_t * -zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) -{ - rl_t *new; - - ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); - - new = kmem_alloc(sizeof (rl_t), KM_SLEEP); - new->r_zp = zp; - new->r_off = off; - new->r_len = len; - new->r_cnt = 1; /* assume it's going to be in the tree */ - new->r_type = type; - new->r_proxy = B_FALSE; - new->r_write_wanted = B_FALSE; - new->r_read_wanted = B_FALSE; - - mutex_enter(&zp->z_range_lock); - if (type == RL_READER) { - /* - * First check for the usual case of no locks - */ - if (avl_numnodes(&zp->z_range_avl) == 0) - avl_add(&zp->z_range_avl, new); - else - zfs_range_lock_reader(zp, new); - } else - zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ - mutex_exit(&zp->z_range_lock); - return (new); -} - -/* - * Unlock a reader lock - */ -static void -zfs_range_unlock_reader(znode_t *zp, rl_t *remove) -{ - avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl, *next; - uint64_t len; - - /* - * The common case is when the remove entry is in the tree - * (cnt == 1) meaning there's been no other reader locks overlapping - * with this one. Otherwise the remove entry will have been - * removed from the tree and replaced by proxies (one or - * more ranges mapping to the entire range). - */ - if (remove->r_cnt == 1) { - avl_remove(tree, remove); - if (remove->r_write_wanted) - cv_broadcast(&remove->r_wr_cv); - if (remove->r_read_wanted) - cv_broadcast(&remove->r_rd_cv); - } else { - ASSERT3U(remove->r_cnt, ==, 0); - ASSERT3U(remove->r_write_wanted, ==, 0); - ASSERT3U(remove->r_read_wanted, ==, 0); - /* - * Find start proxy representing this reader lock, - * then decrement ref count on all proxies - * that make up this range, freeing them as needed. - */ - rl = avl_find(tree, remove, NULL); - ASSERT(rl); - ASSERT(rl->r_cnt); - ASSERT(rl->r_type == RL_READER); - for (len = remove->r_len; len != 0; rl = next) { - len -= rl->r_len; - if (len) { - next = AVL_NEXT(tree, rl); - ASSERT(next); - ASSERT(rl->r_off + rl->r_len == next->r_off); - ASSERT(next->r_cnt); - ASSERT(next->r_type == RL_READER); - } - rl->r_cnt--; - if (rl->r_cnt == 0) { - avl_remove(tree, rl); - if (rl->r_write_wanted) - cv_broadcast(&rl->r_wr_cv); - if (rl->r_read_wanted) - cv_broadcast(&rl->r_rd_cv); - kmem_free(rl, sizeof (rl_t)); - } - } - } - kmem_free(remove, sizeof (rl_t)); -} - -/* - * Unlock range and destroy range lock structure. - */ -void -zfs_range_unlock(rl_t *rl) -{ - znode_t *zp = rl->r_zp; - - ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); - ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); - ASSERT(!rl->r_proxy); - - mutex_enter(&zp->z_range_lock); - if (rl->r_type == RL_WRITER) { - /* writer locks can't be shared or split */ - avl_remove(&zp->z_range_avl, rl); - mutex_exit(&zp->z_range_lock); - if (rl->r_write_wanted) { - cv_broadcast(&rl->r_wr_cv); - cv_destroy(&rl->r_wr_cv); - } - if (rl->r_read_wanted) { - cv_broadcast(&rl->r_rd_cv); - cv_destroy(&rl->r_rd_cv); - } - kmem_free(rl, sizeof (rl_t)); - } else { - /* - * lock may be shared, let zfs_range_unlock_reader() - * release the lock and free the rl_t - */ - zfs_range_unlock_reader(zp, rl); - mutex_exit(&zp->z_range_lock); - } -} - -/* - * Reduce range locked as RL_WRITER from whole file to specified range. - * Asserts the whole file is exclusivly locked and so there's only one - * entry in the tree. - */ -void -zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) -{ - znode_t *zp = rl->r_zp; - - /* Ensure there are no other locks */ - ASSERT(avl_numnodes(&zp->z_range_avl) == 1); - ASSERT(rl->r_off == 0); - ASSERT(rl->r_type == RL_WRITER); - ASSERT(!rl->r_proxy); - ASSERT3U(rl->r_len, ==, UINT64_MAX); - ASSERT3U(rl->r_cnt, ==, 1); - - mutex_enter(&zp->z_range_lock); - rl->r_off = off; - rl->r_len = len; - mutex_exit(&zp->z_range_lock); - if (rl->r_write_wanted) - cv_broadcast(&rl->r_wr_cv); - if (rl->r_read_wanted) - cv_broadcast(&rl->r_rd_cv); -} - -/* - * AVL comparison function used to order range locks - * Locks are ordered on the start offset of the range. - */ -int -zfs_range_compare(const void *arg1, const void *arg2) -{ - const rl_t *rl1 = arg1; - const rl_t *rl2 = arg2; - - if (rl1->r_off > rl2->r_off) - return (1); - if (rl1->r_off < rl2->r_off) - return (-1); - return (0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c deleted file mode 100644 index 28f3293..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ /dev/null @@ -1,1021 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/sysmacros.h> -#include <sys/kmem.h> -#include <sys/acl.h> -#include <sys/vnode.h> -#include <sys/vfs.h> -#include <sys/mntent.h> -#include <sys/mount.h> -#include <sys/cmn_err.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_dir.h> -#include <sys/zil.h> -#include <sys/fs/zfs.h> -#include <sys/dmu.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_dataset.h> -#include <sys/spa.h> -#include <sys/zap.h> -#include <sys/varargs.h> -#include <sys/policy.h> -#include <sys/atomic.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_ctldir.h> -#include <sys/sunddi.h> -#include <sys/dnlc.h> - -struct mtx zfs_debug_mtx; -MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); -SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); -int zfs_debug_level = 0; -TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level); -SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, - "Debug level"); - -static int zfs_mount(vfs_t *vfsp, kthread_t *td); -static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td); -static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td); -static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td); -static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); -static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td); -static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp); -static void zfs_objset_close(zfsvfs_t *zfsvfs); -static void zfs_freevfs(vfs_t *vfsp); - -static struct vfsops zfs_vfsops = { - .vfs_mount = zfs_mount, - .vfs_unmount = zfs_umount, - .vfs_root = zfs_root, - .vfs_statfs = zfs_statfs, - .vfs_vget = zfs_vget, - .vfs_sync = zfs_sync, - .vfs_fhtovp = zfs_fhtovp, -}; - -VFS_SET(zfs_vfsops, zfs, VFCF_JAIL); - -/* - * We need to keep a count of active fs's. - * This is necessary to prevent our module - * from being unloaded after a umount -f - */ -static uint32_t zfs_active_fs_count = 0; - -/*ARGSUSED*/ -static int -zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td) -{ - - /* - * Data integrity is job one. We don't want a compromised kernel - * writing to the storage pool, so we never sync during panic. - */ - if (panicstr) - return (0); - - if (vfsp != NULL) { - /* - * Sync a specific filesystem. - */ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - int error; - - error = vfs_stdsync(vfsp, waitfor, td); - if (error != 0) - return (error); - - ZFS_ENTER(zfsvfs); - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, UINT64_MAX, 0); - else - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - ZFS_EXIT(zfsvfs); - } else { - /* - * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the - * request by waiting for all pools to commit all dirty data. - */ - spa_sync_allpools(); - } - - return (0); -} - -static void -atime_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == TRUE) { - zfsvfs->z_atime = TRUE; - zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); - } else { - zfsvfs->z_atime = FALSE; - zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); - } -} - -static void -xattr_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == TRUE) { - /* XXX locking on vfs_flag? */ -#ifdef TODO - zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; -#endif - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); - } else { - /* XXX locking on vfs_flag? */ -#ifdef TODO - zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; -#endif - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); - } -} - -static void -blksz_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval < SPA_MINBLOCKSIZE || - newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) - newval = SPA_MAXBLOCKSIZE; - - zfsvfs->z_max_blksz = newval; - zfsvfs->z_vfs->vfs_bsize = newval; -} - -static void -readonly_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval) { - /* XXX locking on vfs_flag? */ - zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); - } else { - /* XXX locking on vfs_flag? */ - zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); - } -} - -static void -setuid_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == FALSE) { - zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); - } else { - zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); - } -} - -static void -exec_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == FALSE) { - zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); - } else { - zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); - } -} - -static void -snapdir_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_show_ctldir = newval; -} - -static void -acl_mode_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_acl_mode = newval; -} - -static void -acl_inherit_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_acl_inherit = newval; -} - -static int -zfs_refresh_properties(vfs_t *vfsp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - - /* - * Remount operations default to "rw" unless "ro" is explicitly - * specified. - */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { - readonly_changed_cb(zfsvfs, B_TRUE); - } else { - if (!dmu_objset_is_snapshot(zfsvfs->z_os)) - readonly_changed_cb(zfsvfs, B_FALSE); - else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) - return (EROFS); - } - - if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { - setuid_changed_cb(zfsvfs, B_FALSE); - } else { - if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) - setuid_changed_cb(zfsvfs, B_FALSE); - else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) - setuid_changed_cb(zfsvfs, B_TRUE); - } - - if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) - exec_changed_cb(zfsvfs, B_FALSE); - else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) - exec_changed_cb(zfsvfs, B_TRUE); - - if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) - atime_changed_cb(zfsvfs, B_TRUE); - else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) - atime_changed_cb(zfsvfs, B_FALSE); - - if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) - xattr_changed_cb(zfsvfs, B_TRUE); - else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) - xattr_changed_cb(zfsvfs, B_FALSE); - - return (0); -} - -static int -zfs_register_callbacks(vfs_t *vfsp) -{ - struct dsl_dataset *ds = NULL; - objset_t *os = NULL; - zfsvfs_t *zfsvfs = NULL; - int readonly, do_readonly = FALSE; - int setuid, do_setuid = FALSE; - int exec, do_exec = FALSE; - int xattr, do_xattr = FALSE; - int error = 0; - - ASSERT(vfsp); - zfsvfs = vfsp->vfs_data; - ASSERT(zfsvfs); - os = zfsvfs->z_os; - - /* - * The act of registering our callbacks will destroy any mount - * options we may have. In order to enable temporary overrides - * of mount options, we stash away the current values and - * restore them after we register the callbacks. - */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { - readonly = B_TRUE; - do_readonly = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { - readonly = B_FALSE; - do_readonly = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { - setuid = B_FALSE; - do_setuid = B_TRUE; - } else { - if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { - setuid = B_FALSE; - do_setuid = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { - setuid = B_TRUE; - do_setuid = B_TRUE; - } - } - if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { - exec = B_FALSE; - do_exec = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { - exec = B_TRUE; - do_exec = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { - xattr = B_FALSE; - do_xattr = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { - xattr = B_TRUE; - do_xattr = B_TRUE; - } - - /* - * Register property callbacks. - * - * It would probably be fine to just check for i/o error from - * the first prop_register(), but I guess I like to go - * overboard... - */ - ds = dmu_objset_ds(os); - error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "xattr", xattr_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "recordsize", blksz_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "readonly", readonly_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "setuid", setuid_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "exec", exec_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "snapdir", snapdir_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "aclmode", acl_mode_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "aclinherit", acl_inherit_changed_cb, zfsvfs); - if (error) - goto unregister; - - /* - * Invoke our callbacks to restore temporary mount options. - */ - if (do_readonly) - readonly_changed_cb(zfsvfs, readonly); - if (do_setuid) - setuid_changed_cb(zfsvfs, setuid); - if (do_exec) - exec_changed_cb(zfsvfs, exec); - if (do_xattr) - xattr_changed_cb(zfsvfs, xattr); - - return (0); - -unregister: - /* - * We may attempt to unregister some callbacks that are not - * registered, but this is OK; it will simply return ENOMSG, - * which we will ignore. - */ - (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, - zfsvfs); - return (error); - -} - -static int -zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td) -{ - cred_t *cr = td->td_ucred; - uint64_t recordsize, readonly; - int error = 0; - int mode; - zfsvfs_t *zfsvfs; - znode_t *zp = NULL; - - ASSERT(vfsp); - ASSERT(osname); - - /* - * Initialize the zfs-specific filesystem structure. - * Should probably make this a kmem cache, shuffle fields, - * and just bzero up to z_hold_mtx[]. - */ - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); - zfsvfs->z_vfs = vfsp; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_assign = TXG_NOWAIT; - zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; - zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL); - - if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, - NULL)) - goto out; - zfsvfs->z_vfs->vfs_bsize = recordsize; - - vfsp->vfs_data = zfsvfs; - vfsp->mnt_flag |= MNT_LOCAL; - vfsp->mnt_kern_flag |= MNTK_MPSAFE; - vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; - - if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) - goto out; - - if (readonly) - mode = DS_MODE_PRIMARY | DS_MODE_READONLY; - else - mode = DS_MODE_PRIMARY; - - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); - if (error == EROFS) { - mode = DS_MODE_PRIMARY | DS_MODE_READONLY; - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, - &zfsvfs->z_os); - } - - if (error) - goto out; - - if (error = zfs_init_fs(zfsvfs, &zp, cr)) - goto out; - - if (dmu_objset_is_snapshot(zfsvfs->z_os)) { - uint64_t xattr; - - ASSERT(mode & DS_MODE_READONLY); - atime_changed_cb(zfsvfs, B_FALSE); - readonly_changed_cb(zfsvfs, B_TRUE); - if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL)) - goto out; - xattr_changed_cb(zfsvfs, xattr); - zfsvfs->z_issnap = B_TRUE; - } else { - error = zfs_register_callbacks(vfsp); - if (error) - goto out; - - zfs_unlinked_drain(zfsvfs); - - /* - * Parse and replay the intent log. - */ - zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, - zfs_replay_vector); - - if (!zil_disable) - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - } - - vfs_mountedfrom(vfsp, osname); - - if (!zfsvfs->z_issnap) - zfsctl_create(zfsvfs); -out: - if (error) { - if (zfsvfs->z_os) - dmu_objset_close(zfsvfs->z_os); - rw_destroy(&zfsvfs->z_um_lock); - mutex_destroy(&zfsvfs->z_znodes_lock); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); - } else { - atomic_add_32(&zfs_active_fs_count, 1); - } - - return (error); - -} - -void -zfs_unregister_callbacks(zfsvfs_t *zfsvfs) -{ - objset_t *os = zfsvfs->z_os; - struct dsl_dataset *ds; - - /* - * Unregister properties. - */ - if (!dmu_objset_is_snapshot(os)) { - ds = dmu_objset_ds(os); - VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "aclinherit", - acl_inherit_changed_cb, zfsvfs) == 0); - } -} - -/*ARGSUSED*/ -static int -zfs_mount(vfs_t *vfsp, kthread_t *td) -{ - char *from; - int error; - - /* - * When doing a remount, we simply refresh our temporary properties - * according to those options set in the current VFS options. - */ - if (vfsp->vfs_flag & MS_REMOUNT) - return (zfs_refresh_properties(vfsp)); - - if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL)) - return (EINVAL); - - DROP_GIANT(); - error = zfs_domount(vfsp, from, td); - PICKUP_GIANT(); - return (error); -} - -static int -zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - uint64_t refdbytes, availbytes, usedobjs, availobjs; - - statp->f_version = STATFS_VERSION; - - ZFS_ENTER(zfsvfs); - - dmu_objset_space(zfsvfs->z_os, - &refdbytes, &availbytes, &usedobjs, &availobjs); - - /* - * The underlying storage pool actually uses multiple block sizes. - * We report the fragsize as the smallest block size we support, - * and we report our blocksize as the filesystem's maximum blocksize. - */ - statp->f_bsize = zfsvfs->z_vfs->vfs_bsize; - statp->f_iosize = zfsvfs->z_vfs->vfs_bsize; - - /* - * The following report "total" blocks of various kinds in the - * file system, but reported in terms of f_frsize - the - * "fragment" size. - */ - - statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize; - statp->f_bfree = availbytes / statp->f_bsize; - statp->f_bavail = statp->f_bfree; /* no root reservation */ - - /* - * statvfs() should really be called statufs(), because it assumes - * static metadata. ZFS doesn't preallocate files, so the best - * we can do is report the max that could possibly fit in f_files, - * and that minus the number actually used in f_ffree. - * For f_ffree, report the smaller of the number of object available - * and the number of blocks (each object will take at least a block). - */ - statp->f_ffree = MIN(availobjs, statp->f_bfree); - statp->f_files = statp->f_ffree + usedobjs; - - /* - * We're a zfs filesystem. - */ - (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); - - strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, - sizeof(statp->f_mntfromname)); - strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, - sizeof(statp->f_mntonname)); - - statp->f_namemax = ZFS_MAXNAMELEN; - - ZFS_EXIT(zfsvfs); - return (0); -} - -static int -zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *rootzp; - int error; - - ZFS_ENTER(zfsvfs); - - error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); - if (error == 0) { - *vpp = ZTOV(rootzp); - error = vn_lock(*vpp, flags); - (*vpp)->v_vflag |= VV_ROOT; - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -static int -zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - cred_t *cr = td->td_ucred; - int ret; - - if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0) - return (ret); - - (void) dnlc_purge_vfsp(vfsp, 0); - - /* - * Unmount any snapshots mounted under .zfs before unmounting the - * dataset itself. - */ - if (zfsvfs->z_ctldir != NULL) { - if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) - return (ret); - ret = vflush(vfsp, 0, 0, td); - ASSERT(ret == EBUSY); - if (!(fflag & MS_FORCE)) { - if (zfsvfs->z_ctldir->v_count > 1) - return (EBUSY); - ASSERT(zfsvfs->z_ctldir->v_count == 1); - } - zfsctl_destroy(zfsvfs); - ASSERT(zfsvfs->z_ctldir == NULL); - } - - /* - * Flush all the files. - */ - ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); - if (ret != 0) { - if (!zfsvfs->z_issnap) { - zfsctl_create(zfsvfs); - ASSERT(zfsvfs->z_ctldir != NULL); - } - return (ret); - } - - if (fflag & MS_FORCE) { - MNT_ILOCK(vfsp); - vfsp->mnt_kern_flag |= MNTK_UNMOUNTF; - MNT_IUNLOCK(vfsp); - zfsvfs->z_unmounted1 = B_TRUE; - - /* - * Wait for all zfs threads to leave zfs. - * Grabbing a rwlock as reader in all vops and - * as writer here doesn't work because it too easy to get - * multiple reader enters as zfs can re-enter itself. - * This can lead to deadlock if there is an intervening - * rw_enter as writer. - * So a file system threads ref count (z_op_cnt) is used. - * A polling loop on z_op_cnt may seem inefficient, but - * - this saves all threads on exit from having to grab a - * mutex in order to cv_signal - * - only occurs on forced unmount in the rare case when - * there are outstanding threads within the file system. - */ - while (zfsvfs->z_op_cnt) { - delay(1); - } - } - - zfs_objset_close(zfsvfs); - VFS_RELE(vfsp); - zfs_freevfs(vfsp); - - return (0); -} - -static int -zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *zp; - int err; - - ZFS_ENTER(zfsvfs); - err = zfs_zget(zfsvfs, ino, &zp); - if (err == 0 && zp->z_unlinked) { - VN_RELE(ZTOV(zp)); - err = EINVAL; - } - if (err != 0) - *vpp = NULL; - else { - *vpp = ZTOV(zp); - vn_lock(*vpp, flags); - } - ZFS_EXIT(zfsvfs); - return (err); -} - -static int -zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) -{ - kthread_t *td = curthread; - zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *zp; - uint64_t object = 0; - uint64_t fid_gen = 0; - uint64_t gen_mask; - uint64_t zp_gen; - int i, err; - - *vpp = NULL; - - ZFS_ENTER(zfsvfs); - - if (fidp->fid_len == LONG_FID_LEN) { - zfid_long_t *zlfid = (zfid_long_t *)fidp; - uint64_t objsetid = 0; - uint64_t setgen = 0; - - for (i = 0; i < sizeof (zlfid->zf_setid); i++) - objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); - - for (i = 0; i < sizeof (zlfid->zf_setgen); i++) - setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); - - ZFS_EXIT(zfsvfs); - - err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); - if (err) - return (EINVAL); - ZFS_ENTER(zfsvfs); - } - - if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { - zfid_short_t *zfid = (zfid_short_t *)fidp; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); - - for (i = 0; i < sizeof (zfid->zf_gen); i++) - fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); - } else { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* A zero fid_gen means we are in the .zfs control directories */ - if (fid_gen == 0 && - (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { - *vpp = zfsvfs->z_ctldir; - ASSERT(*vpp != NULL); - if (object == ZFSCTL_INO_SNAPDIR) { - VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, - 0, NULL, NULL) == 0); - } else { - VN_HOLD(*vpp); - } - ZFS_EXIT(zfsvfs); - /* XXX: LK_RETRY? */ - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - return (0); - } - - gen_mask = -1ULL >> (64 - 8 * i); - - dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); - if (err = zfs_zget(zfsvfs, object, &zp)) { - ZFS_EXIT(zfsvfs); - return (err); - } - zp_gen = zp->z_phys->zp_gen & gen_mask; - if (zp_gen == 0) - zp_gen = 1; - if (zp->z_unlinked || zp_gen != fid_gen) { - dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); - VN_RELE(ZTOV(zp)); - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - *vpp = ZTOV(zp); - /* XXX: LK_RETRY? */ - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - vnode_create_vobject(*vpp, zp->z_phys->zp_size, td); - ZFS_EXIT(zfsvfs); - return (0); -} - -static void -zfs_objset_close(zfsvfs_t *zfsvfs) -{ - znode_t *zp, *nextzp; - objset_t *os = zfsvfs->z_os; - - /* - * For forced unmount, at this point all vops except zfs_inactive - * are erroring EIO. We need to now suspend zfs_inactive threads - * while we are freeing dbufs before switching zfs_inactive - * to use behaviour without a objset. - */ - rw_enter(&zfsvfs->z_um_lock, RW_WRITER); - - /* - * Release all holds on dbufs - * Note, although we have stopped all other vop threads and - * zfs_inactive(), the dmu can callback via znode_pageout_func() - * which can zfs_znode_free() the znode. - * So we lock z_all_znodes; search the list for a held - * dbuf; drop the lock (we know zp can't disappear if we hold - * a dbuf lock; then regrab the lock and restart. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) { - nextzp = list_next(&zfsvfs->z_all_znodes, zp); - if (zp->z_dbuf_held) { - /* dbufs should only be held when force unmounting */ - zp->z_dbuf_held = 0; - mutex_exit(&zfsvfs->z_znodes_lock); - dmu_buf_rele(zp->z_dbuf, NULL); - /* Start again */ - mutex_enter(&zfsvfs->z_znodes_lock); - nextzp = list_head(&zfsvfs->z_all_znodes); - } - } - mutex_exit(&zfsvfs->z_znodes_lock); - - /* - * Unregister properties. - */ - if (!dmu_objset_is_snapshot(os)) - zfs_unregister_callbacks(zfsvfs); - - /* - * Switch zfs_inactive to behaviour without an objset. - * It just tosses cached pages and frees the znode & vnode. - * Then re-enable zfs_inactive threads in that new behaviour. - */ - zfsvfs->z_unmounted2 = B_TRUE; - rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */ - - /* - * Close the zil. Can't close the zil while zfs_inactive - * threads are blocked as zil_close can call zfs_inactive. - */ - if (zfsvfs->z_log) { - zil_close(zfsvfs->z_log); - zfsvfs->z_log = NULL; - } - - /* - * Evict all dbufs so that cached znodes will be freed - */ - if (dmu_objset_evict_dbufs(os, 1)) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(os, 0); - } - - /* - * Finally close the objset - */ - dmu_objset_close(os); -} - -static void -zfs_freevfs(vfs_t *vfsp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - int i; - - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); - rw_destroy(&zfsvfs->z_um_lock); - mutex_destroy(&zfsvfs->z_znodes_lock); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); - - atomic_add_32(&zfs_active_fs_count, -1); -} - -#ifdef __i386__ -static int desiredvnodes_backup; -#endif - -static void -zfs_vnodes_adjust(void) -{ -#ifdef __i386__ - int val; - - desiredvnodes_backup = desiredvnodes; - - /* - * We calculate newdesiredvnodes the same way it is done in - * vntblinit(). If it is equal to desiredvnodes, it means that - * it wasn't tuned by the administrator and we can tune it down. - */ - val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / - (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); - if (desiredvnodes == val) - desiredvnodes = (3 * desiredvnodes) / 4; -#endif -} - -static void -zfs_vnodes_adjust_back(void) -{ - -#ifdef __i386__ - desiredvnodes = desiredvnodes_backup; -#endif -} - -void -zfs_init(void) -{ - - printf("ZFS filesystem version " ZFS_VERSION_STRING "\n"); - - /* - * Initialize .zfs directory structures - */ - zfsctl_init(); - - /* - * Initialize znode cache, vnode ops, etc... - */ - zfs_znode_init(); - - /* - * Reduce number of vnodes. Originally number of vnodes is calculated - * with UFS inode in mind. We reduce it here, because it's too big for - * ZFS/i386. - */ - zfs_vnodes_adjust(); -} - -void -zfs_fini(void) -{ - zfsctl_fini(); - zfs_znode_fini(); - zfs_vnodes_adjust_back(); -} - -int -zfs_busy(void) -{ - return (zfs_active_fs_count != 0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c deleted file mode 100644 index 088103a..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ /dev/null @@ -1,3623 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Portions Copyright 2007 Jeremy Teo */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/resource.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/file.h> -#include <sys/stat.h> -#include <sys/kmem.h> -#include <sys/taskq.h> -#include <sys/uio.h> -#include <sys/atomic.h> -#include <sys/namei.h> -#include <sys/mman.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/unistd.h> -#include <sys/zfs_vfsops.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_ioctl.h> -#include <sys/fs/zfs.h> -#include <sys/dmu.h> -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/dbuf.h> -#include <sys/zap.h> -#include <sys/dirent.h> -#include <sys/policy.h> -#include <sys/sunddi.h> -#include <sys/filio.h> -#include <sys/zfs_ctldir.h> -#include <sys/dnlc.h> -#include <sys/zfs_rlock.h> -#include <sys/bio.h> -#include <sys/buf.h> -#include <sys/sf_buf.h> -#include <sys/sched.h> - -/* - * Programming rules. - * - * Each vnode op performs some logical unit of work. To do this, the ZPL must - * properly lock its in-core state, create a DMU transaction, do the work, - * record this work in the intent log (ZIL), commit the DMU transaction, - * and wait the the intent log to commit if it's is a synchronous operation. - * Morover, the vnode ops must work in both normal and log replay context. - * The ordering of events is important to avoid deadlocks and references - * to freed memory. The example below illustrates the following Big Rules: - * - * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. - * - * (2) VN_RELE() should always be the last thing except for zil_commit() - * (if necessary) and ZFS_EXIT(). This is for 3 reasons: - * First, if it's the last reference, the vnode/znode - * can be freed, so the zp may point to freed memory. Second, the last - * reference will call zfs_zinactive(), which may induce a lot of work -- - * pushing cached pages (which acquires range locks) and syncing out - * cached atime changes. Third, zfs_zinactive() may require a new tx, - * which could deadlock the system if you were already holding one. - * - * (3) All range locks must be grabbed before calling dmu_tx_assign(), - * as they can span dmu_tx_assign() calls. - * - * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). - * In normal operation, this will be TXG_NOWAIT. During ZIL replay, - * it will be a specific txg. Either way, dmu_tx_assign() never blocks. - * This is critical because we don't want to block while holding locks. - * Note, in particular, that if a lock is sometimes acquired before - * the tx assigns, and sometimes after (e.g. z_lock), then failing to - * use a non-blocking assign can deadlock the system. The scenario: - * - * Thread A has grabbed a lock before calling dmu_tx_assign(). - * Thread B is in an already-assigned tx, and blocks for this lock. - * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() - * forever, because the previous txg can't quiesce until B's tx commits. - * - * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, - * then drop all locks, call dmu_tx_wait(), and try again. - * - * (5) If the operation succeeded, generate the intent log entry for it - * before dropping locks. This ensures that the ordering of events - * in the intent log matches the order in which they actually occurred. - * - * (6) At the end of each vnode op, the DMU tx must always commit, - * regardless of whether there were any errors. - * - * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) - * to ensure that synchronous semantics are provided when necessary. - * - * In general, this is how things should be ordered in each vnode op: - * - * ZFS_ENTER(zfsvfs); // exit if unmounted - * top: - * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) - * rw_enter(...); // grab any other locks you need - * tx = dmu_tx_create(...); // get DMU tx - * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign - * if (error) { - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * VN_RELE(...); // release held vnodes - * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - * dmu_tx_wait(tx); - * dmu_tx_abort(tx); - * goto top; - * } - * dmu_tx_abort(tx); // abort DMU tx - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // really out of space - * } - * error = do_real_work(); // do whatever this VOP does - * if (error == 0) - * zfs_log_*(...); // on success, make ZIL entry - * dmu_tx_commit(tx); // commit DMU tx -- error or not - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * VN_RELE(...); // release held vnodes - * zil_commit(zilog, seq, foid); // synchronous when necessary - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // done, report error - */ -/* ARGSUSED */ -static int -zfs_open(vnode_t **vpp, int flag, cred_t *cr) -{ - znode_t *zp = VTOZ(*vpp); - - /* Keep a count of the synchronous opens in the znode */ - if (flag & (FSYNC | FDSYNC)) - atomic_inc_32(&zp->z_sync_cnt); - return (0); -} - -/* ARGSUSED */ -static int -zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - - /* Decrement the synchronous opens in the znode */ - if (flag & (FSYNC | FDSYNC)) - atomic_dec_32(&zp->z_sync_cnt); - - /* - * Clean up any locks held by this process on the vp. - */ - cleanlocks(vp, ddi_get_pid(), 0); - cleanshares(vp, ddi_get_pid()); - - return (0); -} - -/* - * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and - * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. - */ -static int -zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) -{ - znode_t *zp = VTOZ(vp); - uint64_t noff = (uint64_t)*off; /* new offset */ - uint64_t file_sz; - int error; - boolean_t hole; - - file_sz = zp->z_phys->zp_size; - if (noff >= file_sz) { - return (ENXIO); - } - - if (cmd == _FIO_SEEK_HOLE) - hole = B_TRUE; - else - hole = B_FALSE; - - error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); - - /* end of file? */ - if ((error == ESRCH) || (noff > file_sz)) { - /* - * Handle the virtual hole at the end of file. - */ - if (hole) { - *off = file_sz; - return (0); - } - return (ENXIO); - } - - if (noff < *off) - return (error); - *off = noff; - return (error); -} - -/* ARGSUSED */ -static int -zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, - int *rvalp) -{ - offset_t off; - int error; - zfsvfs_t *zfsvfs; - - switch (com) { - case _FIOFFS: - return (0); - - /* - * The following two ioctls are used by bfu. Faking out, - * necessary to avoid bfu errors. - */ - case _FIOGDIO: - case _FIOSDIO: - return (0); - - case _FIO_SEEK_DATA: - case _FIO_SEEK_HOLE: - if (ddi_copyin((void *)data, &off, sizeof (off), flag)) - return (EFAULT); - - zfsvfs = VTOZ(vp)->z_zfsvfs; - ZFS_ENTER(zfsvfs); - - /* offset parameter is in/out */ - error = zfs_holey(vp, com, &off); - ZFS_EXIT(zfsvfs); - if (error) - return (error); - if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) - return (EFAULT); - return (0); - } - return (ENOTTY); -} - -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. - */ -static int -mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) -{ - znode_t *zp = VTOZ(vp); - objset_t *os = zp->z_zfsvfs->z_os; - vm_object_t obj; - vm_page_t m; - struct sf_buf *sf; - int64_t start, off; - int len = nbytes; - int error = 0; - uint64_t dirbytes; - - ASSERT(vp->v_mount != NULL); - obj = vp->v_object; - ASSERT(obj != NULL); - - start = uio->uio_loffset; - off = start & PAGEOFFSET; - dirbytes = 0; - VM_OBJECT_LOCK(obj); - for (start &= PAGEMASK; len > 0; start += PAGESIZE) { - uint64_t bytes = MIN(PAGESIZE - off, len); - uint64_t fsize; - -again: - if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && - vm_page_is_valid(m, (vm_offset_t)off, bytes)) { - uint64_t woff; - caddr_t va; - - if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) - goto again; - fsize = obj->un_pager.vnp.vnp_size; - vm_page_busy(m); - vm_page_lock_queues(); - vm_page_undirty(m); - vm_page_unlock_queues(); - VM_OBJECT_UNLOCK(obj); - if (dirbytes > 0) { - error = dmu_write_uio(os, zp->z_id, uio, - dirbytes, tx); - dirbytes = 0; - } - if (error == 0) { - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - va = (caddr_t)sf_buf_kva(sf); - woff = uio->uio_loffset - off; - error = uiomove(va + off, bytes, UIO_WRITE, uio); - /* - * The uiomove() above could have been partially - * successful, that's why we call dmu_write() - * below unconditionally. The page was marked - * non-dirty above and we would lose the changes - * without doing so. If the uiomove() failed - * entirely, well, we just write what we got - * before one more time. - */ - dmu_write(os, zp->z_id, woff, - MIN(PAGESIZE, fsize - woff), va, tx); - sf_buf_free(sf); - sched_unpin(); - } - VM_OBJECT_LOCK(obj); - vm_page_wakeup(m); - } else { - if (__predict_false(obj->cache != NULL)) { - vm_page_cache_free(obj, OFF_TO_IDX(start), - OFF_TO_IDX(start) + 1); - } - dirbytes += bytes; - } - len -= bytes; - off = 0; - if (error) - break; - } - VM_OBJECT_UNLOCK(obj); - if (error == 0 && dirbytes > 0) - error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); - return (error); -} - -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. - */ -static int -mappedread(vnode_t *vp, int nbytes, uio_t *uio) -{ - znode_t *zp = VTOZ(vp); - objset_t *os = zp->z_zfsvfs->z_os; - vm_object_t obj; - vm_page_t m; - struct sf_buf *sf; - int64_t start, off; - caddr_t va; - int len = nbytes; - int error = 0; - uint64_t dirbytes; - - ASSERT(vp->v_mount != NULL); - obj = vp->v_object; - ASSERT(obj != NULL); - - start = uio->uio_loffset; - off = start & PAGEOFFSET; - dirbytes = 0; - VM_OBJECT_LOCK(obj); - for (start &= PAGEMASK; len > 0; start += PAGESIZE) { - uint64_t bytes = MIN(PAGESIZE - off, len); - -again: - if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && - vm_page_is_valid(m, (vm_offset_t)off, bytes)) { - if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) - goto again; - vm_page_busy(m); - VM_OBJECT_UNLOCK(obj); - if (dirbytes > 0) { - error = dmu_read_uio(os, zp->z_id, uio, - dirbytes); - dirbytes = 0; - } - if (error == 0) { - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - va = (caddr_t)sf_buf_kva(sf); - error = uiomove(va + off, bytes, UIO_READ, uio); - sf_buf_free(sf); - sched_unpin(); - } - VM_OBJECT_LOCK(obj); - vm_page_wakeup(m); - } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { - /* - * The code below is here to make sendfile(2) work - * correctly with ZFS. As pointed out by ups@ - * sendfile(2) should be changed to use VOP_GETPAGES(), - * but it pessimize performance of sendfile/UFS, that's - * why I handle this special case in ZFS code. - */ - if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) - goto again; - vm_page_busy(m); - VM_OBJECT_UNLOCK(obj); - if (dirbytes > 0) { - error = dmu_read_uio(os, zp->z_id, uio, - dirbytes); - dirbytes = 0; - } - if (error == 0) { - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - va = (caddr_t)sf_buf_kva(sf); - error = dmu_read(os, zp->z_id, start + off, - bytes, (void *)(va + off)); - sf_buf_free(sf); - sched_unpin(); - } - VM_OBJECT_LOCK(obj); - vm_page_wakeup(m); - if (error == 0) - uio->uio_resid -= bytes; - } else { - dirbytes += bytes; - } - len -= bytes; - off = 0; - if (error) - break; - } - VM_OBJECT_UNLOCK(obj); - if (error == 0 && dirbytes > 0) - error = dmu_read_uio(os, zp->z_id, uio, dirbytes); - return (error); -} - -offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ - -/* - * Read bytes from specified file into supplied buffer. - * - * IN: vp - vnode of file to be read from. - * uio - structure supplying read location, range info, - * and return buffer. - * ioflag - SYNC flags; used to provide FRSYNC semantics. - * cr - credentials of caller. - * - * OUT: uio - updated offset and range, buffer filled. - * - * RETURN: 0 if success - * error code if failure - * - * Side Effects: - * vp - atime updated if byte count > 0 - */ -/* ARGSUSED */ -static int -zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os = zfsvfs->z_os; - ssize_t n, nbytes; - int error; - rl_t *rl; - - ZFS_ENTER(zfsvfs); - - /* - * Validate file offset - */ - if (uio->uio_loffset < (offset_t)0) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * Fasttrack empty reads - */ - if (uio->uio_resid == 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* - * Check for mandatory locks - */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { - if (error = chklock(vp, FREAD, - uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - /* - * If we're in FRSYNC mode, sync out this znode before reading it. - */ - if (ioflag & FRSYNC) - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); - - /* - * Lock the range against changes. - */ - rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); - - /* - * If we are reading past end-of-file we can skip - * to the end; but we might still need to set atime. - */ - if (uio->uio_loffset >= zp->z_phys->zp_size) { - error = 0; - goto out; - } - - ASSERT(uio->uio_loffset < zp->z_phys->zp_size); - n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); - - while (n > 0) { - nbytes = MIN(n, zfs_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - - if (vn_has_cached_data(vp)) - error = mappedread(vp, nbytes, uio); - else - error = dmu_read_uio(os, zp->z_id, uio, nbytes); - if (error) - break; - - n -= nbytes; - } - -out: - zfs_range_unlock(rl); - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Fault in the pages of the first n bytes specified by the uio structure. - * 1 byte in each page is touched and the uio struct is unmodified. - * Any error will exit this routine as this is only a best - * attempt to get the pages resident. This is a copy of ufs_trans_touch(). - */ -static void -zfs_prefault_write(ssize_t n, struct uio *uio) -{ - struct iovec *iov; - ulong_t cnt, incr; - caddr_t p; - - if (uio->uio_segflg != UIO_USERSPACE) - return; - - iov = uio->uio_iov; - - while (n) { - cnt = MIN(iov->iov_len, n); - if (cnt == 0) { - /* empty iov entry */ - iov++; - continue; - } - n -= cnt; - /* - * touch each page in this segment. - */ - p = iov->iov_base; - while (cnt) { - if (fubyte(p) == -1) - return; - incr = MIN(cnt, PAGESIZE); - p += incr; - cnt -= incr; - } - /* - * touch the last byte in case it straddles a page. - */ - p--; - if (fubyte(p) == -1) - return; - iov++; - } -} - -/* - * Write the bytes to a file. - * - * IN: vp - vnode of file to be written to. - * uio - structure supplying write location, range info, - * and data buffer. - * ioflag - IO_APPEND flag set if in append mode. - * cr - credentials of caller. - * - * OUT: uio - updated offset and range. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - ctime|mtime updated if byte count > 0 - */ -/* ARGSUSED */ -static int -zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - rlim64_t limit = MAXOFFSET_T; - ssize_t start_resid = uio->uio_resid; - ssize_t tx_bytes; - uint64_t end_size; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - offset_t woff; - ssize_t n, nbytes; - rl_t *rl; - int max_blksz = zfsvfs->z_max_blksz; - int error; - - /* - * Fasttrack empty write - */ - n = start_resid; - if (n == 0) - return (0); - - if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) - limit = MAXOFFSET_T; - - ZFS_ENTER(zfsvfs); - - /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - */ - zfs_prefault_write(n, uio); - - /* - * If in append mode, set the io offset pointer to eof. - */ - if (ioflag & IO_APPEND) { - /* - * Range lock for a file append: - * The value for the start of range will be determined by - * zfs_range_lock() (to guarantee append semantics). - * If this write will cause the block size to increase, - * zfs_range_lock() will lock the entire file, so we must - * later reduce the range after we grow the block size. - */ - rl = zfs_range_lock(zp, 0, n, RL_APPEND); - if (rl->r_len == UINT64_MAX) { - /* overlocked, zp_size can't change */ - woff = uio->uio_loffset = zp->z_phys->zp_size; - } else { - woff = uio->uio_loffset = rl->r_off; - } - } else { - woff = uio->uio_loffset; - /* - * Validate file offset - */ - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * If we need to grow the block size then zfs_range_lock() - * will lock a wider range than we request here. - * Later after growing the block size we reduce the range. - */ - rl = zfs_range_lock(zp, woff, n, RL_WRITER); - } - - if (woff >= limit) { - zfs_range_unlock(rl); - ZFS_EXIT(zfsvfs); - return (EFBIG); - } - - if ((woff + n) > limit || woff > (limit - n)) - n = limit - woff; - - /* - * Check for mandatory locks - */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && - (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { - zfs_range_unlock(rl); - ZFS_EXIT(zfsvfs); - return (error); - } - end_size = MAX(zp->z_phys->zp_size, woff + n); - - /* - * Write the file in reasonable size chunks. Each chunk is written - * in a separate transaction; this keeps the intent log records small - * and allows us to do more fine-grained space accounting. - */ - while (n > 0) { - /* - * Start a transaction. - */ - woff = uio->uio_loffset; - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - continue; - } - dmu_tx_abort(tx); - break; - } - - /* - * If zfs_range_lock() over-locked we grow the blocksize - * and then reduce the lock range. This will only happen - * on the first iteration since zfs_range_reduce() will - * shrink down r_len to the appropriate size. - */ - if (rl->r_len == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); - zfs_range_reduce(rl, woff, n); - } - - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - - if (woff + nbytes > zp->z_phys->zp_size) - vnode_pager_setsize(vp, woff + nbytes); - - rw_enter(&zp->z_map_lock, RW_READER); - - tx_bytes = uio->uio_resid; - if (vn_has_cached_data(vp)) { - rw_exit(&zp->z_map_lock); - error = mappedwrite(vp, nbytes, uio, tx); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, nbytes, tx); - rw_exit(&zp->z_map_lock); - } - tx_bytes -= uio->uio_resid; - - /* - * If we made no progress, we're done. If we made even - * partial progress, update the znode and ZIL accordingly. - */ - if (tx_bytes == 0) { - dmu_tx_commit(tx); - ASSERT(error != 0); - break; - } - - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the excute bits is set. - * - * It would be nice to to this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - */ - mutex_enter(&zp->z_acl_lock); - if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, - (zp->z_phys->zp_mode & S_ISUID) != 0 && - zp->z_phys->zp_uid == 0) != 0) { - zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); - } - mutex_exit(&zp->z_acl_lock); - - /* - * Update time stamp. NOTE: This marks the bonus buffer as - * dirty, so we don't have to do it again for zp_size. - */ - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - - /* - * Update the file size (zp_size) if it has changed; - * account for possible concurrent updates. - */ - while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) - (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, - uio->uio_loffset); - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); - dmu_tx_commit(tx); - - if (error != 0) - break; - ASSERT(tx_bytes == nbytes); - n -= nbytes; - } - - zfs_range_unlock(rl); - - /* - * If we're in replay mode, or we made no progress, return error. - * Otherwise, it's at least a partial write, so it's successful. - */ - if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (ioflag & (FSYNC | FDSYNC)) - zil_commit(zilog, zp->z_last_itx, zp->z_id); - - ZFS_EXIT(zfsvfs); - return (0); -} - -void -zfs_get_done(dmu_buf_t *db, void *vzgd) -{ - zgd_t *zgd = (zgd_t *)vzgd; - rl_t *rl = zgd->zgd_rl; - vnode_t *vp = ZTOV(rl->r_zp); - int vfslocked; - - vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); - dmu_buf_rele(db, vzgd); - zfs_range_unlock(rl); - VN_RELE(vp); - zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); - kmem_free(zgd, sizeof (zgd_t)); - VFS_UNLOCK_GIANT(vfslocked); -} - -/* - * Get data to generate a TX_WRITE intent log record. - */ -int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) -{ - zfsvfs_t *zfsvfs = arg; - objset_t *os = zfsvfs->z_os; - znode_t *zp; - uint64_t off = lr->lr_offset; - dmu_buf_t *db; - rl_t *rl; - zgd_t *zgd; - int dlen = lr->lr_length; /* length of user data */ - int error = 0; - - ASSERT(zio); - ASSERT(dlen != 0); - - /* - * Nothing to do if the file has been removed - */ - if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) - return (ENOENT); - if (zp->z_unlinked) { - VN_RELE(ZTOV(zp)); - return (ENOENT); - } - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - rl = zfs_range_lock(zp, off, dlen, RL_READER); - /* test for truncation needs to be done while range locked */ - if (off >= zp->z_phys->zp_size) { - error = ENOENT; - goto out; - } - VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); - } else { /* indirect write */ - uint64_t boff; /* block starting offset */ - - /* - * Have to lock the whole block to ensure when it's - * written out and it's checksum is being calculated - * that no one can change the data. We need to re-check - * blocksize after we get the lock in case it's changed! - */ - for (;;) { - if (ISP2(zp->z_blksz)) { - boff = P2ALIGN_TYPED(off, zp->z_blksz, - uint64_t); - } else { - boff = 0; - } - dlen = zp->z_blksz; - rl = zfs_range_lock(zp, boff, dlen, RL_READER); - if (zp->z_blksz == dlen) - break; - zfs_range_unlock(rl); - } - /* test for truncation needs to be done while range locked */ - if (off >= zp->z_phys->zp_size) { - error = ENOENT; - goto out; - } - zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_rl = rl; - zgd->zgd_zilog = zfsvfs->z_log; - zgd->zgd_bp = &lr->lr_blkptr; - VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); - ASSERT(boff == db->db_offset); - lr->lr_blkoff = off - boff; - error = dmu_sync(zio, db, &lr->lr_blkptr, - lr->lr_common.lrc_txg, zfs_get_done, zgd); - ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz); - if (error == 0) { - zil_add_vdev(zfsvfs->z_log, - DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); - } - /* - * If we get EINPROGRESS, then we need to wait for a - * write IO initiated by dmu_sync() to complete before - * we can release this dbuf. We will finish everything - * up in the zfs_get_done() callback. - */ - if (error == EINPROGRESS) - return (0); - dmu_buf_rele(db, zgd); - kmem_free(zgd, sizeof (zgd_t)); - } -out: - zfs_range_unlock(rl); - VN_RELE(ZTOV(zp)); - return (error); -} - -/*ARGSUSED*/ -static int -zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - error = zfs_zaccess_rwx(zp, mode, cr); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Lookup an entry in a directory, or an extended attribute directory. - * If it exists, return a held vnode reference for it. - * - * IN: dvp - vnode of directory to search. - * nm - name of entry to lookup. - * pnp - full pathname to lookup [UNUSED]. - * flags - LOOKUP_XATTR set if looking for an attribute. - * rdir - root directory vnode [UNUSED]. - * cr - credentials of caller. - * - * OUT: vpp - vnode of located entry, NULL if not found. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * NA - */ -/* ARGSUSED */ -static int -zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, - int nameiop, cred_t *cr, kthread_t *td) -{ - - znode_t *zdp = VTOZ(dvp); - zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - - *vpp = NULL; - -#ifdef TODO - if (flags & LOOKUP_XATTR) { - /* - * If the xattr property is off, refuse the lookup request. - */ - if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * We don't allow recursive attributes.. - * Maybe someday we will. - */ - if (zdp->z_phys->zp_flags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Do we have permission to get into attribute directory? - */ - - if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) { - VN_RELE(*vpp); - } - - ZFS_EXIT(zfsvfs); - return (error); - } -#endif /* TODO */ - - if (dvp->v_type != VDIR) { - ZFS_EXIT(zfsvfs); - return (ENOTDIR); - } - - /* - * Check accessibility of directory. - */ - - if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) { - - /* - * Convert device special files - */ - if (IS_DEVVP(*vpp)) { - vnode_t *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) - error = ENOSYS; - else - *vpp = svp; - } - } - - ZFS_EXIT(zfsvfs); - - /* Translate errors and add SAVENAME when needed. */ - if (cnp->cn_flags & ISLASTCN) { - switch (nameiop) { - case CREATE: - case RENAME: - if (error == ENOENT) { - error = EJUSTRETURN; - cnp->cn_flags |= SAVENAME; - break; - } - /* FALLTHROUGH */ - case DELETE: - if (error == 0) - cnp->cn_flags |= SAVENAME; - break; - } - } - if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { - int ltype = 0; - - if (cnp->cn_flags & ISDOTDOT) { - ltype = VOP_ISLOCKED(dvp); - VOP_UNLOCK(dvp, 0); - } - error = vn_lock(*vpp, cnp->cn_lkflags); - if (cnp->cn_flags & ISDOTDOT) - vn_lock(dvp, ltype | LK_RETRY); - if (error != 0) { - VN_RELE(*vpp); - *vpp = NULL; - return (error); - } - } - -#ifdef FREEBSD_NAMECACHE - /* - * Insert name into cache (as non-existent) if appropriate. - */ - if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) - cache_enter(dvp, *vpp, cnp); - /* - * Insert name into cache if appropriate. - */ - if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { - if (!(cnp->cn_flags & ISLASTCN) || - (nameiop != DELETE && nameiop != RENAME)) { - cache_enter(dvp, *vpp, cnp); - } - } -#endif - - return (error); -} - -/* - * Attempt to create a new entry in a directory. If the entry - * already exists, truncate the file if permissible, else return - * an error. Return the vp of the created or trunc'd file. - * - * IN: dvp - vnode of directory to put new file entry in. - * name - name of new file entry. - * vap - attributes of new file. - * excl - flag indicating exclusive or non-exclusive mode. - * mode - mode to open file with. - * cr - credentials of caller. - * flag - large file flag [UNUSED]. - * - * OUT: vpp - vnode of created or trunc'd entry. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime updated if new entry created - * vp - ctime|mtime always, atime if new - */ -/* ARGSUSED */ -static int -zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, - vnode_t **vpp, cred_t *cr) -{ - znode_t *zp, *dzp = VTOZ(dvp); - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - objset_t *os = zfsvfs->z_os; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - uint64_t zoid; - - ZFS_ENTER(zfsvfs); - -top: - *vpp = NULL; - - if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) - vap->va_mode &= ~VSVTX; - - if (*name == '\0') { - /* - * Null component name refers to the directory itself. - */ - VN_HOLD(dvp); - zp = dzp; - dl = NULL; - error = 0; - } else { - /* possible VN_HOLD(zp) */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) { - if (strcmp(name, "..") == 0) - error = EISDIR; - ZFS_EXIT(zfsvfs); - return (error); - } - } - - zoid = zp ? zp->z_id : -1ULL; - - if (zp == NULL) { - /* - * Create a new file object and update the directory - * to reference it. - */ - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { - goto out; - } - - /* - * We only support the creation of regular files in - * extended attribute directories. - */ - if ((dzp->z_phys->zp_flags & ZFS_XATTR) && - (vap->va_type != VREG)) { - error = EINVAL; - goto out; - } - - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); - ASSERT(zp->z_id == zoid); - (void) zfs_link_create(dl, zp, tx, ZNEW); - zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name); - dmu_tx_commit(tx); - } else { - /* - * A directory entry already exists for this name. - */ - /* - * Can't truncate an existing file if in exclusive mode. - */ - if (excl == EXCL) { - error = EEXIST; - goto out; - } - /* - * Can't open a directory for writing. - */ - if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { - error = EISDIR; - goto out; - } - /* - * Verify requested access to file. - */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) { - goto out; - } - - mutex_enter(&dzp->z_lock); - dzp->z_seq++; - mutex_exit(&dzp->z_lock); - - /* - * Truncate regular files if requested. - */ - if ((ZTOV(zp)->v_type == VREG) && - (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { - error = zfs_freesp(zp, 0, 0, mode, TRUE); - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { - /* NB: we already did dmu_tx_wait() */ - zfs_dirent_unlock(dl); - VN_RELE(ZTOV(zp)); - goto top; - } - } - } -out: - - if (error == 0) { - *vpp = ZTOV(zp); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - } - - if (dl) - zfs_dirent_unlock(dl); - - if (error) { - if (zp) - VN_RELE(ZTOV(zp)); - } else { - *vpp = ZTOV(zp); - /* - * If vnode is for a device return a specfs vnode instead. - */ - if (IS_DEVVP(*vpp)) { - struct vnode *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) { - error = ENOSYS; - } - *vpp = svp; - } - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Remove an entry from a directory. - * - * IN: dvp - vnode of directory to remove entry from. - * name - name of entry to remove. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime - * vp - ctime (if nlink > 0) - */ -static int -zfs_remove(vnode_t *dvp, char *name, cred_t *cr) -{ - znode_t *zp, *dzp = VTOZ(dvp); - znode_t *xzp = NULL; - vnode_t *vp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - uint64_t acl_obj, xattr_obj; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - boolean_t may_delete_now, delete_now = FALSE; - boolean_t unlinked; - int error; - - ZFS_ENTER(zfsvfs); - -top: - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - vp = ZTOV(zp); - - if (error = zfs_zaccess_delete(dzp, zp, cr)) { - goto out; - } - - /* - * Need to use rmdir for removing directories. - */ - if (vp->v_type == VDIR) { - error = EPERM; - goto out; - } - - vnevent_remove(vp); - - dnlc_remove(dvp, name); - - may_delete_now = FALSE; - - /* - * We may delete the znode now, or we may put it in the unlinked set; - * it depends on whether we're the last link, and on whether there are - * other holds on the vnode. So we dmu_tx_hold() the right things to - * allow for either case. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); - if (may_delete_now) - dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); - - /* are there any extended attributes? */ - if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { - /* XXX - do we need this if we are deleting? */ - dmu_tx_hold_bonus(tx, xattr_obj); - } - - /* are there any additional acls */ - if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && - may_delete_now) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - - /* charge as an update -- would be nice not to charge at all */ - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Remove the directory entry. - */ - error = zfs_link_destroy(dl, zp, tx, 0, &unlinked); - - if (error) { - dmu_tx_commit(tx); - goto out; - } - - if (0 && unlinked) { - VI_LOCK(vp); - delete_now = may_delete_now && - vp->v_count == 1 && !vn_has_cached_data(vp) && - zp->z_phys->zp_xattr == xattr_obj && - zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; - VI_UNLOCK(vp); - } - - if (delete_now) { - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); - ASSERT3U(error, ==, 0); - ASSERT3U(xzp->z_phys->zp_links, ==, 2); - dmu_buf_will_dirty(xzp->z_dbuf, tx); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = 1; - xzp->z_phys->zp_links = 0; - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - zp->z_phys->zp_xattr = 0; /* probably unnecessary */ - } - mutex_enter(&zp->z_lock); - VI_LOCK(vp); - vp->v_count--; - ASSERT3U(vp->v_count, ==, 0); - VI_UNLOCK(vp); - mutex_exit(&zp->z_lock); - zfs_znode_delete(zp, tx); - VFS_RELE(zfsvfs->z_vfs); - } else if (unlinked) { - zfs_unlinked_add(zp, tx); - } - - zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name); - - dmu_tx_commit(tx); -out: - zfs_dirent_unlock(dl); - - if (!delete_now) { - VN_RELE(vp); - } else if (xzp) { - /* this rele delayed to prevent nesting transactions */ - VN_RELE(ZTOV(xzp)); - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Create a new directory and insert it into dvp using the name - * provided. Return a pointer to the inserted directory. - * - * IN: dvp - vnode of directory to add subdir to. - * dirname - name of new directory. - * vap - attributes of new directory. - * cr - credentials of caller. - * - * OUT: vpp - vnode of created directory. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime updated - * vp - ctime|mtime|atime updated - */ -static int -zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) -{ - znode_t *zp, *dzp = VTOZ(dvp); - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - zfs_dirlock_t *dl; - uint64_t zoid = 0; - dmu_tx_t *tx; - int error; - - ASSERT(vap->va_type == VDIR); - - ZFS_ENTER(zfsvfs); - - if (dzp->z_phys->zp_flags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } -top: - *vpp = NULL; - - /* - * First make sure the new directory doesn't exist. - */ - if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) { - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Add a new entry to the directory. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Create new node. - */ - zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); - - /* - * Now put new name in parent dir. - */ - (void) zfs_link_create(dl, zp, tx, ZNEW); - - *vpp = ZTOV(zp); - - zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname); - dmu_tx_commit(tx); - - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - - zfs_dirent_unlock(dl); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Remove a directory subdir entry. If the current working - * directory is the same as the subdir to be removed, the - * remove will fail. - * - * IN: dvp - vnode of directory to remove from. - * name - name of directory to be removed. - * cwd - vnode of current working directory. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime updated - */ -static int -zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) -{ - znode_t *dzp = VTOZ(dvp); - znode_t *zp; - vnode_t *vp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - - ZFS_ENTER(zfsvfs); - -top: - zp = NULL; - - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - vp = ZTOV(zp); - - if (error = zfs_zaccess_delete(dzp, zp, cr)) { - goto out; - } - - if (vp->v_type != VDIR) { - error = ENOTDIR; - goto out; - } - - if (vp == cwd) { - error = EINVAL; - goto out; - } - - vnevent_rmdir(vp); - - /* - * Grab a lock on the directory to make sure that noone is - * trying to add (or lookup) entries while we are removing it. - */ - rw_enter(&zp->z_name_lock, RW_WRITER); - - /* - * Grab a lock on the parent pointer to make sure we play well - * with the treewalk and directory rename code. - */ - rw_enter(&zp->z_parent_lock, RW_WRITER); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); - zfs_dirent_unlock(dl); - VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - -#ifdef FREEBSD_NAMECACHE - cache_purge(dvp); -#endif - - error = zfs_link_destroy(dl, zp, tx, 0, NULL); - - if (error == 0) - zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name); - - dmu_tx_commit(tx); - - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); -#ifdef FREEBSD_NAMECACHE - cache_purge(vp); -#endif -out: - zfs_dirent_unlock(dl); - - VN_RELE(vp); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Read as many directory entries as will fit into the provided - * buffer from the given directory cursor position (specified in - * the uio structure. - * - * IN: vp - vnode of directory to read. - * uio - structure supplying read location, range info, - * and return buffer. - * cr - credentials of caller. - * - * OUT: uio - updated offset and range, buffer filled. - * eofp - set to true if end-of-file detected. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - atime updated - * - * Note that the low 4 bits of the cookie returned by zap is always zero. - * This allows us to use the low range for "special" directory entries: - * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, - * we use the offset 2 for the '.zfs' directory. - */ -/* ARGSUSED */ -static int -zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) -{ - znode_t *zp = VTOZ(vp); - iovec_t *iovp; - dirent64_t *odp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os; - caddr_t outbuf; - size_t bufsize; - zap_cursor_t zc; - zap_attribute_t zap; - uint_t bytes_wanted; - uint64_t offset; /* must be unsigned; checks for < 1 */ - int local_eof; - int outcount; - int error; - uint8_t prefetch; - uint8_t type; - int ncooks; - u_long *cooks = NULL; - - ZFS_ENTER(zfsvfs); - - /* - * If we are not given an eof variable, - * use a local one. - */ - if (eofp == NULL) - eofp = &local_eof; - - /* - * Check for valid iov_len. - */ - if (uio->uio_iov->iov_len <= 0) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * Quit if directory has been removed (posix) - */ - if ((*eofp = zp->z_unlinked) != 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - - error = 0; - os = zfsvfs->z_os; - offset = uio->uio_loffset; - prefetch = zp->z_zn_prefetch; - - /* - * Initialize the iterator cursor. - */ - if (offset <= 3) { - /* - * Start iteration from the beginning of the directory. - */ - zap_cursor_init(&zc, os, zp->z_id); - } else { - /* - * The offset is a serialized cursor. - */ - zap_cursor_init_serialized(&zc, os, zp->z_id, offset); - } - - /* - * Get space to change directory entries into fs independent format. - */ - iovp = uio->uio_iov; - bytes_wanted = iovp->iov_len; - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { - bufsize = bytes_wanted; - outbuf = kmem_alloc(bufsize, KM_SLEEP); - odp = (struct dirent64 *)outbuf; - } else { - bufsize = bytes_wanted; - odp = (struct dirent64 *)iovp->iov_base; - } - - if (ncookies != NULL) { - /* - * Minimum entry size is dirent size and 1 byte for a file name. - */ - ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); - cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); - *cookies = cooks; - *ncookies = ncooks; - } - - /* - * Transform to file-system independent format - */ - outcount = 0; - while (outcount < bytes_wanted) { - ino64_t objnum; - ushort_t reclen; - - /* - * Special case `.', `..', and `.zfs'. - */ - if (offset == 0) { - (void) strcpy(zap.za_name, "."); - objnum = zp->z_id; - type = DT_DIR; - } else if (offset == 1) { - (void) strcpy(zap.za_name, ".."); - objnum = zp->z_phys->zp_parent; - type = DT_DIR; - } else if (offset == 2 && zfs_show_ctldir(zp)) { - (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); - objnum = ZFSCTL_INO_ROOT; - type = DT_DIR; - } else { - /* - * Grab next entry. - */ - if (error = zap_cursor_retrieve(&zc, &zap)) { - if ((*eofp = (error == ENOENT)) != 0) - break; - else - goto update; - } - - if (zap.za_integer_length != 8 || - zap.za_num_integers != 1) { - cmn_err(CE_WARN, "zap_readdir: bad directory " - "entry, obj = %lld, offset = %lld\n", - (u_longlong_t)zp->z_id, - (u_longlong_t)offset); - error = ENXIO; - goto update; - } - - objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); - /* - * MacOS X can extract the object type here such as: - * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); - */ - type = ZFS_DIRENT_TYPE(zap.za_first_integer); - } - reclen = DIRENT64_RECLEN(strlen(zap.za_name)); - - /* - * Will this entry fit in the buffer? - */ - if (outcount + reclen > bufsize) { - /* - * Did we manage to fit anything in the buffer? - */ - if (!outcount) { - error = EINVAL; - goto update; - } - break; - } - /* - * Add this entry: - */ - odp->d_ino = objnum; - odp->d_reclen = reclen; - odp->d_namlen = strlen(zap.za_name); - (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); - odp->d_type = type; - outcount += reclen; - odp = (dirent64_t *)((intptr_t)odp + reclen); - - ASSERT(outcount <= bufsize); - - /* Prefetch znode */ - if (prefetch) - dmu_prefetch(os, objnum, 0, 0); - - /* - * Move to the next entry, fill in the previous offset. - */ - if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { - zap_cursor_advance(&zc); - offset = zap_cursor_serialize(&zc); - } else { - offset += 1; - } - - if (cooks != NULL) { - *cooks++ = offset; - ncooks--; - KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); - } - } - zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ - - /* Subtract unused cookies */ - if (ncookies != NULL) - *ncookies -= ncooks; - - if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { - iovp->iov_base += outcount; - iovp->iov_len -= outcount; - uio->uio_resid -= outcount; - } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { - /* - * Reset the pointer. - */ - offset = uio->uio_loffset; - } - -update: - zap_cursor_fini(&zc); - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) - kmem_free(outbuf, bufsize); - - if (error == ENOENT) - error = 0; - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - - uio->uio_loffset = offset; - ZFS_EXIT(zfsvfs); - if (error != 0 && cookies != NULL) { - free(*cookies, M_TEMP); - *cookies = NULL; - *ncookies = 0; - } - return (error); -} - -static int -zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - ZFS_ENTER(zfsvfs); - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Get the requested file attributes and place them in the provided - * vattr structure. - * - * IN: vp - vnode of file. - * vap - va_mask identifies requested attributes. - * flags - [UNUSED] - * cr - credentials of caller. - * - * OUT: vap - attribute values. - * - * RETURN: 0 (always succeeds) - */ -/* ARGSUSED */ -static int -zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_phys_t *pzp = zp->z_phys; - uint32_t blksize; - u_longlong_t nblocks; - int error; - - ZFS_ENTER(zfsvfs); - - /* - * Return all attributes. It's cheaper to provide the answer - * than to determine whether we were asked the question. - */ - mutex_enter(&zp->z_lock); - - vap->va_type = IFTOVT(pzp->zp_mode); - vap->va_mode = pzp->zp_mode & ~S_IFMT; - vap->va_uid = zp->z_phys->zp_uid; - vap->va_gid = zp->z_phys->zp_gid; - vap->va_nodeid = zp->z_id; - vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */ - vap->va_size = pzp->zp_size; - vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; - vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); - vap->va_seq = zp->z_seq; - vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ - - ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); - ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); - ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); - ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); - - /* - * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. - * Also, if we are the owner don't bother, since owner should - * always be allowed to read basic attributes of file. - */ - if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) && - (zp->z_phys->zp_uid != crgetuid(cr))) { - if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) { - mutex_exit(&zp->z_lock); - ZFS_EXIT(zfsvfs); - return (error); - } - } - - mutex_exit(&zp->z_lock); - - dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); - vap->va_blksize = blksize; - vap->va_bytes = nblocks << 9; /* nblocks * 512 */ - - if (zp->z_blksz == 0) { - /* - * Block size hasn't been set; suggest maximal I/O transfers. - */ - vap->va_blksize = zfsvfs->z_max_blksz; - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Set the file attributes to the values contained in the - * vattr structure. - * - * IN: vp - vnode of file to be modified. - * vap - new attribute values. - * flags - ATTR_UTIME set if non-default time values provided. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - ctime updated, mtime updated if size changed. - */ -/* ARGSUSED */ -static int -zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) -{ - struct znode *zp = VTOZ(vp); - znode_phys_t *pzp = zp->z_phys; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - dmu_tx_t *tx; - vattr_t oldva; - uint_t mask = vap->va_mask; - uint_t saved_mask; - int trim_mask = 0; - uint64_t new_mode; - znode_t *attrzp; - int need_policy = FALSE; - int err; - - if (mask == 0) - return (0); - - if (mask & AT_NOSET) - return (EINVAL); - - if (mask & AT_SIZE && vp->v_type == VDIR) - return (EISDIR); - - if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) - return (EINVAL); - - ZFS_ENTER(zfsvfs); - -top: - attrzp = NULL; - - if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - ZFS_EXIT(zfsvfs); - return (EROFS); - } - - /* - * First validate permissions - */ - - if (mask & AT_SIZE) { - err = zfs_zaccess(zp, ACE_WRITE_DATA, cr); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - /* - * XXX - Note, we are not providing any open - * mode flags here (like FNDELAY), so we may - * block if there are locks present... this - * should be addressed in openat(). - */ - do { - err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); - /* NB: we already did dmu_tx_wait() if necessary */ - } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - } - - if (mask & (AT_ATIME|AT_MTIME)) - need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr); - - if (mask & (AT_UID|AT_GID)) { - int idmask = (mask & (AT_UID|AT_GID)); - int take_owner; - int take_group; - - /* - * NOTE: even if a new mode is being set, - * we may clear S_ISUID/S_ISGID bits. - */ - - if (!(mask & AT_MODE)) - vap->va_mode = pzp->zp_mode; - - /* - * Take ownership or chgrp to group we are a member of - */ - - take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); - take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr); - - /* - * If both AT_UID and AT_GID are set then take_owner and - * take_group must both be set in order to allow taking - * ownership. - * - * Otherwise, send the check through secpolicy_vnode_setattr() - * - */ - - if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || - ((idmask == AT_UID) && take_owner) || - ((idmask == AT_GID) && take_group)) { - if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) { - /* - * Remove setuid/setgid for non-privileged users - */ - secpolicy_setid_clear(vap, cr); - trim_mask = (mask & (AT_UID|AT_GID)); - } else { - need_policy = TRUE; - } - } else { - need_policy = TRUE; - } - } - - mutex_enter(&zp->z_lock); - oldva.va_mode = pzp->zp_mode; - oldva.va_uid = zp->z_phys->zp_uid; - oldva.va_gid = zp->z_phys->zp_gid; - mutex_exit(&zp->z_lock); - - if (mask & AT_MODE) { - if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) { - err = secpolicy_setid_setsticky_clear(vp, vap, - &oldva, cr); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - trim_mask |= AT_MODE; - } else { - need_policy = TRUE; - } - } - - if (need_policy) { - /* - * If trim_mask is set then take ownership - * has been granted or write_acl is present and user - * has the ability to modify mode. In that case remove - * UID|GID and or MODE from mask so that - * secpolicy_vnode_setattr() doesn't revoke it. - */ - - if (trim_mask) { - saved_mask = vap->va_mask; - vap->va_mask &= ~trim_mask; - - } - err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, - (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - - if (trim_mask) - vap->va_mask |= saved_mask; - } - - /* - * secpolicy_vnode_setattr, or take ownership may have - * changed va_mask - */ - mask = vap->va_mask; - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - - if (mask & AT_MODE) { - uint64_t pmode = pzp->zp_mode; - - new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - - if (zp->z_phys->zp_acl.z_acl_extern_obj) - dmu_tx_hold_write(tx, - pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE); - else - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, ZFS_ACL_SIZE(MAX_ACL_SIZE)); - } - - if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) { - err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp); - if (err) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); - } - dmu_tx_hold_bonus(tx, attrzp->z_id); - } - - err = dmu_tx_assign(tx, zfsvfs->z_assign); - if (err) { - if (attrzp) - VN_RELE(ZTOV(attrzp)); - if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); - } - - dmu_buf_will_dirty(zp->z_dbuf, tx); - - /* - * Set each attribute requested. - * We group settings according to the locks they need to acquire. - * - * Note: you cannot set ctime directly, although it will be - * updated as a side-effect of calling this function. - */ - - mutex_enter(&zp->z_lock); - - if (mask & AT_MODE) { - err = zfs_acl_chmod_setattr(zp, new_mode, tx); - ASSERT3U(err, ==, 0); - } - - if (attrzp) - mutex_enter(&attrzp->z_lock); - - if (mask & AT_UID) { - zp->z_phys->zp_uid = (uint64_t)vap->va_uid; - if (attrzp) { - attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid; - } - } - - if (mask & AT_GID) { - zp->z_phys->zp_gid = (uint64_t)vap->va_gid; - if (attrzp) - attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid; - } - - if (attrzp) - mutex_exit(&attrzp->z_lock); - - if (mask & AT_ATIME) - ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); - - if (mask & AT_MTIME) - ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); - - if (mask & AT_SIZE) - zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); - else if (mask != 0) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - - if (mask != 0) - zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask); - - mutex_exit(&zp->z_lock); - - if (attrzp) - VN_RELE(ZTOV(attrzp)); - - dmu_tx_commit(tx); - - ZFS_EXIT(zfsvfs); - return (err); -} - -typedef struct zfs_zlock { - krwlock_t *zl_rwlock; /* lock we acquired */ - znode_t *zl_znode; /* znode we held */ - struct zfs_zlock *zl_next; /* next in list */ -} zfs_zlock_t; - -/* - * Drop locks and release vnodes that were held by zfs_rename_lock(). - */ -static void -zfs_rename_unlock(zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - - while ((zl = *zlpp) != NULL) { - if (zl->zl_znode != NULL) - VN_RELE(ZTOV(zl->zl_znode)); - rw_exit(zl->zl_rwlock); - *zlpp = zl->zl_next; - kmem_free(zl, sizeof (*zl)); - } -} - -/* - * Search back through the directory tree, using the ".." entries. - * Lock each directory in the chain to prevent concurrent renames. - * Fail any attempt to move a directory into one of its own descendants. - * XXX - z_parent_lock can overlap with map or grow locks - */ -static int -zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - znode_t *zp = tdzp; - uint64_t rootid = zp->z_zfsvfs->z_root; - uint64_t *oidp = &zp->z_id; - krwlock_t *rwlp = &szp->z_parent_lock; - krw_t rw = RW_WRITER; - - /* - * First pass write-locks szp and compares to zp->z_id. - * Later passes read-lock zp and compare to zp->z_parent. - */ - do { - if (!rw_tryenter(rwlp, rw)) { - /* - * Another thread is renaming in this path. - * Note that if we are a WRITER, we don't have any - * parent_locks held yet. - */ - if (rw == RW_READER && zp->z_id > szp->z_id) { - /* - * Drop our locks and restart - */ - zfs_rename_unlock(&zl); - *zlpp = NULL; - zp = tdzp; - oidp = &zp->z_id; - rwlp = &szp->z_parent_lock; - rw = RW_WRITER; - continue; - } else { - /* - * Wait for other thread to drop its locks - */ - rw_enter(rwlp, rw); - } - } - - zl = kmem_alloc(sizeof (*zl), KM_SLEEP); - zl->zl_rwlock = rwlp; - zl->zl_znode = NULL; - zl->zl_next = *zlpp; - *zlpp = zl; - - if (*oidp == szp->z_id) /* We're a descendant of szp */ - return (EINVAL); - - if (*oidp == rootid) /* We've hit the top */ - return (0); - - if (rw == RW_READER) { /* i.e. not the first pass */ - int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); - if (error) - return (error); - zl->zl_znode = zp; - } - oidp = &zp->z_phys->zp_parent; - rwlp = &zp->z_parent_lock; - rw = RW_READER; - - } while (zp->z_id != sdzp->z_id); - - return (0); -} - -/* - * Move an entry from the provided source directory to the target - * directory. Change the entry name as indicated. - * - * IN: sdvp - Source directory containing the "old entry". - * snm - Old entry name. - * tdvp - Target directory to contain the "new entry". - * tnm - New entry name. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * sdvp,tdvp - ctime|mtime updated - */ -static int -zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) -{ - znode_t *tdzp, *szp, *tzp; - znode_t *sdzp = VTOZ(sdvp); - zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - vnode_t *realvp; - zfs_dirlock_t *sdl, *tdl; - dmu_tx_t *tx; - zfs_zlock_t *zl; - int cmp, serr, terr, error; - - ZFS_ENTER(zfsvfs); - - /* - * Make sure we have the real vp for the target directory. - */ - if (VOP_REALVP(tdvp, &realvp) == 0) - tdvp = realvp; - - if (tdvp->v_vfsp != sdvp->v_vfsp) { - ZFS_EXIT(zfsvfs); - return (EXDEV); - } - - tdzp = VTOZ(tdvp); -top: - szp = NULL; - tzp = NULL; - zl = NULL; - - /* - * This is to prevent the creation of links into attribute space - * by renaming a linked file into/outof an attribute directory. - * See the comment in zfs_link() for why this is considered bad. - */ - if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != - (sdzp->z_phys->zp_flags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * Lock source and target directory entries. To prevent deadlock, - * a lock ordering must be defined. We lock the directory with - * the smallest object id first, or if it's a tie, the one with - * the lexically first name. - */ - if (sdzp->z_id < tdzp->z_id) { - cmp = -1; - } else if (sdzp->z_id > tdzp->z_id) { - cmp = 1; - } else { - cmp = strcmp(snm, tnm); - if (cmp == 0) { - /* - * POSIX: "If the old argument and the new argument - * both refer to links to the same existing file, - * the rename() function shall return successfully - * and perform no other action." - */ - ZFS_EXIT(zfsvfs); - return (0); - } - } - if (cmp < 0) { - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); - terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); - } else { - terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); - } - - if (serr) { - /* - * Source entry invalid or not there. - */ - if (!terr) { - zfs_dirent_unlock(tdl); - if (tzp) - VN_RELE(ZTOV(tzp)); - } - if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) - serr = EINVAL; - ZFS_EXIT(zfsvfs); - return (serr); - } - if (terr) { - zfs_dirent_unlock(sdl); - VN_RELE(ZTOV(szp)); - if (strcmp(tnm, "..") == 0) - terr = EINVAL; - ZFS_EXIT(zfsvfs); - return (terr); - } - - /* - * Must have write access at the source to remove the old entry - * and write access at the target to create the new entry. - * Note that if target and source are the same, this can be - * done in a single check. - */ - - if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) - goto out; - - if (ZTOV(szp)->v_type == VDIR) { - /* - * Check to make sure rename is valid. - * Can't do a move like this: /usr/a/b to /usr/a/b/c/d - */ - if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) - goto out; - } - - /* - * Does target exist? - */ - if (tzp) { - /* - * Source and target must be the same type. - */ - if (ZTOV(szp)->v_type == VDIR) { - if (ZTOV(tzp)->v_type != VDIR) { - error = ENOTDIR; - goto out; - } - } else { - if (ZTOV(tzp)->v_type == VDIR) { - error = EISDIR; - goto out; - } - } - /* - * POSIX dictates that when the source and target - * entries refer to the same file object, rename - * must do nothing and exit without error. - */ - if (szp->z_id == tzp->z_id) { - error = 0; - goto out; - } - } - - vnevent_rename_src(ZTOV(szp)); - if (tzp) - vnevent_rename_dest(ZTOV(tzp)); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ - dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); - dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); - if (sdzp != tdzp) - dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ - if (tzp) - dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - if (zl != NULL) - zfs_rename_unlock(&zl); - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - VN_RELE(ZTOV(szp)); - if (tzp) - VN_RELE(ZTOV(tzp)); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, 0, NULL); - - if (error == 0) { - error = zfs_link_create(tdl, szp, tx, ZRENAMING); - if (error == 0) { - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - ASSERT(error == 0); - zfs_log_rename(zilog, tx, TX_RENAME, sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); - } -#ifdef FREEBSD_NAMECACHE - if (error == 0) { - cache_purge(sdvp); - cache_purge(tdvp); - } -#endif - } - - dmu_tx_commit(tx); -out: - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - VN_RELE(ZTOV(szp)); - if (tzp) - VN_RELE(ZTOV(tzp)); - - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Insert the indicated symbolic reference entry into the directory. - * - * IN: dvp - Directory to contain new symbolic link. - * link - Name for new symlink entry. - * vap - Attributes of new entry. - * target - Target path of new symlink. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime updated - */ -static int -zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) -{ - znode_t *zp, *dzp = VTOZ(dvp); - zfs_dirlock_t *dl; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - uint64_t zoid; - int len = strlen(link); - int error; - - ASSERT(vap->va_type == VLNK); - - ZFS_ENTER(zfsvfs); -top: - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (len > MAXPATHLEN) { - ZFS_EXIT(zfsvfs); - return (ENAMETOOLONG); - } - - /* - * Attempt to lock directory; fail if entry already exists. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); - dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - dmu_buf_will_dirty(dzp->z_dbuf, tx); - - /* - * Create a new object for the symlink. - * Put the link content into bonus buffer if it will fit; - * otherwise, store it just like any other file data. - */ - zoid = 0; - if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len); - if (len != 0) - bcopy(link, zp->z_phys + 1, len); - } else { - dmu_buf_t *dbp; - - zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); - - /* - * Nothing can access the znode yet so no locking needed - * for growing the znode's blocksize. - */ - zfs_grow_blocksize(zp, len, tx); - - VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp)); - dmu_buf_will_dirty(dbp, tx); - - ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); - dmu_buf_rele(dbp, FTAG); - } - zp->z_phys->zp_size = len; - - /* - * Insert the new object into the directory. - */ - (void) zfs_link_create(dl, zp, tx, ZNEW); -out: - if (error == 0) { - zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link); - *vpp = ZTOV(zp); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - } - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Return, in the buffer contained in the provided uio structure, - * the symbolic path referred to by vp. - * - * IN: vp - vnode of symbolic link. - * uoip - structure to contain the link path. - * cr - credentials of caller. - * - * OUT: uio - structure to contain the link path. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - atime updated - */ -/* ARGSUSED */ -static int -zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - size_t bufsz; - int error; - - ZFS_ENTER(zfsvfs); - - bufsz = (size_t)zp->z_phys->zp_size; - if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { - error = uiomove(zp->z_phys + 1, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - } else { - dmu_buf_t *dbp; - error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - error = uiomove(dbp->db_data, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - dmu_buf_rele(dbp, FTAG); - } - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert a new entry into directory tdvp referencing svp. - * - * IN: tdvp - Directory to contain new entry. - * svp - vnode of new entry. - * name - name of new entry. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * tdvp - ctime|mtime updated - * svp - ctime updated - */ -/* ARGSUSED */ -static int -zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) -{ - znode_t *dzp = VTOZ(tdvp); - znode_t *tzp, *szp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - vnode_t *realvp; - int error; - - ASSERT(tdvp->v_type == VDIR); - - ZFS_ENTER(zfsvfs); - - if (VOP_REALVP(svp, &realvp) == 0) - svp = realvp; - - if (svp->v_vfsp != tdvp->v_vfsp) { - ZFS_EXIT(zfsvfs); - return (EXDEV); - } - - szp = VTOZ(svp); -top: - /* - * We do not support links between attributes and non-attributes - * because of the potential security risk of creating links - * into "normal" file space in order to circumvent restrictions - * imposed in attribute space. - */ - if ((szp->z_phys->zp_flags & ZFS_XATTR) != - (dzp->z_phys->zp_flags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * POSIX dictates that we return EPERM here. - * Better choices include ENOTSUP or EISDIR. - */ - if (svp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); - return (EPERM); - } - - if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) && - secpolicy_basic_link(cr) != 0) { - ZFS_EXIT(zfsvfs); - return (EPERM); - } - - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Attempt to lock directory; fail if entry already exists. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - error = zfs_link_create(dl, szp, tx, 0); - - if (error == 0) - zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name); - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - ZFS_EXIT(zfsvfs); - return (error); -} - -void -zfs_inactive(vnode_t *vp, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - rw_enter(&zfsvfs->z_um_lock, RW_READER); - if (zfsvfs->z_unmounted2) { - ASSERT(zp->z_dbuf_held == 0); - - mutex_enter(&zp->z_lock); - VI_LOCK(vp); - vp->v_count = 0; /* count arrives as 1 */ - VI_UNLOCK(vp); - if (zp->z_dbuf == NULL) { - mutex_exit(&zp->z_lock); - zfs_znode_free(zp); - } else { - mutex_exit(&zp->z_lock); - } - rw_exit(&zfsvfs->z_um_lock); - VFS_RELE(zfsvfs->z_vfs); - return; - } - - if (zp->z_atime_dirty && zp->z_unlinked == 0) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_buf_will_dirty(zp->z_dbuf, tx); - mutex_enter(&zp->z_lock); - zp->z_atime_dirty = 0; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); - } - } - - zfs_zinactive(zp); - rw_exit(&zfsvfs->z_um_lock); -} - -CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); -CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); - -static int -zfs_fid(vnode_t *vp, fid_t *fidp) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint32_t gen = (uint32_t)zp->z_phys->zp_gen; - uint64_t object = zp->z_id; - zfid_short_t *zfid; - int size, i; - - ZFS_ENTER(zfsvfs); - - size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; - fidp->fid_len = size; - - zfid = (zfid_short_t *)fidp; - - zfid->zf_len = size; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* Must have a non-zero generation number to distinguish from .zfs */ - if (gen == 0) - gen = 1; - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); - - if (size == LONG_FID_LEN) { - uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); - zfid_long_t *zlfid; - - zlfid = (zfid_long_t *)fidp; - - for (i = 0; i < sizeof (zlfid->zf_setid); i++) - zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); - - /* XXX - this should be the generation number for the objset */ - for (i = 0; i < sizeof (zlfid->zf_setgen); i++) - zlfid->zf_setgen[i] = 0; - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -static int -zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) -{ - znode_t *zp, *xzp; - zfsvfs_t *zfsvfs; - zfs_dirlock_t *dl; - int error; - - switch (cmd) { - case _PC_LINK_MAX: - *valp = INT_MAX; - return (0); - - case _PC_FILESIZEBITS: - *valp = 64; - return (0); - -#if 0 - case _PC_XATTR_EXISTS: - zp = VTOZ(vp); - zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - *valp = 0; - error = zfs_dirent_lock(&dl, zp, "", &xzp, - ZXATTR | ZEXISTS | ZSHARED); - if (error == 0) { - zfs_dirent_unlock(dl); - if (!zfs_dirempty(xzp)) - *valp = 1; - VN_RELE(ZTOV(xzp)); - } else if (error == ENOENT) { - /* - * If there aren't extended attributes, it's the - * same as having zero of them. - */ - error = 0; - } - ZFS_EXIT(zfsvfs); - return (error); -#endif - - case _PC_ACL_EXTENDED: - *valp = 0; /* TODO */ - return (0); - - case _PC_MIN_HOLE_SIZE: - *valp = (int)SPA_MINBLOCKSIZE; - return (0); - - default: - return (EOPNOTSUPP); - } -} - -#ifdef TODO -/*ARGSUSED*/ -static int -zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - error = zfs_getacl(zp, vsecp, cr); - ZFS_EXIT(zfsvfs); - - return (error); -} -#endif /* TODO */ - -#ifdef TODO -/*ARGSUSED*/ -static int -zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - error = zfs_setacl(zp, vsecp, cr); - ZFS_EXIT(zfsvfs); - return (error); -} -#endif /* TODO */ - -static int -zfs_freebsd_open(ap) - struct vop_open_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - znode_t *zp = VTOZ(vp); - int error; - - error = zfs_open(&vp, ap->a_mode, ap->a_cred); - if (error == 0) - vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); - return (error); -} - -static int -zfs_freebsd_close(ap) - struct vop_close_args /* { - struct vnode *a_vp; - int a_fflag; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - - return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred)); -} - -static int -zfs_freebsd_ioctl(ap) - struct vop_ioctl_args /* { - struct vnode *a_vp; - u_long a_command; - caddr_t a_data; - int a_fflag; - struct ucred *cred; - struct thread *td; - } */ *ap; -{ - - return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, - ap->a_fflag, ap->a_cred, NULL)); -} - -static int -zfs_freebsd_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - - return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); -} - -static int -zfs_freebsd_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - - return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); -} - -static int -zfs_freebsd_access(ap) - struct vop_access_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - - return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred)); -} - -static int -zfs_freebsd_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - char nm[NAME_MAX + 1]; - - ASSERT(cnp->cn_namelen < sizeof(nm)); - strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); - - return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, - cnp->cn_cred, cnp->cn_thread)); -} - -static int -zfs_freebsd_create(ap) - struct vop_create_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - vattr_t *vap = ap->a_vap; - int mode; - - ASSERT(cnp->cn_flags & SAVENAME); - - vattr_init_mask(vap); - mode = vap->va_mode & ALLPERMS; - - return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, - ap->a_vpp, cnp->cn_cred)); -} - -static int -zfs_freebsd_remove(ap) - struct vop_remove_args /* { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - } */ *ap; -{ - - ASSERT(ap->a_cnp->cn_flags & SAVENAME); - - return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, - ap->a_cnp->cn_cred)); -} - -static int -zfs_freebsd_mkdir(ap) - struct vop_mkdir_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; -{ - vattr_t *vap = ap->a_vap; - - ASSERT(ap->a_cnp->cn_flags & SAVENAME); - - vattr_init_mask(vap); - - return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, - ap->a_cnp->cn_cred)); -} - -static int -zfs_freebsd_rmdir(ap) - struct vop_rmdir_args /* { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - - ASSERT(cnp->cn_flags & SAVENAME); - - return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred)); -} - -static int -zfs_freebsd_readdir(ap) - struct vop_readdir_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - int *a_eofflag; - int *a_ncookies; - u_long **a_cookies; - } */ *ap; -{ - - return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, - ap->a_ncookies, ap->a_cookies)); -} - -static int -zfs_freebsd_fsync(ap) - struct vop_fsync_args /* { - struct vnode *a_vp; - int a_waitfor; - struct thread *a_td; - } */ *ap; -{ - - vop_stdfsync(ap); - return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred)); -} - -static int -zfs_freebsd_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - - return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred)); -} - -static int -zfs_freebsd_setattr(ap) - struct vop_setattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - vattr_t *vap = ap->a_vap; - - /* No support for FreeBSD's chflags(2). */ - if (vap->va_flags != VNOVAL) - return (EOPNOTSUPP); - - vattr_init_mask(vap); - vap->va_mask &= ~AT_NOSET; - - return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL)); -} - -static int -zfs_freebsd_rename(ap) - struct vop_rename_args /* { - struct vnode *a_fdvp; - struct vnode *a_fvp; - struct componentname *a_fcnp; - struct vnode *a_tdvp; - struct vnode *a_tvp; - struct componentname *a_tcnp; - } */ *ap; -{ - vnode_t *fdvp = ap->a_fdvp; - vnode_t *fvp = ap->a_fvp; - vnode_t *tdvp = ap->a_tdvp; - vnode_t *tvp = ap->a_tvp; - int error; - - ASSERT(ap->a_fcnp->cn_flags & SAVENAME); - ASSERT(ap->a_tcnp->cn_flags & SAVENAME); - - error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, - ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred); - - if (tdvp == tvp) - VN_RELE(tdvp); - else - VN_URELE(tdvp); - if (tvp) - VN_URELE(tvp); - VN_RELE(fdvp); - VN_RELE(fvp); - - return (error); -} - -static int -zfs_freebsd_symlink(ap) - struct vop_symlink_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - char *a_target; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - vattr_t *vap = ap->a_vap; - - ASSERT(cnp->cn_flags & SAVENAME); - - vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ - vattr_init_mask(vap); - - return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, - ap->a_target, cnp->cn_cred, cnp->cn_thread)); -} - -static int -zfs_freebsd_readlink(ap) - struct vop_readlink_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - } */ *ap; -{ - - return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred)); -} - -static int -zfs_freebsd_link(ap) - struct vop_link_args /* { - struct vnode *a_tdvp; - struct vnode *a_vp; - struct componentname *a_cnp; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - - ASSERT(cnp->cn_flags & SAVENAME); - - return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); -} - -static int -zfs_freebsd_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - - zfs_inactive(vp, ap->a_td->td_ucred); - return (0); -} - -static int -zfs_freebsd_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs; - int rele = 1; - - ASSERT(zp != NULL); - - /* - * Destroy the vm object and flush associated pages. - */ - vnode_destroy_vobject(vp); - - mutex_enter(&zp->z_lock); - ASSERT(zp->z_phys); - ASSERT(zp->z_dbuf_held); - zfsvfs = zp->z_zfsvfs; - if (!zp->z_unlinked) { - zp->z_dbuf_held = 0; - ZTOV(zp) = NULL; - mutex_exit(&zp->z_lock); - dmu_buf_rele(zp->z_dbuf, NULL); - } else { - mutex_exit(&zp->z_lock); - } - VI_LOCK(vp); - if (vp->v_count > 0) - rele = 0; - vp->v_data = NULL; - ASSERT(vp->v_holdcnt >= 1); - VI_UNLOCK(vp); - if (!zp->z_unlinked && rele) - VFS_RELE(zfsvfs->z_vfs); - return (0); -} - -static int -zfs_freebsd_fid(ap) - struct vop_fid_args /* { - struct vnode *a_vp; - struct fid *a_fid; - } */ *ap; -{ - - return (zfs_fid(ap->a_vp, (void *)ap->a_fid)); -} - -static int -zfs_freebsd_pathconf(ap) - struct vop_pathconf_args /* { - struct vnode *a_vp; - int a_name; - register_t *a_retval; - } */ *ap; -{ - ulong_t val; - int error; - - error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred); - if (error == 0) - *ap->a_retval = val; - else if (error == EOPNOTSUPP) - error = vop_stdpathconf(ap); - return (error); -} - -/* - * Advisory record locking support - */ -static int -zfs_freebsd_advlock(ap) - struct vop_advlock_args /* { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - } */ *ap; -{ - znode_t *zp = VTOZ(ap->a_vp); - - return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size)); -} - -/* - * Advisory record locking support - */ -static int -zfs_freebsd_advlockasync(ap) - struct vop_advlockasync_args /* { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - struct task *a_task; - } */ *ap; -{ - znode_t *zp = VTOZ(ap->a_vp); - - return (lf_advlockasync(ap, &(zp->z_lockf), zp->z_phys->zp_size)); -} - -struct vop_vector zfs_vnodeops; -struct vop_vector zfs_fifoops; - -struct vop_vector zfs_vnodeops = { - .vop_default = &default_vnodeops, - .vop_inactive = zfs_freebsd_inactive, - .vop_reclaim = zfs_freebsd_reclaim, - .vop_access = zfs_freebsd_access, -#ifdef FREEBSD_NAMECACHE - .vop_lookup = vfs_cache_lookup, - .vop_cachedlookup = zfs_freebsd_lookup, -#else - .vop_lookup = zfs_freebsd_lookup, -#endif - .vop_getattr = zfs_freebsd_getattr, - .vop_setattr = zfs_freebsd_setattr, - .vop_create = zfs_freebsd_create, - .vop_mknod = zfs_freebsd_create, - .vop_mkdir = zfs_freebsd_mkdir, - .vop_readdir = zfs_freebsd_readdir, - .vop_fsync = zfs_freebsd_fsync, - .vop_open = zfs_freebsd_open, - .vop_close = zfs_freebsd_close, - .vop_rmdir = zfs_freebsd_rmdir, - .vop_ioctl = zfs_freebsd_ioctl, - .vop_link = zfs_freebsd_link, - .vop_symlink = zfs_freebsd_symlink, - .vop_readlink = zfs_freebsd_readlink, - .vop_read = zfs_freebsd_read, - .vop_write = zfs_freebsd_write, - .vop_remove = zfs_freebsd_remove, - .vop_rename = zfs_freebsd_rename, - .vop_advlock = zfs_freebsd_advlock, - .vop_advlockasync = zfs_freebsd_advlockasync, - .vop_pathconf = zfs_freebsd_pathconf, - .vop_bmap = VOP_EOPNOTSUPP, - .vop_fid = zfs_freebsd_fid, -}; - -struct vop_vector zfs_fifoops = { - .vop_default = &fifo_specops, - .vop_fsync = VOP_PANIC, - .vop_access = zfs_freebsd_access, - .vop_getattr = zfs_freebsd_getattr, - .vop_inactive = zfs_freebsd_inactive, - .vop_read = VOP_PANIC, - .vop_reclaim = zfs_freebsd_reclaim, - .vop_setattr = zfs_freebsd_setattr, - .vop_write = VOP_PANIC, - .vop_fid = zfs_freebsd_fid, -}; diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c deleted file mode 100644 index 46e501c..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ /dev/null @@ -1,1072 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Portions Copyright 2007 Jeremy Teo */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef _KERNEL -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/resource.h> -#include <sys/mntent.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/file.h> -#include <sys/kmem.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/unistd.h> -#include <sys/atomic.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_rlock.h> -#include <sys/fs/zfs.h> -#endif /* _KERNEL */ - -#include <sys/dmu.h> -#include <sys/refcount.h> -#include <sys/stat.h> -#include <sys/zap.h> -#include <sys/zfs_znode.h> -#include <sys/refcount.h> - -/* Used by fstat(1). */ -SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), - "sizeof(znode_t)"); - -/* - * Functions needed for userland (ie: libzpool) are not put under - * #ifdef_KERNEL; the rest of the functions have dependencies - * (such as VFS logic) that will not compile easily in userland. - */ -#ifdef _KERNEL -struct kmem_cache *znode_cache = NULL; - -/*ARGSUSED*/ -static void -znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) -{ - znode_t *zp = user_ptr; - vnode_t *vp; - - mutex_enter(&zp->z_lock); - vp = ZTOV(zp); - if (vp == NULL) { - mutex_exit(&zp->z_lock); - zfs_znode_free(zp); - } else if (vp->v_count == 0) { - ZTOV(zp) = NULL; - vhold(vp); - mutex_exit(&zp->z_lock); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - vrecycle(vp, curthread); - VOP_UNLOCK(vp, 0); - vdrop(vp); - zfs_znode_free(zp); - } else { - /* signal force unmount that this znode can be freed */ - zp->z_dbuf = NULL; - mutex_exit(&zp->z_lock); - } -} - -extern struct vop_vector zfs_vnodeops; -extern struct vop_vector zfs_fifoops; - -/* - * XXX: We cannot use this function as a cache constructor, because - * there is one global cache for all file systems and we need - * to pass vfsp here, which is not possible, because argument - * 'cdrarg' is defined at kmem_cache_create() time. - */ -static int -zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) -{ - znode_t *zp = buf; - vnode_t *vp; - vfs_t *vfsp = cdrarg; - int error; - - if (cdrarg != NULL) { - error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); - ASSERT(error == 0); - zp->z_vnode = vp; - vp->v_data = (caddr_t)zp; - VN_LOCK_AREC(vp); - VN_LOCK_ASHARE(vp); - } else { - zp->z_vnode = NULL; - } - mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); - mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); - - mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zp->z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); - - zp->z_dbuf_held = 0; - zp->z_dirlocks = 0; - zp->z_lockf = NULL; - return (0); -} - -/*ARGSUSED*/ -static void -zfs_znode_cache_destructor(void *buf, void *cdarg) -{ - znode_t *zp = buf; - - ASSERT(zp->z_dirlocks == 0); - mutex_destroy(&zp->z_lock); - rw_destroy(&zp->z_map_lock); - rw_destroy(&zp->z_parent_lock); - rw_destroy(&zp->z_name_lock); - mutex_destroy(&zp->z_acl_lock); - mutex_destroy(&zp->z_range_lock); - avl_destroy(&zp->z_range_avl); - - ASSERT(zp->z_dbuf_held == 0); -} - -void -zfs_znode_init(void) -{ - /* - * Initialize zcache - */ - ASSERT(znode_cache == NULL); - znode_cache = kmem_cache_create("zfs_znode_cache", - sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, - zfs_znode_cache_destructor, NULL, NULL, NULL, 0); -} - -void -zfs_znode_fini(void) -{ - /* - * Cleanup zcache - */ - if (znode_cache) - kmem_cache_destroy(znode_cache); - znode_cache = NULL; -} - -/* - * zfs_init_fs - Initialize the zfsvfs struct and the file system - * incore "master" object. Verify version compatibility. - */ -int -zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) -{ - objset_t *os = zfsvfs->z_os; - uint64_t version = ZPL_VERSION; - int i, error; - dmu_object_info_t doi; - uint64_t fsid_guid; - - *zpp = NULL; - - /* - * XXX - hack to auto-create the pool root filesystem at - * the first attempted mount. - */ - if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { - dmu_tx_t *tx = dmu_tx_create(os); - - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */ - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */ - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ - error = dmu_tx_assign(tx, TXG_WAIT); - ASSERT3U(error, ==, 0); - zfs_create_fs(os, cr, tx); - dmu_tx_commit(tx); - } - - error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1, - &version); - if (error) { - return (error); - } else if (version != ZPL_VERSION) { - (void) printf("Mismatched versions: File system " - "is version %lld on-disk format, which is " - "incompatible with this software version %lld!", - (u_longlong_t)version, ZPL_VERSION); - return (ENOTSUP); - } - - /* - * The fsid is 64 bits, composed of an 8-bit fs type, which - * separates our fsid from any other filesystem types, and a - * 56-bit objset unique ID. The objset unique ID is unique to - * all objsets open on this system, provided by unique_create(). - * The 8-bit fs type must be put in the low bits of fsid[1] - * because that's where other Solaris filesystems put it. - */ - fsid_guid = dmu_objset_fsid_guid(os); - ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); - zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; - zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | - zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, - &zfsvfs->z_root); - if (error) - return (error); - ASSERT(zfsvfs->z_root != 0); - - /* - * Create the per mount vop tables. - */ - - /* - * Initialize zget mutex's - */ - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); - - error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); - if (error) - return (error); - ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, - &zfsvfs->z_unlinkedobj); - if (error) - return (error); - - return (0); -} - -/* - * define a couple of values we need available - * for both 64 and 32 bit environments. - */ -#ifndef NBITSMINOR64 -#define NBITSMINOR64 32 -#endif -#ifndef MAXMAJ64 -#define MAXMAJ64 0xffffffffUL -#endif -#ifndef MAXMIN64 -#define MAXMIN64 0xffffffffUL -#endif -#ifndef major -#define major(x) ((int)(((u_int)(x) >> 8)&0xff)) /* major number */ -#endif -#ifndef minor -#define minor(x) ((int)((x)&0xffff00ff)) /* minor number */ -#endif - -/* - * Create special expldev for ZFS private use. - * Can't use standard expldev since it doesn't do - * what we want. The standard expldev() takes a - * dev32_t in LP64 and expands it to a long dev_t. - * We need an interface that takes a dev32_t in ILP32 - * and expands it to a long dev_t. - */ -static uint64_t -zfs_expldev(dev_t dev) -{ - return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); -} -/* - * Special cmpldev for ZFS private use. - * Can't use standard cmpldev since it takes - * a long dev_t and compresses it to dev32_t in - * LP64. We need to do a compaction of a long dev_t - * to a dev32_t in ILP32. - */ -dev_t -zfs_cmpldev(uint64_t dev) -{ - return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); -} - -/* - * Construct a new znode/vnode and intialize. - * - * This does not do a call to dmu_set_user() that is - * up to the caller to do, in case you don't want to - * return the znode - */ -static znode_t * -zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) -{ - znode_t *zp; - vnode_t *vp; - int error; - - zp = kmem_cache_alloc(znode_cache, KM_SLEEP); - zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0); - - ASSERT(zp->z_dirlocks == NULL); - - zp->z_phys = db->db_data; - zp->z_zfsvfs = zfsvfs; - zp->z_unlinked = 0; - zp->z_atime_dirty = 0; - zp->z_dbuf_held = 0; - zp->z_mapcnt = 0; - zp->z_last_itx = 0; - zp->z_dbuf = db; - zp->z_id = obj_num; - zp->z_blksz = blksz; - zp->z_seq = 0x7A4653; - zp->z_sync_cnt = 0; - - mutex_enter(&zfsvfs->z_znodes_lock); - list_insert_tail(&zfsvfs->z_all_znodes, zp); - mutex_exit(&zfsvfs->z_znodes_lock); - - vp = ZTOV(zp); - if (vp == NULL) - return (zp); - - error = insmntque(vp, zfsvfs->z_vfs); - KASSERT(error == 0, ("insmntque() failed: error %d", error)); - - vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); - switch (vp->v_type) { - case VDIR: - zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ - break; - case VFIFO: - vp->v_op = &zfs_fifoops; - break; - } - - return (zp); -} - -static void -zfs_znode_dmu_init(znode_t *zp) -{ - znode_t *nzp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - dmu_buf_t *db = zp->z_dbuf; - - mutex_enter(&zp->z_lock); - - nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func); - - /* - * there should be no - * concurrent zgets on this object. - */ - ASSERT3P(nzp, ==, NULL); - - /* - * Slap on VROOT if we are the root znode - */ - if (zp->z_id == zfsvfs->z_root) { - ZTOV(zp)->v_flag |= VROOT; - } - - ASSERT(zp->z_dbuf_held == 0); - zp->z_dbuf_held = 1; - VFS_HOLD(zfsvfs->z_vfs); - mutex_exit(&zp->z_lock); -} - -/* - * Create a new DMU object to hold a zfs znode. - * - * IN: dzp - parent directory for new znode - * vap - file attributes for new znode - * tx - dmu transaction id for zap operations - * cr - credentials of caller - * flag - flags: - * IS_ROOT_NODE - new object will be root - * IS_XATTR - new object is an attribute - * IS_REPLAY - intent log replay - * - * OUT: oid - ID of created object - * - */ -void -zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, - uint_t flag, znode_t **zpp, int bonuslen) -{ - dmu_buf_t *dbp; - znode_phys_t *pzp; - znode_t *zp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - timestruc_t now; - uint64_t gen; - int err; - - ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); - - if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ - *oid = vap->va_nodeid; - flag |= IS_REPLAY; - now = vap->va_ctime; /* see zfs_replay_create() */ - gen = vap->va_nblocks; /* ditto */ - } else { - *oid = 0; - gethrestime(&now); - gen = dmu_tx_get_txg(tx); - } - - /* - * Create a new DMU object. - */ - /* - * There's currently no mechanism for pre-reading the blocks that will - * be to needed allocate a new object, so we accept the small chance - * that there will be an i/o error and we will fail one of the - * assertions below. - */ - if (vap->va_type == VDIR) { - if (flag & IS_REPLAY) { - err = zap_create_claim(zfsvfs->z_os, *oid, - DMU_OT_DIRECTORY_CONTENTS, - DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); - ASSERT3U(err, ==, 0); - } else { - *oid = zap_create(zfsvfs->z_os, - DMU_OT_DIRECTORY_CONTENTS, - DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); - } - } else { - if (flag & IS_REPLAY) { - err = dmu_object_claim(zfsvfs->z_os, *oid, - DMU_OT_PLAIN_FILE_CONTENTS, 0, - DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); - ASSERT3U(err, ==, 0); - } else { - *oid = dmu_object_alloc(zfsvfs->z_os, - DMU_OT_PLAIN_FILE_CONTENTS, 0, - DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); - } - } - VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp)); - dmu_buf_will_dirty(dbp, tx); - - /* - * Initialize the znode physical data to zero. - */ - ASSERT(dbp->db_size >= sizeof (znode_phys_t)); - bzero(dbp->db_data, dbp->db_size); - pzp = dbp->db_data; - - /* - * If this is the root, fix up the half-initialized parent pointer - * to reference the just-allocated physical data area. - */ - if (flag & IS_ROOT_NODE) { - dzp->z_phys = pzp; - dzp->z_id = *oid; - } - - /* - * If parent is an xattr, so am I. - */ - if (dzp->z_phys->zp_flags & ZFS_XATTR) - flag |= IS_XATTR; - - if (vap->va_type == VBLK || vap->va_type == VCHR) { - pzp->zp_rdev = zfs_expldev(vap->va_rdev); - } - - if (vap->va_type == VDIR) { - pzp->zp_size = 2; /* contents ("." and "..") */ - pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; - } - - pzp->zp_parent = dzp->z_id; - if (flag & IS_XATTR) - pzp->zp_flags |= ZFS_XATTR; - - pzp->zp_gen = gen; - - ZFS_TIME_ENCODE(&now, pzp->zp_crtime); - ZFS_TIME_ENCODE(&now, pzp->zp_ctime); - - if (vap->va_mask & AT_ATIME) { - ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); - } else { - ZFS_TIME_ENCODE(&now, pzp->zp_atime); - } - - if (vap->va_mask & AT_MTIME) { - ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); - } else { - ZFS_TIME_ENCODE(&now, pzp->zp_mtime); - } - - pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); - zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0); - - zfs_perm_init(zp, dzp, flag, vap, tx, cr); - - if (zpp) { - kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp); - - mutex_enter(hash_mtx); - zfs_znode_dmu_init(zp); - mutex_exit(hash_mtx); - - *zpp = zp; - } else { - if (ZTOV(zp) != NULL) - ZTOV(zp)->v_count = 0; - dmu_buf_rele(dbp, NULL); - zfs_znode_free(zp); - } -} - -int -zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) -{ - dmu_object_info_t doi; - dmu_buf_t *db; - znode_t *zp; - vnode_t *vp; - int err; - - *zpp = NULL; - - ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); - - err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); - if (err) { - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (err); - } - - dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_type != DMU_OT_ZNODE || - doi.doi_bonus_size < sizeof (znode_phys_t)) { - dmu_buf_rele(db, NULL); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (EINVAL); - } - - ASSERT(db->db_object == obj_num); - ASSERT(db->db_offset == -1); - ASSERT(db->db_data != NULL); - - zp = dmu_buf_get_user(db); - - if (zp != NULL) { - mutex_enter(&zp->z_lock); - - ASSERT3U(zp->z_id, ==, obj_num); - if (zp->z_unlinked) { - dmu_buf_rele(db, NULL); - mutex_exit(&zp->z_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (ENOENT); - } else if (zp->z_dbuf_held) { - dmu_buf_rele(db, NULL); - } else { - zp->z_dbuf_held = 1; - VFS_HOLD(zfsvfs->z_vfs); - } - - if (ZTOV(zp) != NULL) - VN_HOLD(ZTOV(zp)); - else { - err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops, - &zp->z_vnode); - ASSERT(err == 0); - vp = ZTOV(zp); - vp->v_data = (caddr_t)zp; - VN_LOCK_AREC(vp); - VN_LOCK_ASHARE(vp); - vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); - if (vp->v_type == VDIR) - zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ - err = insmntque(vp, zfsvfs->z_vfs); - KASSERT(err == 0, ("insmntque() failed: error %d", err)); - } - mutex_exit(&zp->z_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - *zpp = zp; - return (0); - } - - /* - * Not found create new znode/vnode - */ - zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); - ASSERT3U(zp->z_id, ==, obj_num); - zfs_znode_dmu_init(zp); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - *zpp = zp; - return (0); -} - -void -zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); - if (zp->z_phys->zp_acl.z_acl_extern_obj) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - ASSERT3U(error, ==, 0); - } - error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); - ASSERT3U(error, ==, 0); - zp->z_dbuf_held = 0; - ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); - dmu_buf_rele(zp->z_dbuf, NULL); -} - -void -zfs_zinactive(znode_t *zp) -{ - vnode_t *vp = ZTOV(zp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t z_id = zp->z_id; - - ASSERT(zp->z_dbuf_held && zp->z_phys); - - /* - * Don't allow a zfs_zget() while were trying to release this znode - */ - ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); - - mutex_enter(&zp->z_lock); - VI_LOCK(vp); - if (vp->v_count > 0) { - /* - * If the hold count is greater than zero, somebody has - * obtained a new reference on this znode while we were - * processing it here, so we are done. - */ - VI_UNLOCK(vp); - mutex_exit(&zp->z_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); - return; - } - VI_UNLOCK(vp); - - /* - * If this was the last reference to a file with no links, - * remove the file from the file system. - */ - if (zp->z_unlinked) { - ZTOV(zp) = NULL; - mutex_exit(&zp->z_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); - ASSERT(vp->v_count == 0); - vrecycle(vp, curthread); - zfs_rmnode(zp); - VFS_RELE(zfsvfs->z_vfs); - return; - } - ASSERT(zp->z_phys); - ASSERT(zp->z_dbuf_held); - mutex_exit(&zp->z_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); -} - -void -zfs_znode_free(znode_t *zp) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - mutex_enter(&zfsvfs->z_znodes_lock); - list_remove(&zfsvfs->z_all_znodes, zp); - mutex_exit(&zfsvfs->z_znodes_lock); - - kmem_cache_free(znode_cache, zp); -} - -void -zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) -{ - timestruc_t now; - - ASSERT(MUTEX_HELD(&zp->z_lock)); - - gethrestime(&now); - - if (tx) { - dmu_buf_will_dirty(zp->z_dbuf, tx); - zp->z_atime_dirty = 0; - zp->z_seq++; - } else { - zp->z_atime_dirty = 1; - } - - if (flag & AT_ATIME) - ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); - - if (flag & AT_MTIME) - ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); - - if (flag & AT_CTIME) - ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); -} - -/* - * Update the requested znode timestamps with the current time. - * If we are in a transaction, then go ahead and mark the znode - * dirty in the transaction so the timestamps will go to disk. - * Otherwise, we will get pushed next time the znode is updated - * in a transaction, or when this znode eventually goes inactive. - * - * Why is this OK? - * 1 - Only the ACCESS time is ever updated outside of a transaction. - * 2 - Multiple consecutive updates will be collapsed into a single - * znode update by the transaction grouping semantics of the DMU. - */ -void -zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) -{ - mutex_enter(&zp->z_lock); - zfs_time_stamper_locked(zp, flag, tx); - mutex_exit(&zp->z_lock); -} - -/* - * Grow the block size for a file. - * - * IN: zp - znode of file to free data in. - * size - requested block size - * tx - open transaction. - * - * NOTE: this function assumes that the znode is write locked. - */ -void -zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) -{ - int error; - u_longlong_t dummy; - - if (size <= zp->z_blksz) - return; - /* - * If the file size is already greater than the current blocksize, - * we will not grow. If there is more than one block in a file, - * the blocksize cannot change. - */ - if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) - return; - - error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, - size, 0, tx); - if (error == ENOTSUP) - return; - ASSERT3U(error, ==, 0); - - /* What blocksize did we actually get? */ - dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); -} - -/* - * Free space in a file. - * - * IN: zp - znode of file to free data in. - * off - start of section to free. - * len - length of section to free (0 => to EOF). - * flag - current file open mode flags. - * - * RETURN: 0 if success - * error code if failure - */ -int -zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) -{ - vnode_t *vp = ZTOV(zp); - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - rl_t *rl; - uint64_t end = off + len; - uint64_t size, new_blksz; - int error; - - if (ZTOV(zp)->v_type == VFIFO) - return (0); - - /* - * If we will change zp_size then lock the whole file, - * otherwise just lock the range being freed. - */ - if (len == 0 || off + len > zp->z_phys->zp_size) { - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); - } else { - rl = zfs_range_lock(zp, off, len, RL_WRITER); - /* recheck, in case zp_size changed */ - if (off + len > zp->z_phys->zp_size) { - /* lost race: file size changed, lock whole file */ - zfs_range_unlock(rl); - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); - } - } - - /* - * Nothing to do if file already at desired length. - */ - size = zp->z_phys->zp_size; - if (len == 0 && size == off && off != 0) { - zfs_range_unlock(rl); - return (0); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - new_blksz = 0; - if (end > size && - (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { - /* - * We are growing the file past the current block size. - */ - if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end, SPA_MAXBLOCKSIZE); - } else { - new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); - } - dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz)); - } else if (off < size) { - /* - * If len == 0, we are truncating the file. - */ - dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END); - } - - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) - dmu_tx_wait(tx); - dmu_tx_abort(tx); - zfs_range_unlock(rl); - return (error); - } - - if (new_blksz) - zfs_grow_blocksize(zp, new_blksz, tx); - - if (end > size || len == 0) - zp->z_phys->zp_size = end; - - if (off < size) { - objset_t *os = zfsvfs->z_os; - uint64_t rlen = len; - - if (len == 0) - rlen = -1; - else if (end > size) - rlen = size - off; - VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx)); - } - - if (log) { - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); - } - - zfs_range_unlock(rl); - - dmu_tx_commit(tx); - - /* - * Clear any mapped pages in the truncated region. This has to - * happen outside of the transaction to avoid the possibility of - * a deadlock with someone trying to push a page that we are - * about to invalidate. - */ - rw_enter(&zp->z_map_lock, RW_WRITER); - if (end > size) - vnode_pager_setsize(vp, end); - else if (len == 0) { -#if 0 - error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); -#else - error = vinvalbuf(vp, V_SAVE, curthread, 0, 0); - vnode_pager_setsize(vp, end); -#endif - } - rw_exit(&zp->z_map_lock); - - return (0); -} - -void -zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) -{ - zfsvfs_t zfsvfs; - uint64_t moid, doid, roid = 0; - uint64_t version = ZPL_VERSION; - int error; - znode_t *rootzp = NULL; - vattr_t vattr; - - /* - * First attempt to create master node. - */ - /* - * In an empty objset, there are no blocks to read and thus - * there can be no i/o errors (which we assert below). - */ - moid = MASTER_NODE_OBJ; - error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); - - /* - * Set starting attributes. - */ - - error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx); - ASSERT(error == 0); - - /* - * Create a delete queue. - */ - doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); - - error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); - ASSERT(error == 0); - - /* - * Create root znode. Create minimal znode/vnode/zfsvfs - * to allow zfs_mknode to work. - */ - vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; - vattr.va_type = VDIR; - vattr.va_mode = S_IFDIR|0755; - vattr.va_uid = UID_ROOT; - vattr.va_gid = GID_WHEEL; - - rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); - zfs_znode_cache_constructor(rootzp, NULL, 0); - rootzp->z_zfsvfs = &zfsvfs; - rootzp->z_unlinked = 0; - rootzp->z_atime_dirty = 0; - rootzp->z_dbuf_held = 0; - - bzero(&zfsvfs, sizeof (zfsvfs_t)); - - zfsvfs.z_os = os; - zfsvfs.z_assign = TXG_NOWAIT; - zfsvfs.z_parent = &zfsvfs; - - mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - - zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0); - ASSERT3U(rootzp->z_id, ==, roid); - error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx); - ASSERT(error == 0); - - mutex_destroy(&zfsvfs.z_znodes_lock); - kmem_cache_free(znode_cache, rootzp); -} -#endif /* _KERNEL */ - -/* - * Given an object number, return its parent object number and whether - * or not the object is an extended attribute directory. - */ -static int -zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) -{ - dmu_buf_t *db; - dmu_object_info_t doi; - znode_phys_t *zp; - int error; - - if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) - return (error); - - dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_type != DMU_OT_ZNODE || - doi.doi_bonus_size < sizeof (znode_phys_t)) { - dmu_buf_rele(db, FTAG); - return (EINVAL); - } - - zp = db->db_data; - *pobjp = zp->zp_parent; - *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && - S_ISDIR(zp->zp_mode); - dmu_buf_rele(db, FTAG); - - return (0); -} - -int -zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) -{ - char *path = buf + len - 1; - int error; - - *path = '\0'; - - for (;;) { - uint64_t pobj; - char component[MAXNAMELEN + 2]; - size_t complen; - int is_xattrdir; - - if ((error = zfs_obj_to_pobj(osp, obj, &pobj, - &is_xattrdir)) != 0) - break; - - if (pobj == obj) { - if (path[0] != '/') - *--path = '/'; - break; - } - - component[0] = '/'; - if (is_xattrdir) { - (void) sprintf(component + 1, "<xattrdir>"); - } else { - error = zap_value_search(osp, pobj, obj, component + 1); - if (error != 0) - break; - } - - complen = strlen(component); - path -= complen; - ASSERT(path >= buf); - bcopy(component, path, complen); - obj = pobj; - } - - if (error == 0) - (void) memmove(buf, path, buf + len - path); - return (error); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c deleted file mode 100644 index 69ee509..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ /dev/null @@ -1,1607 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/dmu.h> -#include <sys/zap.h> -#include <sys/arc.h> -#include <sys/stat.h> -#include <sys/resource.h> -#include <sys/zil.h> -#include <sys/zil_impl.h> -#include <sys/dsl_dataset.h> -#include <sys/vdev.h> -#include <sys/dmu_tx.h> - -/* - * The zfs intent log (ZIL) saves transaction records of system calls - * that change the file system in memory with enough information - * to be able to replay them. These are stored in memory until - * either the DMU transaction group (txg) commits them to the stable pool - * and they can be discarded, or they are flushed to the stable log - * (also in the pool) due to a fsync, O_DSYNC or other synchronous - * requirement. In the event of a panic or power fail then those log - * records (transactions) are replayed. - * - * There is one ZIL per file system. Its on-disk (pool) format consists - * of 3 parts: - * - * - ZIL header - * - ZIL blocks - * - ZIL records - * - * A log record holds a system call transaction. Log blocks can - * hold many log records and the blocks are chained together. - * Each ZIL block contains a block pointer (blkptr_t) to the next - * ZIL block in the chain. The ZIL header points to the first - * block in the chain. Note there is not a fixed place in the pool - * to hold blocks. They are dynamically allocated and freed as - * needed from the blocks available. Figure X shows the ZIL structure: - */ - -/* - * This global ZIL switch affects all pools - */ -int zil_disable = 0; /* disable intent logging */ -SYSCTL_DECL(_vfs_zfs); -TUNABLE_INT("vfs.zfs.zil_disable", &zil_disable); -SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_disable, CTLFLAG_RW, &zil_disable, 0, - "Disable ZFS Intent Log (ZIL)"); - -/* - * Tunable parameter for debugging or performance analysis. Setting - * zfs_nocacheflush will cause corruption on power loss if a volatile - * out-of-order write cache is enabled. - */ -boolean_t zfs_nocacheflush = B_FALSE; -TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush); -SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN, - &zfs_nocacheflush, 0, "Disable cache flush"); - -static kmem_cache_t *zil_lwb_cache; - -static int -zil_dva_compare(const void *x1, const void *x2) -{ - const dva_t *dva1 = x1; - const dva_t *dva2 = x2; - - if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) - return (-1); - if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) - return (1); - - if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) - return (-1); - if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) - return (1); - - return (0); -} - -static void -zil_dva_tree_init(avl_tree_t *t) -{ - avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t), - offsetof(zil_dva_node_t, zn_node)); -} - -static void -zil_dva_tree_fini(avl_tree_t *t) -{ - zil_dva_node_t *zn; - void *cookie = NULL; - - while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(zn, sizeof (zil_dva_node_t)); - - avl_destroy(t); -} - -static int -zil_dva_tree_add(avl_tree_t *t, dva_t *dva) -{ - zil_dva_node_t *zn; - avl_index_t where; - - if (avl_find(t, dva, &where) != NULL) - return (EEXIST); - - zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP); - zn->zn_dva = *dva; - avl_insert(t, zn, where); - - return (0); -} - -static zil_header_t * -zil_header_in_syncing_context(zilog_t *zilog) -{ - return ((zil_header_t *)zilog->zl_header); -} - -static void -zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) -{ - zio_cksum_t *zc = &bp->blk_cksum; - - zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); - zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); - zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); - zc->zc_word[ZIL_ZC_SEQ] = 1ULL; -} - -/* - * Read a log block, make sure it's valid, and byteswap it if necessary. - */ -static int -zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) -{ - blkptr_t blk = *bp; - zbookmark_t zb; - uint32_t aflags = ARC_WAIT; - int error; - - zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; - - *abufpp = NULL; - - error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array, - arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb); - - if (error == 0) { - char *data = (*abufpp)->b_data; - uint64_t blksz = BP_GET_LSIZE(bp); - zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1; - zio_cksum_t cksum = bp->blk_cksum; - - /* - * Sequence numbers should be... sequential. The checksum - * verifier for the next block should be bp's checksum plus 1. - */ - cksum.zc_word[ZIL_ZC_SEQ]++; - - if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum))) - error = ESTALE; - else if (BP_IS_HOLE(&ztp->zit_next_blk)) - error = ENOENT; - else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) - error = EOVERFLOW; - - if (error) { - VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1); - *abufpp = NULL; - } - } - - dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid); - - return (error); -} - -/* - * Parse the intent log, and call parse_func for each valid record within. - * Return the highest sequence number. - */ -uint64_t -zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) -{ - const zil_header_t *zh = zilog->zl_header; - uint64_t claim_seq = zh->zh_claim_seq; - uint64_t seq = 0; - uint64_t max_seq = 0; - blkptr_t blk = zh->zh_log; - arc_buf_t *abuf; - char *lrbuf, *lrp; - zil_trailer_t *ztp; - int reclen, error; - - if (BP_IS_HOLE(&blk)) - return (max_seq); - - /* - * Starting at the block pointed to by zh_log we read the log chain. - * For each block in the chain we strongly check that block to - * ensure its validity. We stop when an invalid block is found. - * For each block pointer in the chain we call parse_blk_func(). - * For each record in each valid block we call parse_lr_func(). - * If the log has been claimed, stop if we encounter a sequence - * number greater than the highest claimed sequence number. - */ - zil_dva_tree_init(&zilog->zl_dva_tree); - for (;;) { - seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; - - if (claim_seq != 0 && seq > claim_seq) - break; - - ASSERT(max_seq < seq); - max_seq = seq; - - error = zil_read_log_block(zilog, &blk, &abuf); - - if (parse_blk_func != NULL) - parse_blk_func(zilog, &blk, arg, txg); - - if (error) - break; - - lrbuf = abuf->b_data; - ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; - blk = ztp->zit_next_blk; - - if (parse_lr_func == NULL) { - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - continue; - } - - for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) { - lr_t *lr = (lr_t *)lrp; - reclen = lr->lrc_reclen; - ASSERT3U(reclen, >=, sizeof (lr_t)); - parse_lr_func(zilog, lr, arg, txg); - } - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - } - zil_dva_tree_fini(&zilog->zl_dva_tree); - - return (max_seq); -} - -/* ARGSUSED */ -static void -zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) -{ - spa_t *spa = zilog->zl_spa; - int err; - - /* - * Claim log block if not already committed and not already claimed. - */ - if (bp->blk_birth >= first_txg && - zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) { - err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL)); - ASSERT(err == 0); - } -} - -static void -zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) -{ - if (lrc->lrc_txtype == TX_WRITE) { - lr_write_t *lr = (lr_write_t *)lrc; - zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg); - } -} - -/* ARGSUSED */ -static void -zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) -{ - zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx)); -} - -static void -zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) -{ - /* - * If we previously claimed it, we need to free it. - */ - if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) { - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - if (bp->blk_birth >= claim_txg && - !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) { - (void) arc_free(NULL, zilog->zl_spa, - dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT); - } - } -} - -/* - * Create an on-disk intent log. - */ -static void -zil_create(zilog_t *zilog) -{ - const zil_header_t *zh = zilog->zl_header; - lwb_t *lwb; - uint64_t txg = 0; - dmu_tx_t *tx = NULL; - blkptr_t blk; - int error = 0; - - /* - * Wait for any previous destroy to complete. - */ - txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); - - ASSERT(zh->zh_claim_txg == 0); - ASSERT(zh->zh_replay_seq == 0); - - blk = zh->zh_log; - - /* - * If we don't already have an initial log block, allocate one now. - */ - if (BP_IS_HOLE(&blk)) { - tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - - error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, - NULL, txg); - - if (error == 0) - zil_init_log_chain(zilog, &blk); - } - - /* - * Allocate a log write buffer (lwb) for the first log block. - */ - if (error == 0) { - lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); - lwb->lwb_zilog = zilog; - lwb->lwb_blk = blk; - lwb->lwb_nused = 0; - lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk); - lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz); - lwb->lwb_max_txg = txg; - lwb->lwb_zio = NULL; - - mutex_enter(&zilog->zl_lock); - list_insert_tail(&zilog->zl_lwb_list, lwb); - mutex_exit(&zilog->zl_lock); - } - - /* - * If we just allocated the first log block, commit our transaction - * and wait for zil_sync() to stuff the block poiner into zh_log. - * (zh is part of the MOS, so we cannot modify it in open context.) - */ - if (tx != NULL) { - dmu_tx_commit(tx); - txg_wait_synced(zilog->zl_dmu_pool, txg); - } - - ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); -} - -/* - * In one tx, free all log blocks and clear the log header. - * If keep_first is set, then we're replaying a log with no content. - * We want to keep the first block, however, so that the first - * synchronous transaction doesn't require a txg_wait_synced() - * in zil_create(). We don't need to txg_wait_synced() here either - * when keep_first is set, because both zil_create() and zil_destroy() - * will wait for any in-progress destroys to complete. - */ -void -zil_destroy(zilog_t *zilog, boolean_t keep_first) -{ - const zil_header_t *zh = zilog->zl_header; - lwb_t *lwb; - dmu_tx_t *tx; - uint64_t txg; - - /* - * Wait for any previous destroy to complete. - */ - txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); - - if (BP_IS_HOLE(&zh->zh_log)) - return; - - tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - - mutex_enter(&zilog->zl_lock); - - ASSERT3U(zilog->zl_destroy_txg, <, txg); - zilog->zl_destroy_txg = txg; - zilog->zl_keep_first = keep_first; - - if (!list_is_empty(&zilog->zl_lwb_list)) { - ASSERT(zh->zh_claim_txg == 0); - ASSERT(!keep_first); - while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { - list_remove(&zilog->zl_lwb_list, lwb); - if (lwb->lwb_buf != NULL) - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg); - kmem_cache_free(zil_lwb_cache, lwb); - } - } else { - if (!keep_first) { - (void) zil_parse(zilog, zil_free_log_block, - zil_free_log_record, tx, zh->zh_claim_txg); - } - } - mutex_exit(&zilog->zl_lock); - - dmu_tx_commit(tx); - - if (keep_first) /* no need to wait in this case */ - return; - - txg_wait_synced(zilog->zl_dmu_pool, txg); - ASSERT(BP_IS_HOLE(&zh->zh_log)); -} - -int -zil_claim(char *osname, void *txarg) -{ - dmu_tx_t *tx = txarg; - uint64_t first_txg = dmu_tx_get_txg(tx); - zilog_t *zilog; - zil_header_t *zh; - objset_t *os; - int error; - - error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os); - if (error) { - cmn_err(CE_WARN, "can't process intent log for %s", osname); - return (0); - } - - zilog = dmu_objset_zil(os); - zh = zil_header_in_syncing_context(zilog); - - /* - * Claim all log blocks if we haven't already done so, and remember - * the highest claimed sequence number. This ensures that if we can - * read only part of the log now (e.g. due to a missing device), - * but we can read the entire log later, we will not try to replay - * or destroy beyond the last block we successfully claimed. - */ - ASSERT3U(zh->zh_claim_txg, <=, first_txg); - if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { - zh->zh_claim_txg = first_txg; - zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block, - zil_claim_log_record, tx, first_txg); - dsl_dataset_dirty(dmu_objset_ds(os), tx); - } - - ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); - dmu_objset_close(os); - return (0); -} - -void -zil_add_vdev(zilog_t *zilog, uint64_t vdev) -{ - zil_vdev_t *zv, *new; - uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3; - uchar_t *cp; - - if (zfs_nocacheflush) - return; - - if (vdev < bmap_sz) { - cp = zilog->zl_vdev_bmap + (vdev / 8); - atomic_or_8(cp, 1 << (vdev % 8)); - } else { - /* - * insert into ordered list - */ - mutex_enter(&zilog->zl_lock); - for (zv = list_head(&zilog->zl_vdev_list); zv != NULL; - zv = list_next(&zilog->zl_vdev_list, zv)) { - if (zv->vdev == vdev) { - /* duplicate found - just return */ - mutex_exit(&zilog->zl_lock); - return; - } - if (zv->vdev > vdev) { - /* insert before this entry */ - new = kmem_alloc(sizeof (zil_vdev_t), - KM_SLEEP); - new->vdev = vdev; - list_insert_before(&zilog->zl_vdev_list, - zv, new); - mutex_exit(&zilog->zl_lock); - return; - } - } - /* ran off end of list, insert at the end */ - ASSERT(zv == NULL); - new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP); - new->vdev = vdev; - list_insert_tail(&zilog->zl_vdev_list, new); - mutex_exit(&zilog->zl_lock); - } -} - -/* start an async flush of the write cache for this vdev */ -void -zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio) -{ - vdev_t *vd; - - if (*zio == NULL) - *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - - vd = vdev_lookup_top(spa, vdev); - ASSERT(vd); - - (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); -} - -void -zil_flush_vdevs(zilog_t *zilog) -{ - zil_vdev_t *zv; - zio_t *zio = NULL; - spa_t *spa = zilog->zl_spa; - uint64_t vdev; - uint8_t b; - int i, j; - - ASSERT(zilog->zl_writer); - - for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) { - b = zilog->zl_vdev_bmap[i]; - if (b == 0) - continue; - for (j = 0; j < 8; j++) { - if (b & (1 << j)) { - vdev = (i << 3) + j; - zil_flush_vdev(spa, vdev, &zio); - } - } - zilog->zl_vdev_bmap[i] = 0; - } - - while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) { - zil_flush_vdev(spa, zv->vdev, &zio); - list_remove(&zilog->zl_vdev_list, zv); - kmem_free(zv, sizeof (zil_vdev_t)); - } - /* - * Wait for all the flushes to complete. Not all devices actually - * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. - */ - if (zio) - (void) zio_wait(zio); -} - -/* - * Function called when a log block write completes - */ -static void -zil_lwb_write_done(zio_t *zio) -{ - lwb_t *lwb = zio->io_private; - zilog_t *zilog = lwb->lwb_zilog; - - /* - * Now that we've written this log block, we have a stable pointer - * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. - */ - txg_rele_to_sync(&lwb->lwb_txgh); - - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - mutex_enter(&zilog->zl_lock); - lwb->lwb_buf = NULL; - if (zio->io_error) { - zilog->zl_log_error = B_TRUE; - mutex_exit(&zilog->zl_lock); - return; - } - mutex_exit(&zilog->zl_lock); -} - -/* - * Initialize the io for a log block. - * - * Note, we should not initialize the IO until we are about - * to use it, since zio_rewrite() does a spa_config_enter(). - */ -static void -zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) -{ - zbookmark_t zb; - - zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; - - if (zilog->zl_root_zio == NULL) { - zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, - ZIO_FLAG_CANFAIL); - } - if (lwb->lwb_zio == NULL) { - lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf, - lwb->lwb_sz, zil_lwb_write_done, lwb, - ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - } -} - -/* - * Start a log block write and advance to the next log block. - * Calls are serialized. - */ -static lwb_t * -zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) -{ - lwb_t *nlwb; - zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; - spa_t *spa = zilog->zl_spa; - blkptr_t *bp = &ztp->zit_next_blk; - uint64_t txg; - uint64_t zil_blksz; - int error; - - ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb)); - - /* - * Allocate the next block and save its address in this block - * before writing it in order to establish the log chain. - * Note that if the allocation of nlwb synced before we wrote - * the block that points at it (lwb), we'd leak it if we crashed. - * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done(). - */ - txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh); - txg_rele_to_quiesce(&lwb->lwb_txgh); - - /* - * Pick a ZIL blocksize. We request a size that is the - * maximum of the previous used size, the current used size and - * the amount waiting in the queue. - */ - zil_blksz = MAX(zilog->zl_prev_used, - zilog->zl_cur_used + sizeof (*ztp)); - zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp)); - zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t); - if (zil_blksz > ZIL_MAX_BLKSZ) - zil_blksz = ZIL_MAX_BLKSZ; - - BP_ZERO(bp); - /* pass the old blkptr in order to spread log blocks across devs */ - error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg); - if (error) { - dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg); - - /* - * We dirty the dataset to ensure that zil_sync() will - * be called to remove this lwb from our zl_lwb_list. - * Failing to do so, may leave an lwb with a NULL lwb_buf - * hanging around on the zl_lwb_list. - */ - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - dmu_tx_commit(tx); - - /* - * Since we've just experienced an allocation failure so we - * terminate the current lwb and send it on its way. - */ - ztp->zit_pad = 0; - ztp->zit_nused = lwb->lwb_nused; - ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; - zio_nowait(lwb->lwb_zio); - - /* - * By returning NULL the caller will call tx_wait_synced() - */ - return (NULL); - } - - ASSERT3U(bp->blk_birth, ==, txg); - ztp->zit_pad = 0; - ztp->zit_nused = lwb->lwb_nused; - ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; - bp->blk_cksum = lwb->lwb_blk.blk_cksum; - bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; - - /* - * Allocate a new log write buffer (lwb). - */ - nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); - - nlwb->lwb_zilog = zilog; - nlwb->lwb_blk = *bp; - nlwb->lwb_nused = 0; - nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk); - nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz); - nlwb->lwb_max_txg = txg; - nlwb->lwb_zio = NULL; - - /* - * Put new lwb at the end of the log chain - */ - mutex_enter(&zilog->zl_lock); - list_insert_tail(&zilog->zl_lwb_list, nlwb); - mutex_exit(&zilog->zl_lock); - - /* Record the vdev for later flushing */ - zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk)))); - - /* - * kick off the write for the old log block - */ - dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); - ASSERT(lwb->lwb_zio); - zio_nowait(lwb->lwb_zio); - - return (nlwb); -} - -static lwb_t * -zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) -{ - lr_t *lrc = &itx->itx_lr; /* common log record */ - lr_write_t *lr = (lr_write_t *)lrc; - uint64_t txg = lrc->lrc_txg; - uint64_t reclen = lrc->lrc_reclen; - uint64_t dlen; - - if (lwb == NULL) - return (NULL); - ASSERT(lwb->lwb_buf != NULL); - - if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) - dlen = P2ROUNDUP_TYPED( - lr->lr_length, sizeof (uint64_t), uint64_t); - else - dlen = 0; - - zilog->zl_cur_used += (reclen + dlen); - - zil_lwb_write_init(zilog, lwb); - - /* - * If this record won't fit in the current log block, start a new one. - */ - if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) { - lwb = zil_lwb_write_start(zilog, lwb); - if (lwb == NULL) - return (NULL); - zil_lwb_write_init(zilog, lwb); - ASSERT(lwb->lwb_nused == 0); - if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) { - txg_wait_synced(zilog->zl_dmu_pool, txg); - return (lwb); - } - } - - /* - * Update the lrc_seq, to be log record sequence number. See zil.h - * Then copy the record to the log buffer. - */ - lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ - bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen); - - /* - * If it's a write, fetch the data or get its blkptr as appropriate. - */ - if (lrc->lrc_txtype == TX_WRITE) { - if (txg > spa_freeze_txg(zilog->zl_spa)) - txg_wait_synced(zilog->zl_dmu_pool, txg); - if (itx->itx_wr_state != WR_COPIED) { - char *dbuf; - int error; - - /* alignment is guaranteed */ - lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused); - if (dlen) { - ASSERT(itx->itx_wr_state == WR_NEED_COPY); - dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen; - lr->lr_common.lrc_reclen += dlen; - } else { - ASSERT(itx->itx_wr_state == WR_INDIRECT); - dbuf = NULL; - } - error = zilog->zl_get_data( - itx->itx_private, lr, dbuf, lwb->lwb_zio); - if (error) { - ASSERT(error == ENOENT || error == EEXIST || - error == EALREADY); - return (lwb); - } - } - } - - lwb->lwb_nused += reclen + dlen; - lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); - ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb)); - ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); - - return (lwb); -} - -itx_t * -zil_itx_create(int txtype, size_t lrsize) -{ - itx_t *itx; - - lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); - - itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); - itx->itx_lr.lrc_txtype = txtype; - itx->itx_lr.lrc_reclen = lrsize; - itx->itx_lr.lrc_seq = 0; /* defensive */ - - return (itx); -} - -uint64_t -zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) -{ - uint64_t seq; - - ASSERT(itx->itx_lr.lrc_seq == 0); - - mutex_enter(&zilog->zl_lock); - list_insert_tail(&zilog->zl_itx_list, itx); - zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen; - itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); - itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq; - mutex_exit(&zilog->zl_lock); - - return (seq); -} - -/* - * Free up all in-memory intent log transactions that have now been synced. - */ -static void -zil_itx_clean(zilog_t *zilog) -{ - uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa); - uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa); - list_t clean_list; - itx_t *itx; - - list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); - - mutex_enter(&zilog->zl_lock); - /* wait for a log writer to finish walking list */ - while (zilog->zl_writer) { - cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); - } - - /* - * Move the sync'd log transactions to a separate list so we can call - * kmem_free without holding the zl_lock. - * - * There is no need to set zl_writer as we don't drop zl_lock here - */ - while ((itx = list_head(&zilog->zl_itx_list)) != NULL && - itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { - list_remove(&zilog->zl_itx_list, itx); - zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen; - list_insert_tail(&clean_list, itx); - } - cv_broadcast(&zilog->zl_cv_writer); - mutex_exit(&zilog->zl_lock); - - /* destroy sync'd log transactions */ - while ((itx = list_head(&clean_list)) != NULL) { - list_remove(&clean_list, itx); - kmem_free(itx, offsetof(itx_t, itx_lr) - + itx->itx_lr.lrc_reclen); - } - list_destroy(&clean_list); -} - -/* - * If there are any in-memory intent log transactions which have now been - * synced then start up a taskq to free them. - */ -void -zil_clean(zilog_t *zilog) -{ - itx_t *itx; - - mutex_enter(&zilog->zl_lock); - itx = list_head(&zilog->zl_itx_list); - if ((itx != NULL) && - (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { - (void) taskq_dispatch(zilog->zl_clean_taskq, - (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP); - } - mutex_exit(&zilog->zl_lock); -} - -void -zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) -{ - uint64_t txg; - uint64_t reclen; - uint64_t commit_seq = 0; - itx_t *itx, *itx_next = (itx_t *)-1; - lwb_t *lwb; - spa_t *spa; - - zilog->zl_writer = B_TRUE; - zilog->zl_root_zio = NULL; - spa = zilog->zl_spa; - - if (zilog->zl_suspend) { - lwb = NULL; - } else { - lwb = list_tail(&zilog->zl_lwb_list); - if (lwb == NULL) { - /* - * Return if there's nothing to flush before we - * dirty the fs by calling zil_create() - */ - if (list_is_empty(&zilog->zl_itx_list)) { - zilog->zl_writer = B_FALSE; - return; - } - mutex_exit(&zilog->zl_lock); - zil_create(zilog); - mutex_enter(&zilog->zl_lock); - lwb = list_tail(&zilog->zl_lwb_list); - } - } - - /* Loop through in-memory log transactions filling log blocks. */ - DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); - for (;;) { - /* - * Find the next itx to push: - * Push all transactions related to specified foid and all - * other transactions except TX_WRITE, TX_TRUNCATE, - * TX_SETATTR and TX_ACL for all other files. - */ - if (itx_next != (itx_t *)-1) - itx = itx_next; - else - itx = list_head(&zilog->zl_itx_list); - for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) { - if (foid == 0) /* push all foids? */ - break; - if (itx->itx_sync) /* push all O_[D]SYNC */ - break; - switch (itx->itx_lr.lrc_txtype) { - case TX_SETATTR: - case TX_WRITE: - case TX_TRUNCATE: - case TX_ACL: - /* lr_foid is same offset for these records */ - if (((lr_write_t *)&itx->itx_lr)->lr_foid - != foid) { - continue; /* skip this record */ - } - } - break; - } - if (itx == NULL) - break; - - reclen = itx->itx_lr.lrc_reclen; - if ((itx->itx_lr.lrc_seq > seq) && - ((lwb == NULL) || (lwb->lwb_nused == 0) || - (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) { - break; - } - - /* - * Save the next pointer. Even though we soon drop - * zl_lock all threads that may change the list - * (another writer or zil_itx_clean) can't do so until - * they have zl_writer. - */ - itx_next = list_next(&zilog->zl_itx_list, itx); - list_remove(&zilog->zl_itx_list, itx); - mutex_exit(&zilog->zl_lock); - txg = itx->itx_lr.lrc_txg; - ASSERT(txg); - - if (txg > spa_last_synced_txg(spa) || - txg > spa_freeze_txg(spa)) - lwb = zil_lwb_commit(zilog, itx, lwb); - kmem_free(itx, offsetof(itx_t, itx_lr) - + itx->itx_lr.lrc_reclen); - mutex_enter(&zilog->zl_lock); - zilog->zl_itx_list_sz -= reclen; - } - DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); - /* determine commit sequence number */ - itx = list_head(&zilog->zl_itx_list); - if (itx) - commit_seq = itx->itx_lr.lrc_seq; - else - commit_seq = zilog->zl_itx_seq; - mutex_exit(&zilog->zl_lock); - - /* write the last block out */ - if (lwb != NULL && lwb->lwb_zio != NULL) - lwb = zil_lwb_write_start(zilog, lwb); - - zilog->zl_prev_used = zilog->zl_cur_used; - zilog->zl_cur_used = 0; - - /* - * Wait if necessary for the log blocks to be on stable storage. - */ - if (zilog->zl_root_zio) { - DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); - (void) zio_wait(zilog->zl_root_zio); - DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); - if (!zfs_nocacheflush) - zil_flush_vdevs(zilog); - } - - if (zilog->zl_log_error || lwb == NULL) { - zilog->zl_log_error = 0; - txg_wait_synced(zilog->zl_dmu_pool, 0); - } - - mutex_enter(&zilog->zl_lock); - zilog->zl_writer = B_FALSE; - - ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); - zilog->zl_commit_seq = commit_seq; -} - -/* - * Push zfs transactions to stable storage up to the supplied sequence number. - * If foid is 0 push out all transactions, otherwise push only those - * for that file or might have been used to create that file. - */ -void -zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid) -{ - if (zilog == NULL || seq == 0) - return; - - mutex_enter(&zilog->zl_lock); - - seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */ - - while (zilog->zl_writer) { - cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); - if (seq < zilog->zl_commit_seq) { - mutex_exit(&zilog->zl_lock); - return; - } - } - zil_commit_writer(zilog, seq, foid); /* drops zl_lock */ - /* wake up others waiting on the commit */ - cv_broadcast(&zilog->zl_cv_writer); - mutex_exit(&zilog->zl_lock); -} - -/* - * Called in syncing context to free committed log blocks and update log header. - */ -void -zil_sync(zilog_t *zilog, dmu_tx_t *tx) -{ - zil_header_t *zh = zil_header_in_syncing_context(zilog); - uint64_t txg = dmu_tx_get_txg(tx); - spa_t *spa = zilog->zl_spa; - lwb_t *lwb; - - mutex_enter(&zilog->zl_lock); - - ASSERT(zilog->zl_stop_sync == 0); - - zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; - - if (zilog->zl_destroy_txg == txg) { - blkptr_t blk = zh->zh_log; - - ASSERT(list_head(&zilog->zl_lwb_list) == NULL); - ASSERT(spa_sync_pass(spa) == 1); - - bzero(zh, sizeof (zil_header_t)); - bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); - - if (zilog->zl_keep_first) { - /* - * If this block was part of log chain that couldn't - * be claimed because a device was missing during - * zil_claim(), but that device later returns, - * then this block could erroneously appear valid. - * To guard against this, assign a new GUID to the new - * log chain so it doesn't matter what blk points to. - */ - zil_init_log_chain(zilog, &blk); - zh->zh_log = blk; - } - } - - for (;;) { - lwb = list_head(&zilog->zl_lwb_list); - if (lwb == NULL) { - mutex_exit(&zilog->zl_lock); - return; - } - zh->zh_log = lwb->lwb_blk; - if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) - break; - list_remove(&zilog->zl_lwb_list, lwb); - zio_free_blk(spa, &lwb->lwb_blk, txg); - kmem_cache_free(zil_lwb_cache, lwb); - - /* - * If we don't have anything left in the lwb list then - * we've had an allocation failure and we need to zero - * out the zil_header blkptr so that we don't end - * up freeing the same block twice. - */ - if (list_head(&zilog->zl_lwb_list) == NULL) - BP_ZERO(&zh->zh_log); - } - mutex_exit(&zilog->zl_lock); -} - -void -zil_init(void) -{ - zil_lwb_cache = kmem_cache_create("zil_lwb_cache", - sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -zil_fini(void) -{ - kmem_cache_destroy(zil_lwb_cache); -} - -zilog_t * -zil_alloc(objset_t *os, zil_header_t *zh_phys) -{ - zilog_t *zilog; - - zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); - - zilog->zl_header = zh_phys; - zilog->zl_os = os; - zilog->zl_spa = dmu_objset_spa(os); - zilog->zl_dmu_pool = dmu_objset_pool(os); - zilog->zl_destroy_txg = TXG_INITIAL - 1; - - mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); - cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); - - list_create(&zilog->zl_itx_list, sizeof (itx_t), - offsetof(itx_t, itx_node)); - - list_create(&zilog->zl_lwb_list, sizeof (lwb_t), - offsetof(lwb_t, lwb_node)); - - list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t), - offsetof(zil_vdev_t, vdev_seq_node)); - - return (zilog); -} - -void -zil_free(zilog_t *zilog) -{ - lwb_t *lwb; - zil_vdev_t *zv; - - zilog->zl_stop_sync = 1; - - while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { - list_remove(&zilog->zl_lwb_list, lwb); - if (lwb->lwb_buf != NULL) - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - kmem_cache_free(zil_lwb_cache, lwb); - } - list_destroy(&zilog->zl_lwb_list); - - while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) { - list_remove(&zilog->zl_vdev_list, zv); - kmem_free(zv, sizeof (zil_vdev_t)); - } - list_destroy(&zilog->zl_vdev_list); - - ASSERT(list_head(&zilog->zl_itx_list) == NULL); - list_destroy(&zilog->zl_itx_list); - cv_destroy(&zilog->zl_cv_suspend); - cv_destroy(&zilog->zl_cv_writer); - mutex_destroy(&zilog->zl_lock); - - kmem_free(zilog, sizeof (zilog_t)); -} - -/* - * return true if the initial log block is not valid - */ -static int -zil_empty(zilog_t *zilog) -{ - const zil_header_t *zh = zilog->zl_header; - arc_buf_t *abuf = NULL; - - if (BP_IS_HOLE(&zh->zh_log)) - return (1); - - if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) - return (1); - - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - return (0); -} - -/* - * Open an intent log. - */ -zilog_t * -zil_open(objset_t *os, zil_get_data_t *get_data) -{ - zilog_t *zilog = dmu_objset_zil(os); - - zilog->zl_get_data = get_data; - zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, - 2, 2, TASKQ_PREPOPULATE); - - return (zilog); -} - -/* - * Close an intent log. - */ -void -zil_close(zilog_t *zilog) -{ - /* - * If the log isn't already committed, mark the objset dirty - * (so zil_sync() will be called) and wait for that txg to sync. - */ - if (!zil_is_committed(zilog)) { - uint64_t txg; - dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - dmu_tx_commit(tx); - txg_wait_synced(zilog->zl_dmu_pool, txg); - } - - taskq_destroy(zilog->zl_clean_taskq); - zilog->zl_clean_taskq = NULL; - zilog->zl_get_data = NULL; - - zil_itx_clean(zilog); - ASSERT(list_head(&zilog->zl_itx_list) == NULL); -} - -/* - * Suspend an intent log. While in suspended mode, we still honor - * synchronous semantics, but we rely on txg_wait_synced() to do it. - * We suspend the log briefly when taking a snapshot so that the snapshot - * contains all the data it's supposed to, and has an empty intent log. - */ -int -zil_suspend(zilog_t *zilog) -{ - const zil_header_t *zh = zilog->zl_header; - - mutex_enter(&zilog->zl_lock); - if (zh->zh_claim_txg != 0) { /* unplayed log */ - mutex_exit(&zilog->zl_lock); - return (EBUSY); - } - if (zilog->zl_suspend++ != 0) { - /* - * Someone else already began a suspend. - * Just wait for them to finish. - */ - while (zilog->zl_suspending) - cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); - ASSERT(BP_IS_HOLE(&zh->zh_log)); - mutex_exit(&zilog->zl_lock); - return (0); - } - zilog->zl_suspending = B_TRUE; - mutex_exit(&zilog->zl_lock); - - zil_commit(zilog, UINT64_MAX, 0); - - /* - * Wait for any in-flight log writes to complete. - */ - mutex_enter(&zilog->zl_lock); - while (zilog->zl_writer) - cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); - mutex_exit(&zilog->zl_lock); - - zil_destroy(zilog, B_FALSE); - - mutex_enter(&zilog->zl_lock); - ASSERT(BP_IS_HOLE(&zh->zh_log)); - zilog->zl_suspending = B_FALSE; - cv_broadcast(&zilog->zl_cv_suspend); - mutex_exit(&zilog->zl_lock); - - return (0); -} - -void -zil_resume(zilog_t *zilog) -{ - mutex_enter(&zilog->zl_lock); - ASSERT(zilog->zl_suspend != 0); - zilog->zl_suspend--; - mutex_exit(&zilog->zl_lock); -} - -typedef struct zil_replay_arg { - objset_t *zr_os; - zil_replay_func_t **zr_replay; - void *zr_arg; - uint64_t *zr_txgp; - boolean_t zr_byteswap; - char *zr_lrbuf; -} zil_replay_arg_t; - -static void -zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) -{ - zil_replay_arg_t *zr = zra; - const zil_header_t *zh = zilog->zl_header; - uint64_t reclen = lr->lrc_reclen; - uint64_t txtype = lr->lrc_txtype; - char *name; - int pass, error, sunk; - - if (zilog->zl_stop_replay) - return; - - if (lr->lrc_txg < claim_txg) /* already committed */ - return; - - if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ - return; - - /* - * Make a copy of the data so we can revise and extend it. - */ - bcopy(lr, zr->zr_lrbuf, reclen); - - /* - * The log block containing this lr may have been byteswapped - * so that we can easily examine common fields like lrc_txtype. - * However, the log is a mix of different data types, and only the - * replay vectors know how to byteswap their records. Therefore, if - * the lr was byteswapped, undo it before invoking the replay vector. - */ - if (zr->zr_byteswap) - byteswap_uint64_array(zr->zr_lrbuf, reclen); - - /* - * If this is a TX_WRITE with a blkptr, suck in the data. - */ - if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { - lr_write_t *lrw = (lr_write_t *)lr; - blkptr_t *wbp = &lrw->lr_blkptr; - uint64_t wlen = lrw->lr_length; - char *wbuf = zr->zr_lrbuf + reclen; - - if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ - bzero(wbuf, wlen); - } else { - /* - * A subsequent write may have overwritten this block, - * in which case wbp may have been been freed and - * reallocated, and our read of wbp may fail with a - * checksum error. We can safely ignore this because - * the later write will provide the correct data. - */ - zbookmark_t zb; - - zb.zb_objset = dmu_objset_id(zilog->zl_os); - zb.zb_object = lrw->lr_foid; - zb.zb_level = -1; - zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp); - - (void) zio_wait(zio_read(NULL, zilog->zl_spa, - wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, - ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); - (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); - } - } - - /* - * We must now do two things atomically: replay this log record, - * and update the log header to reflect the fact that we did so. - * We use the DMU's ability to assign into a specific txg to do this. - */ - for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) { - uint64_t replay_txg; - dmu_tx_t *replay_tx; - - replay_tx = dmu_tx_create(zr->zr_os); - error = dmu_tx_assign(replay_tx, TXG_WAIT); - if (error) { - dmu_tx_abort(replay_tx); - break; - } - - replay_txg = dmu_tx_get_txg(replay_tx); - - if (txtype == 0 || txtype >= TX_MAX_TYPE) { - error = EINVAL; - } else { - /* - * On the first pass, arrange for the replay vector - * to fail its dmu_tx_assign(). That's the only way - * to ensure that those code paths remain well tested. - */ - *zr->zr_txgp = replay_txg - (pass == 1); - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap); - *zr->zr_txgp = TXG_NOWAIT; - } - - if (error == 0) { - dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx); - zilog->zl_replay_seq[replay_txg & TXG_MASK] = - lr->lrc_seq; - } - - dmu_tx_commit(replay_tx); - - if (!error) - return; - - /* - * The DMU's dnode layer doesn't see removes until the txg - * commits, so a subsequent claim can spuriously fail with - * EEXIST. So if we receive any error other than ERESTART - * we try syncing out any removes then retrying the - * transaction. - */ - if (error != ERESTART && !sunk) { - txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); - sunk = B_TRUE; - continue; /* retry */ - } - - if (error != ERESTART) - break; - - if (pass != 1) - txg_wait_open(spa_get_dsl(zilog->zl_spa), - replay_txg + 1); - - dprintf("pass %d, retrying\n", pass); - } - - ASSERT(error && error != ERESTART); - name = kmem_alloc(MAXNAMELEN, KM_SLEEP); - dmu_objset_name(zr->zr_os, name); - cmn_err(CE_WARN, "ZFS replay transaction error %d, " - "dataset %s, seq 0x%llx, txtype %llu\n", - error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype); - zilog->zl_stop_replay = 1; - kmem_free(name, MAXNAMELEN); -} - -/* ARGSUSED */ -static void -zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - zilog->zl_replay_blks++; -} - -/* - * If this dataset has a non-empty intent log, replay it and destroy it. - */ -void -zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE]) -{ - zilog_t *zilog = dmu_objset_zil(os); - const zil_header_t *zh = zilog->zl_header; - zil_replay_arg_t zr; - - if (zil_empty(zilog)) { - zil_destroy(zilog, B_TRUE); - return; - } - //printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name); - - zr.zr_os = os; - zr.zr_replay = replay_func; - zr.zr_arg = arg; - zr.zr_txgp = txgp; - zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); - zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); - - /* - * Wait for in-progress removes to sync before starting replay. - */ - txg_wait_synced(zilog->zl_dmu_pool, 0); - - zilog->zl_stop_replay = 0; - zilog->zl_replay_time = LBOLT; - ASSERT(zilog->zl_replay_blks == 0); - (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, - zh->zh_claim_txg); - kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE); - - zil_destroy(zilog, B_FALSE); - //printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name); -} - -/* - * Report whether all transactions are committed - */ -int -zil_is_committed(zilog_t *zilog) -{ - lwb_t *lwb; - int ret; - - mutex_enter(&zilog->zl_lock); - while (zilog->zl_writer) - cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); - - /* recent unpushed intent log transactions? */ - if (!list_is_empty(&zilog->zl_itx_list)) { - ret = B_FALSE; - goto out; - } - - /* intent log never used? */ - lwb = list_head(&zilog->zl_lwb_list); - if (lwb == NULL) { - ret = B_TRUE; - goto out; - } - - /* - * more than 1 log buffer means zil_sync() hasn't yet freed - * entries after a txg has committed - */ - if (list_next(&zilog->zl_lwb_list, lwb)) { - ret = B_FALSE; - goto out; - } - - ASSERT(zil_empty(zilog)); - ret = B_TRUE; -out: - cv_broadcast(&zilog->zl_cv_writer); - mutex_exit(&zilog->zl_lock); - return (ret); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c deleted file mode 100644 index b5dd35f..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ /dev/null @@ -1,1861 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/fm/fs/zfs.h> -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/spa_impl.h> -#include <sys/vdev_impl.h> -#include <sys/zio_impl.h> -#include <sys/zio_compress.h> -#include <sys/zio_checksum.h> - -/* - * ========================================================================== - * I/O priority table - * ========================================================================== - */ -uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { - 0, /* ZIO_PRIORITY_NOW */ - 0, /* ZIO_PRIORITY_SYNC_READ */ - 0, /* ZIO_PRIORITY_SYNC_WRITE */ - 6, /* ZIO_PRIORITY_ASYNC_READ */ - 4, /* ZIO_PRIORITY_ASYNC_WRITE */ - 4, /* ZIO_PRIORITY_FREE */ - 0, /* ZIO_PRIORITY_CACHE_FILL */ - 0, /* ZIO_PRIORITY_LOG_WRITE */ - 10, /* ZIO_PRIORITY_RESILVER */ - 20, /* ZIO_PRIORITY_SCRUB */ -}; - -/* - * ========================================================================== - * I/O type descriptions - * ========================================================================== - */ -char *zio_type_name[ZIO_TYPES] = { - "null", "read", "write", "free", "claim", "ioctl" }; - -/* At or above this size, force gang blocking - for testing */ -uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; - -/* Force an allocation failure when non-zero */ -uint16_t zio_zil_fail_shift = 0; - -typedef struct zio_sync_pass { - int zp_defer_free; /* defer frees after this pass */ - int zp_dontcompress; /* don't compress after this pass */ - int zp_rewrite; /* rewrite new bps after this pass */ -} zio_sync_pass_t; - -zio_sync_pass_t zio_sync_pass = { - 1, /* zp_defer_free */ - 4, /* zp_dontcompress */ - 1, /* zp_rewrite */ -}; - -/* - * ========================================================================== - * I/O kmem caches - * ========================================================================== - */ -kmem_cache_t *zio_cache; -#ifdef ZIO_USE_UMA -kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; -kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; -#endif - -#ifdef _KERNEL -extern vmem_t *zio_alloc_arena; -#endif - -void -zio_init(void) -{ -#ifdef ZIO_USE_UMA - size_t c; -#endif -#if 0 - vmem_t *data_alloc_arena = NULL; - -#ifdef _KERNEL - data_alloc_arena = zio_alloc_arena; -#endif -#endif - - zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, - NULL, NULL, NULL, NULL, NULL, 0); - -#ifdef ZIO_USE_UMA - /* - * For small buffers, we want a cache for each multiple of - * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache - * for each quarter-power of 2. For large buffers, we want - * a cache for each multiple of PAGESIZE. - */ - for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { - size_t size = (c + 1) << SPA_MINBLOCKSHIFT; - size_t p2 = size; - size_t align = 0; - - while (p2 & (p2 - 1)) - p2 &= p2 - 1; - - if (size <= 4 * SPA_MINBLOCKSIZE) { - align = SPA_MINBLOCKSIZE; - } else if (P2PHASE(size, PAGESIZE) == 0) { - align = PAGESIZE; - } else if (P2PHASE(size, p2 >> 2) == 0) { - align = p2 >> 2; - } - - if (align != 0) { - char name[36]; - (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); - - (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); - zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, data_alloc_arena, - KMC_NODEBUG); - - dprintf("creating cache for size %5lx align %5lx\n", - size, align); - } - } - - while (--c != 0) { - ASSERT(zio_buf_cache[c] != NULL); - if (zio_buf_cache[c - 1] == NULL) - zio_buf_cache[c - 1] = zio_buf_cache[c]; - - ASSERT(zio_data_buf_cache[c] != NULL); - if (zio_data_buf_cache[c - 1] == NULL) - zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; - } -#endif - - zio_inject_init(); -} - -void -zio_fini(void) -{ -#ifdef ZIO_USE_UMA - size_t c; - kmem_cache_t *last_cache = NULL; - kmem_cache_t *last_data_cache = NULL; - - for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { - if (zio_buf_cache[c] != last_cache) { - last_cache = zio_buf_cache[c]; - kmem_cache_destroy(zio_buf_cache[c]); - } - zio_buf_cache[c] = NULL; - - if (zio_data_buf_cache[c] != last_data_cache) { - last_data_cache = zio_data_buf_cache[c]; - kmem_cache_destroy(zio_data_buf_cache[c]); - } - zio_data_buf_cache[c] = NULL; - } -#endif - - kmem_cache_destroy(zio_cache); - - zio_inject_fini(); -} - -/* - * ========================================================================== - * Allocate and free I/O buffers - * ========================================================================== - */ - -/* - * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a - * crashdump if the kernel panics, so use it judiciously. Obviously, it's - * useful to inspect ZFS metadata, but if possible, we should avoid keeping - * excess / transient data in-core during a crashdump. - */ -void * -zio_buf_alloc(size_t size) -{ -#ifdef ZIO_USE_UMA - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - - ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); -#else - return (kmem_alloc(size, KM_SLEEP)); -#endif -} - -/* - * Use zio_data_buf_alloc to allocate data. The data will not appear in a - * crashdump if the kernel panics. This exists so that we will limit the amount - * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount - * of kernel heap dumped to disk when the kernel panics) - */ -void * -zio_data_buf_alloc(size_t size) -{ -#ifdef ZIO_USE_UMA - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - - ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); -#else - return (kmem_alloc(size, KM_SLEEP)); -#endif -} - -void -zio_buf_free(void *buf, size_t size) -{ -#ifdef ZIO_USE_UMA - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - - ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - kmem_cache_free(zio_buf_cache[c], buf); -#else - kmem_free(buf, size); -#endif -} - -void -zio_data_buf_free(void *buf, size_t size) -{ -#ifdef ZIO_USE_UMA - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - - ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - kmem_cache_free(zio_data_buf_cache[c], buf); -#else - kmem_free(buf, size); -#endif -} - -/* - * ========================================================================== - * Push and pop I/O transform buffers - * ========================================================================== - */ -static void -zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) -{ - zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); - - zt->zt_data = data; - zt->zt_size = size; - zt->zt_bufsize = bufsize; - - zt->zt_next = zio->io_transform_stack; - zio->io_transform_stack = zt; - - zio->io_data = data; - zio->io_size = size; -} - -static void -zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) -{ - zio_transform_t *zt = zio->io_transform_stack; - - *data = zt->zt_data; - *size = zt->zt_size; - *bufsize = zt->zt_bufsize; - - zio->io_transform_stack = zt->zt_next; - kmem_free(zt, sizeof (zio_transform_t)); - - if ((zt = zio->io_transform_stack) != NULL) { - zio->io_data = zt->zt_data; - zio->io_size = zt->zt_size; - } -} - -static void -zio_clear_transform_stack(zio_t *zio) -{ - void *data; - uint64_t size, bufsize; - - ASSERT(zio->io_transform_stack != NULL); - - zio_pop_transform(zio, &data, &size, &bufsize); - while (zio->io_transform_stack != NULL) { - zio_buf_free(data, bufsize); - zio_pop_transform(zio, &data, &size, &bufsize); - } -} - -/* - * ========================================================================== - * Create the various types of I/O (read, write, free) - * ========================================================================== - */ -static zio_t * -zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, - zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) -{ - zio_t *zio; - - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); - ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); - - zio = kmem_cache_alloc(zio_cache, KM_SLEEP); - bzero(zio, sizeof (zio_t)); - zio->io_parent = pio; - zio->io_spa = spa; - zio->io_txg = txg; - if (bp != NULL) { - zio->io_bp = bp; - zio->io_bp_copy = *bp; - zio->io_bp_orig = *bp; - } - zio->io_done = done; - zio->io_private = private; - zio->io_type = type; - zio->io_priority = priority; - zio->io_stage = stage; - zio->io_pipeline = pipeline; - zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; - zio->io_timestamp = lbolt64; - zio->io_flags = flags; - mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); - zio_push_transform(zio, data, size, size); - - /* - * Note on config lock: - * - * If CONFIG_HELD is set, then the caller already has the config - * lock, so we don't need it for this io. - * - * We set CONFIG_GRABBED to indicate that we have grabbed the - * config lock on behalf of this io, so it should be released - * in zio_done. - * - * Unless CONFIG_HELD is set, we will grab the config lock for - * any top-level (parent-less) io, *except* NULL top-level ios. - * The NULL top-level ios rarely have any children, so we delay - * grabbing the lock until the first child is added (but it is - * still grabbed on behalf of the top-level i/o, so additional - * children don't need to also grab it). This greatly reduces - * contention on the config lock. - */ - if (pio == NULL) { - if (type != ZIO_TYPE_NULL && - !(flags & ZIO_FLAG_CONFIG_HELD)) { - spa_config_enter(zio->io_spa, RW_READER, zio); - zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; - } - zio->io_root = zio; - } else { - zio->io_root = pio->io_root; - if (!(flags & ZIO_FLAG_NOBOOKMARK)) - zio->io_logical = pio->io_logical; - mutex_enter(&pio->io_lock); - if (pio->io_parent == NULL && - pio->io_type == ZIO_TYPE_NULL && - !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && - !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { - pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; - spa_config_enter(zio->io_spa, RW_READER, pio); - } - if (stage < ZIO_STAGE_READY) - pio->io_children_notready++; - pio->io_children_notdone++; - zio->io_sibling_next = pio->io_child; - zio->io_sibling_prev = NULL; - if (pio->io_child != NULL) - pio->io_child->io_sibling_prev = zio; - pio->io_child = zio; - zio->io_ndvas = pio->io_ndvas; - mutex_exit(&pio->io_lock); - } - - return (zio); -} - -zio_t * -zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, - int flags) -{ - zio_t *zio; - - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, - ZIO_WAIT_FOR_CHILDREN_PIPELINE); - - return (zio); -} - -zio_t * -zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) -{ - return (zio_null(NULL, spa, done, private, flags)); -} - -zio_t * -zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, - uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, zbookmark_t *zb) -{ - zio_t *zio; - - ASSERT3U(size, ==, BP_GET_LSIZE(bp)); - - zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, - ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, - ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); - zio->io_bookmark = *zb; - - zio->io_logical = zio; - - /* - * Work off our copy of the bp so the caller can free it. - */ - zio->io_bp = &zio->io_bp_copy; - - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { - uint64_t csize = BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(csize); - - zio_push_transform(zio, cbuf, csize, csize); - zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; - } - - if (BP_IS_GANG(bp)) { - uint64_t gsize = SPA_GANGBLOCKSIZE; - void *gbuf = zio_buf_alloc(gsize); - - zio_push_transform(zio, gbuf, gsize, gsize); - zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; - } - - return (zio); -} - -zio_t * -zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, - uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb) -{ - zio_t *zio; - - ASSERT(checksum >= ZIO_CHECKSUM_OFF && - checksum < ZIO_CHECKSUM_FUNCTIONS); - - ASSERT(compress >= ZIO_COMPRESS_OFF && - compress < ZIO_COMPRESS_FUNCTIONS); - - zio = zio_create(pio, spa, txg, bp, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, - ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); - - zio->io_ready = ready; - - zio->io_bookmark = *zb; - - zio->io_logical = zio; - - zio->io_checksum = checksum; - zio->io_compress = compress; - zio->io_ndvas = ncopies; - - if (compress != ZIO_COMPRESS_OFF) - zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; - - if (bp->blk_birth != txg) { - /* XXX the bp usually (always?) gets re-zeroed later */ - BP_ZERO(bp); - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - } else { - /* Make sure someone doesn't change their mind on overwrites */ - ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), - spa_max_replication(spa)) == BP_GET_NDVAS(bp)); - } - - return (zio); -} - -zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, int checksum, - uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags, - zbookmark_t *zb) -{ - zio_t *zio; - - zio = zio_create(pio, spa, txg, bp, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, - ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); - - zio->io_bookmark = *zb; - zio->io_checksum = checksum; - zio->io_compress = ZIO_COMPRESS_OFF; - - if (pio != NULL) - ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); - - return (zio); -} - -static zio_t * -zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, - uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags) -{ - zio_t *zio; - - BP_ZERO(bp); - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); - - zio = zio_create(pio, spa, txg, bp, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags, - ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); - - zio->io_checksum = checksum; - zio->io_compress = ZIO_COMPRESS_OFF; - - return (zio); -} - -zio_t * -zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private) -{ - zio_t *zio; - - ASSERT(!BP_IS_HOLE(bp)); - - if (txg == spa->spa_syncing_txg && - spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { - bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); - return (zio_null(pio, spa, NULL, NULL, 0)); - } - - zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, - ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, - ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); - - zio->io_bp = &zio->io_bp_copy; - - return (zio); -} - -zio_t * -zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private) -{ - zio_t *zio; - - /* - * A claim is an allocation of a specific block. Claims are needed - * to support immediate writes in the intent log. The issue is that - * immediate writes contain committed data, but in a txg that was - * *not* committed. Upon opening the pool after an unclean shutdown, - * the intent log claims all blocks that contain immediate write data - * so that the SPA knows they're in use. - * - * All claims *must* be resolved in the first txg -- before the SPA - * starts allocating blocks -- so that nothing is allocated twice. - */ - ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); - ASSERT3U(spa_first_txg(spa), <=, txg); - - zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, - ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, - ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); - - zio->io_bp = &zio->io_bp_copy; - - return (zio); -} - -zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, int flags) -{ - zio_t *zio; - int c; - - if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_IOCTL, priority, flags, - ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); - - zio->io_vd = vd; - zio->io_cmd = cmd; - } else { - zio = zio_null(pio, spa, NULL, NULL, flags); - - for (c = 0; c < vd->vdev_children; c++) - zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - done, private, priority, flags)); - } - - return (zio); -} - -static void -zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, - int checksum) -{ - ASSERT(vd->vdev_children == 0); - - ASSERT(size <= SPA_MAXBLOCKSIZE); - ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); - ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); - - ASSERT(offset + size <= VDEV_LABEL_START_SIZE || - offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); - ASSERT3U(offset + size, <=, vd->vdev_psize); - - BP_ZERO(bp); - - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - - BP_SET_CHECKSUM(bp, checksum); - BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - if (checksum != ZIO_CHECKSUM_OFF) - ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); -} - -zio_t * -zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, - int priority, int flags) -{ - zio_t *zio; - blkptr_t blk; - - zio_phys_bp_init(vd, &blk, offset, size, checksum); - - zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, - ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, - ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); - - zio->io_vd = vd; - zio->io_offset = offset; - - /* - * Work off our copy of the bp so the caller can free it. - */ - zio->io_bp = &zio->io_bp_copy; - - return (zio); -} - -zio_t * -zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, - int priority, int flags) -{ - zio_block_tail_t *zbt; - void *wbuf; - zio_t *zio; - blkptr_t blk; - - zio_phys_bp_init(vd, &blk, offset, size, checksum); - - zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, - ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); - - zio->io_vd = vd; - zio->io_offset = offset; - - zio->io_bp = &zio->io_bp_copy; - zio->io_checksum = checksum; - - if (zio_checksum_table[checksum].ci_zbt) { - /* - * zbt checksums are necessarily destructive -- they modify - * one word of the write buffer to hold the verifier/checksum. - * Therefore, we must make a local copy in case the data is - * being written to multiple places. - */ - wbuf = zio_buf_alloc(size); - bcopy(data, wbuf, size); - zio_push_transform(zio, wbuf, size, size); - - zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; - zbt->zbt_cksum = blk.blk_cksum; - } - - return (zio); -} - -/* - * Create a child I/O to do some work for us. It has no associated bp. - */ -zio_t * -zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, int priority, int flags, - zio_done_func_t *done, void *private) -{ - uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; - zio_t *cio; - - if (type == ZIO_TYPE_READ && bp != NULL) { - /* - * If we have the bp, then the child should perform the - * checksum and the parent need not. This pushes error - * detection as close to the leaves as possible and - * eliminates redundant checksums in the interior nodes. - */ - pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; - zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); - } - - cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, - done, private, type, priority, - (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, - ZIO_STAGE_VDEV_IO_START - 1, pipeline); - - cio->io_vd = vd; - cio->io_offset = offset; - - return (cio); -} - -/* - * ========================================================================== - * Initiate I/O, either sync or async - * ========================================================================== - */ -int -zio_wait(zio_t *zio) -{ - int error; - - ASSERT(zio->io_stage == ZIO_STAGE_OPEN); - - zio->io_waiter = curthread; - - zio_next_stage_async(zio); - - mutex_enter(&zio->io_lock); - while (zio->io_stalled != ZIO_STAGE_DONE) - cv_wait(&zio->io_cv, &zio->io_lock); - mutex_exit(&zio->io_lock); - - error = zio->io_error; - cv_destroy(&zio->io_cv); - mutex_destroy(&zio->io_lock); - kmem_cache_free(zio_cache, zio); - - return (error); -} - -void -zio_nowait(zio_t *zio) -{ - zio_next_stage_async(zio); -} - -/* - * ========================================================================== - * I/O pipeline interlocks: parent/child dependency scoreboarding - * ========================================================================== - */ -static void -zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) -{ - mutex_enter(&zio->io_lock); - if (*countp == 0) { - ASSERT(zio->io_stalled == 0); - mutex_exit(&zio->io_lock); - zio_next_stage(zio); - } else { - zio->io_stalled = stage; - mutex_exit(&zio->io_lock); - } -} - -static void -zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) -{ - zio_t *pio = zio->io_parent; - - mutex_enter(&pio->io_lock); - if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) - pio->io_error = zio->io_error; - if (--*countp == 0 && pio->io_stalled == stage) { - pio->io_stalled = 0; - mutex_exit(&pio->io_lock); - zio_next_stage_async(pio); - } else { - mutex_exit(&pio->io_lock); - } -} - -static void -zio_wait_children_ready(zio_t *zio) -{ - zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, - &zio->io_children_notready); -} - -void -zio_wait_children_done(zio_t *zio) -{ - zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, - &zio->io_children_notdone); -} - -static void -zio_ready(zio_t *zio) -{ - zio_t *pio = zio->io_parent; - - if (zio->io_ready) - zio->io_ready(zio); - - if (pio != NULL) - zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, - &pio->io_children_notready); - - if (zio->io_bp) - zio->io_bp_copy = *zio->io_bp; - - zio_next_stage(zio); -} - -static void -zio_done(zio_t *zio) -{ - zio_t *pio = zio->io_parent; - spa_t *spa = zio->io_spa; - blkptr_t *bp = zio->io_bp; - vdev_t *vd = zio->io_vd; - - ASSERT(zio->io_children_notready == 0); - ASSERT(zio->io_children_notdone == 0); - - if (bp != NULL) { - ASSERT(bp->blk_pad[0] == 0); - ASSERT(bp->blk_pad[1] == 0); - ASSERT(bp->blk_pad[2] == 0); - ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); - if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && - !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { - ASSERT(!BP_SHOULD_BYTESWAP(bp)); - if (zio->io_ndvas != 0) - ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); - ASSERT(BP_COUNT_GANG(bp) == 0 || - (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); - } - } - - if (vd != NULL) - vdev_stat_update(zio); - - if (zio->io_error) { - /* - * If this I/O is attached to a particular vdev, - * generate an error message describing the I/O failure - * at the block level. We ignore these errors if the - * device is currently unavailable. - */ - if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) - zfs_ereport_post(FM_EREPORT_ZFS_IO, - zio->io_spa, vd, zio, 0, 0); - - if ((zio->io_error == EIO || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && - zio->io_logical == zio) { - /* - * For root I/O requests, tell the SPA to log the error - * appropriately. Also, generate a logical data - * ereport. - */ - spa_log_error(zio->io_spa, zio); - - zfs_ereport_post(FM_EREPORT_ZFS_DATA, - zio->io_spa, NULL, zio, 0, 0); - } - - /* - * For I/O requests that cannot fail, panic appropriately. - */ - if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { - char *blkbuf; - - blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); - if (blkbuf) { - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, - bp ? bp : &zio->io_bp_copy); - } - panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " - "%d", zio->io_error == ECKSUM ? - "bad checksum" : "I/O failure", - zio_type_name[zio->io_type], - vdev_description(vd), - (u_longlong_t)zio->io_offset, - zio, blkbuf ? blkbuf : "", zio->io_error); - } - } - zio_clear_transform_stack(zio); - - if (zio->io_done) - zio->io_done(zio); - - ASSERT(zio->io_delegate_list == NULL); - ASSERT(zio->io_delegate_next == NULL); - - if (pio != NULL) { - zio_t *next, *prev; - - mutex_enter(&pio->io_lock); - next = zio->io_sibling_next; - prev = zio->io_sibling_prev; - if (next != NULL) - next->io_sibling_prev = prev; - if (prev != NULL) - prev->io_sibling_next = next; - if (pio->io_child == zio) - pio->io_child = next; - mutex_exit(&pio->io_lock); - - zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, - &pio->io_children_notdone); - } - - /* - * Note: this I/O is now done, and will shortly be freed, so there is no - * need to clear this (or any other) flag. - */ - if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) - spa_config_exit(spa, zio); - - if (zio->io_waiter != NULL) { - mutex_enter(&zio->io_lock); - ASSERT(zio->io_stage == ZIO_STAGE_DONE); - zio->io_stalled = zio->io_stage; - cv_broadcast(&zio->io_cv); - mutex_exit(&zio->io_lock); - } else { - cv_destroy(&zio->io_cv); - mutex_destroy(&zio->io_lock); - kmem_cache_free(zio_cache, zio); - } -} - -/* - * ========================================================================== - * Compression support - * ========================================================================== - */ -static void -zio_write_compress(zio_t *zio) -{ - int compress = zio->io_compress; - blkptr_t *bp = zio->io_bp; - void *cbuf; - uint64_t lsize = zio->io_size; - uint64_t csize = lsize; - uint64_t cbufsize = 0; - int pass; - - if (bp->blk_birth == zio->io_txg) { - /* - * We're rewriting an existing block, which means we're - * working on behalf of spa_sync(). For spa_sync() to - * converge, it must eventually be the case that we don't - * have to allocate new blocks. But compression changes - * the blocksize, which forces a reallocate, and makes - * convergence take longer. Therefore, after the first - * few passes, stop compressing to ensure convergence. - */ - pass = spa_sync_pass(zio->io_spa); - if (pass > zio_sync_pass.zp_dontcompress) - compress = ZIO_COMPRESS_OFF; - } else { - ASSERT(BP_IS_HOLE(bp)); - pass = 1; - } - - if (compress != ZIO_COMPRESS_OFF) - if (!zio_compress_data(compress, zio->io_data, zio->io_size, - &cbuf, &csize, &cbufsize)) - compress = ZIO_COMPRESS_OFF; - - if (compress != ZIO_COMPRESS_OFF && csize != 0) - zio_push_transform(zio, cbuf, csize, cbufsize); - - /* - * The final pass of spa_sync() must be all rewrites, but the first - * few passes offer a trade-off: allocating blocks defers convergence, - * but newly allocated blocks are sequential, so they can be written - * to disk faster. Therefore, we allow the first few passes of - * spa_sync() to reallocate new blocks, but force rewrites after that. - * There should only be a handful of blocks after pass 1 in any case. - */ - if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && - pass > zio_sync_pass.zp_rewrite) { - ASSERT(csize != 0); - BP_SET_LSIZE(bp, lsize); - BP_SET_COMPRESS(bp, compress); - zio->io_pipeline = ZIO_REWRITE_PIPELINE; - } else { - if (bp->blk_birth == zio->io_txg) - BP_ZERO(bp); - if (csize == 0) { - BP_ZERO(bp); - zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; - } else { - ASSERT3U(BP_GET_NDVAS(bp), ==, 0); - BP_SET_LSIZE(bp, lsize); - BP_SET_PSIZE(bp, csize); - BP_SET_COMPRESS(bp, compress); - zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; - } - } - - zio_next_stage(zio); -} - -static void -zio_read_decompress(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - void *data; - uint64_t size; - uint64_t bufsize; - int compress = BP_GET_COMPRESS(bp); - - ASSERT(compress != ZIO_COMPRESS_OFF); - - zio_pop_transform(zio, &data, &size, &bufsize); - - if (zio_decompress_data(compress, data, size, - zio->io_data, zio->io_size)) - zio->io_error = EIO; - - zio_buf_free(data, bufsize); - - zio_next_stage(zio); -} - -/* - * ========================================================================== - * Gang block support - * ========================================================================== - */ -static void -zio_gang_pipeline(zio_t *zio) -{ - /* - * By default, the pipeline assumes that we're dealing with a gang - * block. If we're not, strip out any gang-specific stages. - */ - if (!BP_IS_GANG(zio->io_bp)) - zio->io_pipeline &= ~ZIO_GANG_STAGES; - - zio_next_stage(zio); -} - -static void -zio_gang_byteswap(zio_t *zio) -{ - ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); - - if (BP_SHOULD_BYTESWAP(zio->io_bp)) - byteswap_uint64_array(zio->io_data, zio->io_size); -} - -static void -zio_get_gang_header(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - uint64_t gsize = SPA_GANGBLOCKSIZE; - void *gbuf = zio_buf_alloc(gsize); - - ASSERT(BP_IS_GANG(bp)); - - zio_push_transform(zio, gbuf, gsize, gsize); - - zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, - NULL, NULL, ZIO_TYPE_READ, zio->io_priority, - zio->io_flags & ZIO_FLAG_GANG_INHERIT, - ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); - - zio_wait_children_done(zio); -} - -static void -zio_read_gang_members(zio_t *zio) -{ - zio_gbh_phys_t *gbh; - uint64_t gsize, gbufsize, loff, lsize; - int i; - - ASSERT(BP_IS_GANG(zio->io_bp)); - - zio_gang_byteswap(zio); - zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); - - for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; - lsize = BP_GET_PSIZE(gbp); - - ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); - ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); - ASSERT3U(loff + lsize, <=, zio->io_size); - ASSERT(i < SPA_GBH_NBLKPTRS); - ASSERT(!BP_IS_HOLE(gbp)); - - zio_nowait(zio_read(zio, zio->io_spa, gbp, - (char *)zio->io_data + loff, lsize, NULL, NULL, - zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, - &zio->io_bookmark)); - } - - zio_buf_free(gbh, gbufsize); - zio_wait_children_done(zio); -} - -static void -zio_rewrite_gang_members(zio_t *zio) -{ - zio_gbh_phys_t *gbh; - uint64_t gsize, gbufsize, loff, lsize; - int i; - - ASSERT(BP_IS_GANG(zio->io_bp)); - ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); - - zio_gang_byteswap(zio); - zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); - - ASSERT(gsize == gbufsize); - - for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; - lsize = BP_GET_PSIZE(gbp); - - ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); - ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); - ASSERT3U(loff + lsize, <=, zio->io_size); - ASSERT(i < SPA_GBH_NBLKPTRS); - ASSERT(!BP_IS_HOLE(gbp)); - - zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, - zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, - NULL, NULL, zio->io_priority, zio->io_flags, - &zio->io_bookmark)); - } - - zio_push_transform(zio, gbh, gsize, gbufsize); - zio_wait_children_ready(zio); -} - -static void -zio_free_gang_members(zio_t *zio) -{ - zio_gbh_phys_t *gbh; - uint64_t gsize, gbufsize; - int i; - - ASSERT(BP_IS_GANG(zio->io_bp)); - - zio_gang_byteswap(zio); - zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); - - for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; - - if (BP_IS_HOLE(gbp)) - continue; - zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, - gbp, NULL, NULL)); - } - - zio_buf_free(gbh, gbufsize); - zio_next_stage(zio); -} - -static void -zio_claim_gang_members(zio_t *zio) -{ - zio_gbh_phys_t *gbh; - uint64_t gsize, gbufsize; - int i; - - ASSERT(BP_IS_GANG(zio->io_bp)); - - zio_gang_byteswap(zio); - zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); - - for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; - if (BP_IS_HOLE(gbp)) - continue; - zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, - gbp, NULL, NULL)); - } - - zio_buf_free(gbh, gbufsize); - zio_next_stage(zio); -} - -static void -zio_write_allocate_gang_member_done(zio_t *zio) -{ - zio_t *pio = zio->io_parent; - dva_t *cdva = zio->io_bp->blk_dva; - dva_t *pdva = pio->io_bp->blk_dva; - uint64_t asize; - int d; - - ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); - ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); - ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); - ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); - - mutex_enter(&pio->io_lock); - for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { - ASSERT(DVA_GET_GANG(&pdva[d])); - asize = DVA_GET_ASIZE(&pdva[d]); - asize += DVA_GET_ASIZE(&cdva[d]); - DVA_SET_ASIZE(&pdva[d], asize); - } - mutex_exit(&pio->io_lock); -} - -static void -zio_write_allocate_gang_members(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - dva_t *dva = bp->blk_dva; - spa_t *spa = zio->io_spa; - zio_gbh_phys_t *gbh; - uint64_t txg = zio->io_txg; - uint64_t resid = zio->io_size; - uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); - uint64_t gsize, loff, lsize; - uint32_t gbps_left; - int ndvas = zio->io_ndvas; - int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); - int error; - int i, d; - - gsize = SPA_GANGBLOCKSIZE; - gbps_left = SPA_GBH_NBLKPTRS; - - error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); - if (error == ENOSPC) - panic("can't allocate gang block header"); - ASSERT(error == 0); - - for (d = 0; d < gbh_ndvas; d++) - DVA_SET_GANG(&dva[d], 1); - - bp->blk_birth = txg; - - gbh = zio_buf_alloc(gsize); - bzero(gbh, gsize); - - /* We need to test multi-level gang blocks */ - if (maxalloc >= zio_gang_bang && (LBOLT & 0x1) == 0) - maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); - - for (loff = 0, i = 0; loff != zio->io_size; - loff += lsize, resid -= lsize, gbps_left--, i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; - dva = gbp->blk_dva; - - ASSERT(gbps_left != 0); - maxalloc = MIN(maxalloc, resid); - - while (resid <= maxalloc * gbps_left) { - error = metaslab_alloc(spa, maxalloc, gbp, ndvas, - txg, bp, B_FALSE); - if (error == 0) - break; - ASSERT3U(error, ==, ENOSPC); - if (maxalloc == SPA_MINBLOCKSIZE) - panic("really out of space"); - maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); - } - - if (resid <= maxalloc * gbps_left) { - lsize = maxalloc; - BP_SET_LSIZE(gbp, lsize); - BP_SET_PSIZE(gbp, lsize); - BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); - gbp->blk_birth = txg; - zio_nowait(zio_rewrite(zio, spa, - zio->io_checksum, txg, gbp, - (char *)zio->io_data + loff, lsize, - zio_write_allocate_gang_member_done, NULL, - zio->io_priority, zio->io_flags, - &zio->io_bookmark)); - } else { - lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); - ASSERT(lsize != SPA_MINBLOCKSIZE); - zio_nowait(zio_write_allocate(zio, spa, - zio->io_checksum, txg, gbp, - (char *)zio->io_data + loff, lsize, - zio_write_allocate_gang_member_done, NULL, - zio->io_priority, zio->io_flags)); - } - } - - ASSERT(resid == 0 && loff == zio->io_size); - - zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; - - zio_push_transform(zio, gbh, gsize, gsize); - /* - * As much as we'd like this to be zio_wait_children_ready(), - * updating our ASIZE doesn't happen until the io_done callback, - * so we have to wait for that to finish in order for our BP - * to be stable. - */ - zio_wait_children_done(zio); -} - -/* - * ========================================================================== - * Allocate and free blocks - * ========================================================================== - */ -static void -zio_dva_allocate(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - int error; - - ASSERT(BP_IS_HOLE(bp)); - ASSERT3U(BP_GET_NDVAS(bp), ==, 0); - ASSERT3U(zio->io_ndvas, >, 0); - ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); - - /* For testing, make some blocks above a certain size be gang blocks */ - if (zio->io_size >= zio_gang_bang && (LBOLT & 0x3) == 0) { - zio_write_allocate_gang_members(zio); - return; - } - - ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - - error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, - zio->io_txg, NULL, B_FALSE); - - if (error == 0) { - bp->blk_birth = zio->io_txg; - } else if (error == ENOSPC) { - if (zio->io_size == SPA_MINBLOCKSIZE) - panic("really, truly out of space"); - zio_write_allocate_gang_members(zio); - return; - } else { - zio->io_error = error; - } - zio_next_stage(zio); -} - -static void -zio_dva_free(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - - metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); - - BP_ZERO(bp); - - zio_next_stage(zio); -} - -static void -zio_dva_claim(zio_t *zio) -{ - zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); - - zio_next_stage(zio); -} - -/* - * ========================================================================== - * Read and write to physical devices - * ========================================================================== - */ - -static void -zio_vdev_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd ? vd->vdev_top : NULL; - blkptr_t *bp = zio->io_bp; - uint64_t align; - - if (vd == NULL) { - /* The mirror_ops handle multiple DVAs in a single BP */ - vdev_mirror_ops.vdev_op_io_start(zio); - return; - } - - align = 1ULL << tvd->vdev_ashift; - - if (zio->io_retries == 0 && vd == tvd) - zio->io_flags |= ZIO_FLAG_FAILFAST; - - if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && - vd->vdev_children == 0) { - zio->io_flags |= ZIO_FLAG_PHYSICAL; - zio->io_offset += VDEV_LABEL_START_SIZE; - } - - if (P2PHASE(zio->io_size, align) != 0) { - uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); - ASSERT(vd == tvd); - if (zio->io_type == ZIO_TYPE_WRITE) { - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf + zio->io_size, asize - zio->io_size); - } - zio_push_transform(zio, abuf, asize, asize); - ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); - zio->io_flags |= ZIO_FLAG_SUBBLOCK; - } - - ASSERT(P2PHASE(zio->io_offset, align) == 0); - ASSERT(P2PHASE(zio->io_size, align) == 0); - ASSERT(bp == NULL || - P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); - ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); - - vdev_io_start(zio); - - /* zio_next_stage_async() gets called from io completion interrupt */ -} - -static void -zio_vdev_io_done(zio_t *zio) -{ - if (zio->io_vd == NULL) - /* The mirror_ops handle multiple DVAs in a single BP */ - vdev_mirror_ops.vdev_op_io_done(zio); - else - vdev_io_done(zio); -} - -/* XXPOLICY */ -boolean_t -zio_should_retry(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - - if (zio->io_error == 0) - return (B_FALSE); - if (zio->io_delegate_list != NULL) - return (B_FALSE); - if (vd && vd != vd->vdev_top) - return (B_FALSE); - if (zio->io_flags & ZIO_FLAG_DONT_RETRY) - return (B_FALSE); - if (zio->io_retries > 0) - return (B_FALSE); - - return (B_TRUE); -} - -static void -zio_vdev_io_assess(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd ? vd->vdev_top : NULL; - - ASSERT(zio->io_vsd == NULL); - - if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { - void *abuf; - uint64_t asize; - ASSERT(vd == tvd); - zio_pop_transform(zio, &abuf, &asize, &asize); - if (zio->io_type == ZIO_TYPE_READ) - bcopy(abuf, zio->io_data, zio->io_size); - zio_buf_free(abuf, asize); - zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; - } - - if (zio_injection_enabled && !zio->io_error) - zio->io_error = zio_handle_fault_injection(zio, EIO); - - /* - * If the I/O failed, determine whether we should attempt to retry it. - */ - /* XXPOLICY */ - if (zio_should_retry(zio)) { - ASSERT(tvd == vd); - - zio->io_retries++; - zio->io_error = 0; - zio->io_flags &= ZIO_FLAG_VDEV_INHERIT | - ZIO_FLAG_CONFIG_GRABBED; - /* XXPOLICY */ - zio->io_flags &= ~ZIO_FLAG_FAILFAST; - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; - - dprintf("retry #%d for %s to %s offset %llx\n", - zio->io_retries, zio_type_name[zio->io_type], - vdev_description(vd), zio->io_offset); - - zio_next_stage_async(zio); - return; - } - - if (zio->io_error != 0 && zio->io_error != ECKSUM && - !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { - /* - * Poor man's hotplug support. Even if we're done retrying this - * I/O, try to reopen the vdev to see if it's still attached. - * To avoid excessive thrashing, we only try it once a minute. - * This also has the effect of detecting when missing devices - * have come back, by polling the device once a minute. - * - * We need to do this asynchronously because we can't grab - * all the necessary locks way down here. - */ - if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { - vd->vdev_last_try = gethrtime(); - tvd->vdev_reopen_wanted = 1; - spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); - } - } - - zio_next_stage(zio); -} - -void -zio_vdev_io_reissue(zio_t *zio) -{ - ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); - ASSERT(zio->io_error == 0); - - zio->io_stage--; -} - -void -zio_vdev_io_redone(zio_t *zio) -{ - ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); - - zio->io_stage--; -} - -void -zio_vdev_io_bypass(zio_t *zio) -{ - ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); - ASSERT(zio->io_error == 0); - - zio->io_flags |= ZIO_FLAG_IO_BYPASS; - zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; -} - -/* - * ========================================================================== - * Generate and verify checksums - * ========================================================================== - */ -static void -zio_checksum_generate(zio_t *zio) -{ - int checksum = zio->io_checksum; - blkptr_t *bp = zio->io_bp; - - ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - - BP_SET_CHECKSUM(bp, checksum); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); - - zio_next_stage(zio); -} - -static void -zio_gang_checksum_generate(zio_t *zio) -{ - zio_cksum_t zc; - zio_gbh_phys_t *gbh = zio->io_data; - - ASSERT(BP_IS_GANG(zio->io_bp)); - ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); - - zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); - - zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); - - zio_next_stage(zio); -} - -static void -zio_checksum_verify(zio_t *zio) -{ - if (zio->io_bp != NULL) { - zio->io_error = zio_checksum_error(zio); - if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, zio->io_vd, zio, 0, 0); - } - - zio_next_stage(zio); -} - -/* - * Called by RAID-Z to ensure we don't compute the checksum twice. - */ -void -zio_checksum_verified(zio_t *zio) -{ - zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); -} - -/* - * Set the external verifier for a gang block based on stuff in the bp - */ -void -zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) -{ - blkptr_t *bp = zio->io_bp; - - zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); - zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); - zcp->zc_word[2] = bp->blk_birth; - zcp->zc_word[3] = 0; -} - -/* - * ========================================================================== - * Define the pipeline - * ========================================================================== - */ -typedef void zio_pipe_stage_t(zio_t *zio); - -static void -zio_badop(zio_t *zio) -{ - panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); -} - -zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { - zio_badop, - zio_wait_children_ready, - zio_write_compress, - zio_checksum_generate, - zio_gang_pipeline, - zio_get_gang_header, - zio_rewrite_gang_members, - zio_free_gang_members, - zio_claim_gang_members, - zio_dva_allocate, - zio_dva_free, - zio_dva_claim, - zio_gang_checksum_generate, - zio_ready, - zio_vdev_io_start, - zio_vdev_io_done, - zio_vdev_io_assess, - zio_wait_children_done, - zio_checksum_verify, - zio_read_gang_members, - zio_read_decompress, - zio_done, - zio_badop -}; - -/* - * Move an I/O to the next stage of the pipeline and execute that stage. - * There's no locking on io_stage because there's no legitimate way for - * multiple threads to be attempting to process the same I/O. - */ -void -zio_next_stage(zio_t *zio) -{ - uint32_t pipeline = zio->io_pipeline; - - ASSERT(!MUTEX_HELD(&zio->io_lock)); - - if (zio->io_error) { - dprintf("zio %p vdev %s offset %llx stage %d error %d\n", - zio, vdev_description(zio->io_vd), - zio->io_offset, zio->io_stage, zio->io_error); - if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) - pipeline &= ZIO_ERROR_PIPELINE_MASK; - } - - while (((1U << ++zio->io_stage) & pipeline) == 0) - continue; - - ASSERT(zio->io_stage <= ZIO_STAGE_DONE); - ASSERT(zio->io_stalled == 0); - - /* - * See the comment in zio_next_stage_async() about per-CPU taskqs. - */ - if (((1U << zio->io_stage) & zio->io_async_stages) && - (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) && - !(zio->io_flags & ZIO_FLAG_METADATA)) { - taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; - (void) taskq_dispatch(tq, - (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); - } else { - zio_pipeline[zio->io_stage](zio); - } -} - -void -zio_next_stage_async(zio_t *zio) -{ - taskq_t *tq; - uint32_t pipeline = zio->io_pipeline; - - ASSERT(!MUTEX_HELD(&zio->io_lock)); - - if (zio->io_error) { - dprintf("zio %p vdev %s offset %llx stage %d error %d\n", - zio, vdev_description(zio->io_vd), - zio->io_offset, zio->io_stage, zio->io_error); - if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) - pipeline &= ZIO_ERROR_PIPELINE_MASK; - } - - while (((1U << ++zio->io_stage) & pipeline) == 0) - continue; - - ASSERT(zio->io_stage <= ZIO_STAGE_DONE); - ASSERT(zio->io_stalled == 0); - - /* - * For performance, we'll probably want two sets of task queues: - * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU - * part is for read performance: since we have to make a pass over - * the data to checksum it anyway, we want to do this on the same CPU - * that issued the read, because (assuming CPU scheduling affinity) - * that thread is probably still there. Getting this optimization - * right avoids performance-hostile cache-to-cache transfers. - * - * Note that having two sets of task queues is also necessary for - * correctness: if all of the issue threads get bogged down waiting - * for dependent reads (e.g. metaslab freelist) to complete, then - * there won't be any threads available to service I/O completion - * interrupts. - */ - if ((1U << zio->io_stage) & zio->io_async_stages) { - if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) - tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; - else - tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; - (void) taskq_dispatch(tq, - (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); - } else { - zio_pipeline[zio->io_stage](zio); - } -} - -static boolean_t -zio_alloc_should_fail(void) -{ - static uint16_t allocs = 0; - - return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0); -} - -/* - * Try to allocate an intent log block. Return 0 on success, errno on failure. - */ -int -zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, - uint64_t txg) -{ - int error; - - spa_config_enter(spa, RW_READER, FTAG); - - if (zio_zil_fail_shift && zio_alloc_should_fail()) { - spa_config_exit(spa, FTAG); - return (ENOSPC); - } - - /* - * We were passed the previous log blocks dva_t in bp->blk_dva[0]. - */ - error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE); - - if (error == 0) { - BP_SET_LSIZE(new_bp, size); - BP_SET_PSIZE(new_bp, size); - BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); - BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); - BP_SET_LEVEL(new_bp, 0); - BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); - new_bp->blk_birth = txg; - } - - spa_config_exit(spa, FTAG); - - return (error); -} - -/* - * Free an intent log block. We know it can't be a gang block, so there's - * nothing to do except metaslab_free() it. - */ -void -zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) -{ - ASSERT(!BP_IS_GANG(bp)); - - spa_config_enter(spa, RW_READER, FTAG); - - metaslab_free(spa, bp, txg, B_FALSE); - - spa_config_exit(spa, FTAG); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c deleted file mode 100644 index f0d9a14..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/zio_checksum.h> - -/* - * Checksum vectors. - * - * In the SPA, everything is checksummed. We support checksum vectors - * for three distinct reasons: - * - * 1. Different kinds of data need different levels of protection. - * For SPA metadata, we always want a very strong checksum. - * For user data, we let users make the trade-off between speed - * and checksum strength. - * - * 2. Cryptographic hash and MAC algorithms are an area of active research. - * It is likely that in future hash functions will be at least as strong - * as current best-of-breed, and may be substantially faster as well. - * We want the ability to take advantage of these new hashes as soon as - * they become available. - * - * 3. If someone develops hardware that can compute a strong hash quickly, - * we want the ability to take advantage of that hardware. - * - * Of course, we don't want a checksum upgrade to invalidate existing - * data, so we store the checksum *function* in five bits of the DVA. - * This gives us room for up to 32 different checksum functions. - * - * When writing a block, we always checksum it with the latest-and-greatest - * checksum function of the appropriate strength. When reading a block, - * we compare the expected checksum against the actual checksum, which we - * compute via the checksum function specified in the DVA encoding. - */ - -/*ARGSUSED*/ -static void -zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); -} - -zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { - {{NULL, NULL}, 0, 0, "inherit"}, - {{NULL, NULL}, 0, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"}, -}; - -uint8_t -zio_checksum_select(uint8_t child, uint8_t parent) -{ - ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); - ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); - ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); - - if (child == ZIO_CHECKSUM_INHERIT) - return (parent); - - if (child == ZIO_CHECKSUM_ON) - return (ZIO_CHECKSUM_ON_VALUE); - - return (child); -} - -/* - * Generate the checksum. - */ -void -zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size) -{ - zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; - zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t zbt_cksum; - - ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); - ASSERT(ci->ci_func[0] != NULL); - - if (ci->ci_zbt) { - *zcp = zbt->zbt_cksum; - zbt->zbt_magic = ZBT_MAGIC; - ci->ci_func[0](data, size, &zbt_cksum); - zbt->zbt_cksum = zbt_cksum; - } else { - ci->ci_func[0](data, size, zcp); - } -} - -int -zio_checksum_error(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - zio_cksum_t zc = bp->blk_cksum; - uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : - BP_GET_CHECKSUM(bp); - int byteswap = BP_SHOULD_BYTESWAP(bp); - void *data = zio->io_data; - uint64_t size = ZIO_GET_IOSIZE(zio); - zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; - zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t actual_cksum, expected_cksum; - - if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) - return (EINVAL); - - if (ci->ci_zbt) { - if (checksum == ZIO_CHECKSUM_GANG_HEADER) - zio_set_gang_verifier(zio, &zc); - - if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) { - expected_cksum = zbt->zbt_cksum; - byteswap_uint64_array(&expected_cksum, - sizeof (zio_cksum_t)); - zbt->zbt_cksum = zc; - byteswap_uint64_array(&zbt->zbt_cksum, - sizeof (zio_cksum_t)); - ci->ci_func[1](data, size, &actual_cksum); - zbt->zbt_cksum = expected_cksum; - byteswap_uint64_array(&zbt->zbt_cksum, - sizeof (zio_cksum_t)); - } else { - expected_cksum = zbt->zbt_cksum; - zbt->zbt_cksum = zc; - ci->ci_func[0](data, size, &actual_cksum); - zbt->zbt_cksum = expected_cksum; - } - zc = expected_cksum; - } else { - ASSERT(!BP_IS_GANG(bp)); - ci->ci_func[byteswap](data, size, &actual_cksum); - } - - if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) - return (ECKSUM); - - if (zio_injection_enabled && !zio->io_error) - return (zio_handle_fault_injection(zio, ECKSUM)); - - return (0); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c deleted file mode 100644 index c563be4..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/compress.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/zio_compress.h> - -/* - * Compression vectors. - */ - -zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {NULL, NULL, 0, "inherit"}, - {NULL, NULL, 0, "on"}, - {NULL, NULL, 0, "uncompressed"}, - {lzjb_compress, lzjb_decompress, 0, "lzjb"}, - {NULL, NULL, 0, "empty"}, - {gzip_compress, gzip_decompress, 1, "gzip-1"}, - {gzip_compress, gzip_decompress, 2, "gzip-2"}, - {gzip_compress, gzip_decompress, 3, "gzip-3"}, - {gzip_compress, gzip_decompress, 4, "gzip-4"}, - {gzip_compress, gzip_decompress, 5, "gzip-5"}, - {gzip_compress, gzip_decompress, 6, "gzip-6"}, - {gzip_compress, gzip_decompress, 7, "gzip-7"}, - {gzip_compress, gzip_decompress, 8, "gzip-8"}, - {gzip_compress, gzip_decompress, 9, "gzip-9"}, -}; - -uint8_t -zio_compress_select(uint8_t child, uint8_t parent) -{ - ASSERT(child < ZIO_COMPRESS_FUNCTIONS); - ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); - ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON); - - if (child == ZIO_COMPRESS_INHERIT) - return (parent); - - if (child == ZIO_COMPRESS_ON) - return (ZIO_COMPRESS_ON_VALUE); - - return (child); -} - -int -zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp, - uint64_t *destsizep, uint64_t *destbufsizep) -{ - uint64_t *word, *word_end; - uint64_t ciosize, gapsize, destbufsize; - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - char *dest; - uint_t allzero; - - ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); - ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); - - /* - * If the data is all zeroes, we don't even need to allocate - * a block for it. We indicate this by setting *destsizep = 0. - */ - allzero = 1; - word = src; - word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize); - while (word < word_end) { - if (*word++ != 0) { - allzero = 0; - break; - } - } - if (allzero) { - *destp = NULL; - *destsizep = 0; - *destbufsizep = 0; - return (1); - } - - if (cpfunc == ZIO_COMPRESS_EMPTY) - return (0); - - /* Compress at least 12.5% */ - destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE); - if (destbufsize == 0) - return (0); - dest = zio_buf_alloc(destbufsize); - - ciosize = ci->ci_compress(src, dest, (size_t)srcsize, - (size_t)destbufsize, ci->ci_level); - if (ciosize > destbufsize) { - zio_buf_free(dest, destbufsize); - return (0); - } - - /* Cool. We compressed at least as much as we were hoping to. */ - - /* For security, make sure we don't write random heap crap to disk */ - gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize; - if (gapsize != 0) { - bzero(dest + ciosize, gapsize); - ciosize += gapsize; - } - - ASSERT3U(ciosize, <=, destbufsize); - ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0); - *destp = dest; - *destsizep = ciosize; - *destbufsizep = destbufsize; - - return (1); -} - -int -zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, - void *dest, uint64_t destsize) -{ - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - - ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); - - return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c deleted file mode 100644 index 4cada09..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c +++ /dev/null @@ -1,315 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * ZFS fault injection - * - * To handle fault injection, we keep track of a series of zinject_record_t - * structures which describe which logical block(s) should be injected with a - * fault. These are kept in a global list. Each record corresponds to a given - * spa_t and maintains a special hold on the spa_t so that it cannot be deleted - * or exported while the injection record exists. - * - * Device level injection is done using the 'zi_guid' field. If this is set, it - * means that the error is destined for a particular device, not a piece of - * data. - * - * This is a rather poor data structure and algorithm, but we don't expect more - * than a few faults at any one time, so it should be sufficient for our needs. - */ - -#include <sys/arc.h> -#include <sys/zio_impl.h> -#include <sys/zfs_ioctl.h> -#include <sys/spa_impl.h> -#include <sys/vdev_impl.h> - -uint32_t zio_injection_enabled; - -typedef struct inject_handler { - int zi_id; - spa_t *zi_spa; - zinject_record_t zi_record; - list_node_t zi_link; -} inject_handler_t; - -static list_t inject_handlers; -static krwlock_t inject_lock; -static int inject_next_id = 1; - -/* - * Returns true if the given record matches the I/O in progress. - */ -static boolean_t -zio_match_handler(zbookmark_t *zb, uint64_t type, - zinject_record_t *record, int error) -{ - /* - * Check for a match against the MOS, which is based on type - */ - if (zb->zb_objset == 0 && record->zi_objset == 0 && - record->zi_object == 0) { - if (record->zi_type == DMU_OT_NONE || - type == record->zi_type) - return (record->zi_freq == 0 || - spa_get_random(100) < record->zi_freq); - else - return (B_FALSE); - } - - /* - * Check for an exact match. - */ - if (zb->zb_objset == record->zi_objset && - zb->zb_object == record->zi_object && - zb->zb_level == record->zi_level && - zb->zb_blkid >= record->zi_start && - zb->zb_blkid <= record->zi_end && - error == record->zi_error) - return (record->zi_freq == 0 || - spa_get_random(100) < record->zi_freq); - - return (B_FALSE); -} - -/* - * Determine if the I/O in question should return failure. Returns the errno - * to be returned to the caller. - */ -int -zio_handle_fault_injection(zio_t *zio, int error) -{ - int ret = 0; - inject_handler_t *handler; - - /* - * Ignore I/O not associated with any logical data. - */ - if (zio->io_logical == NULL) - return (0); - - /* - * Currently, we only support fault injection on reads. - */ - if (zio->io_type != ZIO_TYPE_READ) - return (0); - - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { - - /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - /* Ignore device errors */ - if (handler->zi_record.zi_guid != 0) - continue; - - /* If this handler matches, return EIO */ - if (zio_match_handler(&zio->io_logical->io_bookmark, - zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, - &handler->zi_record, error)) { - ret = error; - break; - } - } - - rw_exit(&inject_lock); - - return (ret); -} - -int -zio_handle_device_injection(vdev_t *vd, int error) -{ - inject_handler_t *handler; - int ret = 0; - - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { - - if (vd->vdev_guid == handler->zi_record.zi_guid) { - if (handler->zi_record.zi_error == error) { - /* - * For a failed open, pretend like the device - * has gone away. - */ - if (error == ENXIO) - vd->vdev_stat.vs_aux = - VDEV_AUX_OPEN_FAILED; - ret = error; - break; - } - if (handler->zi_record.zi_error == ENXIO) { - ret = EIO; - break; - } - } - } - - rw_exit(&inject_lock); - - return (ret); -} - -/* - * Create a new handler for the given record. We add it to the list, adding - * a reference to the spa_t in the process. We increment zio_injection_enabled, - * which is the switch to trigger all fault injection. - */ -int -zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) -{ - inject_handler_t *handler; - int error; - spa_t *spa; - - /* - * If this is pool-wide metadata, make sure we unload the corresponding - * spa_t, so that the next attempt to load it will trigger the fault. - * We call spa_reset() to unload the pool appropriately. - */ - if (flags & ZINJECT_UNLOAD_SPA) - if ((error = spa_reset(name)) != 0) - return (error); - - if (!(flags & ZINJECT_NULL)) { - /* - * spa_inject_ref() will add an injection reference, which will - * prevent the pool from being removed from the namespace while - * still allowing it to be unloaded. - */ - if ((spa = spa_inject_addref(name)) == NULL) - return (ENOENT); - - handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); - - rw_enter(&inject_lock, RW_WRITER); - - *id = handler->zi_id = inject_next_id++; - handler->zi_spa = spa; - handler->zi_record = *record; - list_insert_tail(&inject_handlers, handler); - atomic_add_32(&zio_injection_enabled, 1); - - rw_exit(&inject_lock); - } - - /* - * Flush the ARC, so that any attempts to read this data will end up - * going to the ZIO layer. Note that this is a little overkill, but - * we don't have the necessary ARC interfaces to do anything else, and - * fault injection isn't a performance critical path. - */ - if (flags & ZINJECT_FLUSH_ARC) - arc_flush(); - - return (0); -} - -/* - * Returns the next record with an ID greater than that supplied to the - * function. Used to iterate over all handlers in the system. - */ -int -zio_inject_list_next(int *id, char *name, size_t buflen, - zinject_record_t *record) -{ - inject_handler_t *handler; - int ret; - - mutex_enter(&spa_namespace_lock); - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) - if (handler->zi_id > *id) - break; - - if (handler) { - *record = handler->zi_record; - *id = handler->zi_id; - (void) strncpy(name, spa_name(handler->zi_spa), buflen); - ret = 0; - } else { - ret = ENOENT; - } - - rw_exit(&inject_lock); - mutex_exit(&spa_namespace_lock); - - return (ret); -} - -/* - * Clear the fault handler with the given identifier, or return ENOENT if none - * exists. - */ -int -zio_clear_fault(int id) -{ - inject_handler_t *handler; - int ret; - - rw_enter(&inject_lock, RW_WRITER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) - if (handler->zi_id == id) - break; - - if (handler == NULL) { - ret = ENOENT; - } else { - list_remove(&inject_handlers, handler); - spa_inject_delref(handler->zi_spa); - kmem_free(handler, sizeof (inject_handler_t)); - atomic_add_32(&zio_injection_enabled, -1); - ret = 0; - } - - rw_exit(&inject_lock); - - return (ret); -} - -void -zio_inject_init(void) -{ - list_create(&inject_handlers, sizeof (inject_handler_t), - offsetof(inject_handler_t, zi_link)); -} - -void -zio_inject_fini(void) -{ - list_destroy(&inject_handlers); -} diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c deleted file mode 100644 index fedae03..0000000 --- a/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ /dev/null @@ -1,801 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> - * All rights reserved. - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * ZFS volume emulation driver. - * - * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. - * Volumes are accessed through the symbolic links named: - * - * /dev/zvol/dsk/<pool_name>/<dataset_name> - * /dev/zvol/rdsk/<pool_name>/<dataset_name> - * - * These links are created by the ZFS-specific devfsadm link generator. - * Volumes are persistent through reboot. No user command needs to be - * run before opening and using a device. - */ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/kernel.h> -#include <sys/errno.h> -#include <sys/uio.h> -#include <sys/bio.h> -#include <sys/buf.h> -#include <sys/kmem.h> -#include <sys/conf.h> -#include <sys/cmn_err.h> -#include <sys/stat.h> -#include <sys/zap.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/dsl_prop.h> -#include <sys/dkio.h> -#include <sys/byteorder.h> -#include <sys/sunddi.h> -#include <sys/dirent.h> -#include <sys/policy.h> -#include <sys/fs/zfs.h> -#include <sys/zfs_ioctl.h> -#include <sys/zil.h> -#include <sys/refcount.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_rlock.h> -#include <geom/geom.h> - -#include "zfs_namecheck.h" - -struct g_class zfs_zvol_class = { - .name = "ZFS::ZVOL", - .version = G_VERSION, -}; - -DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); - -#define ZVOL_OBJ 1ULL -#define ZVOL_ZAP_OBJ 2ULL - -static uint32_t zvol_minors; - -/* - * The in-core state of each volume. - */ -typedef struct zvol_state { - char zv_name[MAXPATHLEN]; /* pool/dd name */ - uint64_t zv_volsize; /* amount of space we advertise */ - uint64_t zv_volblocksize; /* volume block size */ - struct g_provider *zv_provider; /* GEOM provider */ - uint8_t zv_min_bs; /* minimum addressable block shift */ - uint8_t zv_readonly; /* hard readonly; like write-protect */ - objset_t *zv_objset; /* objset handle */ - uint32_t zv_mode; /* DS_MODE_* flags at open time */ - uint32_t zv_total_opens; /* total open count */ - zilog_t *zv_zilog; /* ZIL handle */ - uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ - znode_t zv_znode; /* for range locking */ - int zv_state; - struct bio_queue_head zv_queue; - struct mtx zv_queue_mtx; /* zv_queue mutex */ -} zvol_state_t; - -/* - * zvol maximum transfer in one DMU tx. - */ -int zvol_maxphys = DMU_MAX_ACCESS/2; - -static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); - -int -zvol_check_volsize(uint64_t volsize, uint64_t blocksize) -{ - if (volsize == 0) - return (EINVAL); - - if (volsize % blocksize != 0) - return (EINVAL); - -#ifdef _ILP32 - if (volsize - 1 > SPEC_MAXOFFSET_T) - return (EOVERFLOW); -#endif - return (0); -} - -int -zvol_check_volblocksize(uint64_t volblocksize) -{ - if (volblocksize < SPA_MINBLOCKSIZE || - volblocksize > SPA_MAXBLOCKSIZE || - !ISP2(volblocksize)) - return (EDOM); - - return (0); -} - -static void -zvol_readonly_changed_cb(void *arg, uint64_t newval) -{ - zvol_state_t *zv = arg; - - zv->zv_readonly = (uint8_t)newval; -} - -int -zvol_get_stats(objset_t *os, nvlist_t *nv) -{ - int error; - dmu_object_info_t doi; - uint64_t val; - - - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); - if (error) - return (error); - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); - - error = dmu_object_info(os, ZVOL_OBJ, &doi); - - if (error == 0) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, - doi.doi_data_block_size); - } - - return (error); -} - -static zvol_state_t * -zvol_minor_lookup(const char *name) -{ - struct g_provider *pp; - struct g_geom *gp; - - g_topology_assert(); - - LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) { - LIST_FOREACH(pp, &gp->provider, provider) { - if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0) - return (pp->private); - } - } - - return (NULL); -} - -static int -zvol_access(struct g_provider *pp, int acr, int acw, int ace) -{ - zvol_state_t *zv; - - g_topology_assert(); - - zv = pp->private; - if (zv == NULL) { - if (acr <= 0 && acw <= 0 && ace <= 0) - return (0); - return (pp->error); - } - - ASSERT(zv->zv_objset != NULL); - - if (acw > 0 && (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) - return (EROFS); - - zv->zv_total_opens += acr + acw + ace; - - return (0); -} - -/* - * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. - * - * We store data in the log buffers if it's small enough. - * Otherwise we will later flush the data out via dmu_sync(). - */ -ssize_t zvol_immediate_write_sz = 32768; - -static void -zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) -{ - uint32_t blocksize = zv->zv_volblocksize; - lr_write_t *lr; - - while (len) { - ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); - itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); - - itx->itx_wr_state = - len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; - itx->itx_private = zv; - lr = (lr_write_t *)&itx->itx_lr; - lr->lr_foid = ZVOL_OBJ; - lr->lr_offset = off; - lr->lr_length = nbytes; - lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); - BP_ZERO(&lr->lr_blkptr); - - (void) zil_itx_assign(zv->zv_zilog, itx, tx); - len -= nbytes; - off += nbytes; - } -} - -static void -zvol_start(struct bio *bp) -{ - zvol_state_t *zv; - - switch (bp->bio_cmd) { - case BIO_READ: - case BIO_WRITE: - case BIO_FLUSH: - zv = bp->bio_to->private; - ASSERT(zv != NULL); - mtx_lock(&zv->zv_queue_mtx); - bioq_insert_tail(&zv->zv_queue, bp); - wakeup_one(&zv->zv_queue); - mtx_unlock(&zv->zv_queue_mtx); - break; - case BIO_DELETE: - case BIO_GETATTR: - default: - g_io_deliver(bp, EOPNOTSUPP); - break; - } -} - -static void -zvol_serve_one(zvol_state_t *zv, struct bio *bp) -{ - uint64_t off, volsize; - size_t size, resid; - char *addr; - objset_t *os; - rl_t *rl; - int error = 0; - boolean_t reading; - - off = bp->bio_offset; - volsize = zv->zv_volsize; - - os = zv->zv_objset; - ASSERT(os != NULL); - - addr = bp->bio_data; - resid = bp->bio_length; - - error = 0; - - /* - * There must be no buffer changes when doing a dmu_sync() because - * we can't change the data whilst calculating the checksum. - * A better approach than a per zvol rwlock would be to lock ranges. - */ - reading = (bp->bio_cmd == BIO_READ); - rl = zfs_range_lock(&zv->zv_znode, off, resid, - reading ? RL_READER : RL_WRITER); - - while (resid != 0 && off < volsize) { - - size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */ - - if (size > volsize - off) /* don't write past the end */ - size = volsize - off; - - if (reading) { - error = dmu_read(os, ZVOL_OBJ, off, size, addr); - } else { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size); - dmu_tx_commit(tx); - } - } - if (error) - break; - off += size; - addr += size; - resid -= size; - } - zfs_range_unlock(rl); - - bp->bio_completed = bp->bio_length - resid; - if (bp->bio_completed < bp->bio_length) - bp->bio_error = (off > volsize ? EINVAL : error); -} - -static void -zvol_worker(void *arg) -{ - zvol_state_t *zv; - struct bio *bp; - - zv = arg; - for (;;) { - mtx_lock(&zv->zv_queue_mtx); - bp = bioq_takefirst(&zv->zv_queue); - if (bp == NULL) { - if (zv->zv_state == 1) { - zv->zv_state = 2; - wakeup(&zv->zv_state); - mtx_unlock(&zv->zv_queue_mtx); - kproc_exit(0); - } - msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP, - "zvol:io", 0); - continue; - } - mtx_unlock(&zv->zv_queue_mtx); - switch (bp->bio_cmd) { - case BIO_FLUSH: - break; - case BIO_READ: - case BIO_WRITE: - zvol_serve_one(zv, bp); - break; - } - - if (bp->bio_cmd != BIO_READ && !zil_disable) - zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); - - g_io_deliver(bp, bp->bio_error); - } -} - -void -zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx) -{ - zfs_create_data_t *zc = arg; - int error; - uint64_t volblocksize, volsize; - - VERIFY(nvlist_lookup_uint64(zc->zc_props, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); - if (nvlist_lookup_uint64(zc->zc_props, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) - volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); - - /* - * These properites must be removed from the list so the generic - * property setting step won't apply to them. - */ - VERIFY(nvlist_remove_all(zc->zc_props, - zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); - (void) nvlist_remove_all(zc->zc_props, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); - - error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); - - error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); - - error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); - ASSERT(error == 0); -} - -/* - * Replay a TX_WRITE ZIL transaction that didn't get committed - * after a system failure - */ -static int -zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) -{ - objset_t *os = zv->zv_objset; - char *data = (char *)(lr + 1); /* data follows lr_write_t */ - uint64_t off = lr->lr_offset; - uint64_t len = lr->lr_length; - dmu_tx_t *tx; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); - error = dmu_tx_assign(tx, zv->zv_txg_assign); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, len, data, tx); - dmu_tx_commit(tx); - } - - return (error); -} - -/* ARGSUSED */ -static int -zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) -{ - return (ENOTSUP); -} - -/* - * Callback vectors for replaying records. - * Only TX_WRITE is needed for zvol. - */ -zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { - zvol_replay_err, /* 0 no such transaction type */ - zvol_replay_err, /* TX_CREATE */ - zvol_replay_err, /* TX_MKDIR */ - zvol_replay_err, /* TX_MKXATTR */ - zvol_replay_err, /* TX_SYMLINK */ - zvol_replay_err, /* TX_REMOVE */ - zvol_replay_err, /* TX_RMDIR */ - zvol_replay_err, /* TX_LINK */ - zvol_replay_err, /* TX_RENAME */ - zvol_replay_write, /* TX_WRITE */ - zvol_replay_err, /* TX_TRUNCATE */ - zvol_replay_err, /* TX_SETATTR */ - zvol_replay_err, /* TX_ACL */ -}; - -/* - * Create a minor node for the specified volume. - */ -int -zvol_create_minor(const char *name, dev_t dev) -{ - struct g_provider *pp; - struct g_geom *gp; - zvol_state_t *zv; - objset_t *os; - dmu_object_info_t doi; - uint64_t volsize; - int ds_mode = DS_MODE_PRIMARY; - int error; - - DROP_GIANT(); - g_topology_lock(); - - if ((zv = zvol_minor_lookup(name)) != NULL) { - error = EEXIST; - goto end; - } - - if (strchr(name, '@') != 0) - ds_mode |= DS_MODE_READONLY; - - error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); - if (error) - goto end; - - g_topology_unlock(); - PICKUP_GIANT(); - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); - DROP_GIANT(); - g_topology_lock(); - if (error) { - dmu_objset_close(os); - goto end; - } - - gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); - gp->start = zvol_start; - gp->access = zvol_access; - pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name); - pp->mediasize = volsize; - pp->sectorsize = DEV_BSIZE; - - zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); - (void) strcpy(zv->zv_name, name); - zv->zv_min_bs = DEV_BSHIFT; - zv->zv_provider = pp; - zv->zv_volsize = pp->mediasize; - zv->zv_objset = os; - zv->zv_mode = ds_mode; - zv->zv_zilog = zil_open(os, zvol_get_data); - mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); - - - /* get and cache the blocksize */ - error = dmu_object_info(os, ZVOL_OBJ, &doi); - ASSERT(error == 0); - zv->zv_volblocksize = doi.doi_data_block_size; - - zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector); - - /* XXX this should handle the possible i/o error */ - VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), - "readonly", zvol_readonly_changed_cb, zv) == 0); - - pp->private = zv; - g_error_provider(pp, 0); - - bioq_init(&zv->zv_queue); - mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF); - zv->zv_state = 0; - kproc_create(zvol_worker, zv, NULL, 0, 0, "zvol:worker %s", pp->name); - - zvol_minors++; -end: - g_topology_unlock(); - PICKUP_GIANT(); - - return (error); -} - -/* - * Remove minor node for the specified volume. - */ -int -zvol_remove_minor(const char *name) -{ - struct g_provider *pp; - zvol_state_t *zv; - int error = 0; - - DROP_GIANT(); - g_topology_lock(); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - error = ENXIO; - goto end; - } - - if (zv->zv_total_opens != 0) { - error = EBUSY; - goto end; - } - - VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), - "readonly", zvol_readonly_changed_cb, zv) == 0); - - mtx_lock(&zv->zv_queue_mtx); - zv->zv_state = 1; - wakeup_one(&zv->zv_queue); - while (zv->zv_state != 2) - msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0); - mtx_unlock(&zv->zv_queue_mtx); - mtx_destroy(&zv->zv_queue_mtx); - - pp = zv->zv_provider; - pp->private = NULL; - g_wither_geom(pp->geom, ENXIO); - - zil_close(zv->zv_zilog); - zv->zv_zilog = NULL; - dmu_objset_close(zv->zv_objset); - zv->zv_objset = NULL; - avl_destroy(&zv->zv_znode.z_range_avl); - mutex_destroy(&zv->zv_znode.z_range_lock); - - kmem_free(zv, sizeof(*zv)); - - zvol_minors--; -end: - g_topology_unlock(); - PICKUP_GIANT(); - - return (error); -} - -int -zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize) -{ - zvol_state_t *zv; - dmu_tx_t *tx; - int error; - dmu_object_info_t doi; - - DROP_GIANT(); - g_topology_lock(); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - error = ENXIO; - goto end; - } - - if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || - (error = zvol_check_volsize(volsize, - doi.doi_data_block_size)) != 0) { - goto end; - } - - if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { - error = EROFS; - goto end; - } - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - goto end; - } - - error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, - &volsize, tx); - if (error == 0) { - error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize, - DMU_OBJECT_END, tx); - } - - dmu_tx_commit(tx); - - if (error == 0) { - zv->zv_volsize = volsize; - zv->zv_provider->mediasize = volsize; /* XXX: Not supported. */ - } -end: - g_topology_unlock(); - PICKUP_GIANT(); - - return (error); -} - -int -zvol_set_volblocksize(const char *name, uint64_t volblocksize) -{ - zvol_state_t *zv; - dmu_tx_t *tx; - int error; - - DROP_GIANT(); - g_topology_lock(); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - error = ENXIO; - goto end; - } - - if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { - error = EROFS; - goto end; - } - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_bonus(tx, ZVOL_OBJ); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, - volblocksize, 0, tx); - if (error == ENOTSUP) - error = EBUSY; - dmu_tx_commit(tx); - /* XXX: Not supported. */ -#if 0 - if (error == 0) - zv->zv_provider->sectorsize = zc->zc_volblocksize; -#endif - } -end: - g_topology_unlock(); - PICKUP_GIANT(); - - return (error); -} - -void -zvol_get_done(dmu_buf_t *db, void *vzgd) -{ - zgd_t *zgd = (zgd_t *)vzgd; - rl_t *rl = zgd->zgd_rl; - - dmu_buf_rele(db, vzgd); - zfs_range_unlock(rl); - zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); - kmem_free(zgd, sizeof (zgd_t)); -} - -/* - * Get data to generate a TX_WRITE intent log record. - */ -static int -zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) -{ - zvol_state_t *zv = arg; - objset_t *os = zv->zv_objset; - dmu_buf_t *db; - rl_t *rl; - zgd_t *zgd; - uint64_t boff; /* block starting offset */ - int dlen = lr->lr_length; /* length of user data */ - int error; - - ASSERT(zio); - ASSERT(dlen != 0); - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) /* immediate write */ - return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf)); - - zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_zilog = zv->zv_zilog; - zgd->zgd_bp = &lr->lr_blkptr; - - /* - * Lock the range of the block to ensure that when the data is - * written out and it's checksum is being calculated that no other - * thread can change the block. - */ - boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t); - rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize, - RL_READER); - zgd->zgd_rl = rl; - - VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); - error = dmu_sync(zio, db, &lr->lr_blkptr, - lr->lr_common.lrc_txg, zvol_get_done, zgd); - if (error == 0) - zil_add_vdev(zv->zv_zilog, - DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); - /* - * If we get EINPROGRESS, then we need to wait for a - * write IO initiated by dmu_sync() to complete before - * we can release this dbuf. We will finish everything - * up in the zvol_get_done() callback. - */ - if (error == EINPROGRESS) - return (0); - dmu_buf_rele(db, zgd); - zfs_range_unlock(rl); - kmem_free(zgd, sizeof (zgd_t)); - return (error); -} - -int -zvol_busy(void) -{ - return (zvol_minors != 0); -} - -void -zvol_init(void) -{ - ZFS_LOG(1, "ZVOL Initialized."); -} - -void -zvol_fini(void) -{ - ZFS_LOG(1, "ZVOL Deinitialized."); -} |